diff --git a/src/main/scala/coupledL2/Common.scala b/src/main/scala/coupledL2/Common.scala index e3b4d09f..05e024fb 100644 --- a/src/main/scala/coupledL2/Common.scala +++ b/src/main/scala/coupledL2/Common.scala @@ -51,9 +51,8 @@ class TaskBundle(implicit p: Parameters) extends L2Bundle with HasChannelBits { val size = UInt(msgSizeBits.W) val sourceId = UInt(sourceIdBits.W) // tilelink sourceID val bufIdx = UInt(bufIdxBits.W) // idx of SinkC buffer - val needProbeAckData = Bool() // only used for SinkB reqs + val needProbeAckData = Bool() // only used for SinkB reqs, whether L3 needs probeAckData - // val mshrOpType = UInt(mshrOpTypeBits.W) // type of the MSHR task operation // MSHR may send Release(Data) or Grant(Data) or ProbeAck(Data) through Main Pipe val mshrTask = Bool() // is task from mshr val mshrId = UInt(mshrBits.W) // mshr entry index (used only in mshr-task) @@ -78,9 +77,14 @@ class TaskBundle(implicit p: Parameters) extends L2Bundle with HasChannelBits { val tagWen = Bool() val dsWen = Bool() - // for Dir to choose a way not occupied by some unfinished MSHR task + // for Dir to choose a way inside wayMask val wayMask = UInt(cacheParams.ways.W) + // for Grant to read replacer to choose a replaced way + // for Release to read refillBuf and write to DS + val replTask = Bool() + + // for TopDown Monitor (# TopDown) val reqSource = UInt(MemReqSource.reqSourceBits.W) def hasData = opcode(0) @@ -89,39 +93,44 @@ class TaskBundle(implicit p: Parameters) extends L2Bundle with HasChannelBits { class PipeStatus(implicit p: Parameters) extends L2Bundle with HasChannelBits class PipeEntranceStatus(implicit p: Parameters) extends L2Bundle { - val tags = Vec(3, UInt(tagBits.W)) - val sets = Vec(3, UInt(setBits.W)) + val tags = Vec(4, UInt(tagBits.W)) + val sets = Vec(4, UInt(setBits.W)) def c_tag = tags(0) def b_tag = tags(1) def a_tag = tags(2) + def g_tag = tags(3) // replRead-Grant def c_set = sets(0) def b_set = sets(1) def a_set = sets(2) + def g_set = sets(3) } // MSHR exposes signals to MSHRCtl class MSHRStatus(implicit p: Parameters) extends L2Bundle with HasChannelBits { - val set = UInt(setBits.W) - val tag = UInt(tagBits.W) - val way = UInt(wayBits.W) - val off = UInt(offsetBits.W) - val opcode = UInt(3.W) - val param = UInt(3.W) - val size = UInt(msgSizeBits.W) - val source = UInt(sourceIdBits.W) - val alias = aliasBitsOpt.map(_ => UInt(aliasBitsOpt.get.W)) - val aliasTask = aliasBitsOpt.map(_ => Bool()) - val nestB = Bool() - val needProbeAckData = Bool() // only for B reqs - val pbIdx = UInt(mshrBits.W) + val set = UInt(setBits.W) + val reqTag = UInt(tagBits.W) + val metaTag = UInt(tagBits.W) + val needsRepl = Bool() val w_c_resp = Bool() val w_d_resp = Bool() val w_e_resp = Bool() - val fromL2pft = prefetchOpt.map(_ => Bool()) - val needHint = prefetchOpt.map(_ => Bool()) val will_free = Bool() + + // val way = UInt(wayBits.W) +// val off = UInt(offsetBits.W) +// val opcode = UInt(3.W) +// val param = UInt(3.W) +// val size = UInt(msgSizeBits.W) +// val source = UInt(sourceIdBits.W) +// val alias = aliasBitsOpt.map(_ => UInt(aliasBitsOpt.get.W)) +// val aliasTask = aliasBitsOpt.map(_ => Bool()) +// val needProbeAckData = Bool() // only for B reqs +// val pbIdx = UInt(mshrBits.W) +// val fromL2pft = prefetchOpt.map(_ => Bool()) +// val needHint = prefetchOpt.map(_ => Bool()) + // for TopDown usage val reqSource = UInt(MemReqSource.reqSourceBits.W) val is_miss = Bool() @@ -135,16 +144,25 @@ class MSHRRequest(implicit p: Parameters) extends L2Bundle { val task = new TaskBundle() } -// MSHR to ReqBuf for block info -class MSHRBlockAInfo(implicit p: Parameters) extends L2Bundle { +// MSHR info to ReqBuf and SinkB +class MSHRInfo(implicit p: Parameters) extends L2Bundle { val set = UInt(setBits.W) val way = UInt(wayBits.W) val reqTag = UInt(tagBits.W) val willFree = Bool() - // to block Acquire for data about to be replaced until Release done + // to block Acquire for to-be-replaced data until Release done (indicated by ReleaseAck received) val needRelease = Bool() + // MSHR needs to send ReleaseTask but has not yet sent it + // PS: ReleaseTask is also responsible for writing refillData to DS when A miss + val releaseNotSent = Bool() + val metaTag = UInt(tagBits.W) + val dirHit = Bool() + + // decide whether can nest B (req same-addr) or merge B with release (meta same-addr) + val nestB = Bool() + val mergeB = Bool() // to drop duplicate prefetch reqs val isAcqOrPrefetch = Bool() @@ -174,8 +192,8 @@ class FSMState(implicit p: Parameters) extends L2Bundle { val s_release = Bool() // release downwards val s_probeack = Bool() // respond probeack downwards val s_refill = Bool() // respond grant upwards - // val s_grantack = Bool() // respond grantack downwards - // val s_writeback = Bool()// writeback tag/dir + val s_merge_probeack = Bool() // respond probeack downwards, Probe merge into A-replacement-Release + // val s_grantack = Bool() // respond grantack downwards, moved to GrantBuf // val s_triggerprefetch = prefetchOpt.map(_ => Bool()) // wait @@ -189,8 +207,7 @@ class FSMState(implicit p: Parameters) extends L2Bundle { val w_grant = Bool() val w_releaseack = Bool() val w_grantack = Bool() - - val w_release_sent = Bool() + val w_replResp = Bool() } class SourceAReq(implicit p: Parameters) extends L2Bundle { @@ -215,17 +232,16 @@ class SourceBReq(implicit p: Parameters) extends L2Bundle { } class BlockInfo(implicit p: Parameters) extends L2Bundle { + val blockG_s1 = Bool() val blockA_s1 = Bool() val blockB_s1 = Bool() val blockC_s1 = Bool() } +// used for nested C Release class NestedWriteback(implicit p: Parameters) extends L2Bundle { val set = UInt(setBits.W) val tag = UInt(tagBits.W) - val b_toN = Bool() - val b_toB = Bool() - val b_clr_dirty = Bool() val c_set_dirty = Bool() } diff --git a/src/main/scala/coupledL2/CoupledL2.scala b/src/main/scala/coupledL2/CoupledL2.scala index 7a376a2b..069e4e84 100644 --- a/src/main/scala/coupledL2/CoupledL2.scala +++ b/src/main/scala/coupledL2/CoupledL2.scala @@ -49,12 +49,9 @@ trait HasCoupledL2Parameters { else cacheParams.clientCaches.head.aliasBitsOpt val pageOffsetBits = log2Ceil(cacheParams.pageBytes) - val bufBlocks = 8 // hold data that flows in MainPipe + val bufBlocks = 4 // hold data that flows in MainPipe val bufIdxBits = log2Up(bufBlocks) - // 1 cycle for sram read, and latch for another cycle - val sramLatency = 2 - val releaseBufWPorts = 3 // sinkC and mainpipe s5, s6 // Prefetch diff --git a/src/main/scala/coupledL2/DataStorage.scala b/src/main/scala/coupledL2/DataStorage.scala index 20ca1e56..3329ebc1 100644 --- a/src/main/scala/coupledL2/DataStorage.scala +++ b/src/main/scala/coupledL2/DataStorage.scala @@ -59,5 +59,7 @@ class DataStorage(implicit p: Parameters) extends L2Module { array.io.w.apply(wen, io.wdata, arrayIdx, 1.U) array.io.r.apply(ren, arrayIdx) - io.rdata := RegNextN(array.io.r.resp.data(0), sramLatency - 1) + // TODO: timing: we should not use reg here, instead set this as multicycle path + // s3 read, s4 pass and s5 to destination + io.rdata := RegNextN(array.io.r.resp.data(0), 1) } diff --git a/src/main/scala/coupledL2/Directory.scala b/src/main/scala/coupledL2/Directory.scala index 0352cb60..26d1ec91 100644 --- a/src/main/scala/coupledL2/Directory.scala +++ b/src/main/scala/coupledL2/Directory.scala @@ -21,9 +21,9 @@ import chisel3._ import chisel3.util._ import freechips.rocketchip.util.SetAssocLRU import coupledL2.utils._ -import utility.ParallelPriorityMux +import utility.{ParallelPriorityMux, RegNextN} import chipsalliance.rocketchip.config.Parameters -import freechips.rocketchip.tilelink.TLMessages +import freechips.rocketchip.tilelink.TLMessages._ class MetaEntry(implicit p: Parameters) extends L2Bundle { val dirty = Bool() @@ -60,8 +60,12 @@ object MetaEntry { class DirRead(implicit p: Parameters) extends L2Bundle { val tag = UInt(tagBits.W) val set = UInt(setBits.W) + // dirResult.way must only be in the wayMask val wayMask = UInt(cacheParams.ways.W) val replacerInfo = new ReplacerInfo() + // dirRead when refill + val refill = Bool() + val mshrId = UInt(mshrBits.W) } class DirResult(implicit p: Parameters) extends L2Bundle { @@ -74,6 +78,15 @@ class DirResult(implicit p: Parameters) extends L2Bundle { val replacerInfo = new ReplacerInfo() // for TopDown usage } +class ReplacerResult(implicit p: Parameters) extends L2Bundle { + val tag = UInt(tagBits.W) + val set = UInt(setBits.W) + val way = UInt(wayBits.W) + val meta = new MetaEntry() + val mshrId = UInt(mshrBits.W) + val retry = Bool() +} + class MetaWrite(implicit p: Parameters) extends L2Bundle { val set = UInt(setBits.W) val wayOH = UInt(cacheParams.ways.W) @@ -93,6 +106,9 @@ class Directory(implicit p: Parameters) extends L2Module { val resp = Output(new DirResult) val metaWReq = Flipped(ValidIO(new MetaWrite)) val tagWReq = Flipped(ValidIO(new TagWrite)) + val replResp = ValidIO(new ReplacerResult) + // used to count occWays for Grant to retry + val msInfo = Vec(mshrsAll, Flipped(ValidIO(new MSHRInfo))) }) def invalid_way_sel(metaVec: Seq[MetaEntry], repl: UInt) = { @@ -108,17 +124,36 @@ class Directory(implicit p: Parameters) extends L2Module { val tagWen = io.tagWReq.valid val metaWen = io.metaWReq.valid - val replacerWen = RegInit(false.B) + val replacerWen = WireInit(false.B) val tagArray = Module(new BankedSRAM(UInt(tagBits.W), sets, ways, banks, singlePort = true)) val metaArray = Module(new BankedSRAM(new MetaEntry, sets, ways, banks, singlePort = true)) val tagRead = Wire(Vec(ways, UInt(tagBits.W))) val metaRead = Wire(Vec(ways, new MetaEntry())) - val reqValidReg = RegNext(io.read.fire, false.B) val resetFinish = RegInit(false.B) val resetIdx = RegInit((sets - 1).U) + // Replacer + val repl = ReplacementPolicy.fromString(cacheParams.replacement, ways) + val random_repl = cacheParams.replacement == "random" + val replacer_sram_opt = if(random_repl) None else + Some(Module(new BankedSRAM(UInt(repl.nBits.W), sets, 1, banks, singlePort = true, shouldReset = true))) + + /* ====== Generate response signals ====== */ + // hit/way calculation in stage 3, Cuz SRAM latency is high under high frequency + /* stage 1: io.read.fire, access Tag/Meta + stage 2: get Tag/Meta, latch + stage 3: calculate hit/way and chosen meta/tag by way + */ + val reqValid_s2 = RegNext(io.read.fire, false.B) + val reqValid_s3 = RegNext(reqValid_s2, false.B) + val req_s2 = RegEnable(io.read.bits, 0.U.asTypeOf(io.read.bits), io.read.fire) + val req_s3 = RegEnable(req_s2, 0.U.asTypeOf(req_s2), reqValid_s2) + + val refillReqValid_s2 = RegNext(io.read.fire && io.read.bits.refill, false.B) + val refillReqValid_s3 = RegNext(refillReqValid_s2, false.B) + // Tag R/W tagRead := tagArray.io.r(io.read.fire, io.read.bits.set).resp.data tagArray.io.w( @@ -137,138 +172,141 @@ class Directory(implicit p: Parameters) extends L2Module { io.metaWReq.bits.wayOH ) - // Generate response signals - /* stage 1: io.read.fire, access Tag/Meta - stage 2: get Tag/Meta, calculate hit/way - stage 3: output latched hit/way and chosen meta/tag by way - */ - // TODO: how about moving hit/way calculation to stage 2? Cuz SRAM latency can be high under high frequency - val reqReg = RegEnable(io.read.bits, 0.U.asTypeOf(io.read.bits), enable = io.read.fire) - val hit_s2 = Wire(Bool()) - val way_s2 = Wire(UInt(wayBits.W)) + val metaAll_s3 = RegEnable(metaRead, 0.U.asTypeOf(metaRead), reqValid_s2) + val tagAll_s3 = RegEnable(tagRead, 0.U.asTypeOf(tagRead), reqValid_s2) - // Replacer - val repl = ReplacementPolicy.fromString(cacheParams.replacement, ways) - val random_repl = cacheParams.replacement == "random" - val replacer_sram_opt = if(random_repl) None else - Some(Module(new BankedSRAM(UInt(repl.nBits.W), sets, 1, banks, singlePort = true, shouldReset = true))) + val tagMatchVec = tagAll_s3.map(_ (tagBits - 1, 0) === req_s3.tag) + val metaValidVec = metaAll_s3.map(_.state =/= MetaData.INVALID) + val hitVec = tagMatchVec.zip(metaValidVec).map(x => x._1 && x._2) + + val hitWay = OHToUInt(hitVec) + val replaceWay = WireInit(UInt(wayBits.W), 0.U) + val (inv, invalidWay) = invalid_way_sel(metaAll_s3, replaceWay) + val chosenWay = Mux(inv, invalidWay, replaceWay) + // if chosenWay not in wayMask, then choose a way in wayMask + // TODO: consider remove this is not used for better timing + val finalWay = Mux( + req_s3.wayMask(chosenWay), + chosenWay, + PriorityEncoder(req_s3.wayMask) // can be optimized + ) + + val hit_s3 = Cat(hitVec).orR + val way_s3 = Mux(hit_s3, hitWay, finalWay) + val meta_s3 = metaAll_s3(way_s3) + val tag_s3 = tagAll_s3(way_s3) + val set_s3 = req_s3.set + val replacerInfo_s3 = req_s3.replacerInfo + + io.resp.hit := hit_s3 + io.resp.way := way_s3 + io.resp.meta := meta_s3 + io.resp.tag := tag_s3 + io.resp.set := set_s3 + io.resp.error := false.B // depends on ECC + io.resp.replacerInfo := replacerInfo_s3 + + dontTouch(io) + dontTouch(metaArray.io) + dontTouch(tagArray.io) + + //[deprecated] io.read.ready := !io.metaWReq.valid && !io.tagWReq.valid && !replacerWen + val replacerRready = if(cacheParams.replacement == "random") true.B else replacer_sram_opt.get.io.r.req.ready + io.read.ready := tagArray.io.r.req.ready && metaArray.io.r.req.ready && replacerRready - val repl_state = if(random_repl){ + /* ====== refill retry ====== */ + // if refill chooses a way that has not finished writing its refillData back to DS (in MSHR Release), + // or the way is using by Alias-Acquire, + // we cancel the Grant and let it retry + // TODO: timing? + val wayConflictMask = VecInit(io.msInfo.map(s => + s.valid && s.bits.set === req_s3.set && (s.bits.releaseNotSent || s.bits.dirHit) && s.bits.way === finalWay + )).asUInt + val refillRetry = wayConflictMask.orR + + /* ======!! Replacement logic !!====== */ + /* ====== Read, choose replaceWay ====== */ + val repl_state_s3 = if(random_repl) { when(io.tagWReq.fire){ repl.miss } 0.U - } else if(cacheParams.replacement == "srrip"){ - val repl_sram_r = replacer_sram_opt.get.io.r(io.read.fire(), io.read.bits.set).resp.data(0) - val repl_state_hold = WireInit(0.U(repl.nBits.W)) - repl_state_hold := HoldUnless(repl_sram_r, RegNext(io.read.fire(), false.B)) - val next_state = repl.get_next_state(repl_state_hold, way_s2, hit_s2) + } else { + val repl_sram_r = replacer_sram_opt.get.io.r(io.read.fire, io.read.bits.set).resp.data(0) + val repl_state = RegEnable(repl_sram_r, 0.U(repl.nBits.W), reqValid_s2) + repl_state + } + + replaceWay := repl.get_replace_way(repl_state_s3) + + io.replResp.valid := refillReqValid_s3 + io.replResp.bits.tag := tagAll_s3(finalWay) + io.replResp.bits.set := req_s3.set + io.replResp.bits.way := finalWay + io.replResp.bits.meta := metaAll_s3(finalWay) + io.replResp.bits.mshrId := req_s3.mshrId + io.replResp.bits.retry := refillRetry + + /* ====== Update ====== */ + // update replacer only when A hit or refill, at stage 3 + val updateHit = reqValid_s3 && hit_s3 && req_s3.replacerInfo.channel(0) && + (req_s3.replacerInfo.opcode === AcquirePerm || req_s3.replacerInfo.opcode === AcquireBlock) + val updateRefill = refillReqValid_s3 && !refillRetry + replacerWen := updateHit || updateRefill + + // !!![TODO]!!! check this @CLS + // hit-Promotion, miss-Insertion for RRIP, so refill should hit = false.B + val touch_way_s3 = Mux(refillReqValid_s3, replaceWay, way_s3) + val rrip_hit_s3 = Mux(refillReqValid_s3, false.B, hit_s3) + + if(cacheParams.replacement == "srrip"){ + val next_state_s3 = repl.get_next_state(repl_state_s3, touch_way_s3, rrip_hit_s3) val repl_init = Wire(Vec(ways, UInt(2.W))) repl_init.foreach(_ := 2.U(2.W)) replacer_sram_opt.get.io.w( !resetFinish || replacerWen, - Mux(resetFinish, RegNext(next_state, 0.U.asTypeOf(next_state)), repl_init.asUInt), - Mux(resetFinish, RegNext(reqReg.set, 0.U.asTypeOf(reqReg.set)), resetIdx), + Mux(resetFinish, next_state_s3, repl_init.asUInt), + Mux(resetFinish, set_s3, resetIdx), 1.U ) - - repl_state_hold } else if(cacheParams.replacement == "drrip"){ //Set Dueling val PSEL = RegInit(512.U(10.W)) //32-monitor sets, 10-bits psel // track monitor sets' hit rate for each policy: srrip-0,128...3968;brrip-64,192...4032 - when(reqValidReg && (reqReg.set(6,0)===0.U) && !hit_s2){ //SDMs_srrip miss + when(refillReqValid_s3 && (set_s3(6,0)===0.U) && !rrip_hit_s3){ //SDMs_srrip miss PSEL := PSEL + 1.U - } .elsewhen(reqValidReg && (reqReg.set(6,0)===64.U) && !hit_s2){ //SDMs_brrip miss + } .elsewhen(refillReqValid_s3 && (set_s3(6,0)===64.U) && !rrip_hit_s3){ //SDMs_brrip miss PSEL := PSEL - 1.U } - - val repl_sram_r = replacer_sram_opt.get.io.r(io.read.fire(), io.read.bits.set).resp.data(0) - val repl_state_hold = WireInit(0.U(repl.nBits.W)) - repl_state_hold := HoldUnless(repl_sram_r, RegNext(io.read.fire(), false.B)) // decide use which policy by policy selection counter, for insertion - /*if set -> SDMs: use fix policy - else if PSEL(MSB)==0: use srrip - else if PSEL(MSB)==1: use brrip*/ + /* if set -> SDMs: use fix policy + else if PSEL(MSB)==0: use srrip + else if PSEL(MSB)==1: use brrip */ val repl_type = WireInit(false.B) - repl_type := Mux(reqReg.set(6,0)===0.U, false.B, - Mux(reqReg.set(6,0)===64.U, true.B, - Mux(PSEL(9)===0.U, false.B, true.B))) // false.B - srrip, true.B - brrip - val next_state = repl.get_next_state(repl_state_hold, way_s2, hit_s2, repl_type) + repl_type := Mux(set_s3(6,0)===0.U, false.B, + Mux(set_s3(6,0)===64.U, true.B, + Mux(PSEL(9)===0.U, false.B, true.B))) // false.B - srrip, true.B - brrip + val next_state_s3 = repl.get_next_state(repl_state_s3, touch_way_s3, rrip_hit_s3, repl_type) val repl_init = Wire(Vec(ways, UInt(2.W))) repl_init.foreach(_ := 2.U(2.W)) replacer_sram_opt.get.io.w( !resetFinish || replacerWen, - Mux(resetFinish, RegNext(next_state, 0.U.asTypeOf(next_state)), repl_init.asUInt), - Mux(resetFinish, RegNext(reqReg.set, 0.U.asTypeOf(reqReg.set)), resetIdx), + Mux(resetFinish, next_state_s3, repl_init.asUInt), + Mux(resetFinish, set_s3, resetIdx), 1.U ) - - repl_state_hold } else { - val repl_sram_r = replacer_sram_opt.get.io.r(io.read.fire, io.read.bits.set).resp.data(0) - val repl_state_hold = WireInit(0.U(repl.nBits.W)) - repl_state_hold := HoldUnless(repl_sram_r, RegNext(io.read.fire, false.B)) - val next_state = repl.get_next_state(repl_state_hold, way_s2) + val next_state_s3 = repl.get_next_state(repl_state_s3, touch_way_s3) replacer_sram_opt.get.io.w( !resetFinish || replacerWen, - Mux(resetFinish, RegNext(next_state, 0.U.asTypeOf(next_state)), 0.U), - Mux(resetFinish, RegNext(reqReg.set, 0.U.asTypeOf(reqReg.set)), resetIdx), + Mux(resetFinish, next_state_s3, 0.U), + Mux(resetFinish, set_s3, resetIdx), 1.U ) - repl_state_hold - } - - val tagMatchVec = tagRead.map(_ (tagBits - 1, 0) === reqReg.tag) - val metaValidVec = metaRead.map(_.state =/= MetaData.INVALID) - val hitVec = tagMatchVec.zip(metaValidVec).map(x => x._1 && x._2) - val hitWay = OHToUInt(hitVec) - val replaceWay = repl.get_replace_way(repl_state) - val (inv, invalidWay) = invalid_way_sel(metaRead, replaceWay) - val chosenWay = Mux(inv, invalidWay, replaceWay) - // if chosenWay not in wayMask, then choose a way in wayMask - val finalWay = Mux( - reqReg.wayMask(chosenWay), - chosenWay, - PriorityEncoder(reqReg.wayMask) - ) - - hit_s2 := Cat(hitVec).orR - way_s2 := Mux(hit_s2, hitWay, finalWay) - - val hit_s3 = RegEnable(hit_s2, false.B, reqValidReg) - val way_s3 = RegEnable(way_s2, 0.U, reqValidReg) - val metaAll_s3 = RegEnable(metaRead, 0.U.asTypeOf(metaRead), reqValidReg) - val tagAll_s3 = RegEnable(tagRead, 0.U.asTypeOf(tagRead), reqValidReg) - val meta_s3 = metaAll_s3(way_s3) - val tag_s3 = tagAll_s3(way_s3) - val set_s3 = RegEnable(reqReg.set, reqValidReg) - val replacerInfo_s3 = RegEnable(reqReg.replacerInfo, reqValidReg) - - io.resp.hit := hit_s3 - io.resp.way := way_s3 - io.resp.meta := meta_s3 - io.resp.tag := tag_s3 - io.resp.set := set_s3 - io.resp.error := false.B // depends on ECC - io.resp.replacerInfo := replacerInfo_s3 - - dontTouch(io) - dontTouch(metaArray.io) - dontTouch(tagArray.io) - - // io.read.ready := !io.metaWReq.valid && !io.tagWReq.valid && !replacerWen - val replacerRready = if(cacheParams.replacement == "random") true.B else replacer_sram_opt.get.io.r.req.ready - io.read.ready := tagArray.io.r.req.ready && metaArray.io.r.req.ready && replacerRready - - val update = reqReg.replacerInfo.channel(0) && (reqReg.replacerInfo.opcode === TLMessages.AcquirePerm || reqReg.replacerInfo.opcode === TLMessages.AcquireBlock) - when(reqValidReg && update) { - replacerWen := true.B - }.otherwise { - replacerWen := false.B } + /* ====== Reset ====== */ when(resetIdx === 0.U) { resetFinish := true.B } @@ -276,6 +314,6 @@ class Directory(implicit p: Parameters) extends L2Module { resetIdx := resetIdx - 1.U } - XSPerfAccumulate(cacheParams, "dirRead_cnt", reqValidReg) - XSPerfAccumulate(cacheParams, "choose_busy_way", reqValidReg && !reqReg.wayMask(chosenWay)) + XSPerfAccumulate(cacheParams, "dirRead_cnt", io.read.fire) + XSPerfAccumulate(cacheParams, "choose_busy_way", reqValid_s3 && !req_s3.wayMask(chosenWay)) } diff --git a/src/main/scala/coupledL2/GrantBuffer.scala b/src/main/scala/coupledL2/GrantBuffer.scala index f05ddcb3..97874a3e 100644 --- a/src/main/scala/coupledL2/GrantBuffer.scala +++ b/src/main/scala/coupledL2/GrantBuffer.scala @@ -120,6 +120,7 @@ class GrantBuffer(implicit p: Parameters) extends BaseGrantBuffer { //TODO: or should we still Stall B req? // A-replace related rprobe is handled in SourceB io.toReqArb.blockSinkReqEntrance.blockC_s1 := noSpaceForSinkReq + io.toReqArb.blockSinkReqEntrance.blockG_s1 := false.B io.toReqArb.blockMSHRReqEntrance := noSpaceForMSHRReq selectOH.asBools.zipWithIndex.foreach { diff --git a/src/main/scala/coupledL2/GrantBufferFIFO.scala b/src/main/scala/coupledL2/GrantBufferFIFO.scala index 4e8a6044..d7fa1f41 100644 --- a/src/main/scala/coupledL2/GrantBufferFIFO.scala +++ b/src/main/scala/coupledL2/GrantBufferFIFO.scala @@ -143,6 +143,7 @@ class GrantBufferFIFO(implicit p: Parameters) extends BaseGrantBuffer with HasCi //TODO: or should we still Stall B req? // A-replace related rprobe is handled in SourceB io.toReqArb.blockSinkReqEntrance.blockC_s1 := noSpaceForSinkReq + io.toReqArb.blockSinkReqEntrance.blockG_s1 := false.B io.toReqArb.blockMSHRReqEntrance := noSpaceForMSHRReq when(io.d_task.fire() && !(io.d_task.bits.task.opcode === HintAck && !io.d_task.bits.task.fromL2pft.getOrElse(false.B))) { diff --git a/src/main/scala/coupledL2/MSHR.scala b/src/main/scala/coupledL2/MSHR.scala index 1401c27f..e571b10c 100644 --- a/src/main/scala/coupledL2/MSHR.scala +++ b/src/main/scala/coupledL2/MSHR.scala @@ -49,18 +49,16 @@ class MSHR(implicit p: Parameters) extends L2Module { val io = IO(new Bundle() { val id = Input(UInt(mshrBits.W)) val status = ValidIO(new MSHRStatus) - val toReqBuf = ValidIO(new MSHRBlockAInfo) + val msInfo = ValidIO(new MSHRInfo) val alloc = Flipped(ValidIO(new MSHRRequest)) val tasks = new MSHRTasks() val resps = new MSHRResps() val nestedwb = Input(new NestedWriteback) val nestedwbData = Output(Bool()) + val bMergeTask = Flipped(ValidIO(new BMergeTask)) + val replResp = Flipped(ValidIO(new ReplacerResult)) }) - val initState = Wire(new FSMState()) - val state = RegInit(new FSMState(), initState) - initState.elements.foreach(_._2 := true.B) - val dirResult = RegInit(0.U.asTypeOf(new DirResult())) val gotT = RegInit(false.B) // L3 might return T even though L2 wants B val gotDirty = RegInit(false.B) val gotGrantData = RegInit(false.B) @@ -70,30 +68,19 @@ class MSHR(implicit p: Parameters) extends L2Module { val timer = RegInit(0.U(64.W)) // for performance analysis /* MSHR Allocation */ - val status_reg = RegInit(0.U.asTypeOf(Valid(new MSHRStatus()))) - val req = status_reg.bits - val meta = dirResult.meta + val req_valid = RegInit(false.B) + val req = RegInit(0.U.asTypeOf(new TaskBundle())) + val dirResult = RegInit(0.U.asTypeOf(new DirResult())) + val meta = dirResult.meta + val initState = Wire(new FSMState()) + initState.elements.foreach(_._2 := true.B) + val state = RegInit(new FSMState(), initState) when(io.alloc.valid) { - status_reg.valid := true.B - state := io.alloc.bits.state - dirResult := io.alloc.bits.dirResult - val msTask = io.alloc.bits.task - req.channel := msTask.channel - req.tag := msTask.tag - req.set := msTask.set - req.off := msTask.off - req.way := msTask.way - req.opcode := msTask.opcode - req.param := msTask.param - req.size := msTask.size - req.source := msTask.sourceId - req.needProbeAckData := msTask.needProbeAckData - req.alias.foreach(_ := msTask.alias.getOrElse(0.U)) - req.aliasTask.foreach(_ := msTask.aliasTask.getOrElse(false.B)) - req.pbIdx := msTask.pbIdx - req.fromL2pft.foreach(_ := msTask.fromL2pft.get) - req.reqSource := msTask.reqSource + req_valid := true.B + state := io.alloc.bits.state + dirResult := io.alloc.bits.dirResult + req := io.alloc.bits.task gotT := false.B gotDirty := false.B gotGrantData := false.B @@ -121,14 +108,19 @@ class MSHR(implicit p: Parameters) extends L2Module { /* ======== Task allocation ======== */ // Theoretically, data to be released is saved in ReleaseBuffer, so Acquire can be sent as soon as req enters mshr - io.tasks.source_a.valid := !state.s_acquire && state.s_release && state.w_release_sent + io.tasks.source_a.valid := !state.s_acquire io.tasks.source_b.valid := !state.s_pprobe || !state.s_rprobe - val mp_release_valid = !state.s_release && state.w_rprobeacklast + val mp_release_valid = !state.s_release && state.w_rprobeacklast && !io.bMergeTask.valid && + state.w_grantlast && + state.w_replResp // release after Grant to L1 sent and replRead returns + val mp_probeack_valid = !state.s_probeack && state.w_pprobeacklast - val mp_grant_valid = !state.s_refill && state.w_grantlast && state.w_rprobeacklast && state.s_release // [Alias] grant after rprobe done - io.tasks.mainpipe.valid := mp_release_valid || mp_probeack_valid || mp_grant_valid + val mp_merge_probeack_valid = !state.s_merge_probeack && state.w_rprobeacklast + val mp_grant_valid = !state.s_refill && state.w_grantlast && state.w_rprobeacklast // [Alias] grant after rprobe done + io.tasks.mainpipe.valid := mp_release_valid || mp_probeack_valid || mp_merge_probeack_valid || mp_grant_valid // io.tasks.prefetchTrain.foreach(t => t.valid := !state.s_triggerprefetch.getOrElse(true.B)) + val a_task = { val oa = io.tasks.source_a.bits oa.tag := req.tag @@ -158,7 +150,6 @@ class MSHR(implicit p: Parameters) extends L2Module { ob.set := dirResult.set ob.off := 0.U ob.opcode := Probe - // ob.param := Mux(!state.s_pprobe, req.param, toN) ob.param := Mux( !state.s_pprobe, req.param, @@ -171,7 +162,7 @@ class MSHR(implicit p: Parameters) extends L2Module { ob.alias.foreach(_ := meta.alias.getOrElse(0.U)) ob } - val mp_release, mp_probeack, mp_grant = Wire(new TaskBundle) + val mp_release, mp_probeack, mp_merge_probeack, mp_grant = Wire(new TaskBundle) val mp_release_task = { mp_release.channel := req.channel mp_release.tag := dirResult.tag @@ -197,16 +188,19 @@ class MSHR(implicit p: Parameters) extends L2Module { mp_release.mshrTask := true.B mp_release.mshrId := io.id mp_release.aliasTask.foreach(_ := false.B) - mp_release.useProbeData := true.B // read ReleaseBuf when useProbeData && opcode(0) is true + // mp_release definitely read releaseBuf and refillBuf at ReqArb + // and it needs to write refillData to DS, so useProbeData is set false according to DS.wdata logic + mp_release.useProbeData := false.B + mp_release.way := dirResult.way mp_release.pbIdx := 0.U(mshrBits.W) mp_release.fromL2pft.foreach(_ := false.B) mp_release.needHint.foreach(_ := false.B) - mp_release.way := req.way mp_release.dirty := meta.dirty && meta.state =/= INVALID || probeDirty - mp_release.metaWen := true.B + mp_release.metaWen := false.B mp_release.meta := MetaEntry() mp_release.tagWen := false.B - mp_release.dsWen := false.B + mp_release.dsWen := true.B + mp_release.replTask := true.B mp_release.wayMask := 0.U(cacheParams.ways.W) mp_release.reqSource := 0.U(MemReqSource.reqSourceBits.W) mp_release @@ -227,7 +221,6 @@ class MSHR(implicit p: Parameters) extends L2Module { Cat(isT(meta.state), req.param(bdWidth - 1, 0)), Seq( Cat(false.B, toN) -> BtoN, - Cat(false.B, toB) -> BtoB, // TODO: make sure that this req will not enter mshr in this situation Cat(true.B, toN) -> TtoN, Cat(true.B, toB) -> TtoB ) @@ -239,11 +232,11 @@ class MSHR(implicit p: Parameters) extends L2Module { mp_probeack.mshrTask := true.B mp_probeack.mshrId := io.id mp_probeack.aliasTask.foreach(_ := false.B) - mp_probeack.useProbeData := true.B // read ReleaseBuf when useProbeData && opcode(0) is true + mp_probeack.useProbeData := true.B // write [probeAckData] to DS, if not probed toN + mp_probeack.way := dirResult.way mp_probeack.pbIdx := 0.U(mshrBits.W) mp_probeack.fromL2pft.foreach(_ := false.B) mp_probeack.needHint.foreach(_ := false.B) - mp_probeack.way := req.way mp_probeack.dirty := meta.dirty && meta.state =/= INVALID || probeDirty mp_probeack.meta := MetaEntry( dirty = false.B, @@ -266,16 +259,70 @@ class MSHR(implicit p: Parameters) extends L2Module { mp_probeack.dsWen := req.param =/= toN && probeDirty mp_probeack.wayMask := 0.U(cacheParams.ways.W) mp_probeack.reqSource := 0.U(MemReqSource.reqSourceBits.W) + mp_probeack.replTask := false.B mp_probeack } + val mp_merge_probeack_task = { + val task = RegEnable(io.bMergeTask.bits.task, 0.U.asTypeOf(new TaskBundle), io.bMergeTask.valid) + mp_merge_probeack.channel := task.channel + mp_merge_probeack.tag := task.tag + mp_merge_probeack.set := task.set + mp_merge_probeack.off := task.off + mp_merge_probeack.opcode := Mux( + meta.dirty && isT(meta.state) || probeDirty || task.needProbeAckData, + ProbeAckData, + ProbeAck + ) + mp_merge_probeack.param := ParallelLookUp( + Cat(isT(meta.state), task.param(bdWidth - 1, 0)), + Seq( + Cat(false.B, toN) -> BtoN, + Cat(true.B, toN) -> TtoN, + Cat(true.B, toB) -> TtoB + ) + ) + mp_merge_probeack.mshrTask := true.B + mp_merge_probeack.mshrId := io.id + // mp_merge_probeack definitely read releaseBuf and refillBuf at ReqArb + // and it needs to write refillData to DS, so useProbeData is set false according to DS.wdata logic + mp_merge_probeack.useProbeData := false.B + mp_merge_probeack.way := dirResult.way + mp_merge_probeack.dirty := meta.dirty && meta.state =/= INVALID || probeDirty + mp_merge_probeack.meta := MetaEntry( + dirty = false.B, + state = Mux(task.param === toN, INVALID, Mux(task.param === toB, BRANCH, meta.state)), + clients = Fill(clientBits, !probeGotN), + alias = meta.alias, + prefetch = task.param =/= toN && meta_pft, + accessed = task.param =/= toN && meta.accessed + ) + mp_merge_probeack.metaWen := true.B + mp_merge_probeack.tagWen := false.B + mp_merge_probeack.dsWen := task.param =/= toN && probeDirty + + // unused, set to default + mp_merge_probeack.alias.foreach(_ := 0.U) + mp_merge_probeack.aliasTask.foreach(_ := false.B) + mp_merge_probeack.size := offsetBits.U + mp_merge_probeack.sourceId := 0.U + mp_merge_probeack.bufIdx := 0.U + mp_merge_probeack.needProbeAckData := false.B + mp_merge_probeack.pbIdx := 0.U + mp_merge_probeack.fromL2pft.foreach(_ := false.B) + mp_merge_probeack.needHint.foreach(_ := false.B) + mp_merge_probeack.wayMask := Fill(cacheParams.ways, "b1".U) + mp_merge_probeack.replTask := true.B + mp_merge_probeack.reqSource := MemReqSource.NoWhere.id.U + } + val mp_grant_task = { mp_grant.channel := req.channel mp_grant.tag := req.tag mp_grant.set := req.set mp_grant.off := req.off + mp_grant.sourceId := req.sourceId mp_grant.alias.foreach(_ := 0.U) - mp_grant.sourceId := req.source mp_grant.opcode := odOpGen(req.opcode) mp_grant.param := Mux( req_get || req_put || req_prefetch, @@ -295,8 +342,8 @@ class MSHR(implicit p: Parameters) extends L2Module { mp_grant.needProbeAckData := false.B mp_grant.mshrTask := true.B mp_grant.mshrId := io.id + mp_grant.way := dirResult.way mp_grant.aliasTask.foreach(_ := false.B) - mp_grant.way := req.way // if it is a Get or Prefetch, then we must keep alias bits unchanged // in case future probes gets the wrong alias bits val aliasFinal = Mux(req_get || req_prefetch, meta.alias.getOrElse(0.U), req.alias.getOrElse(0.U)) @@ -337,6 +384,7 @@ class MSHR(implicit p: Parameters) extends L2Module { mp_grant.dsWen := !dirResult.hit && !req_put && gotGrantData || probeDirty && (req_get || req.aliasTask.getOrElse(false.B)) mp_grant.fromL2pft.foreach(_ := req.fromL2pft.get) mp_grant.needHint.foreach(_ := false.B) + mp_grant.replTask := !dirResult.hit // Get and Alias are hit that does not need replacement mp_grant.wayMask := 0.U(cacheParams.ways.W) mp_grant.reqSource := 0.U(MemReqSource.reqSourceBits.W) mp_grant @@ -345,7 +393,8 @@ class MSHR(implicit p: Parameters) extends L2Module { Seq( mp_grant_valid -> mp_grant, mp_release_valid -> mp_release, - mp_probeack_valid -> mp_probeack + mp_probeack_valid -> mp_probeack, + mp_merge_probeack_valid -> mp_merge_probeack ) ) io.tasks.mainpipe.bits.reqSource := req.reqSource @@ -367,7 +416,9 @@ class MSHR(implicit p: Parameters) extends L2Module { state.s_rprobe := true.B } when (io.tasks.mainpipe.ready) { - when (mp_grant_valid) { + when (mp_merge_probeack_valid) { + state.s_merge_probeack := true.B + }.elsewhen (mp_grant_valid) { state.s_refill := true.B }.elsewhen (mp_release_valid) { state.s_release := true.B @@ -383,7 +434,7 @@ class MSHR(implicit p: Parameters) extends L2Module { // } // } - /* ======== Refill response ======== */ + /* ======== Handling response ======== */ val c_resp = io.resps.sink_c val d_resp = io.resps.sink_d val e_resp = io.resps.sink_e @@ -425,27 +476,62 @@ class MSHR(implicit p: Parameters) extends L2Module { state.w_grantack := true.B } - when (io.resps.source_c.valid) { - state.w_release_sent := true.B + val replResp = io.replResp.bits + when (io.replResp.valid && replResp.retry) { + state.s_refill := false.B + } + when (io.replResp.valid && !replResp.retry) { + state.w_replResp := true.B + + // update meta (no need to update hit/set/error/replacerInfo of dirResult) + dirResult.tag := replResp.tag + dirResult.way := replResp.way + dirResult.meta := replResp.meta + + // replacer choosing: + // 1. an invalid way, release no longer needed + // 2. the same way, just release as normal (only now we set s_release) + // 3. differet way, we need to update meta and release that way + // if meta has client, rprobe client + when (replResp.meta.state =/= INVALID) { + // set release flags + state.s_release := false.B + state.w_releaseack := false.B + // rprobe clients if any + when(replResp.meta.clients.orR) { + state.s_rprobe := false.B + state.w_rprobeackfirst := false.B + state.w_rprobeacklast := false.B + } + } } - when (status_reg.valid) { + when (req_valid) { timer := timer + 1.U } - val no_schedule = state.s_refill && state.s_probeack// && state.s_triggerprefetch.getOrElse(true.B) - val no_wait = state.w_rprobeacklast && state.w_pprobeacklast && state.w_grantlast && state.w_releaseack && state.w_grantack + val no_schedule = state.s_refill && state.s_probeack && state.s_merge_probeack && state.s_release // && state.s_triggerprefetch.getOrElse(true.B) + val no_wait = state.w_rprobeacklast && state.w_pprobeacklast && state.w_grantlast && state.w_releaseack && state.w_grantack && state.w_replResp val will_free = no_schedule && no_wait - when (will_free && status_reg.valid) { - status_reg.valid := false.B + when (will_free && req_valid) { + req_valid := false.B timer := 0.U } - io.status.valid := status_reg.valid - io.status.bits <> status_reg.bits - // For A reqs, we only concern about the tag to be replaced - io.status.bits.tag := Mux(state.w_release_sent, req.tag, dirResult.tag) // s_release is low-as-valid - io.status.bits.nestB := status_reg.valid && state.w_releaseack && state.w_rprobeacklast && state.w_pprobeacklast && !state.w_grantfirst + // when grant not received, B can nest A + val nestB = !state.w_grantfirst + + // mergeB is only allowed when release not sent + //(TODO: or we could just blockB, since Release will be sent to MP very shortly and have no deadlock problem) + val mergeB = !state.s_release + // alias: should protect meta from being accessed or occupied + val releaseNotSent = !state.s_release || !state.s_merge_probeack || io.bMergeTask.valid + io.status.valid := req_valid + io.status.bits.channel := req.channel + io.status.bits.set := req.set + io.status.bits.reqTag := req.tag + io.status.bits.metaTag := dirResult.tag + io.status.bits.needsRepl := releaseNotSent // wait for resps, high as valid io.status.bits.w_c_resp := !state.w_rprobeacklast || !state.w_pprobeacklast || !state.w_pprobeack io.status.bits.w_d_resp := !state.w_grantlast || !state.w_grant || !state.w_releaseack @@ -453,38 +539,52 @@ class MSHR(implicit p: Parameters) extends L2Module { io.status.bits.will_free := will_free io.status.bits.is_miss := !dirResult.hit io.status.bits.is_prefetch := req_prefetch - - io.toReqBuf.valid := status_reg.valid - io.toReqBuf.bits.set := req.set - io.toReqBuf.bits.way := req.way - io.toReqBuf.bits.reqTag := req.tag - io.toReqBuf.bits.needRelease := !state.w_releaseack - io.toReqBuf.bits.metaTag := dirResult.tag - io.toReqBuf.bits.willFree := will_free - io.toReqBuf.bits.isAcqOrPrefetch := req_acquire || req_prefetch + io.status.bits.reqSource := req.reqSource + + io.msInfo.valid := req_valid + io.msInfo.bits.set := req.set + io.msInfo.bits.way := dirResult.way + io.msInfo.bits.reqTag := req.tag + io.msInfo.bits.needRelease := !state.w_releaseack + io.msInfo.bits.releaseNotSent := releaseNotSent + io.msInfo.bits.dirHit := dirResult.hit + io.msInfo.bits.metaTag := dirResult.tag + io.msInfo.bits.willFree := will_free + io.msInfo.bits.nestB := nestB + io.msInfo.bits.mergeB := mergeB + io.msInfo.bits.isAcqOrPrefetch := req_acquire || req_prefetch assert(!(c_resp.valid && !io.status.bits.w_c_resp)) assert(!(d_resp.valid && !io.status.bits.w_d_resp)) assert(!(e_resp.valid && !io.status.bits.w_e_resp)) - val nestedwb_match = status_reg.valid && meta.state =/= INVALID && + /* ======== Handling Nested B ======== */ + when (io.bMergeTask.valid) { + state.s_merge_probeack := false.B + state.s_release := true.B + state.w_releaseack := true.B + when (meta.clients.orR) { + state.s_rprobe := false.B + state.w_rprobeackfirst := false.B + state.w_rprobeacklast := false.B + } + } + + /* ======== Handling Nested C ======== */ + // for A miss, only when replResp do we finally choose a way, allowing nested C + // for A-alias, always allowing nested C (state.w_replResp === true.B) + val nestedwb_match = req_valid && meta.state =/= INVALID && dirResult.set === io.nestedwb.set && - dirResult.tag === io.nestedwb.tag + dirResult.tag === io.nestedwb.tag && + state.w_replResp + when (nestedwb_match) { - when (io.nestedwb.b_toN) { - dirResult.hit := false.B - } - when (io.nestedwb.b_toB) { - meta.state := BRANCH - } - when (io.nestedwb.b_clr_dirty) { - meta.dirty := false.B - } when (io.nestedwb.c_set_dirty) { meta.dirty := true.B } } - + // let nested C write ReleaseData to the MSHRBuffer entry of this MSHR id + // This is the VALID signal for releaseBuf.io.w(2) io.nestedwbData := nestedwb_match && io.nestedwb.c_set_dirty dontTouch(state) diff --git a/src/main/scala/coupledL2/MSHRBuffer.scala b/src/main/scala/coupledL2/MSHRBuffer.scala index 808789a5..76949c79 100644 --- a/src/main/scala/coupledL2/MSHRBuffer.scala +++ b/src/main/scala/coupledL2/MSHRBuffer.scala @@ -78,7 +78,7 @@ class MSHRBuffer(wPorts: Int = 1)(implicit p: Parameters) extends L2Module { buffer.zipWithIndex.foreach { case (block, i) => val wens = VecInit(io.w.map(w => w.valid && w.id === i.U)).asUInt - assert(PopCount(wens) <= 1.U, "multiple write to the same MSHR buffer entry") + assert(PopCount(wens) <= 2.U, "triple write to the same MSHR buffer entry") val w_beat_sel = PriorityMux(wens, io.w.map(_.beat_sel)) val w_data = PriorityMux(wens, io.w.map(_.data)) diff --git a/src/main/scala/coupledL2/MSHRCtl.scala b/src/main/scala/coupledL2/MSHRCtl.scala index 6721ccad..6cd2bd6e 100644 --- a/src/main/scala/coupledL2/MSHRCtl.scala +++ b/src/main/scala/coupledL2/MSHRCtl.scala @@ -84,8 +84,14 @@ class MSHRCtl(implicit p: Parameters) extends L2Module { /* status of s2 and s3 */ val pipeStatusVec = Flipped(Vec(2, ValidIO(new PipeStatus))) - /* to ReqBuffer, to solve conflict */ - val toReqBuf = Vec(mshrsAll, ValidIO(new MSHRBlockAInfo)) + /* MSHR info to Sinks */ + /* to ReqBuffer, to calculate conflict */ + /* to SinkB, to merge nested B req */ + val msInfo = Vec(mshrsAll, ValidIO(new MSHRInfo)) + val bMergeTask = Flipped(ValidIO(new BMergeTask)) + + /* refill read replacer result */ + val replResp = Flipped(ValidIO(new ReplacerResult)) /* for TopDown Monitor */ val msStatus = topDownOpt.map(_ => Vec(mshrsAll, ValidIO(new MSHRStatus))) @@ -103,11 +109,11 @@ class MSHRCtl(implicit p: Parameters) extends L2Module { val selectedMSHROH = mshrSelector.io.out.bits io.toMainPipe.mshr_alloc_ptr := OHToUInt(selectedMSHROH) - val resp_sinkC_match_vec = mshrs.map(mshr => - mshr.io.status.valid && mshr.io.status.bits.w_c_resp && - io.resps.sinkC.set === mshr.io.status.bits.set && - io.resps.sinkC.tag === mshr.io.status.bits.tag - ) + val resp_sinkC_match_vec = mshrs.map { mshr => + val status = mshr.io.status.bits + val tag = Mux(status.needsRepl, status.metaTag, status.reqTag) + mshr.io.status.valid && status.w_c_resp && io.resps.sinkC.set === status.set && io.resps.sinkC.tag === tag + } mshrs.zipWithIndex.foreach { case (m, i) => @@ -123,17 +129,19 @@ class MSHRCtl(implicit p: Parameters) extends L2Module { m.io.resps.sink_e.bits := io.resps.sinkE.respInfo m.io.resps.source_c.valid := m.io.status.valid && io.resps.sourceC.valid && io.resps.sourceC.mshrId === i.U m.io.resps.source_c.bits := io.resps.sourceC.respInfo - - m.io.nestedwb := io.nestedwb + m.io.replResp.valid := io.replResp.valid && io.replResp.bits.mshrId === i.U + m.io.replResp.bits := io.replResp.bits - io.toReqBuf(i) := m.io.toReqBuf + io.msInfo(i) := m.io.msInfo + m.io.nestedwb := io.nestedwb + m.io.bMergeTask.valid := io.bMergeTask.valid && io.bMergeTask.bits.id === i.U + m.io.bMergeTask.bits := io.bMergeTask.bits } - val setMatchVec_b = mshrs.map(m => m.io.status.valid && m.io.status.bits.set === io.fromReqArb.status_s1.b_set) - val setConflictVec_b = (setMatchVec_b zip mshrs.map(_.io.status.bits.nestB)).map(x => x._1 && !x._2) io.toReqArb.blockC_s1 := false.B - io.toReqArb.blockB_s1 := mshrFull || Cat(setConflictVec_b).orR - io.toReqArb.blockA_s1 := a_mshrFull // conflict logic moved to ReqBuf + io.toReqArb.blockB_s1 := mshrFull // conflict logic in SinkB + io.toReqArb.blockA_s1 := a_mshrFull // conflict logic in ReqBuf + io.toReqArb.blockG_s1 := false.B /* Acquire downwards */ val acquireUnit = Module(new AcquireUnit()) @@ -163,6 +171,7 @@ class MSHRCtl(implicit p: Parameters) extends L2Module { io.nestedwbDataId.bits := ParallelPriorityMux(mshrs.zipWithIndex.map { case (mshr, i) => (mshr.io.nestedwbData, i.U) }) + assert(RegNext(PopCount(mshrs.map(_.io.nestedwbData)) <= 1.U), "should only be one nestedwbData") dontTouch(io.sourceA) @@ -174,8 +183,6 @@ class MSHRCtl(implicit p: Parameters) extends L2Module { // Performance counters XSPerfAccumulate(cacheParams, "capacity_conflict_to_sinkA", a_mshrFull) XSPerfAccumulate(cacheParams, "capacity_conflict_to_sinkB", mshrFull) - // XSPerfAccumulate(cacheParams, "set_conflict_to_sinkA", Cat(setMatchVec_a).orR) //TODO: move this to ReqBuf - XSPerfAccumulate(cacheParams, "set_conflict_to_sinkB", Cat(setConflictVec_b).orR) XSPerfHistogram(cacheParams, "mshr_alloc", io.toMainPipe.mshr_alloc_ptr, enable = io.fromMainPipe.mshr_alloc_s3.valid, start = 0, stop = mshrsAll, step = 1) diff --git a/src/main/scala/coupledL2/MainPipe.scala b/src/main/scala/coupledL2/MainPipe.scala index c351447f..46d56e5e 100644 --- a/src/main/scala/coupledL2/MainPipe.scala +++ b/src/main/scala/coupledL2/MainPipe.scala @@ -51,6 +51,7 @@ class MainPipe(implicit p: Parameters) extends L2Module { /* get dir result at stage 3 */ val dirResp_s3 = Input(new DirResult) + val replResp = Flipped(ValidIO(new ReplacerResult)) /* send task to MSHRCtl at stage 3 */ val toMSHRCtl = new Bundle() { @@ -152,7 +153,7 @@ class MainPipe(implicit p: Parameters) extends L2Module { val mshr_grant_s3 = mshr_req_s3 && req_s3.fromA && req_s3.opcode(2, 1) === Grant(2, 1) // Grant or GrantData from mshr val mshr_grantdata_s3 = mshr_req_s3 && req_s3.fromA && req_s3.opcode === GrantData val mshr_accessackdata_s3 = mshr_req_s3 && req_s3.fromA && req_s3.opcode === AccessAckData - val mshr_accessack_s3 = mshr_req_s3 && req_s3.fromA && req_s3.opcode === AccessAck + val mshr_accessack_s3 = mshr_req_s3 && req_s3.fromA && req_s3.opcode === AccessAck // response for Put, unused val mshr_hintack_s3 = mshr_req_s3 && req_s3.fromA && req_s3.opcode === HintAck val mshr_probeack_s3 = mshr_req_s3 && req_s3.fromB && req_s3.opcode(2, 1) === ProbeAck(2, 1) // ProbeAck or ProbeAckData from mshr val mshr_probeackdata_s3 = mshr_req_s3 && req_s3.fromB && req_s3.opcode === ProbeAckData @@ -160,12 +161,16 @@ class MainPipe(implicit p: Parameters) extends L2Module { val meta_has_clients_s3 = meta_s3.clients.orR val req_needT_s3 = needT(req_s3.opcode, req_s3.param) // require T status to handle req - val a_need_replacement = sinkA_req_s3 && !dirResult_s3.hit && meta_s3.state =/= INVALID // b and c do not need replacement +// val a_need_replacement = sinkA_req_s3 && !dirResult_s3.hit && meta_s3.state =/= INVALID // b and c do not need replacement //[Alias] TODO: consider 1 client for now val cache_alias = req_acquire_s3 && dirResult_s3.hit && meta_s3.clients(0) && meta_s3.alias.getOrElse(0.U) =/= req_s3.alias.getOrElse(0.U) + val mshr_refill_s3 = (mshr_accessackdata_s3 || mshr_hintack_s3 || mshr_grant_s3) // needs refill to L2 DS + val retry = io.replResp.valid && io.replResp.bits.retry + val need_repl = io.replResp.valid && io.replResp.bits.meta.state =/= INVALID && req_s3.replTask // Grant needs replacement + /* ======== Interact with MSHR ======== */ val acquire_on_miss_s3 = req_acquire_s3 || req_prefetch_s3 || req_get_s3 // TODO: remove this cause always acquire on miss? val acquire_on_hit_s3 = meta_s3.state === BRANCH && req_needT_s3 @@ -214,47 +219,30 @@ class MainPipe(implicit p: Parameters) extends L2Module { ms_task.fromL2pft.foreach(_ := req_s3.fromL2pft.get) ms_task.needHint.foreach(_ := req_s3.needHint.get) ms_task.dirty := false.B - ms_task.way := dirResult_s3.way + ms_task.way := req_s3.way ms_task.meta := 0.U.asTypeOf(new MetaEntry) ms_task.metaWen := false.B ms_task.tagWen := false.B ms_task.dsWen := false.B ms_task.wayMask := 0.U(cacheParams.ways.W) + ms_task.replTask := false.B ms_task.reqSource := req_s3.reqSource /* ======== Resps to SinkA/B/C Reqs ======== */ val sink_resp_s3 = WireInit(0.U.asTypeOf(Valid(new TaskBundle))) // resp for sinkA/B/C request that does not need to alloc mshr - val mainpipe_release = a_need_replacement && !meta_has_clients_s3 val sink_resp_s3_a_promoteT = dirResult_s3.hit && isT(meta_s3.state) - sink_resp_s3.valid := task_s3.valid && !mshr_req_s3 && (!need_mshr_s3 || mainpipe_release) + sink_resp_s3.valid := task_s3.valid && !mshr_req_s3 && !need_mshr_s3 sink_resp_s3.bits := task_s3.bits sink_resp_s3.bits.mshrId := (1 << (mshrBits-1)).U + sink_resp_s3.bits.sourceId // extra id for reqs that do not enter mshr when(req_s3.fromA) { - when(mainpipe_release){ // replacement-Release for A-miss - sink_resp_s3.bits.opcode := { - cacheParams.releaseData match { - case 0 => Mux(meta_s3.dirty, ReleaseData, Release) - case 1 => Mux(meta_s3.dirty && meta_s3.accessed, ReleaseData, Release) - case 2 => ReleaseData - case 3 => ReleaseData - } - } - sink_resp_s3.bits.param := Mux(isT(meta_s3.state), TtoN, BtoN) - // sink_resp_s3.bits.mshrId is changed to mshr_alloc_ptr at stage 4 - // so source of C-Release is correct - sink_resp_s3.bits.tag := dirResult_s3.tag - sink_resp_s3.bits.dirty := meta_s3.dirty - - }.otherwise { // Grant for A-hit - sink_resp_s3.bits.opcode := odOpGen(req_s3.opcode) - sink_resp_s3.bits.param := Mux( - req_acquire_s3, - Mux(req_s3.param === NtoB && !sink_resp_s3_a_promoteT, toB, toT), - 0.U // reserved - ) - } + sink_resp_s3.bits.opcode := odOpGen(req_s3.opcode) + sink_resp_s3.bits.param := Mux( + req_acquire_s3, + Mux(req_s3.param === NtoB && !sink_resp_s3_a_promoteT, toB, toT), + 0.U // reserved + ) }.elsewhen(req_s3.fromB) { sink_resp_s3.bits.opcode := Mux( dirResult_s3.hit && (meta_s3.state === TIP && meta_s3.dirty || req_s3.needProbeAckData), @@ -278,27 +266,31 @@ class MainPipe(implicit p: Parameters) extends L2Module { source_req_s3 := Mux(sink_resp_s3.valid, sink_resp_s3.bits, req_s3) /* ======== Interact with DS ======== */ - val data_s3 = Mux(io.refillBufResp_s3.valid, io.refillBufResp_s3.bits.data, io.releaseBufResp_s3.bits.data) + val data_s3 = Mux(io.releaseBufResp_s3.valid, io.releaseBufResp_s3.bits.data, io.refillBufResp_s3.bits.data) // releaseBuf prior + val c_releaseData_s3 = RegNext(io.bufResp.data.asUInt) val hasData_s3 = source_req_s3.opcode(0) - val wen_c = sinkC_req_s3 && isParamFromT(req_s3.param) && req_s3.opcode(0) - val wen = wen_c || req_s3.dsWen && (mshr_grant_s3 || mshr_accessackdata_s3 || mshr_probeack_s3 || mshr_hintack_s3) + val need_data_a = dirResult_s3.hit && (req_get_s3 || req_acquireBlock_s3) + val need_data_b = sinkB_req_s3 && dirResult_s3.hit && + (meta_s3.state === TRUNK || meta_s3.state === TIP && meta_s3.dirty || req_s3.needProbeAckData) + val need_data_mshr_repl = mshr_refill_s3 && need_repl && !retry + val ren = need_data_a || need_data_b || need_data_mshr_repl - val need_data_on_hit_a = req_get_s3 || req_acquireBlock_s3 - val need_data_on_miss_a = a_need_replacement // read data ahead of time to prepare for ReleaseData later - val need_data_b = sinkB_req_s3 && dirResult_s3.hit && - (meta_s3.state === TRUNK || meta_s3.state === TIP && meta_s3.dirty || req_s3.needProbeAckData) - val ren = Mux(dirResult_s3.hit, need_data_on_hit_a, need_data_on_miss_a) || need_data_b - val bufResp_s3 = RegNext(io.bufResp.data.asUInt) // for Release from C-channel + val wen_c = sinkC_req_s3 && isParamFromT(req_s3.param) && req_s3.opcode(0) && dirResult_s3.hit + val wen_mshr = req_s3.dsWen && ( + mshr_probeack_s3 || mshr_release_s3 || + mshr_refill_s3 && !need_repl && !retry + ) + val wen = wen_c || wen_mshr io.toDS.req_s3.valid := task_s3.valid && (ren || wen) - io.toDS.req_s3.bits.way := Mux(mshr_req_s3, req_s3.way, dirResult_s3.way) + io.toDS.req_s3.bits.way := Mux(mshr_refill_s3 && req_s3.replTask, io.replResp.bits.way, + Mux(mshr_req_s3, req_s3.way, dirResult_s3.way)) io.toDS.req_s3.bits.set := Mux(mshr_req_s3, req_s3.set, dirResult_s3.set) io.toDS.req_s3.bits.wen := wen - //[Alias] TODO: may change this according to four || signals of wen, use ParallelPriorityMux io.toDS.wdata_s3.data := Mux( !mshr_req_s3, - bufResp_s3, + c_releaseData_s3, // Among all sinkTasks, only C-Release writes DS Mux( req_s3.useProbeData, io.releaseBufResp_s3.bits.data, @@ -312,19 +304,19 @@ class MainPipe(implicit p: Parameters) extends L2Module { // inner clients' data is needed, but whether the client will ack data is uncertain, so DS data is also needed, or val need_write_releaseBuf = need_probe_s3_a || cache_alias || - a_need_replacement && meta_has_clients_s3 || - need_data_b && need_mshr_s3_b + need_data_b && need_mshr_s3_b || + need_data_mshr_repl // B: need_write_refillBuf when L1 AcquireBlock BtoT // L2 sends AcquirePerm to L3, so GrantData to L1 needs to read DS ahead of time and store in RefillBuffer // TODO: how about AcquirePerm BtoT interaction with refill buffer? + // !!TODO June 22th: this is no longer useful, cuz we only AcquirePerm when L1 AcquirePerm (see MSHR) val need_write_refillBuf = sinkA_req_s3 && req_needT_s3 && dirResult_s3.hit && meta_s3.state === BRANCH && !req_put_s3 && !req_prefetch_s3 /* ======== Write Directory ======== */ val metaW_valid_s3_a = sinkA_req_s3 && !need_mshr_s3_a && !req_get_s3 && !req_prefetch_s3 // get & prefetch that hit will not write meta val metaW_valid_s3_b = sinkB_req_s3 && !need_mshr_s3_b && dirResult_s3.hit && (meta_s3.state === TIP || meta_s3.state === BRANCH && req_s3.param === toN) - val metaW_valid_s3_c = sinkC_req_s3 - val metaW_valid_s3_repl = mainpipe_release - val metaW_valid_s3_mshr = mshr_req_s3 && req_s3.metaWen + val metaW_valid_s3_c = sinkC_req_s3 && dirResult_s3.hit + val metaW_valid_s3_mshr = mshr_req_s3 && req_s3.metaWen && !(mshr_refill_s3 && retry) require(clientBits == 1) // Get and Prefetch should not change alias bit @@ -350,44 +342,48 @@ class MainPipe(implicit p: Parameters) extends L2Module { meta_s3.alias, accessed = meta_s3.accessed ) - val metaW_s3_repl = MetaEntry() val metaW_s3_mshr = req_s3.meta - io.metaWReq.valid := !resetFinish || task_s3.valid && (metaW_valid_s3_a || metaW_valid_s3_b || metaW_valid_s3_c || metaW_valid_s3_mshr || metaW_valid_s3_repl) + val metaW_way = Mux(mshr_refill_s3 && req_s3.replTask, io.replResp.bits.way, // grant always use replResp way + Mux(mshr_req_s3, req_s3.way, dirResult_s3.way)) + + io.metaWReq.valid := !resetFinish || task_s3.valid && (metaW_valid_s3_a || metaW_valid_s3_b || metaW_valid_s3_c || metaW_valid_s3_mshr) io.metaWReq.bits.set := Mux(resetFinish, req_s3.set, resetIdx) - io.metaWReq.bits.wayOH := Mux(resetFinish, UIntToOH(Mux(mshr_req_s3, req_s3.way, dirResult_s3.way)), Fill(cacheParams.ways, true.B)) + io.metaWReq.bits.wayOH := Mux(resetFinish, UIntToOH(metaW_way), Fill(cacheParams.ways, true.B)) io.metaWReq.bits.wmeta := Mux( resetFinish, ParallelPriorityMux( - Seq(metaW_valid_s3_a, metaW_valid_s3_b, metaW_valid_s3_c, metaW_valid_s3_repl, metaW_valid_s3_mshr), - Seq(metaW_s3_a, metaW_s3_b, metaW_s3_c, metaW_s3_repl, metaW_s3_mshr) + Seq(metaW_valid_s3_a, metaW_valid_s3_b, metaW_valid_s3_c, metaW_valid_s3_mshr), + Seq(metaW_s3_a, metaW_s3_b, metaW_s3_c, metaW_s3_mshr) ), MetaEntry() ) - io.tagWReq.valid := task_s3.valid && (mshr_grant_s3 || mshr_accessack_s3 || mshr_accessackdata_s3 || mshr_hintack_s3) && req_s3.tagWen + io.tagWReq.valid := task_s3.valid && req_s3.tagWen && mshr_refill_s3 && !retry io.tagWReq.bits.set := req_s3.set - io.tagWReq.bits.way := req_s3.way + io.tagWReq.bits.way := Mux(mshr_refill_s3 && req_s3.replTask, io.replResp.bits.way, req_s3.way) io.tagWReq.bits.wtag := req_s3.tag /* ======== Interact with Channels (C & D) ======== */ - val task_ready_s3 = !hasData_s3 || req_s3.fromC || (need_mshr_s3 && !a_need_replacement) || mshr_req_s3 // do not need s4 & s5 - val req_drop_s3 = (!mshr_req_s3 && need_mshr_s3 && !need_write_releaseBuf && !need_write_refillBuf && !mainpipe_release) || - (task_ready_s3 && (c_s3.fire || d_s3.fire)) + val chnl_fire_s3 = c_s3.fire || d_s3.fire + val req_drop_s3 = !need_write_releaseBuf && !need_write_refillBuf && ( + !mshr_req_s3 && need_mshr_s3 || chnl_fire_s3 + ) || (mshr_refill_s3 && retry) - //[Alias] TODO: may change this to ren? val data_unready_s3 = hasData_s3 && !mshr_req_s3 - c_s3.valid := task_s3.valid && Mux( + val isC_s3 = Mux( mshr_req_s3, mshr_release_s3 || mshr_probeack_s3, req_s3.fromB && !need_mshr_s3 && !data_unready_s3 ) - d_s3.valid := task_s3.valid && Mux( + val isD_s3 = Mux( mshr_req_s3, - mshr_grant_s3 || mshr_accessackdata_s3 || mshr_accessack_s3 || mshr_hintack_s3, + mshr_refill_s3 && !retry, req_s3.fromC || req_s3.fromA && !need_mshr_s3 && !data_unready_s3 ) + c_s3.valid := task_s3.valid && isC_s3 + d_s3.valid := task_s3.valid && isD_s3 c_s3.bits.task := source_req_s3 c_s3.bits.data.data := data_s3 d_s3.bits.task := source_req_s3 @@ -396,13 +392,11 @@ class MainPipe(implicit p: Parameters) extends L2Module { /* ======== nested & prefetch ======== */ io.nestedwb.set := req_s3.set io.nestedwb.tag := req_s3.tag - io.nestedwb.b_toN := task_s3.valid && metaW_valid_s3_b && req_s3.param === toN - io.nestedwb.b_toB := task_s3.valid && metaW_valid_s3_b && req_s3.param === toB // assume L3 won't send Probe toT - io.nestedwb.b_clr_dirty := task_s3.valid && metaW_valid_s3_b && meta_s3.dirty + // This serves as VALID signal // c_set_dirty is true iff Release has Data - io.nestedwb.c_set_dirty := task_s3.valid && metaW_valid_s3_c && wen_c + io.nestedwb.c_set_dirty := task_s3.valid && task_s3.bits.fromC && task_s3.bits.opcode === ReleaseData - io.nestedwbData := bufResp_s3.asTypeOf(new DSBlock) + io.nestedwbData := c_releaseData_s3.asTypeOf(new DSBlock) io.prefetchTrain.foreach { train => @@ -421,6 +415,7 @@ class MainPipe(implicit p: Parameters) extends L2Module { val ren_s4 = RegInit(false.B) val need_write_releaseBuf_s4 = RegInit(false.B) val need_write_refillBuf_s4 = RegInit(false.B) + val isC_s4, isD_s4 = RegInit(false.B) task_s4.valid := task_s3.valid && !req_drop_s3 when (task_s3.valid && !req_drop_s3) { task_s4.bits := source_req_s3 @@ -430,18 +425,26 @@ class MainPipe(implicit p: Parameters) extends L2Module { ren_s4 := ren need_write_releaseBuf_s4 := need_write_releaseBuf need_write_refillBuf_s4 := need_write_refillBuf + isC_s4 := isC_s3 + isD_s4 := isD_s3 } - val isC_s4 = task_s4.bits.opcode(2, 1) === Release(2, 1) && task_s4.bits.fromA || - task_s4.bits.opcode(2, 1) === ProbeAck(2, 1) && task_s4.bits.fromB - val isD_s4 = task_s4.bits.fromC || task_s4.bits.fromA && ( - task_s4.bits.opcode(2, 1) === Grant(2, 1) || - task_s4.bits.opcode(2, 1) === AccessAck(2, 1) || - task_s4.bits.opcode === HintAck) + // A-alias-Acquire should send neither C nor D +// val isC_s4 = task_s4.bits.opcode(2, 1) === Release(2, 1) && task_s4.bits.fromA && !RegNext(cache_alias, false.B) || +// task_s4.bits.opcode(2, 1) === ProbeAck(2, 1) && task_s4.bits.fromB +// val isD_s4 = task_s4.bits.fromC || task_s4.bits.fromA && ( +// task_s4.bits.opcode(2, 1) === Grant(2, 1) || +// task_s4.bits.opcode(2, 1) === AccessAck(2, 1) || +// task_s4.bits.opcode === HintAck) + + // for reqs that CANNOT give response in MainPipe, but needs to write releaseBuf/refillBuf + // we cannot drop them at s3, we must let them go to s4/s5 val chnl_fire_s4 = c_s4.fire() || d_s4.fire() + val req_drop_s4 = !need_write_releaseBuf_s4 && !need_write_refillBuf_s4 && chnl_fire_s4 - c_s4.valid := task_s4.valid && !data_unready_s4 && isC_s4 && !need_write_releaseBuf_s4 && !need_write_refillBuf_s4 - d_s4.valid := task_s4.valid && !data_unready_s4 && isD_s4 && !need_write_releaseBuf_s4 && !need_write_refillBuf_s4 + val c_d_valid_s4 = task_s4.valid && !RegNext(chnl_fire_s3, false.B) + c_s4.valid := c_d_valid_s4 && isC_s4 + d_s4.valid := c_d_valid_s4 && isD_s4 c_s4.bits.task := task_s4.bits c_s4.bits.data.data := data_s4 d_s4.bits.task := task_s4.bits @@ -454,18 +457,19 @@ class MainPipe(implicit p: Parameters) extends L2Module { val need_write_releaseBuf_s5 = RegInit(false.B) val need_write_refillBuf_s5 = RegInit(false.B) val isC_s5, isD_s5 = RegInit(false.B) - task_s5.valid := task_s4.valid && !chnl_fire_s4 - when (task_s4.valid && !chnl_fire_s4) { + task_s5.valid := task_s4.valid && !req_drop_s4 + when (task_s4.valid && !req_drop_s4) { task_s5.bits := task_s4.bits ren_s5 := ren_s4 data_s5 := data_s4 need_write_releaseBuf_s5 := need_write_releaseBuf_s4 need_write_refillBuf_s5 := need_write_refillBuf_s4 - isC_s5 := isC_s4 - isD_s5 := isD_s4 + isC_s5 := isC_s4 || task_s4.bits.fromB && !task_s4.bits.mshrTask && task_s4.bits.opcode === ProbeAckData + isD_s5 := isD_s4 || task_s4.bits.fromA && !task_s4.bits.mshrTask && + (task_s4.bits.opcode === GrantData || task_s4.bits.opcode === AccessAckData) } val rdata_s5 = io.toDS.rdata_s5.data - val merged_data_s5 = Mux(ren_s5, rdata_s5, data_s5) + val out_data_s5 = Mux(!task_s5.bits.mshrTask, rdata_s5, data_s5) val chnl_fire_s5 = c_s5.fire() || d_s5.fire() val customL1Hint = Module(new CustomL1Hint) @@ -492,54 +496,64 @@ class MainPipe(implicit p: Parameters) extends L2Module { io.releaseBufWrite.valid := task_s5.valid && need_write_releaseBuf_s5 io.releaseBufWrite.beat_sel := Fill(beatSize, 1.U(1.W)) - io.releaseBufWrite.data.data := merged_data_s5 + io.releaseBufWrite.data.data := rdata_s5 io.releaseBufWrite.id := task_s5.bits.mshrId assert(!(io.releaseBufWrite.valid && !io.releaseBufWrite.ready), "releaseBuf should be ready when given valid") io.refillBufWrite.valid := task_s5.valid && need_write_refillBuf_s5 io.refillBufWrite.beat_sel := Fill(beatSize, 1.U(1.W)) - io.refillBufWrite.data.data := merged_data_s5 + io.refillBufWrite.data.data := rdata_s5 io.refillBufWrite.id := task_s5.bits.mshrId - assert(!(io.refillBufWrite.valid && !io.refillBufWrite.ready), "releaseBuf should be ready when given valid") + assert(!(io.refillBufWrite.valid && !io.refillBufWrite.ready), "refillBuf should be ready when given valid") - c_s5.valid := task_s5.valid && isC_s5 && !need_write_releaseBuf_s5 && !need_write_refillBuf_s5 - d_s5.valid := task_s5.valid && isD_s5 && !need_write_releaseBuf_s5 && !need_write_refillBuf_s5 + val c_d_valid_s5 = task_s5.valid && !RegNext(chnl_fire_s4, false.B) && !RegNextN(chnl_fire_s3, 2, Some(false.B)) + c_s5.valid := c_d_valid_s5 && isC_s5 + d_s5.valid := c_d_valid_s5 && isD_s5 c_s5.bits.task := task_s5.bits - c_s5.bits.data.data := merged_data_s5 + c_s5.bits.data.data := out_data_s5 d_s5.bits.task := task_s5.bits - d_s5.bits.data.data := merged_data_s5 + d_s5.bits.data.data := out_data_s5 /* ======== BlockInfo ======== */ - def pipelineBlock(chn: Char, s: TaskBundle, allTask: Boolean = false, tag: Boolean = false): Bool = { + // if s2/s3 might write Dir, we must block s1 sink entrance + // TODO:[Check] it seems that s3 Dir write will naturally block all s1 by dirRead.ready + // (an even stronger blocking than set blocking) + // so we might not need s3 blocking here + def s23Block(chn: Char, s: TaskBundle): Bool = { val s1 = io.fromReqArb.status_s1 - val s1_tag = if(chn == 'a') s1.a_tag else s1.b_tag - val s1_set = if(chn == 'a') s1.a_set else s1.b_set - - // allTask false: only !mshrTask (SinkReq) blocks Entrance - // allTask true : all tasks with the same set at s2 block Entrance - // tag true : compare tag+set - // tag false: compare set alone - s.set === s1_set && (if(allTask) true.B else !s.mshrTask) && (if(tag) s.tag === s1_tag else true.B) + val s1_set = chn match { + case 'a' => s1.a_set + case 'b' => s1.b_set + case 'c' => s1.c_set + case 'g' => s1.g_set + } + s.set === s1_set && !(s.mshrTask && !s.metaWen) // if guaranteed not to write meta, no blocking needed + } + def bBlock(s: TaskBundle, tag: Boolean = false): Bool = { + val s1 = io.fromReqArb.status_s1 + // tag true: compare tag + set + s.set === s1.b_set && (if(tag) s.tag === s1.b_tag else true.B) } - io.toReqBuf(0) := task_s2.valid && pipelineBlock('a', task_s2.bits, allTask = true) - io.toReqBuf(1) := task_s3.valid && pipelineBlock('a', task_s3.bits) + io.toReqBuf(0) := task_s2.valid && s23Block('a', task_s2.bits) + io.toReqBuf(1) := task_s3.valid && s23Block('a', task_s3.bits) + + io.toReqArb.blockC_s1 := task_s2.valid && s23Block('c', task_s2.bits) - io.toReqArb.blockC_s1 := - task_s2.valid && task_s2.bits.set === io.fromReqArb.status_s1.c_set || - io.toMSHRCtl.mshr_alloc_s3.valid && task_s3.bits.set === io.fromReqArb.status_s1.c_set io.toReqArb.blockB_s1 := - task_s2.valid && pipelineBlock('b', task_s2.bits, allTask = true) || - task_s3.valid && pipelineBlock('b', task_s3.bits) || - task_s4.valid && pipelineBlock('b', task_s4.bits, tag = true) || - task_s5.valid && pipelineBlock('b', task_s5.bits, tag = true) + task_s2.valid && bBlock(task_s2.bits) || + task_s3.valid && bBlock(task_s3.bits) || + task_s4.valid && bBlock(task_s4.bits, tag = true) || + task_s5.valid && bBlock(task_s5.bits, tag = true) + io.toReqArb.blockA_s1 := io.toReqBuf(0) || io.toReqBuf(1) + io.toReqArb.blockG_s1 := task_s2.valid && s23Block('g', task_s2.bits) /* ======== Pipeline Status ======== */ require(io.status_vec.size == 3) io.status_vec(0).valid := task_s3.valid && Mux( mshr_req_s3, - mshr_grant_s3 || mshr_accessackdata_s3 || mshr_accessack_s3, + mshr_refill_s3 && !retry, true.B // TODO: To consider grantBuffer capacity conflict, // only " req_s3.fromC || req_s3.fromA && !need_mshr_s3 " is needed @@ -557,20 +571,7 @@ class MainPipe(implicit p: Parameters) extends L2Module { when(req_s3.fromA) { alloc_state.s_refill := false.B alloc_state.w_grantack := req_prefetch_s3 || req_get_s3 || req_put_s3 - // need replacement - when(a_need_replacement) { - alloc_state.w_releaseack := false.B - alloc_state.w_release_sent := false.B - // need rprobe for release - when(meta_has_clients_s3) { - alloc_state.s_release := false.B // release when rprobe is sent in MSHR - alloc_state.s_rprobe := false.B - alloc_state.w_rprobeackfirst := false.B - alloc_state.w_rprobeacklast := false.B - } - }.otherwise { - alloc_state.w_release_sent := alloc_state.s_acquire || alloc_state.s_release - } + alloc_state.w_replResp := dirResult_s3.hit // need replRead when NOT dirHit // need Acquire downwards when(need_acquire_s3_a || req_put_s3) { alloc_state.s_acquire := false.B @@ -614,11 +615,11 @@ class MainPipe(implicit p: Parameters) extends L2Module { /* ===== Performance counters ===== */ // num of mshr req - XSPerfAccumulate(cacheParams, "mshr_grant_req", task_s3.valid && mshr_grant_s3) - XSPerfAccumulate(cacheParams, "mshr_grantdata_req", task_s3.valid && mshr_grantdata_s3) - XSPerfAccumulate(cacheParams, "mshr_accessackdata_req", task_s3.valid && mshr_accessackdata_s3) - XSPerfAccumulate(cacheParams, "mshr_accessack_req", task_s3.valid && mshr_accessack_s3) - XSPerfAccumulate(cacheParams, "mshr_hintack_req", task_s3.valid && mshr_hintack_s3) + XSPerfAccumulate(cacheParams, "mshr_grant_req", task_s3.valid && mshr_grant_s3 && !retry) + XSPerfAccumulate(cacheParams, "mshr_grantdata_req", task_s3.valid && mshr_grantdata_s3 && !retry) + XSPerfAccumulate(cacheParams, "mshr_accessackdata_req", task_s3.valid && mshr_accessackdata_s3 && !retry) + XSPerfAccumulate(cacheParams, "mshr_accessack_req", task_s3.valid && mshr_accessack_s3 && !retry) + XSPerfAccumulate(cacheParams, "mshr_hintack_req", task_s3.valid && mshr_hintack_s3 && !retry) XSPerfAccumulate(cacheParams, "mshr_probeack_req", task_s3.valid && mshr_probeack_s3) XSPerfAccumulate(cacheParams, "mshr_probeackdata_req", task_s3.valid && mshr_probeackdata_s3) XSPerfAccumulate(cacheParams, "mshr_release_req", task_s3.valid && mshr_release_s3) @@ -630,6 +631,7 @@ class MainPipe(implicit p: Parameters) extends L2Module { XSPerfAccumulate(cacheParams, "acquire_hit", hit_s3 && req_s3.fromA && (req_s3.opcode === AcquireBlock || req_s3.opcode === AcquirePerm)) XSPerfAccumulate(cacheParams, "get_hit", hit_s3 && req_s3.fromA && req_s3.opcode === Get) + XSPerfAccumulate(cacheParams, "retry", mshr_refill_s3 && retry) XSPerfAccumulate(cacheParams, "a_req_miss", miss_s3 && req_s3.fromA) XSPerfAccumulate(cacheParams, "acquire_miss", miss_s3 && req_s3.fromA && @@ -637,7 +639,7 @@ class MainPipe(implicit p: Parameters) extends L2Module { XSPerfAccumulate(cacheParams, "get_miss", miss_s3 && req_s3.fromA && req_s3.opcode === Get) XSPerfAccumulate(cacheParams, "a_req_need_replacement", - io.toMSHRCtl.mshr_alloc_s3.valid && !alloc_state.s_release || task_s3.valid && mainpipe_release) + io.toMSHRCtl.mshr_alloc_s3.valid && !alloc_state.s_release) XSPerfAccumulate(cacheParams, "b_req_hit", hit_s3 && req_s3.fromB) XSPerfAccumulate(cacheParams, "b_req_miss", miss_s3 && req_s3.fromB) diff --git a/src/main/scala/coupledL2/RequestArb.scala b/src/main/scala/coupledL2/RequestArb.scala index bfc79429..5868e478 100644 --- a/src/main/scala/coupledL2/RequestArb.scala +++ b/src/main/scala/coupledL2/RequestArb.scala @@ -31,12 +31,11 @@ class RequestArb(implicit p: Parameters) extends L2Module { val sinkA = Flipped(DecoupledIO(new TaskBundle)) val ATag = Input(UInt(tagBits.W)) // !TODO: very dirty, consider optimize structure val ASet = Input(UInt(setBits.W)) // To pass A entrance status to MP for blockA-info of ReqBuf - val sinkEntrance = ValidIO(new L2Bundle { - val tag = UInt(tagBits.W) + val s1Entrance = ValidIO(new L2Bundle { val set = UInt(setBits.W) }) - val sinkB = Flipped(DecoupledIO(new TLBundleB(edgeOut.bundle))) + val sinkB = Flipped(DecoupledIO(new TaskBundle)) val sinkC = Flipped(DecoupledIO(new TaskBundle)) val mshrTask = Flipped(DecoupledIO(new TaskBundle)) @@ -56,7 +55,7 @@ class RequestArb(implicit p: Parameters) extends L2Module { val status_s1 = Output(new PipeEntranceStatus) // set & tag of entrance status val status_vec = Vec(2, ValidIO(new PipeStatus)) // whether this stage will flow into SourceD - /* handle set conflict, capacity conflict and nestB */ + /* handle set conflict, capacity conflict */ val fromMSHRCtl = Input(new BlockInfo()) val fromMainPipe = Input(new BlockInfo()) val fromGrantBuffer = Input(new Bundle() { @@ -75,57 +74,35 @@ class RequestArb(implicit p: Parameters) extends L2Module { when(resetIdx === 0.U) { resetFinish := true.B } - // val valids = RegInit(0.U(8.W)) // 7 stages - /* ======== Stage 0 ======== */ - io.mshrTask.ready := !io.fromGrantBuffer.blockMSHRReqEntrance val mshr_task_s0 = Wire(Valid(new TaskBundle())) + val mshr_task_s1 = RegInit(0.U.asTypeOf(Valid(new TaskBundle()))) + + val s1_needs_replRead = mshr_task_s1.valid && mshr_task_s1.bits.fromA && mshr_task_s1.bits.replTask && ( + mshr_task_s1.bits.opcode(2, 1) === Grant(2, 1) || + mshr_task_s1.bits.opcode === AccessAckData || + mshr_task_s1.bits.opcode === HintAck && mshr_task_s1.bits.dsWen + ) + + /* ======== Stage 0 ======== */ + // if mshr_task_s1 is replRead, it might stall and wait for dirRead.ready, so we block new mshrTask from entering + // TODO: will cause msTask path vacant for one-cycle after replRead, since not use Flow so as to avoid ready propagation + io.mshrTask.ready := !io.fromGrantBuffer.blockMSHRReqEntrance && !s1_needs_replRead mshr_task_s0.valid := io.mshrTask.fire() mshr_task_s0.bits := io.mshrTask.bits /* ======== Stage 1 ======== */ - /* Task generation and pipelining */ - def fromTLBtoTaskBundle(b: TLBundleB): TaskBundle = { - val task = Wire(new TaskBundle) - task.channel := "b010".U - task.tag := parseAddress(b.address)._1 - task.set := parseAddress(b.address)._2 - task.off := parseAddress(b.address)._3 - task.alias.foreach(_ := 0.U) - task.opcode := b.opcode - task.param := b.param - task.size := b.size - task.sourceId := 0.U(sourceIdBits.W) - task.bufIdx := 0.U(bufIdxBits.W) - task.needProbeAckData := b.data(0) // TODO: parameterize this - task.mshrTask := false.B - task.mshrId := 0.U(mshrBits.W) - task.aliasTask.foreach(_ := false.B) - task.useProbeData := false.B - task.pbIdx := 0.U(mshrBits.W) - task.fromL2pft.foreach(_ := false.B) - task.needHint.foreach(_ := false.B) - task.dirty := false.B - task.way := 0.U(wayBits.W) - task.meta := 0.U.asTypeOf(new MetaEntry) - task.metaWen := false.B - task.tagWen := false.B - task.dsWen := false.B - task.wayMask := Fill(cacheParams.ways, "b1".U) - task.reqSource := MemReqSource.NoWhere.id.U // Ignore - task - } - /* latch mshr_task from s0 to s1 */ - val mshr_task_s1 = RegInit(0.U.asTypeOf(Valid(new TaskBundle()))) - mshr_task_s1.valid := mshr_task_s0.valid - when(mshr_task_s0.valid) { + val mshr_replRead_stall = mshr_task_s1.valid && s1_needs_replRead && (!io.dirRead_s1.ready || io.fromMainPipe.blockG_s1) + + mshr_task_s1.valid := mshr_task_s0.valid || mshr_replRead_stall + when(mshr_task_s0.valid && !mshr_replRead_stall) { mshr_task_s1.bits := mshr_task_s0.bits } /* Channel interaction from s1 */ val A_task = io.sinkA.bits - val B_task = fromTLBtoTaskBundle(io.sinkB.bits) + val B_task = io.sinkB.bits val C_task = io.sinkC.bits val block_A = io.fromMSHRCtl.blockA_s1 || io.fromMainPipe.blockA_s1 || io.fromGrantBuffer.blockSinkReqEntrance.blockA_s1 val block_B = io.fromMSHRCtl.blockB_s1 || io.fromMainPipe.blockB_s1 || io.fromGrantBuffer.blockSinkReqEntrance.blockB_s1 @@ -154,24 +131,29 @@ class RequestArb(implicit p: Parameters) extends L2Module { /* Meta read request */ // ^ only sinkA/B/C tasks need to read directory - io.dirRead_s1.valid := chnl_task_s1.valid && !mshr_task_s1.valid + io.dirRead_s1.valid := chnl_task_s1.valid && !mshr_task_s1.valid || s1_needs_replRead && !io.fromMainPipe.blockG_s1 io.dirRead_s1.bits.set := task_s1.bits.set io.dirRead_s1.bits.tag := task_s1.bits.tag - io.dirRead_s1.bits.wayMask := task_s1.bits.wayMask + io.dirRead_s1.bits.wayMask := Fill(cacheParams.ways, "b1".U) //[deprecated] io.dirRead_s1.bits.replacerInfo.opcode := task_s1.bits.opcode io.dirRead_s1.bits.replacerInfo.channel := task_s1.bits.channel io.dirRead_s1.bits.replacerInfo.reqSource := task_s1.bits.reqSource + io.dirRead_s1.bits.refill := s1_needs_replRead + io.dirRead_s1.bits.mshrId := task_s1.bits.mshrId - // probe block same-set A req for s2/s3 - io.sinkEntrance.valid := io.sinkB.fire || io.sinkC.fire - io.sinkEntrance.bits.tag := Mux(io.sinkC.fire, C_task.tag, B_task.tag) - io.sinkEntrance.bits.set := Mux(io.sinkC.fire, C_task.set, B_task.set) + // block same-set A req + io.s1Entrance.valid := mshr_task_s1.valid && mshr_task_s1.bits.metaWen || io.sinkC.fire || io.sinkB.fire + io.s1Entrance.bits.set := Mux( + mshr_task_s1.valid && mshr_task_s1.bits.metaWen, + mshr_task_s1.bits.set, + Mux(io.sinkC.fire, C_task.set, B_task.set) + ) /* ======== Stage 2 ======== */ val task_s2 = RegInit(0.U.asTypeOf(task_s1)) - task_s2.valid := task_s1.valid - when(task_s1.valid) { task_s2.bits := task_s1.bits } - + task_s2.valid := task_s1.valid && !mshr_replRead_stall + when(task_s1.valid && !mshr_replRead_stall) { task_s2.bits := task_s1.bits } + io.taskToPipe_s2 := task_s2 // MSHR task @@ -181,14 +163,19 @@ class RequestArb(implicit p: Parameters) extends L2Module { task_s2.bits.opcode === AccessAckData || task_s2.bits.opcode === HintAck && task_s2.bits.dsWen) // For GrantData, read refillBuffer // Caution: GrantData-alias may read DataStorage or ReleaseBuf instead - io.refillBufRead_s2.valid := mshrTask_s2 && !task_s2.bits.useProbeData && mshrTask_s2_a_upwards + // Release-replTask also read refillBuf and then write to DS + io.refillBufRead_s2.valid := mshrTask_s2 && ( + task_s2.bits.fromB && task_s2.bits.opcode(2, 1) === ProbeAck(2, 1) && task_s2.bits.replTask || + task_s2.bits.opcode(2, 1) === Release(2, 1) && task_s2.bits.replTask || + mshrTask_s2_a_upwards && !task_s2.bits.useProbeData) io.refillBufRead_s2.id := task_s2.bits.mshrId - // For ReleaseData or ProbeAckData, read releaseBuffer + + // ReleaseData and ProbeAckData read releaseBuffer // channel is used to differentiate GrantData and ProbeAckData io.releaseBufRead_s2.valid := mshrTask_s2 && ( task_s2.bits.opcode === ReleaseData || task_s2.bits.fromB && task_s2.bits.opcode === ProbeAckData || - task_s2.bits.fromA && task_s2.bits.useProbeData && mshrTask_s2_a_upwards) + mshrTask_s2_a_upwards && task_s2.bits.useProbeData) io.releaseBufRead_s2.id := task_s2.bits.mshrId assert(!io.refillBufRead_s2.valid || io.refillBufRead_s2.ready) assert(!io.releaseBufRead_s2.valid || io.releaseBufRead_s2.ready) @@ -196,8 +183,8 @@ class RequestArb(implicit p: Parameters) extends L2Module { require(beatSize == 2) /* status of each pipeline stage */ - io.status_s1.sets := VecInit(Seq(C_task.set, B_task.set, io.ASet)) - io.status_s1.tags := VecInit(Seq(C_task.tag, B_task.tag, io.ATag)) + io.status_s1.sets := VecInit(Seq(C_task.set, B_task.set, io.ASet, mshr_task_s1.bits.set)) + io.status_s1.tags := VecInit(Seq(C_task.tag, B_task.tag, io.ATag, mshr_task_s1.bits.tag)) require(io.status_vec.size == 2) io.status_vec.zip(Seq(task_s1, task_s2)).foreach { case (status, task) => diff --git a/src/main/scala/coupledL2/RequestBuffer.scala b/src/main/scala/coupledL2/RequestBuffer.scala index 46d8e9b3..7f2f3d90 100644 --- a/src/main/scala/coupledL2/RequestBuffer.scala +++ b/src/main/scala/coupledL2/RequestBuffer.scala @@ -36,8 +36,6 @@ class ReqEntry(entries: Int = 4)(implicit p: Parameters) extends L2Bundle() { */ // val depMask = Vec(entries, Bool()) - /* ways in the set that are occupied by unfinished MSHR task */ - val occWays = UInt(cacheParams.ways.W) } class ChosenQBundle(idWIdth: Int = 2)(implicit p: Parameters) extends L2Bundle { @@ -50,15 +48,14 @@ class RequestBuffer(flow: Boolean = true, entries: Int = 4)(implicit p: Paramete val io = IO(new Bundle() { val in = Flipped(DecoupledIO(new TaskBundle)) val out = DecoupledIO(new TaskBundle) - val mshrStatus = Vec(mshrsAll, Flipped(ValidIO(new MSHRBlockAInfo))) + val mshrInfo = Vec(mshrsAll, Flipped(ValidIO(new MSHRInfo))) val mainPipeBlock = Input(Vec(2, Bool())) val ATag = Output(UInt(tagBits.W)) val ASet = Output(UInt(setBits.W)) - // when Probe/Release enters MainPipe, we need also to block A req - val sinkEntrance = Flipped(ValidIO(new L2Bundle { - val tag = UInt(tagBits.W) + // when Probe/Release/MSHR enters MainPipe, we need also to block A req + val s1Entrance = Flipped(ValidIO(new L2Bundle { val set = UInt(setBits.W) })) }) @@ -77,45 +74,39 @@ class RequestBuffer(flow: Boolean = true, entries: Int = 4)(implicit p: Paramete val NWay = cacheParams.ways // count conflict def sameAddr(a: TaskBundle, b: TaskBundle): Bool = Cat(a.tag, a.set) === Cat(b.tag, b.set) - def sameAddr(a: TaskBundle, b: MSHRBlockAInfo): Bool = Cat(a.tag, a.set) === Cat(b.reqTag, b.set) + def sameAddr(a: TaskBundle, b: MSHRInfo): Bool = Cat(a.tag, a.set) === Cat(b.reqTag, b.set) def sameSet (a: TaskBundle, b: TaskBundle): Bool = a.set === b.set - def sameSet (a: TaskBundle, b: MSHRBlockAInfo): Bool = a.set === b.set - def addrConflict(a: TaskBundle, s: MSHRBlockAInfo): Bool = { + def sameSet (a: TaskBundle, b: MSHRInfo): Bool = a.set === b.set + def addrConflict(a: TaskBundle, s: MSHRInfo): Bool = { a.set === s.set && (a.tag === s.reqTag || a.tag === s.metaTag && s.needRelease) } - def conflictMask(a: TaskBundle): UInt = VecInit(io.mshrStatus.map(s => + def conflictMask(a: TaskBundle): UInt = VecInit(io.mshrInfo.map(s => s.valid && addrConflict(a, s.bits) && !s.bits.willFree)).asUInt def conflict(a: TaskBundle): Bool = conflictMask(a).orR // count ways - def countWaysOH(cond: (MSHRBlockAInfo => Bool)): UInt = { - VecInit(io.mshrStatus.map(s => - Mux( - s.valid && cond(s.bits), - UIntToOH(s.bits.way, NWay), - 0.U(NWay.W) - ) - )).reduceTree(_ | _) - } - def occWays (a: TaskBundle): UInt = countWaysOH(s => !s.willFree && sameSet(a, s)) - def willFreeWays(a: TaskBundle): UInt = countWaysOH(s => s.willFree && sameSet(a, s)) - - def noFreeWay(a: TaskBundle): Bool = !Cat(~occWays(a)).orR - def noFreeWay(occWays: UInt): Bool = !Cat(~occWays).orR +// def countWaysOH(cond: (MSHRInfo => Bool)): UInt = { +// VecInit(io.mshrInfo.map(s => +// Mux( +// s.valid && cond(s.bits), +// UIntToOH(s.bits.way, NWay), +// 0.U(NWay.W) +// ) +// )).reduceTree(_ | _) +// } // other flags val in = io.in.bits val full = Cat(buffer.map(_.valid)).andR // flow not allowed when full, or entries might starve - val canFlow = flow.B && !full && - !conflict(in) && !chosenQValid && !Cat(io.mainPipeBlock).orR && !noFreeWay(in) + val canFlow = flow.B && !full && !conflict(in) && !chosenQValid && !Cat(io.mainPipeBlock).orR val doFlow = canFlow && io.out.ready // val depMask = buffer.map(e => e.valid && sameAddr(io.in.bits, e.task)) // remove duplicate prefetch if same-addr A req in MSHR or ReqBuf val isPrefetch = in.fromA && in.opcode === Hint val dupMask = VecInit( - io.mshrStatus.map(s => + io.mshrInfo.map(s => s.valid && s.bits.isAcqOrPrefetch && sameAddr(in, s.bits)) ++ buffer.map(e => e.valid && sameAddr(in, e.task) @@ -134,12 +125,12 @@ class RequestBuffer(flow: Boolean = true, entries: Int = 4)(implicit p: Paramete val entry = buffer(insertIdx) val mpBlock = Cat(io.mainPipeBlock).orR val pipeBlockOut = io.out.fire && sameSet(in, io.out.bits) - val probeBlock = io.sinkEntrance.valid && io.sinkEntrance.bits.set === in.set // wait for same-addr req to enter MSHR + val probeBlock = io.s1Entrance.valid && io.s1Entrance.bits.set === in.set // wait for same-addr req to enter MSHR val s1Block = pipeBlockOut || probeBlock entry.valid := true.B // when Addr-Conflict / Same-Addr-Dependent / MainPipe-Block / noFreeWay-in-Set, entry not ready - entry.rdy := !conflict(in) && !mpBlock && !noFreeWay(in) && !s1Block // && !Cat(depMask).orR + entry.rdy := !conflict(in) && !mpBlock && !s1Block // && !Cat(depMask).orR entry.task := io.in.bits entry.waitMP := Cat( s1Block, @@ -147,7 +138,6 @@ class RequestBuffer(flow: Boolean = true, entries: Int = 4)(implicit p: Paramete io.mainPipeBlock(1), 0.U(1.W)) entry.waitMS := conflictMask(in) - entry.occWays := Mux(mpBlock, 0.U, occWays(in)) // entry.depMask := depMask assert(PopCount(conflictMask(in)) <= 2.U) @@ -172,20 +162,15 @@ class RequestBuffer(flow: Boolean = true, entries: Int = 4)(implicit p: Paramete chosenQ.io.enq.bits.id := issueArb.io.chosen issueArb.io.out.ready := chosenQ.io.enq.ready - //TODO: if i use occWays when update, - // does this mean that every entry has occWays logic? - /* ======== Update rdy and masks ======== */ for (e <- buffer) { when(e.valid) { val waitMSUpdate = WireInit(e.waitMS) // val depMaskUpdate = WireInit(e.depMask) - val occWaysUpdate = WireInit(e.occWays) - // when mshr will_free, clear it in other reqs' waitMS and occWays - val willFreeMask = VecInit(io.mshrStatus.map(s => s.valid && s.bits.willFree)).asUInt + // when mshr will_free, clear it in other reqs' waitMS + val willFreeMask = VecInit(io.mshrInfo.map(s => s.valid && s.bits.willFree)).asUInt waitMSUpdate := e.waitMS & (~willFreeMask).asUInt - occWaysUpdate := e.occWays & (~willFreeWays(e.task)).asUInt // Initially, // waitMP(2) = s2 blocking, wait 2 cycles @@ -197,7 +182,6 @@ class RequestBuffer(flow: Boolean = true, entries: Int = 4)(implicit p: Paramete e.waitMP := e.waitMP >> 1.U when(e.waitMP(1) === 0.U && e.waitMP(0) === 1.U) { waitMSUpdate := conflictMask(e.task) - occWaysUpdate := occWays(e.task) } // when request is sent, clear it in other reqs' depMask @@ -207,7 +191,7 @@ class RequestBuffer(flow: Boolean = true, entries: Int = 4)(implicit p: Paramete // set waitMP if fired-s1-req is the same set val s1A_Block = io.out.fire && sameSet(e.task, io.out.bits) - val s1B_Block = io.sinkEntrance.valid && io.sinkEntrance.bits.set === e.task.set + val s1B_Block = io.s1Entrance.valid && io.s1Entrance.bits.set === e.task.set val s1_Block = s1A_Block || s1B_Block when(s1_Block) { e.waitMP := e.waitMP | "b0100".U // fired-req at s2 next cycle @@ -216,8 +200,7 @@ class RequestBuffer(flow: Boolean = true, entries: Int = 4)(implicit p: Paramete // update info e.waitMS := waitMSUpdate // e.depMask := depMaskUpdate - e.occWays := occWaysUpdate - e.rdy := !waitMSUpdate.orR && !e.waitMP && !noFreeWay(occWaysUpdate) && !s1_Block + e.rdy := !waitMSUpdate.orR && !e.waitMP && !s1_Block } } @@ -235,8 +218,8 @@ class RequestBuffer(flow: Boolean = true, entries: Int = 4)(implicit p: Paramete buffer(chosenQ.io.deq.bits.id).valid := false.B } - // for Dir to choose a way not occupied by some unfinished MSHR task - io.out.bits.wayMask := Mux(canFlow, ~occWays(io.in.bits), ~chosenQ.io.deq.bits.bits.occWays) + // for Dir to choose a free way + io.out.bits.wayMask := Fill(cacheParams.ways, 1.U(1.W)) // add XSPerf to see how many cycles the req is held in Buffer if(cacheParams.enablePerf) { @@ -249,6 +232,7 @@ class RequestBuffer(flow: Boolean = true, entries: Int = 4)(implicit p: Paramete XSPerfAccumulate(cacheParams, "recv_prefetch", io.in.fire && isPrefetch) XSPerfAccumulate(cacheParams, "recv_normal", io.in.fire && !isPrefetch) XSPerfAccumulate(cacheParams, "chosenQ_cancel", chosenQValid && cancel) + // TODO: count conflict for(i <- 0 until entries){ val cntEnable = PopCount(buffer.map(_.valid)) === i.U XSPerfAccumulate(cacheParams, s"req_buffer_util_$i", cntEnable) @@ -258,7 +242,7 @@ class RequestBuffer(flow: Boolean = true, entries: Int = 4)(implicit p: Paramete case (e, t) => when(e.valid) { t := t + 1.U } when(RegNext(RegNext(e.valid) && !e.valid)) { t := 0.U } - assert(t < 10000.U, "ReqBuf Leak") + assert(t < 20000.U, "ReqBuf Leak") val enable = RegNext(e.valid) && !e.valid XSPerfHistogram(cacheParams, "reqBuf_timer", t, enable, 0, 20, 1, right_strict = true) diff --git a/src/main/scala/coupledL2/SinkA.scala b/src/main/scala/coupledL2/SinkA.scala index 77b36b99..a3b8634b 100644 --- a/src/main/scala/coupledL2/SinkA.scala +++ b/src/main/scala/coupledL2/SinkA.scala @@ -30,7 +30,7 @@ class SinkA(implicit p: Parameters) extends L2Module { val io = IO(new Bundle() { val a = Flipped(DecoupledIO(new TLBundleA(edgeIn.bundle))) val prefetchReq = prefetchOpt.map(_ => Flipped(DecoupledIO(new PrefetchReq))) - val toReqArb = DecoupledIO(new TaskBundle) + val task = DecoupledIO(new TaskBundle) val pbRead = Flipped(DecoupledIO(new PutBufferRead)) val pbResp = ValidIO(new PutBufferEntry) }) @@ -63,8 +63,8 @@ class SinkA(implicit p: Parameters) extends L2Module { beatValids(io.pbRead.bits.idx)(io.pbRead.bits.count) := false.B } - val commonReq = Wire(io.toReqArb.cloneType) - val prefetchReq = prefetchOpt.map(_ => Wire(io.toReqArb.cloneType)) + val commonReq = Wire(io.task.cloneType) + val prefetchReq = prefetchOpt.map(_ => Wire(io.task.cloneType)) io.a.ready := !first || commonReq.ready && !noSpace @@ -96,6 +96,7 @@ class SinkA(implicit p: Parameters) extends L2Module { task.dsWen := false.B task.wayMask := 0.U(cacheParams.ways.W) task.reqSource := a.user.lift(utility.ReqSourceKey).getOrElse(MemReqSource.NoWhere.id.U) + task.replTask := false.B task } def fromPrefetchReqtoTaskBundle(req: PrefetchReq): TaskBundle = { @@ -135,9 +136,9 @@ class SinkA(implicit p: Parameters) extends L2Module { prefetchReq.get.valid := io.prefetchReq.get.valid prefetchReq.get.bits := fromPrefetchReqtoTaskBundle(io.prefetchReq.get.bits) io.prefetchReq.get.ready := prefetchReq.get.ready - fastArb(Seq(commonReq, prefetchReq.get), io.toReqArb) + fastArb(Seq(commonReq, prefetchReq.get), io.task) } else { - io.toReqArb <> commonReq + io.task <> commonReq } io.pbRead.ready := beatValids(io.pbRead.bits.idx)(io.pbRead.bits.count) @@ -148,26 +149,26 @@ class SinkA(implicit p: Parameters) extends L2Module { // Performance counters // num of reqs - XSPerfAccumulate(cacheParams, "sinkA_req", io.toReqArb.fire()) + XSPerfAccumulate(cacheParams, "sinkA_req", io.task.fire()) XSPerfAccumulate(cacheParams, "sinkA_acquire_req", io.a.fire() && io.a.bits.opcode(2, 1) === AcquireBlock(2, 1)) XSPerfAccumulate(cacheParams, "sinkA_acquireblock_req", io.a.fire() && io.a.bits.opcode === AcquireBlock) XSPerfAccumulate(cacheParams, "sinkA_acquireperm_req", io.a.fire() && io.a.bits.opcode === AcquirePerm) XSPerfAccumulate(cacheParams, "sinkA_get_req", io.a.fire() && io.a.bits.opcode === Get) - XSPerfAccumulate(cacheParams, "sinkA_put_req", io.toReqArb.fire() && - (io.toReqArb.bits.opcode === PutFullData || io.toReqArb.bits.opcode === PutPartialData)) + XSPerfAccumulate(cacheParams, "sinkA_put_req", io.task.fire() && + (io.task.bits.opcode === PutFullData || io.task.bits.opcode === PutPartialData)) XSPerfAccumulate(cacheParams, "sinkA_put_beat", io.a.fire() && (io.a.bits.opcode === PutFullData || io.a.bits.opcode === PutPartialData)) prefetchOpt.foreach { _ => XSPerfAccumulate(cacheParams, "sinkA_prefetch_req", io.prefetchReq.get.fire()) } // cycels stalled by mainpipe - val stall = io.toReqArb.valid && !io.toReqArb.ready + val stall = io.task.valid && !io.task.ready XSPerfAccumulate(cacheParams, "sinkA_stall_by_mainpipe", stall) XSPerfAccumulate(cacheParams, "sinkA_acquire_stall_by_mainpipe", stall && - (io.toReqArb.bits.opcode === AcquireBlock || io.toReqArb.bits.opcode === AcquirePerm)) - XSPerfAccumulate(cacheParams, "sinkA_get_stall_by_mainpipe", stall && io.toReqArb.bits.opcode === Get) + (io.task.bits.opcode === AcquireBlock || io.task.bits.opcode === AcquirePerm)) + XSPerfAccumulate(cacheParams, "sinkA_get_stall_by_mainpipe", stall && io.task.bits.opcode === Get) XSPerfAccumulate(cacheParams, "sinkA_put_stall_by_mainpipe", stall && - (io.toReqArb.bits.opcode === PutFullData || io.toReqArb.bits.opcode === PutPartialData)) - prefetchOpt.foreach { _ => XSPerfAccumulate(cacheParams, "sinkA_prefetch_stall_by_mainpipe", stall && io.toReqArb.bits.opcode === Hint) } + (io.task.bits.opcode === PutFullData || io.task.bits.opcode === PutPartialData)) + prefetchOpt.foreach { _ => XSPerfAccumulate(cacheParams, "sinkA_prefetch_stall_by_mainpipe", stall && io.task.bits.opcode === Hint) } // cycles stalled for no space XSPerfAccumulate(cacheParams, "sinkA_put_stall_for_noSpace", io.a.valid && first && noSpace) diff --git a/src/main/scala/coupledL2/SinkB.scala b/src/main/scala/coupledL2/SinkB.scala new file mode 100644 index 00000000..26631b83 --- /dev/null +++ b/src/main/scala/coupledL2/SinkB.scala @@ -0,0 +1,107 @@ +/** ************************************************************************************* + * Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences + * Copyright (c) 2020-2021 Peng Cheng Laboratory + * + * XiangShan is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * + * See the Mulan PSL v2 for more details. + * ************************************************************************************* + */ + +package coupledL2 + +import chisel3._ +import chisel3.util._ +import chipsalliance.rocketchip.config.Parameters +import freechips.rocketchip.tilelink._ +import freechips.rocketchip.tilelink.TLMessages._ +import coupledL2.utils.XSPerfAccumulate + +class BMergeTask(implicit p: Parameters) extends L2Bundle { + val id = UInt(mshrBits.W) + val task = new TaskBundle() +} + +class SinkB(implicit p: Parameters) extends L2Module { + val io = IO(new Bundle() { + val b = Flipped(DecoupledIO(new TLBundleB(edgeIn.bundle))) + val task = DecoupledIO(new TaskBundle) + val msInfo = Vec(mshrsAll, Flipped(ValidIO(new MSHRInfo))) + val bMergeTask = ValidIO(new BMergeTask) + }) + + def fromTLBtoTaskBundle(b: TLBundleB): TaskBundle = { + val task = Wire(new TaskBundle) + task.channel := "b010".U + task.tag := parseAddress(b.address)._1 + task.set := parseAddress(b.address)._2 + task.off := parseAddress(b.address)._3 + task.alias.foreach(_ := 0.U) + task.opcode := b.opcode + task.param := b.param + task.size := b.size + task.sourceId := 0.U(sourceIdBits.W) + task.bufIdx := 0.U(bufIdxBits.W) + task.needProbeAckData := b.data(0) // TODO: parameterize this + task.mshrTask := false.B + task.mshrId := 0.U(mshrBits.W) + task.aliasTask.foreach(_ := false.B) + task.useProbeData := false.B + task.pbIdx := 0.U(mshrBits.W) + task.fromL2pft.foreach(_ := false.B) + task.needHint.foreach(_ := false.B) + task.dirty := false.B + task.way := 0.U(wayBits.W) + task.meta := 0.U.asTypeOf(new MetaEntry) + task.metaWen := false.B + task.tagWen := false.B + task.dsWen := false.B + task.wayMask := Fill(cacheParams.ways, "b1".U) + task.reqSource := MemReqSource.NoWhere.id.U // Ignore + task.replTask := false.B + task + } + val task = fromTLBtoTaskBundle(io.b.bits) + + /* ======== Merge Nested-B req ======== */ + // unable to accept incoming B req because same-addr as some MSHR REQ + val addrConflict = VecInit(io.msInfo.map(s => + s.valid && s.bits.set === task.set && s.bits.reqTag === task.tag && !s.bits.willFree && !s.bits.nestB + )).asUInt.orR + + // unable to accept incoming B req because same-addr as some MSHR replaced block and cannot nest + val replaceConflictMask = VecInit(io.msInfo.map(s => + s.valid && s.bits.set === task.set && s.bits.metaTag === task.tag && s.bits.releaseNotSent && !s.bits.mergeB + )).asUInt + val replaceConflict = replaceConflictMask.orR + + // incoming B can be merged with some MSHR replaced block and able to be accepted + val mergeBMask = VecInit(io.msInfo.map(s => + s.valid && s.bits.set === task.set && s.bits.metaTag === task.tag && s.bits.mergeB + )).asUInt + + assert(PopCount(replaceConflictMask) <= 1.U) + assert(PopCount(mergeBMask) <= 1.U) + + val mergeB = mergeBMask.orR + val mergeBId = OHToUInt(mergeBMask) + + // when conflict, we block B req from entering SinkB + // when !conflict and mergeB , we merge B req to MSHR + io.task.valid := io.b.valid && !addrConflict && !replaceConflict && !mergeB + io.task.bits := task + io.b.ready := mergeB || (io.task.ready && !addrConflict && !replaceConflict) + + io.bMergeTask.valid := io.b.valid && mergeB + io.bMergeTask.bits.id := mergeBId + io.bMergeTask.bits.task := task + + // TODO: add conflict XSPerf counter +} diff --git a/src/main/scala/coupledL2/SinkC.scala b/src/main/scala/coupledL2/SinkC.scala index 2fca5613..c28cb68d 100644 --- a/src/main/scala/coupledL2/SinkC.scala +++ b/src/main/scala/coupledL2/SinkC.scala @@ -39,11 +39,13 @@ class PipeBufferResp(implicit p: Parameters) extends L2Bundle { class SinkC(implicit p: Parameters) extends L2Module { val io = IO(new Bundle() { val c = Flipped(DecoupledIO(new TLBundleC(edgeIn.bundle))) - val toReqArb = DecoupledIO(new TaskBundle) // Release/ReleaseData + val task = DecoupledIO(new TaskBundle) // Release/ReleaseData val resp = Output(new RespBundle) val releaseBufWrite = Flipped(new MSHRBufWrite) val bufRead = Input(ValidIO(new PipeBufferRead)) val bufResp = Output(new PipeBufferResp) + val refillBufWrite = Flipped(new MSHRBufWrite) + val msInfo = Vec(mshrsAll, Flipped(ValidIO(new MSHRInfo))) }) val (first, last, _, beat) = edgeIn.count(io.c) @@ -93,6 +95,7 @@ class SinkC(implicit p: Parameters) extends L2Module { task.dsWen := false.B task.wayMask := Fill(cacheParams.ways, "b1".U) task.reqSource := MemReqSource.NoWhere.id.U // Ignore + task.replTask := false.B task } @@ -109,7 +112,7 @@ class SinkC(implicit p: Parameters) extends L2Module { } } - when (io.c.fire() && isRelease && last && (!io.toReqArb.ready || taskArb.io.out.valid)) { + when (io.c.fire() && isRelease && last && (!io.task.ready || taskArb.io.out.valid)) { when (hasData) { taskValids(nextPtrReg) := true.B taskBuf(nextPtrReg) := toTaskBundle(io.c.bits) @@ -121,7 +124,7 @@ class SinkC(implicit p: Parameters) extends L2Module { } } - taskArb.io.out.ready := io.toReqArb.ready + taskArb.io.out.ready := io.task.ready taskArb.io.in.zipWithIndex.foreach { case (in, i) => in.valid := taskValids(i) @@ -136,9 +139,9 @@ class SinkC(implicit p: Parameters) extends L2Module { } val cValid = io.c.valid && isRelease && last - io.toReqArb.valid := cValid || taskArb.io.out.valid - io.toReqArb.bits := Mux(taskArb.io.out.valid, taskArb.io.out.bits, toTaskBundle(io.c.bits)) - io.toReqArb.bits.bufIdx := Mux(taskArb.io.out.valid, taskArb.io.out.bits.bufIdx, nextPtrReg) + io.task.valid := cValid || taskArb.io.out.valid + io.task.bits := Mux(taskArb.io.out.valid, taskArb.io.out.bits, toTaskBundle(io.c.bits)) + io.task.bits.bufIdx := Mux(taskArb.io.out.valid, taskArb.io.out.bits.bufIdx, nextPtrReg) io.resp.valid := io.c.valid && (first || last) && !isRelease io.resp.mshrId := 0.U // DontCare @@ -155,8 +158,23 @@ class SinkC(implicit p: Parameters) extends L2Module { io.releaseBufWrite.data.data := Fill(beatSize, io.c.bits.data) io.releaseBufWrite.id := 0.U(mshrBits.W) // id is given by MSHRCtl by comparing address to the MSHRs - // io.c.ready := !first || !noSpace && !(isRelease && !io.toReqArb.ready) - io.c.ready := !isRelease || !first || !full || !hasData && io.toReqArb.ready + // C-Release writing new data to refillBuffer, for repl-Release to write to DS + val newdataMask = VecInit(io.msInfo.map(s => + s.valid && s.bits.set === io.task.bits.set && s.bits.reqTag === io.task.bits.tag && s.bits.releaseNotSent + )).asUInt + + // we must wait until 2nd beat written into databuf(idx) before we can read it + // So we use RegNext + // //Or we can use Cat(databuf(idx)(0), io.c.bits.data) + + // since what we are trying to prevent is that C-Release comes first and MSHR-Release comes later + // we can make sure this refillBufWrite can be read by MSHR-Release + io.refillBufWrite.valid := RegNext(io.task.fire && newdataMask.orR, false.B) + io.refillBufWrite.beat_sel := Fill(beatSize, 1.U(1.W)) + io.refillBufWrite.id := RegNext(OHToUInt(newdataMask)) + io.refillBufWrite.data.data := dataBuf(RegNext(io.task.bits.bufIdx)).asUInt + + io.c.ready := !isRelease || !first || !full || !hasData && io.task.ready io.bufResp.data := dataBuf(io.bufRead.bits.bufIdx) @@ -164,6 +182,6 @@ class SinkC(implicit p: Parameters) extends L2Module { val stall = io.c.valid && isRelease && !io.c.ready XSPerfAccumulate(cacheParams, "sinkC_c_stall", stall) XSPerfAccumulate(cacheParams, "sinkC_c_stall_for_noSpace", stall && hasData && first && full) - XSPerfAccumulate(cacheParams, "sinkC_toReqArb_stall", io.toReqArb.valid && !io.toReqArb.ready) + XSPerfAccumulate(cacheParams, "sinkC_toReqArb_stall", io.task.valid && !io.task.ready) XSPerfAccumulate(cacheParams, "sinkC_buf_full", full) } \ No newline at end of file diff --git a/src/main/scala/coupledL2/Slice.scala b/src/main/scala/coupledL2/Slice.scala index 24054beb..d6228233 100644 --- a/src/main/scala/coupledL2/Slice.scala +++ b/src/main/scala/coupledL2/Slice.scala @@ -47,22 +47,29 @@ class Slice()(implicit p: Parameters) extends L2Module { val dataStorage = Module(new DataStorage()) val refillUnit = Module(new RefillUnit()) val sinkA = Module(new SinkA) - val sinkC = Module(new SinkC) // or ReleaseUnit? + val sinkB = Module(new SinkB) + val sinkC = Module(new SinkC) val sourceC = Module(new SourceC) val grantBuf = if (!useFIFOGrantBuffer) Module(new GrantBuffer) else Module(new GrantBufferFIFO) - val refillBuf = Module(new MSHRBuffer(wPorts = 2)) + val refillBuf = Module(new MSHRBuffer(wPorts = 3)) val releaseBuf = Module(new MSHRBuffer(wPorts = 3)) - a_reqBuf.io.in <> sinkA.io.toReqArb - a_reqBuf.io.mshrStatus := mshrCtl.io.toReqBuf + val prbq = Module(new ProbeQueue()) + prbq.io <> DontCare // @XiaBin TODO + + a_reqBuf.io.in <> sinkA.io.task + a_reqBuf.io.mshrInfo := mshrCtl.io.msInfo a_reqBuf.io.mainPipeBlock := mainPipe.io.toReqBuf - a_reqBuf.io.sinkEntrance := reqArb.io.sinkEntrance + a_reqBuf.io.s1Entrance := reqArb.io.s1Entrance + sinkB.io.msInfo := mshrCtl.io.msInfo + sinkC.io.msInfo := mshrCtl.io.msInfo reqArb.io.sinkA <> a_reqBuf.io.out reqArb.io.ATag := a_reqBuf.io.ATag reqArb.io.ASet := a_reqBuf.io.ASet - reqArb.io.sinkC <> sinkC.io.toReqArb + reqArb.io.sinkB <> sinkB.io.task + reqArb.io.sinkC <> sinkC.io.task reqArb.io.dirRead_s1 <> directory.io.read reqArb.io.taskToPipe_s2 <> mainPipe.io.taskFromArb_s2 reqArb.io.mshrTask <> mshrCtl.io.mshrTask @@ -78,12 +85,16 @@ class Slice()(implicit p: Parameters) extends L2Module { mshrCtl.io.resps.sinkE := grantBuf.io.e_resp mshrCtl.io.resps.sourceC := sourceC.io.resp mshrCtl.io.nestedwb := mainPipe.io.nestedwb + mshrCtl.io.bMergeTask := sinkB.io.bMergeTask mshrCtl.io.pbRead <> sinkA.io.pbRead mshrCtl.io.pbResp <> sinkA.io.pbResp + mshrCtl.io.replResp <> directory.io.replResp + mainPipe.io.replResp <> directory.io.replResp directory.io.resp <> mainPipe.io.dirResp_s3 directory.io.metaWReq <> mainPipe.io.metaWReq directory.io.tagWReq <> mainPipe.io.tagWReq + directory.io.msInfo <> mshrCtl.io.msInfo dataStorage.io.req <> mainPipe.io.toDS.req_s3 dataStorage.io.wdata := mainPipe.io.toDS.wdata_s3 @@ -102,16 +113,19 @@ class Slice()(implicit p: Parameters) extends L2Module { mainPipe.io.globalCounter := grantBuf.io.globalCounter mainPipe.io.taskInfo_s1 <> reqArb.io.taskInfo_s1 - releaseBuf.io.w(0) <> sinkC.io.releaseBufWrite - releaseBuf.io.w(0).id := mshrCtl.io.releaseBufWriteId - releaseBuf.io.w(1) <> mainPipe.io.releaseBufWrite - releaseBuf.io.w(2).valid := mshrCtl.io.nestedwbDataId.valid - releaseBuf.io.w(2).beat_sel := Fill(beatSize, 1.U(1.W)) - releaseBuf.io.w(2).data := mainPipe.io.nestedwbData - releaseBuf.io.w(2).id := mshrCtl.io.nestedwbDataId.bits + // priority: nested-ReleaseData / probeAckData [NEW] > mainPipe DS rdata [OLD] + // 0/1 might happen at the same cycle with 2 + releaseBuf.io.w(0).valid := mshrCtl.io.nestedwbDataId.valid + releaseBuf.io.w(0).beat_sel := Fill(beatSize, 1.U(1.W)) + releaseBuf.io.w(0).data := mainPipe.io.nestedwbData + releaseBuf.io.w(0).id := mshrCtl.io.nestedwbDataId.bits + releaseBuf.io.w(1) <> sinkC.io.releaseBufWrite + releaseBuf.io.w(1).id := mshrCtl.io.releaseBufWriteId + releaseBuf.io.w(2) <> mainPipe.io.releaseBufWrite refillBuf.io.w(0) <> refillUnit.io.refillBufWrite - refillBuf.io.w(1) <> mainPipe.io.refillBufWrite + refillBuf.io.w(1) <> sinkC.io.refillBufWrite + refillBuf.io.w(2) <> mainPipe.io.refillBufWrite sourceC.io.in <> mainPipe.io.toSourceC @@ -146,7 +160,7 @@ class Slice()(implicit p: Parameters) extends L2Module { /* connect downward channels */ io.out.a <> outBuf.a(mshrCtl.io.sourceA) - reqArb.io.sinkB <> outBuf.b(io.out.b) + sinkB.io.b <> outBuf.b(io.out.b) io.out.c <> outBuf.c(sourceC.io.out) refillUnit.io.sinkD <> outBuf.d(io.out.d) io.out.e <> outBuf.e(refillUnit.io.sourceE) @@ -184,6 +198,9 @@ class Slice()(implicit p: Parameters) extends L2Module { if (cacheParams.enableMonitor) { val monitor = Module(new Monitor()) - mainPipe.io.toMonitor <> monitor.io.fromMainPipe + monitor.io.fromMainPipe <> mainPipe.io.toMonitor +// monitor.io.nestedWBValid := mshrCtl.io.nestedwbDataId.valid + } else { + mainPipe.io.toMonitor <> DontCare } } diff --git a/src/main/scala/coupledL2/SourceC.scala b/src/main/scala/coupledL2/SourceC.scala index 3aff4d7f..fa259ce0 100644 --- a/src/main/scala/coupledL2/SourceC.scala +++ b/src/main/scala/coupledL2/SourceC.scala @@ -175,7 +175,7 @@ class SourceC(implicit p: Parameters) extends L2Module { } } - TLArbiter.lowest(edgeIn, io.out, out_bundles:_*) + TLArbiter.robin(edgeIn, io.out, out_bundles:_*) io.in.ready := !full assert(!full, "SourceC should never be full") @@ -183,6 +183,7 @@ class SourceC(implicit p: Parameters) extends L2Module { val (first, last, done, count) = edgeOut.count(io.out) val isRelease = io.out.bits.opcode === TLMessages.Release val isReleaseData = io.out.bits.opcode === TLMessages.ReleaseData + // [LRelease] TODO: resp from SourceC indicating w_release_sent may be deprecated io.resp.valid := io.out.fire() && first && (isRelease || isReleaseData) io.resp.mshrId := io.out.bits.source io.resp.set := parseFullAddress(io.out.bits.address)._2 diff --git a/src/main/scala/coupledL2/TopDownMonitor.scala b/src/main/scala/coupledL2/TopDownMonitor.scala index ac1960ab..443f8cae 100644 --- a/src/main/scala/coupledL2/TopDownMonitor.scala +++ b/src/main/scala/coupledL2/TopDownMonitor.scala @@ -25,8 +25,8 @@ class TopDownMonitor()(implicit p: Parameters) extends L2Module { case(slice, i) => slice.map { ms => - val msBlockAddr = if(bankBits == 0) Cat(ms.bits.tag, ms.bits.set) - else Cat(ms.bits.tag, ms.bits.set, i.U(bankBits-1, 0)) + val msBlockAddr = if(bankBits == 0) Cat(ms.bits.reqTag, ms.bits.set) + else Cat(ms.bits.reqTag, ms.bits.set, i.U(bankBits-1, 0)) val pBlockAddr = (pAddr.bits >> 6.U).asUInt val isMiss = ms.valid && ms.bits.is_miss @@ -38,6 +38,7 @@ class TopDownMonitor()(implicit p: Parameters) extends L2Module { XSPerfAccumulate(cacheParams, perfName, addrMatch) ExcitingUtils.addSource(addrMatch, perfName, ExcitingUtils.Perf) + ExcitingUtils.addSink(WireDefault(addrMatch), perfName, ExcitingUtils.Perf) } /* ====== PART TWO ====== diff --git a/src/main/scala/coupledL2/debug/Monitor.scala b/src/main/scala/coupledL2/debug/Monitor.scala index 7ddac1cc..4405c7f6 100644 --- a/src/main/scala/coupledL2/debug/Monitor.scala +++ b/src/main/scala/coupledL2/debug/Monitor.scala @@ -41,9 +41,12 @@ class CPL2S3Info(implicit p: Parameters) extends L2Bundle { class Monitor(implicit p: Parameters) extends L2Module { val io = IO(new Bundle() { val fromMainPipe = Input(new MainpipeMoni()) +// val nestedWBValid = Input(Bool()) }) val mp = io.fromMainPipe + val s2_valid = mp.task_s2.valid + val req_s2 = mp.task_s2.bits val s3_valid = mp.task_s3.valid val req_s3 = mp.task_s3.bits val mshr_req_s3 = req_s3.mshrTask @@ -51,24 +54,35 @@ class Monitor(implicit p: Parameters) extends L2Module { val meta_s3 = mp.dirResult_s3.meta /* ======== MainPipe Assertions ======== */ - assert(!(s3_valid && req_s3.fromC && !dirResult_s3.hit), - "C Release should always hit, Tag %x Set %x", - req_s3.tag, req_s3.set) + // ! Release w/o data will not trigger nestedWBValid, either + // ! consider using mshrs.map(_.io.nestedwb_match) and passes to Monitor, if necessary +// val c_notHit = s3_valid && req_s3.fromC && !dirResult_s3.hit +// val c_noNested = !io.nestedWBValid +// assert(RegNext(!(c_notHit && c_noNested)), +// "C Release should always hit or have some MSHR meta nested, Tag %x Set %x", +// req_s3.tag, req_s3.set) assert(RegNext(!(s3_valid && !mshr_req_s3 && dirResult_s3.hit && meta_s3.state === TRUNK && !meta_s3.clients.orR)), "Trunk should have some client hit") + assert(RegNext(!(s3_valid && req_s3.fromC && dirResult_s3.hit && + !meta_s3.clients.orR)), + "Invalid Client should not send Release") + // assertion for set blocking - // make sure we don't send two reqs continuously with the same set - assert(!(mp.task_s2.bits.set === mp.task_s3.bits.set && - mp.task_s2.valid && !mp.task_s2.bits.mshrTask && mp.task_s2.bits.fromA && - mp.task_s3.valid && !mp.task_s3.bits.mshrTask && mp.task_s3.bits.fromA), - "s2 and s3 task same set, failed in blocking") - - assert(!(mp.task_s2.bits.set === mp.task_s4.bits.set && - mp.task_s2.valid && !mp.task_s2.bits.mshrTask && mp.task_s2.bits.fromA && - mp.task_s4.valid && !mp.task_s4.bits.mshrTask && mp.task_s4.bits.fromA)) + // A channel task @s1 never have same-set task @s2/s3 + // to ensure that meta written can be read by chnTask +// assert(RegNext(!(mp.task_s2.bits.set === mp.task_s3.bits.set && +// s2_valid && !req_s2.mshrTask && s3_valid)), +// "chnTask-s2 and s3 same set, failed in blocking") +// +// assert(RegNext(!(mp.task_s2.bits.set === RegNext(mp.task_s3.bits.set) && +// s2_valid && !req_s2.mshrTask && RegNext(s3_valid))), +// "chosen-chnTask-s1 and s3 task same set, failed in blocking") + +// TODO: whether mshrGrant also need such blocking, since it reads dir as well + /* ======== ChiselDB ======== */ // assert(cacheParams.hartIds.length == 1, "private L2 should have one and only one hardId") diff --git a/src/test/scala/TestTop.scala b/src/test/scala/TestTop.scala index 95135589..92a76a27 100644 --- a/src/test/scala/TestTop.scala +++ b/src/test/scala/TestTop.scala @@ -8,7 +8,8 @@ import freechips.rocketchip.diplomacy._ import freechips.rocketchip.tilelink._ import huancun._ import coupledL2.prefetch._ -import utility.{ChiselDB, FileRegisters} +import utility.{ChiselDB, FileRegisters, TLLogger} + import scala.collection.mutable.ArrayBuffer @@ -135,7 +136,7 @@ class TestTop_L2L3()(implicit p: Parameters) extends LazyModule { CacheParameters( name = s"l2", sets = 128, - ways = 4, + ways = 4 + 2, blockGranularity = log2Ceil(128) ), ), @@ -292,7 +293,8 @@ class TestTop_L2L3L2()(implicit p: Parameters) extends LazyModule { ways = 4, sets = 128, clientCaches = Seq(L1Param(aliasBitsOpt = Some(2))), - echoField = Seq(DirtyField()) + echoField = Seq(DirtyField()), + hartIds = Seq{i} ) }))).node) @@ -307,7 +309,7 @@ class TestTop_L2L3L2()(implicit p: Parameters) extends LazyModule { CacheParameters( name = s"l2", sets = 128, - ways = 4, + ways = 4 + 2, blockGranularity = log2Ceil(128) ), ), @@ -319,12 +321,12 @@ class TestTop_L2L3L2()(implicit p: Parameters) extends LazyModule { val xbar = TLXbar() val ram = LazyModule(new TLRAM(AddressSet(0, 0xffffL), beatBytes = 32)) - l1d_nodes.zip(l2_nodes).map { - case (l1d, l2) => l2 := TLBuffer() := l1d + l1d_nodes.zip(l2_nodes).zipWithIndex map { + case ((l1d, l2), i) => l2 := TLLogger(s"L2_L1_${i}", true) := TLBuffer() := l1d } - for (l2 <- l2_nodes) { - xbar := TLBuffer() := l2 + l2_nodes.zipWithIndex map { + case(l2, i) => xbar := TLLogger(s"L3_L2_${i}", true) := TLBuffer() := l2 } ram.node := @@ -424,7 +426,7 @@ class TestTop_fullSys()(implicit p: Parameters) extends LazyModule { CacheParameters( name = s"l2", sets = 128, - ways = 4, + ways = 4 + 2, blockGranularity = log2Ceil(128) ), ), @@ -545,4 +547,4 @@ object TestTop_fullSys extends App { ChiselDB.init(false) ChiselDB.addToFileRegisters FileRegisters.write("./build") -} \ No newline at end of file +}