Skip to content

Commit 235387a

Browse files
authored
Merge pull request OSCPU#31 from OpenXiangShan/support-put
Support put
2 parents d539a3e + f27efe2 commit 235387a

File tree

7 files changed

+282
-90
lines changed

7 files changed

+282
-90
lines changed

src/main/scala/huancun/Common.scala

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ class SourceDReq(implicit p: Parameters) extends InnerTask with HasChannelBits {
6161
val size = UInt(msgSizeBits.W)
6262
val way = UInt(wayBits.W)
6363
val off = UInt(offsetBits.W)
64+
val bufIdx = UInt(bufIdxBits.W)
6465
val denied = Bool()
6566
val sinkId = UInt(mshrBits.W)
6667
val dirty = Bool()
@@ -72,7 +73,9 @@ class SourceAReq(implicit p: Parameters) extends HuanCunBundle {
7273
val opcode = UInt(3.W)
7374
val param = UInt(3.W)
7475
val source = UInt(mshrBits.W)
76+
val bufIdx = UInt(bufIdxBits.W)
7577
val needData = Bool()
78+
val putData = Bool()
7679
}
7780
class SourceCReq(implicit p: Parameters) extends HuanCunBundle {
7881
val opcode = UInt(3.W)
@@ -187,4 +190,16 @@ class SourceDHazard(implicit p: Parameters) extends HuanCunBundle {
187190
class ReplacerInfo() extends Bundle {
188191
val channel = UInt(3.W)
189192
val opcode = UInt(3.W)
193+
}
194+
195+
class PutBufferPop(implicit p: Parameters) extends HuanCunBundle {
196+
val bufIdx = UInt(bufIdxBits.W)
197+
val count = UInt(beatBits.W)
198+
val last = Bool()
199+
}
200+
201+
class PutBufferBeatEntry(implicit p: Parameters) extends HuanCunBundle {
202+
val data = UInt((beatBytes * 8).W)
203+
val mask = UInt(beatBytes.W)
204+
val corrupt = Bool()
190205
}

src/main/scala/huancun/SinkA.scala

Lines changed: 48 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -29,21 +29,50 @@ class SinkA(implicit p: Parameters) extends HuanCunModule {
2929
val a = Flipped(DecoupledIO(new TLBundleA(edgeIn.bundle)))
3030
val alloc = DecoupledIO(new MSHRRequest)
3131
val task = Flipped(DecoupledIO(new SinkAReq))
32+
// SourceD
33+
val d_pb_pop = Flipped(DecoupledIO(new PutBufferPop))
34+
val d_pb_beat = Output(new PutBufferBeatEntry)
35+
// SourceA
36+
val a_pb_pop = Flipped(DecoupledIO(new PutBufferPop))
37+
val a_pb_beat = Output(new PutBufferBeatEntry)
3238
})
3339

34-
// TODO: Handle task
40+
// TODO: Does task for SinkA necessary?
3541
io.task.ready := false.B
3642

3743
val a = io.a
38-
val first = edgeIn.first(a)
44+
val (first, last, done, count) = edgeIn.count(a)
3945
val hasData = edgeIn.hasData(a.bits)
40-
when(a.valid) {
41-
assert(!hasData)
46+
47+
val beats = blockBytes / beatBytes
48+
val putBuffer = Reg(Vec(bufBlocks, Vec(beats, new PutBufferBeatEntry())))
49+
val beatVals = RegInit(VecInit(Seq.fill(bufBlocks) {
50+
VecInit(Seq.fill(beats) { false.B })
51+
}))
52+
val bufVals = VecInit(beatVals.map(_.asUInt().orR())).asUInt()
53+
val full = bufVals.andR()
54+
val noSpace = full && hasData
55+
val insertIdx = PriorityEncoder(~bufVals)
56+
val insertIdxReg = RegEnable(insertIdx, a.fire() && first)
57+
58+
when(a.fire() && hasData) {
59+
when(first) {
60+
putBuffer(insertIdx)(count).data := a.bits.data
61+
putBuffer(insertIdx)(count).mask := a.bits.mask
62+
putBuffer(insertIdx)(count).corrupt := a.bits.corrupt
63+
beatVals(insertIdx)(count) := true.B
64+
}.otherwise({
65+
putBuffer(insertIdxReg)(count).data := a.bits.data
66+
putBuffer(insertIdxReg)(count).mask := a.bits.mask
67+
putBuffer(insertIdxReg)(count).corrupt := a.bits.corrupt
68+
beatVals(insertIdxReg)(count) := true.B
69+
})
4270
}
71+
4372
val (tag, set, offset) = parseAddress(a.bits.address)
4473

45-
io.alloc.valid := a.valid && first
46-
a.ready := io.alloc.ready
74+
io.alloc.valid := a.valid && first && !noSpace
75+
a.ready := Mux(first, io.alloc.ready && !noSpace, true.B)
4776

4877
val allocInfo = io.alloc.bits
4978
allocInfo.channel := 1.U(3.W)
@@ -54,12 +83,24 @@ class SinkA(implicit p: Parameters) extends HuanCunModule {
5483
allocInfo.set := set
5584
allocInfo.tag := tag
5685
allocInfo.off := offset
57-
allocInfo.bufIdx := DontCare
86+
allocInfo.bufIdx := insertIdx
5887
allocInfo.needHint.foreach(_ := a.bits.user.lift(PrefetchKey).getOrElse(false.B))
5988
allocInfo.isPrefetch.foreach(_ := false.B)
6089
allocInfo.alias.foreach(_ := a.bits.user.lift(AliasKey).getOrElse(0.U))
6190
allocInfo.preferCache := a.bits.user.lift(PreferCacheKey).getOrElse(true.B)
6291
allocInfo.dirty := false.B // ignored
6392
allocInfo.fromProbeHelper := false.B
6493
allocInfo.needProbeAckData.foreach(_ := false.B)
94+
95+
io.d_pb_pop.ready := beatVals(io.d_pb_pop.bits.bufIdx).asUInt().andR()
96+
io.d_pb_beat := RegEnable(putBuffer(io.d_pb_pop.bits.bufIdx)(io.d_pb_pop.bits.count), io.d_pb_pop.fire())
97+
when(io.d_pb_pop.fire() && io.d_pb_pop.bits.last) {
98+
beatVals(io.d_pb_pop.bits.bufIdx).foreach(_ := false.B)
99+
}
100+
101+
io.a_pb_pop.ready := beatVals(io.a_pb_pop.bits.bufIdx).asUInt().andR()
102+
io.a_pb_beat := RegEnable(putBuffer(io.a_pb_pop.bits.bufIdx)(io.a_pb_pop.bits.count), io.a_pb_pop.fire())
103+
when(io.a_pb_pop.fire() && io.a_pb_pop.bits.last) {
104+
beatVals(io.a_pb_pop.bits.bufIdx).foreach(_ := false.B)
105+
}
65106
}

src/main/scala/huancun/Slice.scala

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -496,4 +496,9 @@ class Slice()(implicit p: Parameters) extends HuanCunModule {
496496
sinkC.io.sourceD_r_hazard <> sourceD.io.sourceD_r_hazard
497497
sinkD.io.sourceD_r_hazard <> sourceD.io.sourceD_r_hazard
498498

499+
sinkA.io.d_pb_pop <> sourceD.io.pb_pop
500+
sinkA.io.d_pb_beat <> sourceD.io.pb_beat
501+
502+
sinkA.io.a_pb_pop <> sourceA.io.pb_pop
503+
sinkA.io.a_pb_beat <> sourceA.io.pb_beat
499504
}

src/main/scala/huancun/SourceA.scala

Lines changed: 75 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -23,26 +23,88 @@ import chipsalliance.rocketchip.config.Parameters
2323
import chisel3._
2424
import chisel3.util._
2525
import freechips.rocketchip.tilelink._
26+
import huancun.utils.HoldUnless
2627

2728
class SourceA(edge: TLEdgeOut)(implicit p: Parameters) extends HuanCunModule {
2829
val io = IO(new Bundle() {
2930
val a = DecoupledIO(new TLBundleA(edge.bundle))
3031
val task = Flipped(DecoupledIO(new SourceAReq))
32+
// putbuffer interface
33+
val pb_pop = DecoupledIO(new PutBufferPop)
34+
val pb_beat = Input(new PutBufferBeatEntry)
3135
})
3236

3337
val a = io.a
38+
val a_acquire = Wire(a.cloneType)
39+
val a_put = Wire(a.cloneType)
40+
val beats = blockBytes / beatBytes
41+
val busy = RegInit(false.B)
42+
43+
io.task.ready := Mux(io.task.bits.putData, !busy, a_acquire.ready) // TODO: not ready until all beats of Put fire
44+
45+
when (io.task.fire() && io.task.bits.putData) {
46+
busy := true.B
47+
}
48+
49+
a_acquire.bits.opcode := io.task.bits.opcode
50+
a_acquire.bits.param := io.task.bits.param
51+
a_acquire.bits.size := offsetBits.U
52+
a_acquire.bits.source := io.task.bits.source
53+
a_acquire.bits.address := Cat(io.task.bits.tag, io.task.bits.set, 0.U(offsetBits.W))
54+
a_acquire.bits.mask := Fill(edgeOut.manager.beatBytes, 1.U(1.W))
55+
a_acquire.bits.data := DontCare
56+
a_acquire.bits.corrupt := false.B
57+
a_acquire.bits.user.lift(PreferCacheKey).map( _ := false.B)
58+
a_acquire.bits.echo.lift(DirtyKey).map(_ := true.B)
59+
a_acquire.valid := io.task.valid && !io.task.bits.putData
60+
61+
val s1_ready = Wire(Bool())
62+
val s1_full = RegInit(false.B)
63+
64+
// S0: read putBuffer
65+
val s0_task = HoldUnless(io.task.bits, io.task.fire())
66+
val s0_count = RegInit(0.U(beatBits.W))
67+
val s0_last = s0_count === (beats-1).U
68+
val s0_valid = io.pb_pop.fire()
69+
70+
io.pb_pop.valid := (io.task.valid && io.task.bits.putData || busy) && s1_ready
71+
io.pb_pop.bits.bufIdx := s0_task.bufIdx
72+
io.pb_pop.bits.count := s0_count
73+
io.pb_pop.bits.last := s0_last
74+
75+
when (io.pb_pop.fire()) {
76+
s0_count := s0_count + 1.U
77+
when (s0_last) {
78+
busy := false.B
79+
s0_count := 0.U
80+
}
81+
}
82+
83+
// S1: get putBuffer and transfer to outer A
84+
val s1_latch = s0_valid && s1_ready
85+
val s1_count = RegEnable(s0_count, s1_latch)
86+
val s1_task = RegEnable(s0_task, s1_latch)
87+
val s1_cango = Mux(a_put.valid, a_put.ready, false.B)
88+
val s1_doput = RegNext(s1_latch)
89+
val s1_pb_latch = HoldUnless(io.pb_beat, s1_doput)
90+
91+
s1_ready := s1_cango || !s1_full
92+
93+
when(s1_full && s1_cango) { s1_full := false.B }
94+
when(s1_latch) { s1_full := true.B }
95+
96+
a_put.bits.opcode := s1_task.opcode
97+
a_put.bits.param := s1_task.param
98+
a_put.bits.size := offsetBits.U
99+
a_put.bits.source := s1_task.source
100+
a_put.bits.address := Cat(s1_task.tag, s1_task.set, 0.U(offsetBits.W))
101+
a_put.bits.mask := io.pb_beat.mask
102+
a_put.bits.data := io.pb_beat.data
103+
a_put.bits.corrupt := false.B
104+
a_put.bits.user.lift(PreferCacheKey).map( _ := false.B)
105+
a_put.bits.echo.lift(DirtyKey).map(_ := true.B)
106+
a_put.valid := s1_doput
107+
108+
TLArbiter.lowest(edgeIn, io.a, a_put, a_acquire)
34109

35-
io.task.ready := a.ready
36-
a.valid := io.task.valid
37-
38-
a.bits.opcode := io.task.bits.opcode
39-
a.bits.param := io.task.bits.param
40-
a.bits.size := offsetBits.U
41-
a.bits.source := io.task.bits.source
42-
a.bits.address := Cat(io.task.bits.tag, io.task.bits.set, 0.U(offsetBits.W))
43-
a.bits.mask := Fill(edgeOut.manager.beatBytes, 1.U(1.W))
44-
a.bits.data := DontCare
45-
a.bits.corrupt := false.B
46-
a.bits.user.lift(PreferCacheKey).map( _ := false.B)
47-
a.bits.echo.lift(DirtyKey).map(_ := true.B)
48110
}

src/main/scala/huancun/SourceD.scala

Lines changed: 66 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@ import chisel3._
2424
import chisel3.util._
2525
import freechips.rocketchip.tilelink._
2626
import freechips.rocketchip.tilelink.TLMessages.{AcquireBlock, AcquirePerm, ReleaseAck}
27-
import huancun.utils.SReg
27+
import huancun.utils._
28+
2829

2930
class SourceD(implicit p: Parameters) extends HuanCunModule {
3031
/*
@@ -47,16 +48,16 @@ class SourceD(implicit p: Parameters) extends HuanCunModule {
4748
val bs_wdata = Output(new DSData)
4849
// data hazards
4950
val sourceD_r_hazard = ValidIO(new SourceDHazard)
51+
// putbuffer interface
52+
val pb_pop = DecoupledIO(new PutBufferPop)
53+
val pb_beat = Input(new PutBufferBeatEntry)
5054
})
5155

52-
io.bs_waddr.valid := false.B
53-
io.bs_waddr.bits := DontCare
54-
io.bs_wdata := DontCare
55-
5656
val d = io.d
5757
val s1_valid = Wire(Bool())
5858
val s2_valid, s2_ready = Wire(Bool())
5959
val s3_valid, s3_ready = Wire(Bool())
60+
val s4_ready = Wire(Bool())
6061

6162
// stage1
6263
val busy = RegInit(false.B)
@@ -65,8 +66,10 @@ class SourceD(implicit p: Parameters) extends HuanCunModule {
6566
val s1_req = Mux(busy, s1_req_reg, io.task.bits)
6667
val s1_needData = s1_req.fromA && (
6768
s1_req.opcode === TLMessages.GrantData ||
68-
s1_req.opcode === TLMessages.AccessAckData
69+
s1_req.opcode === TLMessages.AccessAckData ||
70+
s1_req.opcode === TLMessages.AccessAck // Put should also read data TODO: no need for full-sized PutFullData
6971
)
72+
val s1_need_pb = s1_req.fromA && (s1_req.opcode === TLMessages.AccessAck)
7073
val s1_counter = RegInit(0.U(beatBits.W)) // how many beats have been sent
7174
val s1_total_beats = Mux(s1_needData, totalBeats(s1_req.size), 0.U(beatBits.W))
7275
val s1_beat = startBeat(s1_req.off) | s1_counter
@@ -126,13 +129,29 @@ class SourceD(implicit p: Parameters) extends HuanCunModule {
126129
val s2_latch = s1_valid && s2_ready
127130
val s2_req = RegEnable(s1_req, s2_latch)
128131
val s2_needData = RegEnable(s1_needData, s2_latch)
132+
val s2_last = RegEnable(s1_last, s2_latch)
133+
val s2_counter = RegEnable(s1_counter, s2_latch)
129134
val s2_full = RegInit(false.B)
130135
val s2_releaseAck = s2_req.opcode === ReleaseAck
131136
val s2_bypass_hit = RegEnable(
132137
Mux(busy, s1_bypass_hit_reg, s1_bypass_hit_wire),
133138
false.B, s2_latch
134139
)
135140
val s2_d = Wire(io.d.cloneType)
141+
val s2_need_pb = RegEnable(s1_need_pb, s2_latch)
142+
val s2_need_d = RegEnable(!s1_need_pb || s1_counter === 0.U, s2_latch) // AccessAck for Put should only be fired once
143+
val s2_valid_pb = RegInit(false.B) // put buffer is valid, wait put buffer fire
144+
val s2_pdata_raw = io.pb_beat
145+
val pb_ready = io.pb_pop.ready
146+
val s2_pdata = HoldUnless(s2_pdata_raw, s2_valid_pb)
147+
148+
io.pb_pop.valid := s2_valid_pb && s2_req.fromA
149+
io.pb_pop.bits.bufIdx := s2_req.bufIdx
150+
io.pb_pop.bits.count := s2_counter
151+
io.pb_pop.bits.last := s2_last
152+
153+
when (pb_ready) { s2_valid_pb := false.B }
154+
when (s2_latch) { s2_valid_pb := s1_need_pb }
136155

137156
s1_queue.io.deq.ready := s2_full && s2_bypass_hit && s2_d.ready
138157
s2_d.valid := s2_full && ((s2_bypass_hit && s1_queue.io.deq.valid) || !s2_needData)
@@ -146,33 +165,37 @@ class SourceD(implicit p: Parameters) extends HuanCunModule {
146165
s2_d.bits.corrupt := false.B
147166
s2_d.bits.echo.lift(DirtyKey).foreach(_ := s2_req.dirty)
148167

149-
val s2_can_go = Mux(s2_d.valid, s2_d.ready, s3_ready)
168+
val s2_can_go = Mux(s2_d.valid, s2_d.ready, s3_ready && (!s2_valid_pb || pb_ready))
150169
when(s2_full && s2_can_go) { s2_full := false.B }
151170
when(s2_latch) { s2_full := true.B }
152171

153-
s2_valid := s2_full && !s2_d.valid
172+
s2_valid := s2_full && !s2_d.valid && (!s2_valid_pb || pb_ready)
154173
s2_ready := !s2_full || s2_can_go
155174

156175
// stage3
157176
val s3_latch = s2_valid && s3_ready
158-
val s3_full = RegInit(false.B)
177+
val s3_valid_d = RegInit(false.B)
178+
159179
// wait counter for sram data
160180
val s3_wait = Reg(UInt(log2Ceil(cacheParams.sramCycleFactor).W))
161181
val s3_needData = RegInit(false.B)
162182
val s3_req = RegEnable(s2_req, s3_latch)
183+
val s3_counter = RegEnable(s2_counter, s3_latch)
184+
val s3_pdata = RegEnable(s2_pdata, s3_latch)
185+
val s3_need_pb = RegEnable(s2_need_pb, s3_latch)
163186
val s3_releaseAck = RegEnable(s2_releaseAck, s3_latch)
164187
val s3_d = Wire(io.d.cloneType)
165188
val s3_queue = Module(new Queue(new DSData, 3, flow = true))
166189
val s3_can_go = if(cacheParams.sramCycleFactor == 1) true.B else s3_wait === 0.U
167190

168-
assert(!s3_full || s3_needData, "Only data task can go to stage3!")
191+
assert(!s3_valid_d || s3_needData, "Only data task can go to stage3!")
169192

170193
when(s3_d.ready && s3_can_go) {
171-
s3_full := false.B
194+
s3_valid_d := false.B
172195
s3_needData := false.B
173196
}
174197
when(s3_latch) {
175-
s3_full := true.B
198+
s3_valid_d := s2_need_d
176199
s3_needData := s2_needData
177200
}
178201
s3_wait := Mux(s3_latch,
@@ -181,7 +204,7 @@ class SourceD(implicit p: Parameters) extends HuanCunModule {
181204
)
182205

183206
val s3_rdata = s3_queue.io.deq.bits.data
184-
s3_d.valid := s3_valid && s3_can_go
207+
s3_d.valid := s3_valid_d && s3_can_go
185208
s3_d.bits.opcode := s3_req.opcode
186209
s3_d.bits.param := Mux(s3_releaseAck, 0.U, s3_req.param)
187210
s3_d.bits.sink := s3_req.sinkId
@@ -198,10 +221,37 @@ class SourceD(implicit p: Parameters) extends HuanCunModule {
198221
), false.B)
199222
s3_queue.io.enq.bits := io.bs_rdata
200223
assert(!s3_queue.io.enq.valid || s3_queue.io.enq.ready)
201-
s3_queue.io.deq.ready := s3_d.ready && s3_needData && s3_valid && s3_can_go
224+
s3_queue.io.deq.ready := s3_d.ready && s3_needData && s3_valid && s3_can_go // TODO: inspect this
225+
226+
s3_ready := !s3_valid_d || s3_d.ready && s3_can_go
227+
s3_valid := s3_valid_d
228+
229+
// stage4
230+
val s4_latch = s3_valid && s4_ready
231+
val s4_req = RegEnable(s3_req, s4_latch)
232+
val s4_rdata = RegEnable(s3_rdata, s4_latch)
233+
val s4_pdata = RegEnable(s3_pdata, s4_latch)
234+
val s4_need_pb = RegEnable(s3_need_pb, s4_latch)
235+
val s4_beat = RegEnable(s3_counter, s4_latch)
236+
val s4_full = RegInit(false.B)
237+
238+
when (io.bs_waddr.ready || !s4_need_pb) { s4_full := false.B }
239+
when (s4_latch) { s4_full := true.B }
240+
241+
val selects = s4_pdata.mask.asBools
242+
val mergedData = Cat(selects.zipWithIndex.map { case (s, i) =>
243+
VecInit(Seq(s4_rdata, s4_pdata.data).map(_((i + 1) * 8 - 1, i * 8)))(s)
244+
}.reverse) // merge data according to mask
245+
246+
io.bs_waddr.valid := s4_full && s4_need_pb
247+
io.bs_waddr.bits.noop := false.B
248+
io.bs_waddr.bits.way := s4_req.way
249+
io.bs_waddr.bits.set := s4_req.set
250+
io.bs_waddr.bits.beat := s4_beat
251+
io.bs_waddr.bits.write := true.B
252+
io.bs_wdata.data := mergedData
202253

203-
s3_ready := !s3_valid || s3_d.ready && s3_can_go
204-
s3_valid := s3_full
254+
s4_ready := !s4_full || io.bs_waddr.ready || !s4_need_pb
205255

206256
TLArbiter.lowest(edgeIn, io.d, s3_d, s2_d)
207257

0 commit comments

Comments
 (0)