diff --git a/cic.py b/cic.py
index 26bb0d2..2906fae 100644
--- a/cic.py
+++ b/cic.py
@@ -18,6 +18,7 @@
 # ([ ] possibly rate change/reset sequencing with output hold and settling)
 # ([ ] possibly n-by-m channels (iter-by-parallel) and a single BRAM)
 
+
 class CIC(Module):
     def __init__(self, width=16, rate_width=9, order=3, channels=4):
         """
@@ -30,7 +31,7 @@ def __init__(self, width=16, rate_width=9, order=3, channels=4):
         # rate change, rate ratio is `r_output/r_input = rate + 1`
         self.rate = Signal(rate_width)
         # output right shift to account for filter gain
-        # should be ceil(order*log2(rate))
+        # should be ceil(order*log2(rate + 1))
         self.gain_shift = Signal(max=order*rate_width + 1)
         # clear combs and integrators to establish new rate
         self.rate_stb = Signal()
@@ -42,7 +43,7 @@ def __init__(self, width=16, rate_width=9, order=3, channels=4):
         self.xi = Signal(max=channels)
         # rate cycle complete
         self.x_ack = Signal()
-        
+
         # output sample for given output channel
         self.y = Signal((width, True), reset_less=True)
         # output channel
@@ -51,30 +52,30 @@ def __init__(self, width=16, rate_width=9, order=3, channels=4):
         self.y_stb = Signal()
 
         ###
-        
+
         channel = Signal(max=channels)
         rate_cnt = Signal(rate_width)
-        stb = Signal(order)
-        rst = Signal(2*order + 1)
+        we = Signal(order)
+        rst = Signal(2*order)
 
         self.sync += [
             channel.eq(channel + 1),
-            stb[1:].eq(stb),
+            we[1:].eq(we),
             rst[1:].eq(rst),
             If(channel == channels - 1,
                 channel.eq(0),
                 rate_cnt.eq(rate_cnt - 1),
-                stb[0].eq(0),
+                we[0].eq(0),
                 If(rate_cnt == 0,
                     rate_cnt.eq(self.rate),
-                    stb[0].eq(1),
+                    we[0].eq(1),
                     rst[0].eq(0),
                 ),
             ),
             If(self.rate_stb,
                 channel.eq(0),
                 rate_cnt.eq(0),
-                stb[0].eq(1),
+                we[0].eq(1),
                 rst[0].eq(1),
             )
         ]
@@ -83,8 +84,8 @@ def __init__(self, width=16, rate_width=9, order=3, channels=4):
         comb = [width + n for n in range(order)]
         integ = [width + order + (rate_width - 1)*(n + 1) for n in range(order)]
 
-        comb_r = [Signal((w, True), reset_less=True) for w in comb]
-        integ_r = [Signal((w, True), reset_less=True) for w in integ]
+        comb_r = [Signal((w, True)) for w in comb]
+        integ_r = [Signal((w, True)) for w in integ]
         comb_w = [Signal((w, True), reset_less=True) for w in comb]
         integ_w = [Signal((w, True), reset_less=True) for w in integ]
 
@@ -92,28 +93,26 @@ def __init__(self, width=16, rate_width=9, order=3, channels=4):
         mem_r = mem.get_port()
         mem_w = mem.get_port(write_capable=True, we_granularity=1)
         self.specials += mem, mem_r, mem_w
-        
-        # for the integrators for a given channel, read is 2 cycles ahead of write:
+
+        # for the integrators for a given channel, read is 2 cycles ahead of
+        # write:
         #   0: read addr; 1: integ_r and old z, 2; new z and comb_w write-back
         # for the combs there would only be one cycle:
         #   0: read addr; 1: comb_r, old z, and comb_w write-back; 2: new z
         # add one delay register at the read port to match the integrator
         # read-write pointer spacing:
-        #   0: read addr; 1: mem dat_r; 2: comb_r, old z, and comb_w write-back; 3: new z
-        # alternatively try:
-        #   0: read addr; 1: comb_r, old z, and comb_w1; 3: new z and comb_w write-back
-        self.sync += [
-            Cat(comb_r).eq(mem_r.dat_r[:sum(comb)]),  
-        ]
+        #   0: read addr; 1: mem_dat_r; 2: comb_r, old z, and comb_w write-back; 3: new z
+        # or delay at the write port:
+        #   0: read addr; 1: comb_r, old z, and z1 store, 2: new z and comb_w write-back
         self.comb += [
-            Cat(integ_r).eq(mem_r.dat_r[sum(comb):]),
+            Cat(comb_r, integ_r).eq(mem_r.dat_r),
             mem_r.adr.eq(channel + 2),
             mem_w.dat_w.eq(Cat(comb_w, integ_w)),
             mem_w.adr.eq(channel),
-            mem_w.we.eq(Cat([Replicate(stb[n], w) for n, w in enumerate(comb)],
+            mem_w.we.eq(Cat([Replicate(we[n], w) for n, w in enumerate(comb)],
                             Replicate(1, sum(integ)))),
             self.xi.eq(channel),
-            self.x_ack.eq(stb[0]),
+            self.x_ack.eq(we[0]),
             self.yi.eq(channel - 2*order),  # 2*order pipeline latency
             self.y_stb.eq(1),
         ]
@@ -122,8 +121,10 @@ def __init__(self, width=16, rate_width=9, order=3, channels=4):
         for i, (cr, cw) in enumerate(zip(comb_r, comb_w)):
             self.comb += cw.eq(z)
             z = Signal((len(cw) + 1, True), reset_less=True)
+            z0 = Signal((len(cr), True), reset_less=True)
             self.sync += [
-                z.eq(cw - cr),
+                z0.eq(cr),
+                z.eq(cw - z0),
                 If(rst[i],
                     z.eq(0),
                 ),
@@ -131,9 +132,9 @@ def __init__(self, width=16, rate_width=9, order=3, channels=4):
         for i, (ir, iw) in enumerate(zip(integ_r, integ_w)):
             self.sync += [
                 iw.eq(ir + z),
-                If(rst[order + 1 + i],
+                If(rst[order + i],
                     iw.eq(0),
                 ),
             ]
             z = iw
-        self.comb += self.y.eq(z >> self.gain_shift)
+        self.comb += self.y.eq(z >> self.gain_shift)
\ No newline at end of file
diff --git a/fastino_phy.py b/fastino_phy.py
index 4cbf80c..7c8cdf7 100644
--- a/fastino_phy.py
+++ b/fastino_phy.py
@@ -324,6 +324,36 @@ def __init__(self, n_frame=14):
         ]
 
 
+class Interpolator(Module):
+    def __init__(self, n_channels=32, n_bits=16):
+        self.rate = Signal(5, reset=16)
+        self.submodules.cic0 = CIC(width=n_bits, order=3,
+                rate_width=1 << len(self.rate) - 1, channels=n_channels//2)
+        self.submodules.cic1 = CIC(width=n_bits, order=3,
+                rate_width=1 << len(self.rate) - 1, channels=n_channels//2)
+        rate0 = Signal.like(self.rate)
+        self.sync += [
+            self.cic0.rate.eq(Array([(1 << i) - 1 for i in range(17)])[self.rate]),
+            self.cic0.gain_shift.eq(3*(self.rate + 1)),
+            If(self.rate != rate0,
+                self.cic0.rate_stb.eq(1),
+            ),
+        ]
+        self.comb += [
+            self.cic1.rate_stb.eq(self.cic0.rate_stb),
+            self.cic1.rate.eq(self.cic0.rate),
+            self.cic1.gain_shift.eq(self.cic0.gain_shift),
+        ]
+        self.x = [Signal((n_bits, True)) for _ in range(n_channels)]
+        self.y = [Signal((n_bits, True)) for _ in range(n_channels)]
+        self.sync += [
+            self.cic0.x.eq(Array(self.x[:16])[self.cic0.xi]),
+            self.cic1.x.eq(Array(self.x[16:])[self.cic1.xi]),
+            Array(self.y[:16])[self.cic0.yi].eq(self.cic0.y),
+            Array(self.y[16:])[self.cic1.yi].eq(self.cic1.y),
+        ]
+
+
 class MultiSPI(Module):
     """Multi-bus SPI streamer"""
     def __init__(self, platform, n_channels=32, n_bits=16):
@@ -332,18 +362,6 @@ def __init__(self, platform, n_channels=32, n_bits=16):
         self.data = Signal(n)
         self.stb = Signal()
 
-        spi_cic = ClockDomainsRenamer("spi")(CIC)
-        self.submodules.cic0 = spi_cic(order=3, rate_width=16, channels=16)
-        self.submodules.cic1 = spi_cic(order=3, rate_width=16, channels=16)
-        self.comb += [
-            self.cic0.rate.eq((1 << 16) - 1),
-            self.cic0.gain_shift.eq(3*16),
-            self.cic1.rate.eq(self.cic0.rate),
-            self.cic1.gain_shift.eq(self.cic0.gain_shift),
-        ]
-        data_a = [Signal((n_bits, True)) for _ in range(32)]
-        data_b = [Signal((n_bits, True)) for _ in range(32)]
-
         spi = [platform.request("dac", i) for i in range(32)]
 
         self.busy = Signal()
@@ -362,16 +380,11 @@ def __init__(self, platform, n_channels=32, n_bits=16):
                 enable.eq(0),
                 self.busy.eq(0),
             ),
-            self.cic0.x.eq(Array(data_a[:16])[self.cic0.xi]),
-            self.cic1.x.eq(Array(data_a[16:])[self.cic1.xi]),
-            Array(data_b[:16])[self.cic0.yi].eq(self.cic0.y),
-            Array(data_b[16:])[self.cic1.yi].eq(self.cic1.y),
             If(self.busy,
                 i.eq(i + 1),
             ).Elif(self.stb,
                 self.busy.eq(1),
-                Cat(enable, data_a).eq(self.data),
-                Cat(sr).eq(Cat(data_b)),
+                Cat(enable, sr).eq(self.data),
             ),
             enable0.eq(enable),
         ]
@@ -550,6 +563,8 @@ def __init__(self, platform):
             AsyncResetSynchronizer(cd_spi, ~locked),
         ]
 
+        self.submodules.int = ClockDomainsRenamer("spi")(Interpolator)()
+
         self.submodules.spi = MultiSPI(platform)
 
         assert len(cfg) + len(adr) + len(self.spi.data) == len(self.frame.body)
@@ -558,7 +573,11 @@ def __init__(self, platform):
         # max data delay sys-spi < min sys-spi clock delay over all alignments
         self.comb += [
             self.spi.stb.eq(self.frame.stb),
-            self.spi.data.eq(self.frame.body[-len(self.spi.data):])
+            #self.spi.data.eq(self.frame.body[-len(self.spi.data):])
+            Cat(self.int.x).eq(self.frame.body[-len(Cat(self.int.x)):]),
+            Cat(self.spi.data).eq(Cat(
+                self.frame.body[-len(self.spi.data):][:32], self.int.y)),
+            self.int.rate.eq(cfg.reserved),
         ]
 
         self.comb += [