Skip to content

Commit ed392c8

Browse files
committed
cache第二阶段所需的文件
1 parent 1a2c261 commit ed392c8

File tree

76 files changed

+1771
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

76 files changed

+1771
-0
lines changed
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
# -*- coding:utf-8 -*-
2+
# Python2 or Python3
3+
# Author : WangXuan
4+
#
5+
# 功能: 生成针对于矩阵乘法(matmul)的 mem.v ,里面存放两个要进行相乘的初始矩阵
6+
#
7+
8+
verilog_head = '''
9+
module mem #( //
10+
parameter ADDR_LEN = 11 //
11+
) (
12+
input clk, rst,
13+
input [ADDR_LEN-1:0] addr, // memory address
14+
output reg [31:0] rd_data, // data read out
15+
input wr_req,
16+
input [31:0] wr_data // data write in
17+
);
18+
localparam MEM_SIZE = 1<<ADDR_LEN;
19+
reg [0:%d-1] [31:0] ram_cell;
20+
21+
always @ (posedge clk or posedge rst)
22+
if(rst)
23+
rd_data <= 0;
24+
else
25+
rd_data <= ram_cell[addr];
26+
27+
always @ (posedge clk)
28+
if(wr_req)
29+
ram_cell[addr] <= wr_data;
30+
31+
initial begin'''
32+
33+
verilog_tail = '''end
34+
35+
endmodule
36+
'''
37+
38+
import sys
39+
from random import randint
40+
41+
if len(sys.argv) != 2:
42+
print(' Usage:\n python generate_mem_for_matmul.py [matrix size]')
43+
print(' Example:\n python generate_mem_for_matmul.py 16')
44+
print(' Tip: use this command to write to file:\n python generate_mem_for_matmul.py 16 > mem.sv')
45+
else:
46+
try:
47+
N = int( sys.argv[1] )
48+
except:
49+
print(' *** Error: parameter must be integer, not %s' % (sys.argv[1], ) )
50+
sys.exit(-1)
51+
if N<=1:
52+
print(' *** Error: parameter must be larger than 1, not %d' % (N, ) )
53+
sys.exit(-1)
54+
55+
print(verilog_head % (N*N*3))
56+
57+
A, B, C = [], [], []
58+
for i in range(N):
59+
Aline, Bline, Cline = [], [], []
60+
for j in range(N):
61+
Aline.append( randint(0,0xffffffff) )
62+
Bline.append( randint(0,0xffffffff) )
63+
Cline.append( 0 )
64+
A.append(Aline)
65+
B.append(Bline)
66+
C.append(Cline)
67+
68+
for i in range(N):
69+
for j in range(N):
70+
for k in range(N):
71+
C[i][j] += A[i][k] & B[k][j]
72+
73+
print(' // dst matrix C')
74+
for i in range(N):
75+
for j in range(N):
76+
print(" ram_cell[%8d] = 32'h0; // 32'h%08x;" % ( N*i+j, C[i][j] & 0xffffffff, ) )
77+
print(' // src matrix A')
78+
for i in range(N):
79+
for j in range(N):
80+
print(" ram_cell[%8d] = 32'h%08x;" % ( N*N+N*i+j, A[i][j], ) )
81+
print(' // src matrix B')
82+
for i in range(N):
83+
for j in range(N):
84+
print(" ram_cell[%8d] = 32'h%08x;" % ( 2*N*N+N*i+j, B[i][j], ) )
85+
86+
print(verilog_tail)
87+
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# -*- coding:utf-8 -*-
2+
# Python2 or Python3
3+
# Author : WangXuan
4+
#
5+
# 功能: 生成针对于快速排序(matmul)的 mem.sv ,里面存放即将被排序的数据
6+
#
7+
8+
verilog_head = '''
9+
module mem #( //
10+
parameter ADDR_LEN = 11 //
11+
) (
12+
input clk, rst,
13+
input [ADDR_LEN-1:0] addr, // memory address
14+
output reg [31:0] rd_data, // data read out
15+
input wr_req,
16+
input [31:0] wr_data // data write in
17+
);
18+
localparam MEM_SIZE = 1<<ADDR_LEN;
19+
reg [0:MEM_SIZE-1] [31:0] ram_cell;
20+
21+
always @ (posedge clk or posedge rst)
22+
if(rst)
23+
rd_data <= 0;
24+
else
25+
rd_data <= ram_cell[addr];
26+
27+
always @ (posedge clk)
28+
if(wr_req)
29+
ram_cell[addr] <= wr_data;
30+
31+
initial begin'''
32+
33+
verilog_tail = '''end
34+
35+
endmodule
36+
'''
37+
38+
import sys
39+
from random import shuffle
40+
41+
if len(sys.argv) != 2:
42+
print(' Usage:\n python generate_mem_for_quicksort.py [matrix size]')
43+
print(' Example:\n python generate_mem_for_quicksort.py 16')
44+
print(' Tip: use this command to write to file:\n python generate_mem_for_quicksort.py 16 > mem.sv')
45+
else:
46+
try:
47+
N = int( sys.argv[1] )
48+
except:
49+
print(' *** Error: parameter must be integer, not %s' % (sys.argv[1], ) )
50+
sys.exit(-1)
51+
if N<=2:
52+
print(' *** Error: parameter must be larger than 2, not %d' % (N, ) )
53+
sys.exit(-1)
54+
55+
print(verilog_head)
56+
57+
lst = list(range(N))
58+
shuffle(lst)
59+
for i in range(N):
60+
print(" ram_cell[%8d] = 32'h%08x;" % ( i, lst[i], ) )
61+
62+
print(verilog_tail)
63+
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
# 伪矩阵乘法 汇编代码
2+
# 我们的 RV32I CPU 没有实现乘法指令,所以在伪矩阵乘法中,使用按位或代替加法,用加法代替乘法,完成矩阵运算。
3+
# 虽然不是真的矩阵乘法,但能够模仿矩阵乘法对RAM的访问过程,对cache的性能研究起到作用
4+
#
5+
6+
.org 0x0
7+
.global _start
8+
_start:
9+
xori a4, zero, 4 # a4寄存器决定了计算的规模,矩阵规模=N*N,N=2^a4。例如a4=4,则矩阵为 2^4=16阶方阵。该值可以修改。当然,矩阵规模变化后,DataRam的内存分配方式也要同步的变化,才能运行出正确结果
10+
11+
# 以下指令计算3个矩阵(目的矩阵,源矩阵1,源矩阵2)在内存中的起始地址。
12+
# 这三个矩阵在内存中顺序而紧挨着存放,例如 a4=4,则N=16,则每个矩阵占N*N=256个字,即1024个字节
13+
# 则 目的矩阵起始地址为0, 源矩阵1起始地址为1024, 源矩阵2起始地址为2048
14+
# 目的矩阵起始地址放在a2里,源矩阵1起始地址放在a0里,源矩阵2起始地址放在a1里
15+
xori a3, zero, 4
16+
sll a3, a3 , a4
17+
xor a2, zero, zero
18+
sll a0, a3 , a4
19+
add a1, a0 , a0
20+
21+
# 开始矩阵乘法,使用伪矩阵乘法公式:c_{ij} = \sigma c_{ik}*b{kj} , 循环嵌套顺序(从内向外)为 i,j,k 。 分别使用 t0,t1,t2 存放 i,j,k
22+
xor t0, zero, zero
23+
MatMulLoopI:
24+
xor t1, zero, zero
25+
MatMulLoopJ:
26+
xor t3, zero, zero #用t3存放最内求和循环的累加和,首先将t3清零
27+
xor t2, zero, zero
28+
MatMulLoopK:
29+
sll t4, t0, a4
30+
add t4, t4, t2
31+
add t4, t4, a0
32+
lw t4, (t4)
33+
sll t5, t2, a4
34+
add t5, t5, t1
35+
add t5, t5, a1
36+
lw t5, (t5)
37+
and t4, t4, t5
38+
add t3, t3, t4
39+
addi t2, t2, 4
40+
blt t2, a3, MatMulLoopK
41+
sll t4, t0, a4
42+
add t4, t4, t1
43+
add t4, t4, a2
44+
sw t3, (t4)
45+
addi t1, t1, 4
46+
blt t1, a3, MatMulLoopJ
47+
addi t0, t0, 4
48+
blt t0, a3, MatMulLoopI
49+
50+
# 计算结束,死循环
51+
InfLoop:
52+
jal zero, InfLoop
53+
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
# 概述:对数组进行原地快速排序
2+
# Author: WangXuan
3+
4+
.org 0x0
5+
.global _start
6+
_start:
7+
8+
main:
9+
xor a3, zero, 0x100 # 指定排序问题的规模。0x100则代表要给0x100=256个数字进行快速排序。
10+
11+
lui sp, 0x00001 # 设置栈顶指针 sp=0x1000
12+
13+
xor a0, zero, zero # 准备函数参数,a0=0, 说明要排序的数组的RAM起始地址为0
14+
xor a1, zero, zero # 准备函数参数,a1=0,说明从第0个字开始排序
15+
addi a2, a3 , -1
16+
slli a2, a2 , 2 # 准备函数参数,a2=数组最后一个元素的地址偏移。我们要排0x100=1024个数,最后一个数的地址为0x3fc
17+
18+
jal ra , QuickSort # 开始排序
19+
infinity_loop:
20+
jal zero, infinity_loop # 排序结束,死循环
21+
22+
QuickSort:
23+
# 函数:QuickSort:以a0为基地址的原地升序快速排序,a1是start即开始下标,a2end即结束下标
24+
# 例: a0=0x00000100,a1=0, a2=31*4,则计算从0x00000100开始的32个字的快速排序
25+
# 注: 以有符号数为比较标准。例如0xffffffff应该排在0x00000001前面,因为0xffffffff代表-1,比1要小
26+
# 之所以使用低13位,因为13位二进制数取值范围位0~8191,不会超过4位十进制数
27+
# 改变数据RAM: 除了被排序的数组外,还使用了以sp寄存器为栈顶指针的栈。使用栈的大小根据排序长度而不同,调用前合理设置sp的值以防爆栈
28+
# 改变的寄存器: t0, t1, t2, t3, t4
29+
30+
bge a1, a2, QuickSortReturn # if a1>=a2, end<=start, jump to return
31+
or t1, a1, zero # t1=i=a1=start
32+
or t2, a2, zero # t2=j=a2=end
33+
add t0, a0, t1 #
34+
lw t0, (t0) # t0=key=lst[start]
35+
36+
PartationStart:
37+
PartationFirstStart: # start of for loop
38+
bge t1, t2, PartationEnd # if i>=j, branch to next step
39+
add t3, a0, t2 #
40+
lw t3, (t3) # t3=lst[j]
41+
blt t3, t0, PartationFirstEnd # if lst[j]<key, branch to next step
42+
addi t2, t2, -4 # t2-=4 j--
43+
jal zero, PartationFirstStart # for loop
44+
PartationFirstEnd: # end of for loop
45+
add t4 , a0, t1 # t4=lst+i
46+
sw t3 , (t4) # lst[i] = t3 = lst[j]
47+
48+
PartationSecondStart: # start of for loop
49+
bge t1, t2, PartationEnd # if i>=j, branch to next step
50+
add t3, a0, t1 #
51+
lw t3, (t3) # t3=lst[i]
52+
blt t0, t3, PartationSecondEnd # if key<lst[i], branch to next step
53+
addi t1, t1, 4 # t1+=4 i++
54+
jal zero, PartationSecondStart # for loop
55+
PartationSecondEnd: # end of for loop
56+
add t4 , a0, t2 # t4=lst+j
57+
sw t3 , (t4) # lst[j] = t3 = lst[i]
58+
59+
blt t1, t2, PartationStart # if t1<t2, branch to while start
60+
PartationEnd:
61+
62+
add t4 , a0, t1 # t4=lst+i
63+
sw t0 , (t4) # lst[i] = t0 = key
64+
65+
addi sp, sp, -4 # sp-=4
66+
sw ra, (sp) # mem[sp] = ra # push ra to stack
67+
addi sp, sp, -4 # sp-=4
68+
sw a1, (sp) # mem[sp] = a1 # push a1 to stack, save start
69+
addi sp, sp, -4 # sp-=4
70+
sw a2, (sp) # mem[sp] = a2 # push a2 to stack, save end
71+
addi sp, sp, -4 # sp-=4
72+
sw t1, (sp) # mem[sp] = t1 # push t1 to stack, save i
73+
addi a2, t1, -4 # a2 = i-4, a parameter for recursive call
74+
jal ra , QuickSort
75+
lw t1, (sp) # pop i form stack
76+
addi sp, sp, 4 # sp+=4
77+
lw a2, (sp) # pop end form stack
78+
addi sp, sp, 4 # sp+=4
79+
lw a1, (sp) # pop start form stack
80+
81+
addi sp, sp, -4 # sp-=4
82+
sw a2, (sp) # mem[sp] = a2 # push a2 to stack, save end
83+
addi sp, sp, -4 # sp-=4
84+
sw t1, (sp) # mem[sp] = t1 # push t1 to stack, save i
85+
addi a1, t1, 4 # a1 = i+4, a parameter for recursive call
86+
jal ra , QuickSort
87+
lw t1, (sp) # pop i form stack
88+
addi sp, sp, 4 # sp+=4
89+
lw a2, (sp) # pop end form stack
90+
addi sp, sp, 4 # sp+=4
91+
lw a1, (sp) # pop start form stack
92+
addi sp, sp, 4 # sp+=4
93+
lw ra, (sp) # pop ra form stack
94+
addi sp, sp, 4 # sp+=4
95+
96+
QuickSortReturn: # 函数结尾
97+
jalr zero, ra, 0 # 返回
98+
99+
100+
101+
102+
#
103+
# QuickSort函数的等效C代码:
104+
# void QuickSort(int *lst, int start, int end){
105+
# if(end>start){
106+
# int i = start,j = end,key = lst[start];
107+
# while(i < j){
108+
# for (;i < j && key <= lst[j];j--);
109+
# lst[i] = lst[j];
110+
# for (;i < j && key >= lst[i];i++);
111+
# lst[j] = lst[i];
112+
# }
113+
# lst[i] = key;
114+
# QuickSort(lst, start, i - 1);
115+
# QuickSort(lst, i + 1, end);
116+
# }
117+
# }
118+
#
119+
#
120+

0 commit comments

Comments
 (0)