diff --git a/.gitignore b/.gitignore
index 1b4209a..5a75f31 100644
--- a/.gitignore
+++ b/.gitignore
@@ -23,7 +23,8 @@ wheelhouse
 *.egg-info
 *.whl
 
-# Gemm Bench
+# Bench Artifacts
 gemm/vmfb/
+attention/vmfb/
+conv/vmfb/
 results/
-
diff --git a/attention/vmfb/attention_128x1024x128x128x1024xf16.vmfb b/attention/vmfb/attention_128x1024x128x128x1024xf16.vmfb
deleted file mode 100644
index 7c3490f..0000000
Binary files a/attention/vmfb/attention_128x1024x128x128x1024xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_128x1024x64x64x1024xf16.vmfb b/attention/vmfb/attention_128x1024x64x64x1024xf16.vmfb
deleted file mode 100644
index 8ece34c..0000000
Binary files a/attention/vmfb/attention_128x1024x64x64x1024xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_128x16384x128x128x16384xf16.vmfb b/attention/vmfb/attention_128x16384x128x128x16384xf16.vmfb
deleted file mode 100644
index 89402c0..0000000
Binary files a/attention/vmfb/attention_128x16384x128x128x16384xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_128x16384x64x64x16384xf16.vmfb b/attention/vmfb/attention_128x16384x64x64x16384xf16.vmfb
deleted file mode 100644
index fea5b70..0000000
Binary files a/attention/vmfb/attention_128x16384x64x64x16384xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_128x2048x128x128x2048xf16.vmfb b/attention/vmfb/attention_128x2048x128x128x2048xf16.vmfb
deleted file mode 100644
index ec50bd8..0000000
Binary files a/attention/vmfb/attention_128x2048x128x128x2048xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_128x2048x64x64x2048xf16.vmfb b/attention/vmfb/attention_128x2048x64x64x2048xf16.vmfb
deleted file mode 100644
index c37aa19..0000000
Binary files a/attention/vmfb/attention_128x2048x64x64x2048xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_128x4096x128x128x4096xf16.vmfb b/attention/vmfb/attention_128x4096x128x128x4096xf16.vmfb
deleted file mode 100644
index 7d0136a..0000000
Binary files a/attention/vmfb/attention_128x4096x128x128x4096xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_128x4096x64x64x4096xf16.vmfb b/attention/vmfb/attention_128x4096x64x64x4096xf16.vmfb
deleted file mode 100644
index 05bc66f..0000000
Binary files a/attention/vmfb/attention_128x4096x64x64x4096xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_128x8192x128x128x8192xf16.vmfb b/attention/vmfb/attention_128x8192x128x128x8192xf16.vmfb
deleted file mode 100644
index d667542..0000000
Binary files a/attention/vmfb/attention_128x8192x128x128x8192xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_128x8192x64x64x8192xf16.vmfb b/attention/vmfb/attention_128x8192x64x64x8192xf16.vmfb
deleted file mode 100644
index 70e2681..0000000
Binary files a/attention/vmfb/attention_128x8192x64x64x8192xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_12x384x64x64x384xf16.vmfb b/attention/vmfb/attention_12x384x64x64x384xf16.vmfb
deleted file mode 100644
index b4c2b2c..0000000
Binary files a/attention/vmfb/attention_12x384x64x64x384xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_16x1024x128x128x1024xf16.vmfb b/attention/vmfb/attention_16x1024x128x128x1024xf16.vmfb
deleted file mode 100644
index bec08a4..0000000
Binary files a/attention/vmfb/attention_16x1024x128x128x1024xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_16x1024x64x64x1024xf16.vmfb b/attention/vmfb/attention_16x1024x64x64x1024xf16.vmfb
deleted file mode 100644
index dc0fe57..0000000
Binary files a/attention/vmfb/attention_16x1024x64x64x1024xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_16x16384x128x128x16384xf16.vmfb b/attention/vmfb/attention_16x16384x128x128x16384xf16.vmfb
deleted file mode 100644
index 2e4a4ab..0000000
Binary files a/attention/vmfb/attention_16x16384x128x128x16384xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_16x16384x64x64x16384xf16.vmfb b/attention/vmfb/attention_16x16384x64x64x16384xf16.vmfb
deleted file mode 100644
index b089f0c..0000000
Binary files a/attention/vmfb/attention_16x16384x64x64x16384xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_16x2048x128x128x2048xf16.vmfb b/attention/vmfb/attention_16x2048x128x128x2048xf16.vmfb
deleted file mode 100644
index 09d0412..0000000
Binary files a/attention/vmfb/attention_16x2048x128x128x2048xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_16x2048x64x64x2048xf16.vmfb b/attention/vmfb/attention_16x2048x64x64x2048xf16.vmfb
deleted file mode 100644
index 8b0bacb..0000000
Binary files a/attention/vmfb/attention_16x2048x64x64x2048xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_16x4096x128x128x4096xf16.vmfb b/attention/vmfb/attention_16x4096x128x128x4096xf16.vmfb
deleted file mode 100644
index 6a34acf..0000000
Binary files a/attention/vmfb/attention_16x4096x128x128x4096xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_16x4096x64x64x4096xf16.vmfb b/attention/vmfb/attention_16x4096x64x64x4096xf16.vmfb
deleted file mode 100644
index afdbfa7..0000000
Binary files a/attention/vmfb/attention_16x4096x64x64x4096xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_16x8192x128x128x8192xf16.vmfb b/attention/vmfb/attention_16x8192x128x128x8192xf16.vmfb
deleted file mode 100644
index cf5bd14..0000000
Binary files a/attention/vmfb/attention_16x8192x128x128x8192xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_16x8192x64x64x8192xf16.vmfb b/attention/vmfb/attention_16x8192x64x64x8192xf16.vmfb
deleted file mode 100644
index 38d303b..0000000
Binary files a/attention/vmfb/attention_16x8192x64x64x8192xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_192x1024x128x128x1024xf16.vmfb b/attention/vmfb/attention_192x1024x128x128x1024xf16.vmfb
deleted file mode 100644
index 4ae9d19..0000000
Binary files a/attention/vmfb/attention_192x1024x128x128x1024xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_192x1024x64x64x1024xf16.vmfb b/attention/vmfb/attention_192x1024x64x64x1024xf16.vmfb
deleted file mode 100644
index fc46dcf..0000000
Binary files a/attention/vmfb/attention_192x1024x64x64x1024xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_192x16384x128x128x16384xf16.vmfb b/attention/vmfb/attention_192x16384x128x128x16384xf16.vmfb
deleted file mode 100644
index 715b290..0000000
Binary files a/attention/vmfb/attention_192x16384x128x128x16384xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_192x16384x64x64x16384xf16.vmfb b/attention/vmfb/attention_192x16384x64x64x16384xf16.vmfb
deleted file mode 100644
index 5b01a5b..0000000
Binary files a/attention/vmfb/attention_192x16384x64x64x16384xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_192x2048x128x128x2048xf16.vmfb b/attention/vmfb/attention_192x2048x128x128x2048xf16.vmfb
deleted file mode 100644
index bce1552..0000000
Binary files a/attention/vmfb/attention_192x2048x128x128x2048xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_192x2048x64x64x2048xf16.vmfb b/attention/vmfb/attention_192x2048x64x64x2048xf16.vmfb
deleted file mode 100644
index cfb23f1..0000000
Binary files a/attention/vmfb/attention_192x2048x64x64x2048xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_192x4096x128x128x4096xf16.vmfb b/attention/vmfb/attention_192x4096x128x128x4096xf16.vmfb
deleted file mode 100644
index 6e98b97..0000000
Binary files a/attention/vmfb/attention_192x4096x128x128x4096xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_192x4096x64x64x4096xf16.vmfb b/attention/vmfb/attention_192x4096x64x64x4096xf16.vmfb
deleted file mode 100644
index a948ed4..0000000
Binary files a/attention/vmfb/attention_192x4096x64x64x4096xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_192x8192x128x128x8192xf16.vmfb b/attention/vmfb/attention_192x8192x128x128x8192xf16.vmfb
deleted file mode 100644
index 2d5b932..0000000
Binary files a/attention/vmfb/attention_192x8192x128x128x8192xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_192x8192x64x64x8192xf16.vmfb b/attention/vmfb/attention_192x8192x64x64x8192xf16.vmfb
deleted file mode 100644
index ce154a8..0000000
Binary files a/attention/vmfb/attention_192x8192x64x64x8192xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_1x1024x128x128x1024xf16.vmfb b/attention/vmfb/attention_1x1024x128x128x1024xf16.vmfb
deleted file mode 100644
index 3cf1e55..0000000
Binary files a/attention/vmfb/attention_1x1024x128x128x1024xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_1x1024x64x64x1024xf16.vmfb b/attention/vmfb/attention_1x1024x64x64x1024xf16.vmfb
deleted file mode 100644
index 3f39388..0000000
Binary files a/attention/vmfb/attention_1x1024x64x64x1024xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_1x16384x128x128x16384xf16.vmfb b/attention/vmfb/attention_1x16384x128x128x16384xf16.vmfb
deleted file mode 100644
index de13b1f..0000000
Binary files a/attention/vmfb/attention_1x16384x128x128x16384xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_1x16384x64x64x16384xf16.vmfb b/attention/vmfb/attention_1x16384x64x64x16384xf16.vmfb
deleted file mode 100644
index 938d2cd..0000000
Binary files a/attention/vmfb/attention_1x16384x64x64x16384xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_1x2048x128x128x2048xf16.vmfb b/attention/vmfb/attention_1x2048x128x128x2048xf16.vmfb
deleted file mode 100644
index c09f218..0000000
Binary files a/attention/vmfb/attention_1x2048x128x128x2048xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_1x2048x64x64x2048xf16.vmfb b/attention/vmfb/attention_1x2048x64x64x2048xf16.vmfb
deleted file mode 100644
index a2e36e1..0000000
Binary files a/attention/vmfb/attention_1x2048x64x64x2048xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_1x4096x128x128x4096xf16.vmfb b/attention/vmfb/attention_1x4096x128x128x4096xf16.vmfb
deleted file mode 100644
index 6a16230..0000000
Binary files a/attention/vmfb/attention_1x4096x128x128x4096xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_1x4096x64x64x4096xf16.vmfb b/attention/vmfb/attention_1x4096x64x64x4096xf16.vmfb
deleted file mode 100644
index 6057b7f..0000000
Binary files a/attention/vmfb/attention_1x4096x64x64x4096xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_1x4096x64x64x64xf16.vmfb b/attention/vmfb/attention_1x4096x64x64x64xf16.vmfb
deleted file mode 100644
index a9246fe..0000000
Binary files a/attention/vmfb/attention_1x4096x64x64x64xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_1x8192x128x128x8192xf16.vmfb b/attention/vmfb/attention_1x8192x128x128x8192xf16.vmfb
deleted file mode 100644
index 080fd73..0000000
Binary files a/attention/vmfb/attention_1x8192x128x128x8192xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_1x8192x64x64x8192xf16.vmfb b/attention/vmfb/attention_1x8192x64x64x8192xf16.vmfb
deleted file mode 100644
index f46aa51..0000000
Binary files a/attention/vmfb/attention_1x8192x64x64x8192xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_20x4096x64x64x4096xf16.vmfb b/attention/vmfb/attention_20x4096x64x64x4096xf16.vmfb
deleted file mode 100644
index 424702f..0000000
Binary files a/attention/vmfb/attention_20x4096x64x64x4096xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_20x4096x64x64x64xf16.vmfb b/attention/vmfb/attention_20x4096x64x64x64xf16.vmfb
deleted file mode 100644
index 5f3725e..0000000
Binary files a/attention/vmfb/attention_20x4096x64x64x64xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_2x1024x128x128x1024xf16.vmfb b/attention/vmfb/attention_2x1024x128x128x1024xf16.vmfb
deleted file mode 100644
index eb0be8b..0000000
Binary files a/attention/vmfb/attention_2x1024x128x128x1024xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_2x1024x64x64x1024xf16.vmfb b/attention/vmfb/attention_2x1024x64x64x1024xf16.vmfb
deleted file mode 100644
index 3e66ff3..0000000
Binary files a/attention/vmfb/attention_2x1024x64x64x1024xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_2x1024x64x64x64xf16.vmfb b/attention/vmfb/attention_2x1024x64x64x64xf16.vmfb
deleted file mode 100644
index 1f98e57..0000000
Binary files a/attention/vmfb/attention_2x1024x64x64x64xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_2x16384x128x128x16384xf16.vmfb b/attention/vmfb/attention_2x16384x128x128x16384xf16.vmfb
deleted file mode 100644
index 1465c10..0000000
Binary files a/attention/vmfb/attention_2x16384x128x128x16384xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_2x16384x64x64x16384xf16.vmfb b/attention/vmfb/attention_2x16384x64x64x16384xf16.vmfb
deleted file mode 100644
index 2e8b0fc..0000000
Binary files a/attention/vmfb/attention_2x16384x64x64x16384xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_2x2048x128x128x2048xf16.vmfb b/attention/vmfb/attention_2x2048x128x128x2048xf16.vmfb
deleted file mode 100644
index 6ad2d49..0000000
Binary files a/attention/vmfb/attention_2x2048x128x128x2048xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_2x2048x64x64x2048xf16.vmfb b/attention/vmfb/attention_2x2048x64x64x2048xf16.vmfb
deleted file mode 100644
index 8a5feae..0000000
Binary files a/attention/vmfb/attention_2x2048x64x64x2048xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_2x4096x128x128x4096xf16.vmfb b/attention/vmfb/attention_2x4096x128x128x4096xf16.vmfb
deleted file mode 100644
index 80e7b4d..0000000
Binary files a/attention/vmfb/attention_2x4096x128x128x4096xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_2x4096x64x64x4096xf16.vmfb b/attention/vmfb/attention_2x4096x64x64x4096xf16.vmfb
deleted file mode 100644
index 28b0733..0000000
Binary files a/attention/vmfb/attention_2x4096x64x64x4096xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_2x8192x128x128x8192xf16.vmfb b/attention/vmfb/attention_2x8192x128x128x8192xf16.vmfb
deleted file mode 100644
index eb49642..0000000
Binary files a/attention/vmfb/attention_2x8192x128x128x8192xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_2x8192x64x64x8192xf16.vmfb b/attention/vmfb/attention_2x8192x64x64x8192xf16.vmfb
deleted file mode 100644
index 10bd77f..0000000
Binary files a/attention/vmfb/attention_2x8192x64x64x8192xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_32x1024x128x128x1024xf16.vmfb b/attention/vmfb/attention_32x1024x128x128x1024xf16.vmfb
deleted file mode 100644
index a5a2226..0000000
Binary files a/attention/vmfb/attention_32x1024x128x128x1024xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_32x1024x64x64x1024xf16.vmfb b/attention/vmfb/attention_32x1024x64x64x1024xf16.vmfb
deleted file mode 100644
index 47febe6..0000000
Binary files a/attention/vmfb/attention_32x1024x64x64x1024xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_32x16384x128x128x16384xf16.vmfb b/attention/vmfb/attention_32x16384x128x128x16384xf16.vmfb
deleted file mode 100644
index 7197ee8..0000000
Binary files a/attention/vmfb/attention_32x16384x128x128x16384xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_32x16384x64x64x16384xf16.vmfb b/attention/vmfb/attention_32x16384x64x64x16384xf16.vmfb
deleted file mode 100644
index 9c66eab..0000000
Binary files a/attention/vmfb/attention_32x16384x64x64x16384xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_32x2048x128x128x2048xf16.vmfb b/attention/vmfb/attention_32x2048x128x128x2048xf16.vmfb
deleted file mode 100644
index a8a11f0..0000000
Binary files a/attention/vmfb/attention_32x2048x128x128x2048xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_32x2048x64x64x2048xf16.vmfb b/attention/vmfb/attention_32x2048x64x64x2048xf16.vmfb
deleted file mode 100644
index 23cad72..0000000
Binary files a/attention/vmfb/attention_32x2048x64x64x2048xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_32x4096x128x128x4096xf16.vmfb b/attention/vmfb/attention_32x4096x128x128x4096xf16.vmfb
deleted file mode 100644
index 7ed4feb..0000000
Binary files a/attention/vmfb/attention_32x4096x128x128x4096xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_32x4096x64x64x4096xf16.vmfb b/attention/vmfb/attention_32x4096x64x64x4096xf16.vmfb
deleted file mode 100644
index f67f0a9..0000000
Binary files a/attention/vmfb/attention_32x4096x64x64x4096xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_32x8192x128x128x8192xf16.vmfb b/attention/vmfb/attention_32x8192x128x128x8192xf16.vmfb
deleted file mode 100644
index 48d4c63..0000000
Binary files a/attention/vmfb/attention_32x8192x128x128x8192xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_32x8192x64x64x8192xf16.vmfb b/attention/vmfb/attention_32x8192x64x64x8192xf16.vmfb
deleted file mode 100644
index b4a9ba1..0000000
Binary files a/attention/vmfb/attention_32x8192x64x64x8192xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_40x1024x64x64x1024xf16.vmfb b/attention/vmfb/attention_40x1024x64x64x1024xf16.vmfb
deleted file mode 100644
index 405bdaf..0000000
Binary files a/attention/vmfb/attention_40x1024x64x64x1024xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_40x1024x64x64x64xf16.vmfb b/attention/vmfb/attention_40x1024x64x64x64xf16.vmfb
deleted file mode 100644
index aecdf23..0000000
Binary files a/attention/vmfb/attention_40x1024x64x64x64xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_48x1024x128x128x1024xf16.vmfb b/attention/vmfb/attention_48x1024x128x128x1024xf16.vmfb
deleted file mode 100644
index 84bc961..0000000
Binary files a/attention/vmfb/attention_48x1024x128x128x1024xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_48x1024x64x64x1024xf16.vmfb b/attention/vmfb/attention_48x1024x64x64x1024xf16.vmfb
deleted file mode 100644
index dcb8053..0000000
Binary files a/attention/vmfb/attention_48x1024x64x64x1024xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_48x16384x128x128x16384xf16.vmfb b/attention/vmfb/attention_48x16384x128x128x16384xf16.vmfb
deleted file mode 100644
index 6030730..0000000
Binary files a/attention/vmfb/attention_48x16384x128x128x16384xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_48x16384x64x64x16384xf16.vmfb b/attention/vmfb/attention_48x16384x64x64x16384xf16.vmfb
deleted file mode 100644
index 5121c8a..0000000
Binary files a/attention/vmfb/attention_48x16384x64x64x16384xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_48x2048x128x128x2048xf16.vmfb b/attention/vmfb/attention_48x2048x128x128x2048xf16.vmfb
deleted file mode 100644
index 5b4a736..0000000
Binary files a/attention/vmfb/attention_48x2048x128x128x2048xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_48x2048x64x64x2048xf16.vmfb b/attention/vmfb/attention_48x2048x64x64x2048xf16.vmfb
deleted file mode 100644
index 31ac981..0000000
Binary files a/attention/vmfb/attention_48x2048x64x64x2048xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_48x4096x128x128x4096xf16.vmfb b/attention/vmfb/attention_48x4096x128x128x4096xf16.vmfb
deleted file mode 100644
index 66ae333..0000000
Binary files a/attention/vmfb/attention_48x4096x128x128x4096xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_48x4096x64x64x4096xf16.vmfb b/attention/vmfb/attention_48x4096x64x64x4096xf16.vmfb
deleted file mode 100644
index 3222407..0000000
Binary files a/attention/vmfb/attention_48x4096x64x64x4096xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_48x8192x128x128x8192xf16.vmfb b/attention/vmfb/attention_48x8192x128x128x8192xf16.vmfb
deleted file mode 100644
index c8ac465..0000000
Binary files a/attention/vmfb/attention_48x8192x128x128x8192xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_48x8192x64x64x8192xf16.vmfb b/attention/vmfb/attention_48x8192x64x64x8192xf16.vmfb
deleted file mode 100644
index 12e17aa..0000000
Binary files a/attention/vmfb/attention_48x8192x64x64x8192xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_4x1024x128x128x1024xf16.vmfb b/attention/vmfb/attention_4x1024x128x128x1024xf16.vmfb
deleted file mode 100644
index 61759ec..0000000
Binary files a/attention/vmfb/attention_4x1024x128x128x1024xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_4x1024x64x64x1024xf16.vmfb b/attention/vmfb/attention_4x1024x64x64x1024xf16.vmfb
deleted file mode 100644
index e97b7ae..0000000
Binary files a/attention/vmfb/attention_4x1024x64x64x1024xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_4x16384x128x128x16384xf16.vmfb b/attention/vmfb/attention_4x16384x128x128x16384xf16.vmfb
deleted file mode 100644
index 5a36bf0..0000000
Binary files a/attention/vmfb/attention_4x16384x128x128x16384xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_4x16384x64x64x16384xf16.vmfb b/attention/vmfb/attention_4x16384x64x64x16384xf16.vmfb
deleted file mode 100644
index ffbbcb3..0000000
Binary files a/attention/vmfb/attention_4x16384x64x64x16384xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_4x2048x128x128x2048xf16.vmfb b/attention/vmfb/attention_4x2048x128x128x2048xf16.vmfb
deleted file mode 100644
index a4ea6c9..0000000
Binary files a/attention/vmfb/attention_4x2048x128x128x2048xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_4x2048x64x64x2048xf16.vmfb b/attention/vmfb/attention_4x2048x64x64x2048xf16.vmfb
deleted file mode 100644
index d76fb8f..0000000
Binary files a/attention/vmfb/attention_4x2048x64x64x2048xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_4x4096x128x128x4096xf16.vmfb b/attention/vmfb/attention_4x4096x128x128x4096xf16.vmfb
deleted file mode 100644
index f862895..0000000
Binary files a/attention/vmfb/attention_4x4096x128x128x4096xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_4x4096x64x64x4096xf16.vmfb b/attention/vmfb/attention_4x4096x64x64x4096xf16.vmfb
deleted file mode 100644
index 767d3eb..0000000
Binary files a/attention/vmfb/attention_4x4096x64x64x4096xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_4x4096x64x64x64xf16.vmfb b/attention/vmfb/attention_4x4096x64x64x64xf16.vmfb
deleted file mode 100644
index a4e8b21..0000000
Binary files a/attention/vmfb/attention_4x4096x64x64x64xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_4x8192x128x128x8192xf16.vmfb b/attention/vmfb/attention_4x8192x128x128x8192xf16.vmfb
deleted file mode 100644
index a087bf3..0000000
Binary files a/attention/vmfb/attention_4x8192x128x128x8192xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_4x8192x64x64x8192xf16.vmfb b/attention/vmfb/attention_4x8192x64x64x8192xf16.vmfb
deleted file mode 100644
index 059df1a..0000000
Binary files a/attention/vmfb/attention_4x8192x64x64x8192xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_64x1024x128x128x1024xf16.vmfb b/attention/vmfb/attention_64x1024x128x128x1024xf16.vmfb
deleted file mode 100644
index a0295a2..0000000
Binary files a/attention/vmfb/attention_64x1024x128x128x1024xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_64x1024x64x64x1024xf16.vmfb b/attention/vmfb/attention_64x1024x64x64x1024xf16.vmfb
deleted file mode 100644
index 8e68b50..0000000
Binary files a/attention/vmfb/attention_64x1024x64x64x1024xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_64x16384x128x128x16384xf16.vmfb b/attention/vmfb/attention_64x16384x128x128x16384xf16.vmfb
deleted file mode 100644
index 67d8911..0000000
Binary files a/attention/vmfb/attention_64x16384x128x128x16384xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_64x16384x64x64x16384xf16.vmfb b/attention/vmfb/attention_64x16384x64x64x16384xf16.vmfb
deleted file mode 100644
index d548b8a..0000000
Binary files a/attention/vmfb/attention_64x16384x64x64x16384xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_64x2048x128x128x2048xf16.vmfb b/attention/vmfb/attention_64x2048x128x128x2048xf16.vmfb
deleted file mode 100644
index 797fcff..0000000
Binary files a/attention/vmfb/attention_64x2048x128x128x2048xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_64x2048x64x64x2048xf16.vmfb b/attention/vmfb/attention_64x2048x64x64x2048xf16.vmfb
deleted file mode 100644
index 41ef8f2..0000000
Binary files a/attention/vmfb/attention_64x2048x64x64x2048xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_64x4096x128x128x4096xf16.vmfb b/attention/vmfb/attention_64x4096x128x128x4096xf16.vmfb
deleted file mode 100644
index 62b63e2..0000000
Binary files a/attention/vmfb/attention_64x4096x128x128x4096xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_64x4096x64x64x4096xf16.vmfb b/attention/vmfb/attention_64x4096x64x64x4096xf16.vmfb
deleted file mode 100644
index eda7758..0000000
Binary files a/attention/vmfb/attention_64x4096x64x64x4096xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_64x8192x128x128x8192xf16.vmfb b/attention/vmfb/attention_64x8192x128x128x8192xf16.vmfb
deleted file mode 100644
index 68aae1c..0000000
Binary files a/attention/vmfb/attention_64x8192x128x128x8192xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_64x8192x64x64x8192xf16.vmfb b/attention/vmfb/attention_64x8192x64x64x8192xf16.vmfb
deleted file mode 100644
index bdc231e..0000000
Binary files a/attention/vmfb/attention_64x8192x64x64x8192xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_768x4096x64x64x64xf16.vmfb b/attention/vmfb/attention_768x4096x64x64x64xf16.vmfb
deleted file mode 100644
index cd9d9ad..0000000
Binary files a/attention/vmfb/attention_768x4096x64x64x64xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_8x1024x128x128x1024xf16.vmfb b/attention/vmfb/attention_8x1024x128x128x1024xf16.vmfb
deleted file mode 100644
index 4c8fb53..0000000
Binary files a/attention/vmfb/attention_8x1024x128x128x1024xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_8x1024x64x64x1024xf16.vmfb b/attention/vmfb/attention_8x1024x64x64x1024xf16.vmfb
deleted file mode 100644
index 44033ea..0000000
Binary files a/attention/vmfb/attention_8x1024x64x64x1024xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_8x1024x64x64x64xf16.vmfb b/attention/vmfb/attention_8x1024x64x64x64xf16.vmfb
deleted file mode 100644
index 5b8212f..0000000
Binary files a/attention/vmfb/attention_8x1024x64x64x64xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_8x16384x128x128x16384xf16.vmfb b/attention/vmfb/attention_8x16384x128x128x16384xf16.vmfb
deleted file mode 100644
index f0ddb38..0000000
Binary files a/attention/vmfb/attention_8x16384x128x128x16384xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_8x16384x64x64x16384xf16.vmfb b/attention/vmfb/attention_8x16384x64x64x16384xf16.vmfb
deleted file mode 100644
index 900f552..0000000
Binary files a/attention/vmfb/attention_8x16384x64x64x16384xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_8x2048x128x128x2048xf16.vmfb b/attention/vmfb/attention_8x2048x128x128x2048xf16.vmfb
deleted file mode 100644
index 4813ea9..0000000
Binary files a/attention/vmfb/attention_8x2048x128x128x2048xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_8x2048x64x64x2048xf16.vmfb b/attention/vmfb/attention_8x2048x64x64x2048xf16.vmfb
deleted file mode 100644
index 0b40915..0000000
Binary files a/attention/vmfb/attention_8x2048x64x64x2048xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_8x4096x128x128x4096xf16.vmfb b/attention/vmfb/attention_8x4096x128x128x4096xf16.vmfb
deleted file mode 100644
index 2c9b6af..0000000
Binary files a/attention/vmfb/attention_8x4096x128x128x4096xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_8x4096x64x64x4096xf16.vmfb b/attention/vmfb/attention_8x4096x64x64x4096xf16.vmfb
deleted file mode 100644
index f208a68..0000000
Binary files a/attention/vmfb/attention_8x4096x64x64x4096xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_8x8192x128x128x8192xf16.vmfb b/attention/vmfb/attention_8x8192x128x128x8192xf16.vmfb
deleted file mode 100644
index e7cb257..0000000
Binary files a/attention/vmfb/attention_8x8192x128x128x8192xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_8x8192x64x64x8192xf16.vmfb b/attention/vmfb/attention_8x8192x64x64x8192xf16.vmfb
deleted file mode 100644
index 3e38735..0000000
Binary files a/attention/vmfb/attention_8x8192x64x64x8192xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_96x1024x128x128x1024xf16.vmfb b/attention/vmfb/attention_96x1024x128x128x1024xf16.vmfb
deleted file mode 100644
index e6f5a77..0000000
Binary files a/attention/vmfb/attention_96x1024x128x128x1024xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_96x1024x64x64x1024xf16.vmfb b/attention/vmfb/attention_96x1024x64x64x1024xf16.vmfb
deleted file mode 100644
index 3c98db9..0000000
Binary files a/attention/vmfb/attention_96x1024x64x64x1024xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_96x16384x128x128x16384xf16.vmfb b/attention/vmfb/attention_96x16384x128x128x16384xf16.vmfb
deleted file mode 100644
index 80347c4..0000000
Binary files a/attention/vmfb/attention_96x16384x128x128x16384xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_96x16384x64x64x16384xf16.vmfb b/attention/vmfb/attention_96x16384x64x64x16384xf16.vmfb
deleted file mode 100644
index f421314..0000000
Binary files a/attention/vmfb/attention_96x16384x64x64x16384xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_96x2048x128x128x2048xf16.vmfb b/attention/vmfb/attention_96x2048x128x128x2048xf16.vmfb
deleted file mode 100644
index f5c959b..0000000
Binary files a/attention/vmfb/attention_96x2048x128x128x2048xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_96x2048x64x64x2048xf16.vmfb b/attention/vmfb/attention_96x2048x64x64x2048xf16.vmfb
deleted file mode 100644
index 4c482f6..0000000
Binary files a/attention/vmfb/attention_96x2048x64x64x2048xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_96x4096x128x128x4096xf16.vmfb b/attention/vmfb/attention_96x4096x128x128x4096xf16.vmfb
deleted file mode 100644
index 3312178..0000000
Binary files a/attention/vmfb/attention_96x4096x128x128x4096xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_96x4096x64x64x4096xf16.vmfb b/attention/vmfb/attention_96x4096x64x64x4096xf16.vmfb
deleted file mode 100644
index ee83a16..0000000
Binary files a/attention/vmfb/attention_96x4096x64x64x4096xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_96x8192x128x128x8192xf16.vmfb b/attention/vmfb/attention_96x8192x128x128x8192xf16.vmfb
deleted file mode 100644
index cd119be..0000000
Binary files a/attention/vmfb/attention_96x8192x128x128x8192xf16.vmfb and /dev/null differ
diff --git a/attention/vmfb/attention_96x8192x64x64x8192xf16.vmfb b/attention/vmfb/attention_96x8192x64x64x8192xf16.vmfb
deleted file mode 100644
index 60fc0ca..0000000
Binary files a/attention/vmfb/attention_96x8192x64x64x8192xf16.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_16x112x112x64x7x7x3_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_16x112x112x64x7x7x3_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index 90ba607..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_16x112x112x64x7x7x3_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_16x14x14x1024x1x1x512_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_16x14x14x1024x1x1x512_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index c5c4834..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_16x14x14x1024x1x1x512_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_16x14x14x256x3x3x256_f32xf32xf32_stride1.vmfb b/conv/vmfb/conv_2d_nchw_fchw_16x14x14x256x3x3x256_f32xf32xf32_stride1.vmfb
deleted file mode 100644
index 6a848a0..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_16x14x14x256x3x3x256_f32xf32xf32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_16x14x14x256x3x3x256_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_16x14x14x256x3x3x256_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index 5f31e34..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_16x14x14x256x3x3x256_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_16x28x28x128x3x3x128_f32xf32xf32_stride1.vmfb b/conv/vmfb/conv_2d_nchw_fchw_16x28x28x128x3x3x128_f32xf32xf32_stride1.vmfb
deleted file mode 100644
index 31fbdc3..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_16x28x28x128x3x3x128_f32xf32xf32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_16x28x28x128x3x3x128_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_16x28x28x128x3x3x128_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index d713448..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_16x28x28x128x3x3x128_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_16x28x28x512x1x1x256_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_16x28x28x512x1x1x256_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index cb314fc..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_16x28x28x512x1x1x256_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_16x56x56x64x3x3x64_f32xf32xf32_stride1.vmfb b/conv/vmfb/conv_2d_nchw_fchw_16x56x56x64x3x3x64_f32xf32xf32_stride1.vmfb
deleted file mode 100644
index a873616..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_16x56x56x64x3x3x64_f32xf32xf32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_16x7x7x2048x1x1x1024_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_16x7x7x2048x1x1x1024_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index 485ce60..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_16x7x7x2048x1x1x1024_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_16x7x7x512x3x3x512_f32xf32xf32_stride1.vmfb b/conv/vmfb/conv_2d_nchw_fchw_16x7x7x512x3x3x512_f32xf32xf32_stride1.vmfb
deleted file mode 100644
index a29d15f..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_16x7x7x512x3x3x512_f32xf32xf32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_16x7x7x512x3x3x512_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_16x7x7x512x3x3x512_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index c13030d..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_16x7x7x512x3x3x512_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_1x112x112x64x7x7x3_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_1x112x112x64x7x7x3_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index 45bdada..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_1x112x112x64x7x7x3_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_1x14x14x1024x1x1x512_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_1x14x14x1024x1x1x512_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index 851f16f..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_1x14x14x1024x1x1x512_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_1x14x14x256x3x3x256_f32xf32xf32_stride1.vmfb b/conv/vmfb/conv_2d_nchw_fchw_1x14x14x256x3x3x256_f32xf32xf32_stride1.vmfb
deleted file mode 100644
index 34c5996..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_1x14x14x256x3x3x256_f32xf32xf32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_1x14x14x256x3x3x256_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_1x14x14x256x3x3x256_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index ec04687..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_1x14x14x256x3x3x256_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_1x28x28x128x3x3x128_f32xf32xf32_stride1.vmfb b/conv/vmfb/conv_2d_nchw_fchw_1x28x28x128x3x3x128_f32xf32xf32_stride1.vmfb
deleted file mode 100644
index 69355da..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_1x28x28x128x3x3x128_f32xf32xf32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_1x28x28x128x3x3x128_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_1x28x28x128x3x3x128_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index f804da6..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_1x28x28x128x3x3x128_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_1x28x28x512x1x1x256_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_1x28x28x512x1x1x256_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index bf5f2d4..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_1x28x28x512x1x1x256_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_1x56x56x64x3x3x64_f32xf32xf32_stride1.vmfb b/conv/vmfb/conv_2d_nchw_fchw_1x56x56x64x3x3x64_f32xf32xf32_stride1.vmfb
deleted file mode 100644
index 24b2d30..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_1x56x56x64x3x3x64_f32xf32xf32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_1x7x7x2048x1x1x1024_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_1x7x7x2048x1x1x1024_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index 18fafce..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_1x7x7x2048x1x1x1024_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_1x7x7x512x3x3x512_f32xf32xf32_stride1.vmfb b/conv/vmfb/conv_2d_nchw_fchw_1x7x7x512x3x3x512_f32xf32xf32_stride1.vmfb
deleted file mode 100644
index a035c49..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_1x7x7x512x3x3x512_f32xf32xf32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_1x7x7x512x3x3x512_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_1x7x7x512x3x3x512_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index 85886ea..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_1x7x7x512x3x3x512_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_2x112x112x64x7x7x3_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_2x112x112x64x7x7x3_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index 07c6054..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_2x112x112x64x7x7x3_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_2x14x14x1024x1x1x512_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_2x14x14x1024x1x1x512_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index 8034479..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_2x14x14x1024x1x1x512_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_2x14x14x256x3x3x256_f32xf32xf32_stride1.vmfb b/conv/vmfb/conv_2d_nchw_fchw_2x14x14x256x3x3x256_f32xf32xf32_stride1.vmfb
deleted file mode 100644
index 7dbface..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_2x14x14x256x3x3x256_f32xf32xf32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_2x14x14x256x3x3x256_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_2x14x14x256x3x3x256_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index 0c86b7c..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_2x14x14x256x3x3x256_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_2x28x28x128x3x3x128_f32xf32xf32_stride1.vmfb b/conv/vmfb/conv_2d_nchw_fchw_2x28x28x128x3x3x128_f32xf32xf32_stride1.vmfb
deleted file mode 100644
index 3b27184..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_2x28x28x128x3x3x128_f32xf32xf32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_2x28x28x128x3x3x128_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_2x28x28x128x3x3x128_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index 70d2b31..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_2x28x28x128x3x3x128_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_2x28x28x512x1x1x256_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_2x28x28x512x1x1x256_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index 4a17a4d..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_2x28x28x512x1x1x256_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_2x56x56x64x3x3x64_f32xf32xf32_stride1.vmfb b/conv/vmfb/conv_2d_nchw_fchw_2x56x56x64x3x3x64_f32xf32xf32_stride1.vmfb
deleted file mode 100644
index a180673..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_2x56x56x64x3x3x64_f32xf32xf32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_2x7x7x2048x1x1x1024_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_2x7x7x2048x1x1x1024_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index 6b646a4..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_2x7x7x2048x1x1x1024_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_2x7x7x512x3x3x512_f32xf32xf32_stride1.vmfb b/conv/vmfb/conv_2d_nchw_fchw_2x7x7x512x3x3x512_f32xf32xf32_stride1.vmfb
deleted file mode 100644
index 6138c98..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_2x7x7x512x3x3x512_f32xf32xf32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_2x7x7x512x3x3x512_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_2x7x7x512x3x3x512_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index 30fe861..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_2x7x7x512x3x3x512_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_32x112x112x64x7x7x3_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_32x112x112x64x7x7x3_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index b14dcdd..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_32x112x112x64x7x7x3_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_32x14x14x1024x1x1x512_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_32x14x14x1024x1x1x512_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index f7a494a..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_32x14x14x1024x1x1x512_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_32x14x14x256x3x3x256_f32xf32xf32_stride1.vmfb b/conv/vmfb/conv_2d_nchw_fchw_32x14x14x256x3x3x256_f32xf32xf32_stride1.vmfb
deleted file mode 100644
index 400c9b7..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_32x14x14x256x3x3x256_f32xf32xf32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_32x14x14x256x3x3x256_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_32x14x14x256x3x3x256_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index 58c0ce5..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_32x14x14x256x3x3x256_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_32x28x28x128x3x3x128_f32xf32xf32_stride1.vmfb b/conv/vmfb/conv_2d_nchw_fchw_32x28x28x128x3x3x128_f32xf32xf32_stride1.vmfb
deleted file mode 100644
index a1b88d2..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_32x28x28x128x3x3x128_f32xf32xf32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_32x28x28x128x3x3x128_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_32x28x28x128x3x3x128_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index b123ad7..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_32x28x28x128x3x3x128_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_32x28x28x512x1x1x256_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_32x28x28x512x1x1x256_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index 3a7439e..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_32x28x28x512x1x1x256_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_32x56x56x64x3x3x64_f32xf32xf32_stride1.vmfb b/conv/vmfb/conv_2d_nchw_fchw_32x56x56x64x3x3x64_f32xf32xf32_stride1.vmfb
deleted file mode 100644
index 43ffeb9..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_32x56x56x64x3x3x64_f32xf32xf32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_32x7x7x2048x1x1x1024_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_32x7x7x2048x1x1x1024_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index d8efef1..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_32x7x7x2048x1x1x1024_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_32x7x7x512x3x3x512_f32xf32xf32_stride1.vmfb b/conv/vmfb/conv_2d_nchw_fchw_32x7x7x512x3x3x512_f32xf32xf32_stride1.vmfb
deleted file mode 100644
index 9aa0b09..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_32x7x7x512x3x3x512_f32xf32xf32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_32x7x7x512x3x3x512_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_32x7x7x512x3x3x512_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index 02b26af..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_32x7x7x512x3x3x512_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_48x112x112x64x7x7x3_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_48x112x112x64x7x7x3_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index 735c1c7..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_48x112x112x64x7x7x3_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_48x14x14x1024x1x1x512_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_48x14x14x1024x1x1x512_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index 83c2dfe..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_48x14x14x1024x1x1x512_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_48x14x14x256x3x3x256_f32xf32xf32_stride1.vmfb b/conv/vmfb/conv_2d_nchw_fchw_48x14x14x256x3x3x256_f32xf32xf32_stride1.vmfb
deleted file mode 100644
index 887dd4a..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_48x14x14x256x3x3x256_f32xf32xf32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_48x14x14x256x3x3x256_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_48x14x14x256x3x3x256_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index d6826f8..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_48x14x14x256x3x3x256_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_48x28x28x128x3x3x128_f32xf32xf32_stride1.vmfb b/conv/vmfb/conv_2d_nchw_fchw_48x28x28x128x3x3x128_f32xf32xf32_stride1.vmfb
deleted file mode 100644
index 92ed8ab..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_48x28x28x128x3x3x128_f32xf32xf32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_48x28x28x128x3x3x128_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_48x28x28x128x3x3x128_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index 9dc35c7..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_48x28x28x128x3x3x128_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_48x28x28x512x1x1x256_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_48x28x28x512x1x1x256_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index 1def039..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_48x28x28x512x1x1x256_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_48x56x56x64x3x3x64_f32xf32xf32_stride1.vmfb b/conv/vmfb/conv_2d_nchw_fchw_48x56x56x64x3x3x64_f32xf32xf32_stride1.vmfb
deleted file mode 100644
index e22986b..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_48x56x56x64x3x3x64_f32xf32xf32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_48x7x7x2048x1x1x1024_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_48x7x7x2048x1x1x1024_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index 19b2d16..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_48x7x7x2048x1x1x1024_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_48x7x7x512x3x3x512_f32xf32xf32_stride1.vmfb b/conv/vmfb/conv_2d_nchw_fchw_48x7x7x512x3x3x512_f32xf32xf32_stride1.vmfb
deleted file mode 100644
index 6be3307..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_48x7x7x512x3x3x512_f32xf32xf32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_48x7x7x512x3x3x512_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_48x7x7x512x3x3x512_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index ba6e7d1..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_48x7x7x512x3x3x512_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_4x112x112x64x7x7x3_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_4x112x112x64x7x7x3_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index c12f222..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_4x112x112x64x7x7x3_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_4x14x14x1024x1x1x512_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_4x14x14x1024x1x1x512_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index 34e251f..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_4x14x14x1024x1x1x512_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_4x14x14x256x3x3x256_f32xf32xf32_stride1.vmfb b/conv/vmfb/conv_2d_nchw_fchw_4x14x14x256x3x3x256_f32xf32xf32_stride1.vmfb
deleted file mode 100644
index 503eb32..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_4x14x14x256x3x3x256_f32xf32xf32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_4x14x14x256x3x3x256_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_4x14x14x256x3x3x256_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index 3837457..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_4x14x14x256x3x3x256_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_4x28x28x128x3x3x128_f32xf32xf32_stride1.vmfb b/conv/vmfb/conv_2d_nchw_fchw_4x28x28x128x3x3x128_f32xf32xf32_stride1.vmfb
deleted file mode 100644
index 8eb8287..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_4x28x28x128x3x3x128_f32xf32xf32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_4x28x28x128x3x3x128_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_4x28x28x128x3x3x128_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index 5fa495b..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_4x28x28x128x3x3x128_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_4x28x28x512x1x1x256_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_4x28x28x512x1x1x256_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index 4d84e19..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_4x28x28x512x1x1x256_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_4x56x56x64x3x3x64_f32xf32xf32_stride1.vmfb b/conv/vmfb/conv_2d_nchw_fchw_4x56x56x64x3x3x64_f32xf32xf32_stride1.vmfb
deleted file mode 100644
index 757cb62..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_4x56x56x64x3x3x64_f32xf32xf32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_4x7x7x2048x1x1x1024_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_4x7x7x2048x1x1x1024_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index c43acfa..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_4x7x7x2048x1x1x1024_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_4x7x7x512x3x3x512_f32xf32xf32_stride1.vmfb b/conv/vmfb/conv_2d_nchw_fchw_4x7x7x512x3x3x512_f32xf32xf32_stride1.vmfb
deleted file mode 100644
index f7999f8..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_4x7x7x512x3x3x512_f32xf32xf32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_4x7x7x512x3x3x512_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_4x7x7x512x3x3x512_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index 4e3f847..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_4x7x7x512x3x3x512_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_8x112x112x64x7x7x3_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_8x112x112x64x7x7x3_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index 4ad5015..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_8x112x112x64x7x7x3_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_8x14x14x1024x1x1x512_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_8x14x14x1024x1x1x512_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index 0bb8c42..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_8x14x14x1024x1x1x512_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_8x14x14x256x3x3x256_f32xf32xf32_stride1.vmfb b/conv/vmfb/conv_2d_nchw_fchw_8x14x14x256x3x3x256_f32xf32xf32_stride1.vmfb
deleted file mode 100644
index 2f5380e..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_8x14x14x256x3x3x256_f32xf32xf32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_8x14x14x256x3x3x256_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_8x14x14x256x3x3x256_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index 800fff5..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_8x14x14x256x3x3x256_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_8x28x28x128x3x3x128_f32xf32xf32_stride1.vmfb b/conv/vmfb/conv_2d_nchw_fchw_8x28x28x128x3x3x128_f32xf32xf32_stride1.vmfb
deleted file mode 100644
index 20a91fa..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_8x28x28x128x3x3x128_f32xf32xf32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_8x28x28x128x3x3x128_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_8x28x28x128x3x3x128_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index 569db13..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_8x28x28x128x3x3x128_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_8x28x28x512x1x1x256_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_8x28x28x512x1x1x256_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index 519a1e3..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_8x28x28x512x1x1x256_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_8x56x56x64x3x3x64_f32xf32xf32_stride1.vmfb b/conv/vmfb/conv_2d_nchw_fchw_8x56x56x64x3x3x64_f32xf32xf32_stride1.vmfb
deleted file mode 100644
index 34cd2fa..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_8x56x56x64x3x3x64_f32xf32xf32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_8x7x7x2048x1x1x1024_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_8x7x7x2048x1x1x1024_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index f7c98dd..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_8x7x7x2048x1x1x1024_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_8x7x7x512x3x3x512_f32xf32xf32_stride1.vmfb b/conv/vmfb/conv_2d_nchw_fchw_8x7x7x512x3x3x512_f32xf32xf32_stride1.vmfb
deleted file mode 100644
index 542df15..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_8x7x7x512x3x3x512_f32xf32xf32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nchw_fchw_8x7x7x512x3x3x512_f32xf32xf32_stride2.vmfb b/conv/vmfb/conv_2d_nchw_fchw_8x7x7x512x3x3x512_f32xf32xf32_stride2.vmfb
deleted file mode 100644
index af7836e..0000000
Binary files a/conv/vmfb/conv_2d_nchw_fchw_8x7x7x512x3x3x512_f32xf32xf32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_16x112x112x64x7x7x3_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_16x112x112x64x7x7x3_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index 94902a0..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_16x112x112x64x7x7x3_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_16x14x14x1024x1x1x512_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_16x14x14x1024x1x1x512_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index 4c6b315..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_16x14x14x1024x1x1x512_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_16x14x14x256x3x3x256_i8xi8xi32_stride1.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_16x14x14x256x3x3x256_i8xi8xi32_stride1.vmfb
deleted file mode 100644
index fc6d0f0..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_16x14x14x256x3x3x256_i8xi8xi32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_16x14x14x256x3x3x256_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_16x14x14x256x3x3x256_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index 9ed6b72..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_16x14x14x256x3x3x256_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_16x28x28x128x3x3x128_i8xi8xi32_stride1.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_16x28x28x128x3x3x128_i8xi8xi32_stride1.vmfb
deleted file mode 100644
index 8b6c7c5..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_16x28x28x128x3x3x128_i8xi8xi32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_16x28x28x128x3x3x128_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_16x28x28x128x3x3x128_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index 105fa6a..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_16x28x28x128x3x3x128_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_16x28x28x512x1x1x256_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_16x28x28x512x1x1x256_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index 13930f0..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_16x28x28x512x1x1x256_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_16x56x56x64x3x3x64_i8xi8xi32_stride1.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_16x56x56x64x3x3x64_i8xi8xi32_stride1.vmfb
deleted file mode 100644
index 3859b23..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_16x56x56x64x3x3x64_i8xi8xi32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_16x7x7x2048x1x1x1024_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_16x7x7x2048x1x1x1024_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index 9cda9bd..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_16x7x7x2048x1x1x1024_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_16x7x7x512x3x3x512_i8xi8xi32_stride1.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_16x7x7x512x3x3x512_i8xi8xi32_stride1.vmfb
deleted file mode 100644
index 686f2ce..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_16x7x7x512x3x3x512_i8xi8xi32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_16x7x7x512x3x3x512_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_16x7x7x512x3x3x512_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index 95e8ac8..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_16x7x7x512x3x3x512_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_1x112x112x64x7x7x3_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_1x112x112x64x7x7x3_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index 6c6de3a..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_1x112x112x64x7x7x3_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_1x14x14x1024x1x1x512_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_1x14x14x1024x1x1x512_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index b1f1839..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_1x14x14x1024x1x1x512_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_1x14x14x256x3x3x256_i8xi8xi32_stride1.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_1x14x14x256x3x3x256_i8xi8xi32_stride1.vmfb
deleted file mode 100644
index 5cd5b56..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_1x14x14x256x3x3x256_i8xi8xi32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_1x14x14x256x3x3x256_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_1x14x14x256x3x3x256_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index 10477f9..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_1x14x14x256x3x3x256_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_1x28x28x128x3x3x128_i8xi8xi32_stride1.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_1x28x28x128x3x3x128_i8xi8xi32_stride1.vmfb
deleted file mode 100644
index e2688ae..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_1x28x28x128x3x3x128_i8xi8xi32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_1x28x28x128x3x3x128_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_1x28x28x128x3x3x128_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index a5ad4cd..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_1x28x28x128x3x3x128_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_1x28x28x512x1x1x256_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_1x28x28x512x1x1x256_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index ee9a095..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_1x28x28x512x1x1x256_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_1x56x56x64x3x3x64_i8xi8xi32_stride1.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_1x56x56x64x3x3x64_i8xi8xi32_stride1.vmfb
deleted file mode 100644
index 9f0d831..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_1x56x56x64x3x3x64_i8xi8xi32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_1x7x7x2048x1x1x1024_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_1x7x7x2048x1x1x1024_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index f7ed27d..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_1x7x7x2048x1x1x1024_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_1x7x7x512x3x3x512_i8xi8xi32_stride1.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_1x7x7x512x3x3x512_i8xi8xi32_stride1.vmfb
deleted file mode 100644
index ab65c78..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_1x7x7x512x3x3x512_i8xi8xi32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_1x7x7x512x3x3x512_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_1x7x7x512x3x3x512_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index 7c667b6..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_1x7x7x512x3x3x512_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_2x112x112x64x7x7x3_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_2x112x112x64x7x7x3_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index 08f5257..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_2x112x112x64x7x7x3_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_2x14x14x1024x1x1x512_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_2x14x14x1024x1x1x512_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index 69bad46..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_2x14x14x1024x1x1x512_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_2x14x14x256x3x3x256_i8xi8xi32_stride1.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_2x14x14x256x3x3x256_i8xi8xi32_stride1.vmfb
deleted file mode 100644
index 1bffc8c..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_2x14x14x256x3x3x256_i8xi8xi32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_2x14x14x256x3x3x256_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_2x14x14x256x3x3x256_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index c8d1650..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_2x14x14x256x3x3x256_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_2x28x28x128x3x3x128_i8xi8xi32_stride1.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_2x28x28x128x3x3x128_i8xi8xi32_stride1.vmfb
deleted file mode 100644
index 71c6649..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_2x28x28x128x3x3x128_i8xi8xi32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_2x28x28x128x3x3x128_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_2x28x28x128x3x3x128_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index 804c6f3..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_2x28x28x128x3x3x128_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_2x28x28x512x1x1x256_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_2x28x28x512x1x1x256_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index 127d334..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_2x28x28x512x1x1x256_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_2x56x56x64x3x3x64_i8xi8xi32_stride1.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_2x56x56x64x3x3x64_i8xi8xi32_stride1.vmfb
deleted file mode 100644
index cb91ea6..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_2x56x56x64x3x3x64_i8xi8xi32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_2x7x7x2048x1x1x1024_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_2x7x7x2048x1x1x1024_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index 2a16e17..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_2x7x7x2048x1x1x1024_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_2x7x7x512x3x3x512_i8xi8xi32_stride1.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_2x7x7x512x3x3x512_i8xi8xi32_stride1.vmfb
deleted file mode 100644
index 02fbba2..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_2x7x7x512x3x3x512_i8xi8xi32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_2x7x7x512x3x3x512_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_2x7x7x512x3x3x512_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index b3e94c5..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_2x7x7x512x3x3x512_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_32x112x112x64x7x7x3_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_32x112x112x64x7x7x3_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index 86f0c25..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_32x112x112x64x7x7x3_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_32x14x14x1024x1x1x512_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_32x14x14x1024x1x1x512_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index a3d903f..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_32x14x14x1024x1x1x512_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_32x14x14x256x3x3x256_i8xi8xi32_stride1.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_32x14x14x256x3x3x256_i8xi8xi32_stride1.vmfb
deleted file mode 100644
index bb9b90d..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_32x14x14x256x3x3x256_i8xi8xi32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_32x14x14x256x3x3x256_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_32x14x14x256x3x3x256_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index 228ec1c..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_32x14x14x256x3x3x256_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_32x28x28x128x3x3x128_i8xi8xi32_stride1.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_32x28x28x128x3x3x128_i8xi8xi32_stride1.vmfb
deleted file mode 100644
index 3864a71..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_32x28x28x128x3x3x128_i8xi8xi32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_32x28x28x128x3x3x128_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_32x28x28x128x3x3x128_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index 94a8a71..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_32x28x28x128x3x3x128_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_32x28x28x512x1x1x256_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_32x28x28x512x1x1x256_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index 6d88c07..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_32x28x28x512x1x1x256_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_32x56x56x64x3x3x64_i8xi8xi32_stride1.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_32x56x56x64x3x3x64_i8xi8xi32_stride1.vmfb
deleted file mode 100644
index 8ce8a4a..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_32x56x56x64x3x3x64_i8xi8xi32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_32x7x7x2048x1x1x1024_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_32x7x7x2048x1x1x1024_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index e3ceb41..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_32x7x7x2048x1x1x1024_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_32x7x7x512x3x3x512_i8xi8xi32_stride1.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_32x7x7x512x3x3x512_i8xi8xi32_stride1.vmfb
deleted file mode 100644
index b203b43..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_32x7x7x512x3x3x512_i8xi8xi32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_32x7x7x512x3x3x512_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_32x7x7x512x3x3x512_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index 30dce78..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_32x7x7x512x3x3x512_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_48x112x112x64x7x7x3_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_48x112x112x64x7x7x3_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index 7de2f43..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_48x112x112x64x7x7x3_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_48x14x14x1024x1x1x512_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_48x14x14x1024x1x1x512_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index b9c2920..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_48x14x14x1024x1x1x512_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_48x14x14x256x3x3x256_i8xi8xi32_stride1.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_48x14x14x256x3x3x256_i8xi8xi32_stride1.vmfb
deleted file mode 100644
index 2a7db58..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_48x14x14x256x3x3x256_i8xi8xi32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_48x14x14x256x3x3x256_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_48x14x14x256x3x3x256_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index c054fac..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_48x14x14x256x3x3x256_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_48x28x28x128x3x3x128_i8xi8xi32_stride1.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_48x28x28x128x3x3x128_i8xi8xi32_stride1.vmfb
deleted file mode 100644
index f992692..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_48x28x28x128x3x3x128_i8xi8xi32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_48x28x28x128x3x3x128_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_48x28x28x128x3x3x128_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index a20c26f..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_48x28x28x128x3x3x128_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_48x28x28x512x1x1x256_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_48x28x28x512x1x1x256_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index 8153614..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_48x28x28x512x1x1x256_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_48x56x56x64x3x3x64_i8xi8xi32_stride1.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_48x56x56x64x3x3x64_i8xi8xi32_stride1.vmfb
deleted file mode 100644
index 0790864..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_48x56x56x64x3x3x64_i8xi8xi32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_48x7x7x2048x1x1x1024_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_48x7x7x2048x1x1x1024_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index a086b8d..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_48x7x7x2048x1x1x1024_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_48x7x7x512x3x3x512_i8xi8xi32_stride1.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_48x7x7x512x3x3x512_i8xi8xi32_stride1.vmfb
deleted file mode 100644
index 2f5c4d8..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_48x7x7x512x3x3x512_i8xi8xi32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_48x7x7x512x3x3x512_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_48x7x7x512x3x3x512_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index 985f486..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_48x7x7x512x3x3x512_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_4x112x112x64x7x7x3_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_4x112x112x64x7x7x3_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index 434e5bc..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_4x112x112x64x7x7x3_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_4x14x14x1024x1x1x512_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_4x14x14x1024x1x1x512_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index 65c472c..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_4x14x14x1024x1x1x512_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_4x14x14x256x3x3x256_i8xi8xi32_stride1.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_4x14x14x256x3x3x256_i8xi8xi32_stride1.vmfb
deleted file mode 100644
index ccc863f..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_4x14x14x256x3x3x256_i8xi8xi32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_4x14x14x256x3x3x256_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_4x14x14x256x3x3x256_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index fbbcf57..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_4x14x14x256x3x3x256_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_4x28x28x128x3x3x128_i8xi8xi32_stride1.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_4x28x28x128x3x3x128_i8xi8xi32_stride1.vmfb
deleted file mode 100644
index 6c71955..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_4x28x28x128x3x3x128_i8xi8xi32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_4x28x28x128x3x3x128_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_4x28x28x128x3x3x128_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index 51892d2..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_4x28x28x128x3x3x128_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_4x28x28x512x1x1x256_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_4x28x28x512x1x1x256_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index a1ca1f7..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_4x28x28x512x1x1x256_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_4x56x56x64x3x3x64_i8xi8xi32_stride1.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_4x56x56x64x3x3x64_i8xi8xi32_stride1.vmfb
deleted file mode 100644
index b43d422..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_4x56x56x64x3x3x64_i8xi8xi32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_4x7x7x2048x1x1x1024_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_4x7x7x2048x1x1x1024_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index 75c6026..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_4x7x7x2048x1x1x1024_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_4x7x7x512x3x3x512_i8xi8xi32_stride1.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_4x7x7x512x3x3x512_i8xi8xi32_stride1.vmfb
deleted file mode 100644
index 7cd222a..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_4x7x7x512x3x3x512_i8xi8xi32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_4x7x7x512x3x3x512_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_4x7x7x512x3x3x512_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index 9747602..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_4x7x7x512x3x3x512_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_8x112x112x64x7x7x3_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_8x112x112x64x7x7x3_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index 3f85f6d..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_8x112x112x64x7x7x3_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_8x14x14x1024x1x1x512_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_8x14x14x1024x1x1x512_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index 0dca3e7..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_8x14x14x1024x1x1x512_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_8x14x14x256x3x3x256_i8xi8xi32_stride1.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_8x14x14x256x3x3x256_i8xi8xi32_stride1.vmfb
deleted file mode 100644
index 9758f0c..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_8x14x14x256x3x3x256_i8xi8xi32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_8x14x14x256x3x3x256_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_8x14x14x256x3x3x256_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index b4aac89..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_8x14x14x256x3x3x256_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_8x28x28x128x3x3x128_i8xi8xi32_stride1.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_8x28x28x128x3x3x128_i8xi8xi32_stride1.vmfb
deleted file mode 100644
index 9198024..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_8x28x28x128x3x3x128_i8xi8xi32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_8x28x28x128x3x3x128_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_8x28x28x128x3x3x128_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index f1f34d6..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_8x28x28x128x3x3x128_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_8x28x28x512x1x1x256_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_8x28x28x512x1x1x256_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index 7e2d852..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_8x28x28x512x1x1x256_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_8x56x56x64x3x3x64_i8xi8xi32_stride1.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_8x56x56x64x3x3x64_i8xi8xi32_stride1.vmfb
deleted file mode 100644
index a4fd2ce..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_8x56x56x64x3x3x64_i8xi8xi32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_8x7x7x2048x1x1x1024_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_8x7x7x2048x1x1x1024_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index f9353cb..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_8x7x7x2048x1x1x1024_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_8x7x7x512x3x3x512_i8xi8xi32_stride1.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_8x7x7x512x3x3x512_i8xi8xi32_stride1.vmfb
deleted file mode 100644
index 6d49435..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_8x7x7x512x3x3x512_i8xi8xi32_stride1.vmfb and /dev/null differ
diff --git a/conv/vmfb/conv_2d_nhwc_hwcf_q_8x7x7x512x3x3x512_i8xi8xi32_stride2.vmfb b/conv/vmfb/conv_2d_nhwc_hwcf_q_8x7x7x512x3x3x512_i8xi8xi32_stride2.vmfb
deleted file mode 100644
index 1c85020..0000000
Binary files a/conv/vmfb/conv_2d_nhwc_hwcf_q_8x7x7x512x3x3x512_i8xi8xi32_stride2.vmfb and /dev/null differ
diff --git a/gemm/mlir/gemm_10240_16_8192_bf16_tA.mlir b/gemm/mlir/gemm_10240_16_8192_bf16_tA.mlir
index 6518245..2f56e73 100644
--- a/gemm/mlir/gemm_10240_16_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_10240_16_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x10240xbf16>, %arg1: tensor<8192x16xbf16>) -> tensor<10240x16xbf16> {
+    func.func @main(%arg0: tensor<8192x10240xbf16>, %arg1: tensor<8192x16xbf16>) -> tensor<10240x16xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<10240x16xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<10240x16xbf16>) -> tensor<10240x16xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x10240xbf16>, tensor<8192x16xbf16>) outs(%1 : tensor<10240x16xbf16>) -> tensor<10240x16xbf16>
         return %2 : tensor<10240x16xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_10240_16_8192_f16_tA.mlir b/gemm/mlir/gemm_10240_16_8192_f16_tA.mlir
index b205b6b..78c8d49 100644
--- a/gemm/mlir/gemm_10240_16_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_10240_16_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x10240xf16>, %arg1: tensor<8192x16xf16>) -> tensor<10240x16xf16> {
+    func.func @main(%arg0: tensor<8192x10240xf16>, %arg1: tensor<8192x16xf16>) -> tensor<10240x16xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<10240x16xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<10240x16xf16>) -> tensor<10240x16xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x10240xf16>, tensor<8192x16xf16>) outs(%1 : tensor<10240x16xf16>) -> tensor<10240x16xf16>
         return %2 : tensor<10240x16xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_10240_1_8192_bf16_tA.mlir b/gemm/mlir/gemm_10240_1_8192_bf16_tA.mlir
index c8f11c7..ad452e2 100644
--- a/gemm/mlir/gemm_10240_1_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_10240_1_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x10240xbf16>, %arg1: tensor<8192x1xbf16>) -> tensor<10240x1xbf16> {
+    func.func @main(%arg0: tensor<8192x10240xbf16>, %arg1: tensor<8192x1xbf16>) -> tensor<10240x1xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<10240x1xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<10240x1xbf16>) -> tensor<10240x1xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x10240xbf16>, tensor<8192x1xbf16>) outs(%1 : tensor<10240x1xbf16>) -> tensor<10240x1xbf16>
         return %2 : tensor<10240x1xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_10240_1_8192_f16_tA.mlir b/gemm/mlir/gemm_10240_1_8192_f16_tA.mlir
index 6e209c5..71b8145 100644
--- a/gemm/mlir/gemm_10240_1_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_10240_1_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x10240xf16>, %arg1: tensor<8192x1xf16>) -> tensor<10240x1xf16> {
+    func.func @main(%arg0: tensor<8192x10240xf16>, %arg1: tensor<8192x1xf16>) -> tensor<10240x1xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<10240x1xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<10240x1xf16>) -> tensor<10240x1xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x10240xf16>, tensor<8192x1xf16>) outs(%1 : tensor<10240x1xf16>) -> tensor<10240x1xf16>
         return %2 : tensor<10240x1xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_10240_2_8192_bf16_tA.mlir b/gemm/mlir/gemm_10240_2_8192_bf16_tA.mlir
index 81b7648..b0f1298 100644
--- a/gemm/mlir/gemm_10240_2_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_10240_2_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x10240xbf16>, %arg1: tensor<8192x2xbf16>) -> tensor<10240x2xbf16> {
+    func.func @main(%arg0: tensor<8192x10240xbf16>, %arg1: tensor<8192x2xbf16>) -> tensor<10240x2xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<10240x2xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<10240x2xbf16>) -> tensor<10240x2xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x10240xbf16>, tensor<8192x2xbf16>) outs(%1 : tensor<10240x2xbf16>) -> tensor<10240x2xbf16>
         return %2 : tensor<10240x2xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_10240_2_8192_f16_tA.mlir b/gemm/mlir/gemm_10240_2_8192_f16_tA.mlir
index 356bbee..273354c 100644
--- a/gemm/mlir/gemm_10240_2_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_10240_2_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x10240xf16>, %arg1: tensor<8192x2xf16>) -> tensor<10240x2xf16> {
+    func.func @main(%arg0: tensor<8192x10240xf16>, %arg1: tensor<8192x2xf16>) -> tensor<10240x2xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<10240x2xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<10240x2xf16>) -> tensor<10240x2xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x10240xf16>, tensor<8192x2xf16>) outs(%1 : tensor<10240x2xf16>) -> tensor<10240x2xf16>
         return %2 : tensor<10240x2xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_10240_32_8192_bf16_tA.mlir b/gemm/mlir/gemm_10240_32_8192_bf16_tA.mlir
index cf172cd..f1ec0ed 100644
--- a/gemm/mlir/gemm_10240_32_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_10240_32_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x10240xbf16>, %arg1: tensor<8192x32xbf16>) -> tensor<10240x32xbf16> {
+    func.func @main(%arg0: tensor<8192x10240xbf16>, %arg1: tensor<8192x32xbf16>) -> tensor<10240x32xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<10240x32xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<10240x32xbf16>) -> tensor<10240x32xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x10240xbf16>, tensor<8192x32xbf16>) outs(%1 : tensor<10240x32xbf16>) -> tensor<10240x32xbf16>
         return %2 : tensor<10240x32xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_10240_32_8192_f16_tA.mlir b/gemm/mlir/gemm_10240_32_8192_f16_tA.mlir
index 2b86e9d..3a3e10a 100644
--- a/gemm/mlir/gemm_10240_32_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_10240_32_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x10240xf16>, %arg1: tensor<8192x32xf16>) -> tensor<10240x32xf16> {
+    func.func @main(%arg0: tensor<8192x10240xf16>, %arg1: tensor<8192x32xf16>) -> tensor<10240x32xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<10240x32xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<10240x32xf16>) -> tensor<10240x32xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x10240xf16>, tensor<8192x32xf16>) outs(%1 : tensor<10240x32xf16>) -> tensor<10240x32xf16>
         return %2 : tensor<10240x32xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_10240_4_8192_bf16_tA.mlir b/gemm/mlir/gemm_10240_4_8192_bf16_tA.mlir
index 5190491..2b73883 100644
--- a/gemm/mlir/gemm_10240_4_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_10240_4_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x10240xbf16>, %arg1: tensor<8192x4xbf16>) -> tensor<10240x4xbf16> {
+    func.func @main(%arg0: tensor<8192x10240xbf16>, %arg1: tensor<8192x4xbf16>) -> tensor<10240x4xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<10240x4xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<10240x4xbf16>) -> tensor<10240x4xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x10240xbf16>, tensor<8192x4xbf16>) outs(%1 : tensor<10240x4xbf16>) -> tensor<10240x4xbf16>
         return %2 : tensor<10240x4xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_10240_4_8192_f16_tA.mlir b/gemm/mlir/gemm_10240_4_8192_f16_tA.mlir
index e220fe4..2a97ec8 100644
--- a/gemm/mlir/gemm_10240_4_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_10240_4_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x10240xf16>, %arg1: tensor<8192x4xf16>) -> tensor<10240x4xf16> {
+    func.func @main(%arg0: tensor<8192x10240xf16>, %arg1: tensor<8192x4xf16>) -> tensor<10240x4xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<10240x4xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<10240x4xf16>) -> tensor<10240x4xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x10240xf16>, tensor<8192x4xf16>) outs(%1 : tensor<10240x4xf16>) -> tensor<10240x4xf16>
         return %2 : tensor<10240x4xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_10240_8_8192_bf16_tA.mlir b/gemm/mlir/gemm_10240_8_8192_bf16_tA.mlir
index 7d7d21f..a5c4f70 100644
--- a/gemm/mlir/gemm_10240_8_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_10240_8_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x10240xbf16>, %arg1: tensor<8192x8xbf16>) -> tensor<10240x8xbf16> {
+    func.func @main(%arg0: tensor<8192x10240xbf16>, %arg1: tensor<8192x8xbf16>) -> tensor<10240x8xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<10240x8xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<10240x8xbf16>) -> tensor<10240x8xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x10240xbf16>, tensor<8192x8xbf16>) outs(%1 : tensor<10240x8xbf16>) -> tensor<10240x8xbf16>
         return %2 : tensor<10240x8xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_10240_8_8192_f16_tA.mlir b/gemm/mlir/gemm_10240_8_8192_f16_tA.mlir
index 774eb17..96ca8f3 100644
--- a/gemm/mlir/gemm_10240_8_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_10240_8_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x10240xf16>, %arg1: tensor<8192x8xf16>) -> tensor<10240x8xf16> {
+    func.func @main(%arg0: tensor<8192x10240xf16>, %arg1: tensor<8192x8xf16>) -> tensor<10240x8xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<10240x8xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<10240x8xf16>) -> tensor<10240x8xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x10240xf16>, tensor<8192x8xf16>) outs(%1 : tensor<10240x8xf16>) -> tensor<10240x8xf16>
         return %2 : tensor<10240x8xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_1280_16_8192_bf16_tA.mlir b/gemm/mlir/gemm_1280_16_8192_bf16_tA.mlir
index cce0498..3baa555 100644
--- a/gemm/mlir/gemm_1280_16_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_1280_16_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x1280xbf16>, %arg1: tensor<8192x16xbf16>) -> tensor<1280x16xbf16> {
+    func.func @main(%arg0: tensor<8192x1280xbf16>, %arg1: tensor<8192x16xbf16>) -> tensor<1280x16xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<1280x16xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<1280x16xbf16>) -> tensor<1280x16xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x1280xbf16>, tensor<8192x16xbf16>) outs(%1 : tensor<1280x16xbf16>) -> tensor<1280x16xbf16>
         return %2 : tensor<1280x16xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_1280_16_8192_f16_tA.mlir b/gemm/mlir/gemm_1280_16_8192_f16_tA.mlir
index a9bcd82..3fe4759 100644
--- a/gemm/mlir/gemm_1280_16_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_1280_16_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x1280xf16>, %arg1: tensor<8192x16xf16>) -> tensor<1280x16xf16> {
+    func.func @main(%arg0: tensor<8192x1280xf16>, %arg1: tensor<8192x16xf16>) -> tensor<1280x16xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<1280x16xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<1280x16xf16>) -> tensor<1280x16xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x1280xf16>, tensor<8192x16xf16>) outs(%1 : tensor<1280x16xf16>) -> tensor<1280x16xf16>
         return %2 : tensor<1280x16xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_1280_1_8192_bf16_tA.mlir b/gemm/mlir/gemm_1280_1_8192_bf16_tA.mlir
index 13eb35d..3d2ccc5 100644
--- a/gemm/mlir/gemm_1280_1_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_1280_1_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x1280xbf16>, %arg1: tensor<8192x1xbf16>) -> tensor<1280x1xbf16> {
+    func.func @main(%arg0: tensor<8192x1280xbf16>, %arg1: tensor<8192x1xbf16>) -> tensor<1280x1xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<1280x1xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<1280x1xbf16>) -> tensor<1280x1xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x1280xbf16>, tensor<8192x1xbf16>) outs(%1 : tensor<1280x1xbf16>) -> tensor<1280x1xbf16>
         return %2 : tensor<1280x1xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_1280_1_8192_f16_tA.mlir b/gemm/mlir/gemm_1280_1_8192_f16_tA.mlir
index 3c68737..b723290 100644
--- a/gemm/mlir/gemm_1280_1_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_1280_1_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x1280xf16>, %arg1: tensor<8192x1xf16>) -> tensor<1280x1xf16> {
+    func.func @main(%arg0: tensor<8192x1280xf16>, %arg1: tensor<8192x1xf16>) -> tensor<1280x1xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<1280x1xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<1280x1xf16>) -> tensor<1280x1xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x1280xf16>, tensor<8192x1xf16>) outs(%1 : tensor<1280x1xf16>) -> tensor<1280x1xf16>
         return %2 : tensor<1280x1xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_1280_2_8192_bf16_tA.mlir b/gemm/mlir/gemm_1280_2_8192_bf16_tA.mlir
index 081118a..3f23515 100644
--- a/gemm/mlir/gemm_1280_2_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_1280_2_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x1280xbf16>, %arg1: tensor<8192x2xbf16>) -> tensor<1280x2xbf16> {
+    func.func @main(%arg0: tensor<8192x1280xbf16>, %arg1: tensor<8192x2xbf16>) -> tensor<1280x2xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<1280x2xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<1280x2xbf16>) -> tensor<1280x2xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x1280xbf16>, tensor<8192x2xbf16>) outs(%1 : tensor<1280x2xbf16>) -> tensor<1280x2xbf16>
         return %2 : tensor<1280x2xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_1280_2_8192_f16_tA.mlir b/gemm/mlir/gemm_1280_2_8192_f16_tA.mlir
index f111865..32fdd34 100644
--- a/gemm/mlir/gemm_1280_2_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_1280_2_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x1280xf16>, %arg1: tensor<8192x2xf16>) -> tensor<1280x2xf16> {
+    func.func @main(%arg0: tensor<8192x1280xf16>, %arg1: tensor<8192x2xf16>) -> tensor<1280x2xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<1280x2xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<1280x2xf16>) -> tensor<1280x2xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x1280xf16>, tensor<8192x2xf16>) outs(%1 : tensor<1280x2xf16>) -> tensor<1280x2xf16>
         return %2 : tensor<1280x2xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_1280_32_8192_bf16_tA.mlir b/gemm/mlir/gemm_1280_32_8192_bf16_tA.mlir
index e0cd5f1..e9bf063 100644
--- a/gemm/mlir/gemm_1280_32_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_1280_32_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x1280xbf16>, %arg1: tensor<8192x32xbf16>) -> tensor<1280x32xbf16> {
+    func.func @main(%arg0: tensor<8192x1280xbf16>, %arg1: tensor<8192x32xbf16>) -> tensor<1280x32xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<1280x32xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<1280x32xbf16>) -> tensor<1280x32xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x1280xbf16>, tensor<8192x32xbf16>) outs(%1 : tensor<1280x32xbf16>) -> tensor<1280x32xbf16>
         return %2 : tensor<1280x32xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_1280_32_8192_f16_tA.mlir b/gemm/mlir/gemm_1280_32_8192_f16_tA.mlir
index 014bc6e..faf8f1a 100644
--- a/gemm/mlir/gemm_1280_32_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_1280_32_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x1280xf16>, %arg1: tensor<8192x32xf16>) -> tensor<1280x32xf16> {
+    func.func @main(%arg0: tensor<8192x1280xf16>, %arg1: tensor<8192x32xf16>) -> tensor<1280x32xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<1280x32xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<1280x32xf16>) -> tensor<1280x32xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x1280xf16>, tensor<8192x32xf16>) outs(%1 : tensor<1280x32xf16>) -> tensor<1280x32xf16>
         return %2 : tensor<1280x32xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_1280_4_8192_bf16_tA.mlir b/gemm/mlir/gemm_1280_4_8192_bf16_tA.mlir
index 6f92786..d844019 100644
--- a/gemm/mlir/gemm_1280_4_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_1280_4_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x1280xbf16>, %arg1: tensor<8192x4xbf16>) -> tensor<1280x4xbf16> {
+    func.func @main(%arg0: tensor<8192x1280xbf16>, %arg1: tensor<8192x4xbf16>) -> tensor<1280x4xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<1280x4xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<1280x4xbf16>) -> tensor<1280x4xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x1280xbf16>, tensor<8192x4xbf16>) outs(%1 : tensor<1280x4xbf16>) -> tensor<1280x4xbf16>
         return %2 : tensor<1280x4xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_1280_4_8192_f16_tA.mlir b/gemm/mlir/gemm_1280_4_8192_f16_tA.mlir
index 9f50653..f7ead50 100644
--- a/gemm/mlir/gemm_1280_4_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_1280_4_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x1280xf16>, %arg1: tensor<8192x4xf16>) -> tensor<1280x4xf16> {
+    func.func @main(%arg0: tensor<8192x1280xf16>, %arg1: tensor<8192x4xf16>) -> tensor<1280x4xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<1280x4xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<1280x4xf16>) -> tensor<1280x4xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x1280xf16>, tensor<8192x4xf16>) outs(%1 : tensor<1280x4xf16>) -> tensor<1280x4xf16>
         return %2 : tensor<1280x4xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_1280_8_8192_bf16_tA.mlir b/gemm/mlir/gemm_1280_8_8192_bf16_tA.mlir
index 8654770..8f2da95 100644
--- a/gemm/mlir/gemm_1280_8_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_1280_8_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x1280xbf16>, %arg1: tensor<8192x8xbf16>) -> tensor<1280x8xbf16> {
+    func.func @main(%arg0: tensor<8192x1280xbf16>, %arg1: tensor<8192x8xbf16>) -> tensor<1280x8xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<1280x8xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<1280x8xbf16>) -> tensor<1280x8xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x1280xbf16>, tensor<8192x8xbf16>) outs(%1 : tensor<1280x8xbf16>) -> tensor<1280x8xbf16>
         return %2 : tensor<1280x8xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_1280_8_8192_f16_tA.mlir b/gemm/mlir/gemm_1280_8_8192_f16_tA.mlir
index 4a87cab..4c96f74 100644
--- a/gemm/mlir/gemm_1280_8_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_1280_8_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x1280xf16>, %arg1: tensor<8192x8xf16>) -> tensor<1280x8xf16> {
+    func.func @main(%arg0: tensor<8192x1280xf16>, %arg1: tensor<8192x8xf16>) -> tensor<1280x8xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<1280x8xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<1280x8xf16>) -> tensor<1280x8xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x1280xf16>, tensor<8192x8xf16>) outs(%1 : tensor<1280x8xf16>) -> tensor<1280x8xf16>
         return %2 : tensor<1280x8xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_128_1280_2048_bf16.mlir b/gemm/mlir/gemm_128_1280_2048_bf16.mlir
index f113fc4..c758c9d 100644
--- a/gemm/mlir/gemm_128_1280_2048_bf16.mlir
+++ b/gemm/mlir/gemm_128_1280_2048_bf16.mlir
@@ -1,9 +1,9 @@
 module {
-    func.func @main_0(%arg0: tensor<128x2048xbf16>, %arg1: tensor<2048x1280xbf16>) -> tensor<128x1280xbf16> {
+    func.func @main(%arg0: tensor<128x2048xbf16>, %arg1: tensor<2048x1280xbf16>) -> tensor<128x1280xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<128x1280xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<128x1280xbf16>) -> tensor<128x1280xbf16>
         %2 = linalg.matmul ins(%arg0, %arg1 : tensor<128x2048xbf16>, tensor<2048x1280xbf16>) outs(%1 : tensor<128x1280xbf16>) -> tensor<128x1280xbf16>
         return %2 : tensor<128x1280xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_128_1280_2048_bf16_tA.mlir b/gemm/mlir/gemm_128_1280_2048_bf16_tA.mlir
index 5880178..0cb012c 100644
--- a/gemm/mlir/gemm_128_1280_2048_bf16_tA.mlir
+++ b/gemm/mlir/gemm_128_1280_2048_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<2048x128xbf16>, %arg1: tensor<2048x1280xbf16>) -> tensor<128x1280xbf16> {
+    func.func @main(%arg0: tensor<2048x128xbf16>, %arg1: tensor<2048x1280xbf16>) -> tensor<128x1280xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<128x1280xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<128x1280xbf16>) -> tensor<128x1280xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2048x128xbf16>, tensor<2048x1280xbf16>) outs(%1 : tensor<128x1280xbf16>) -> tensor<128x1280xbf16>
         return %2 : tensor<128x1280xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_128_1280_2048_bf16_tB.mlir b/gemm/mlir/gemm_128_1280_2048_bf16_tB.mlir
index 4163d4c..32f5e6f 100644
--- a/gemm/mlir/gemm_128_1280_2048_bf16_tB.mlir
+++ b/gemm/mlir/gemm_128_1280_2048_bf16_tB.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<128x2048xbf16>, %arg1: tensor<1280x2048xbf16>) -> tensor<128x1280xbf16> {
+    func.func @main(%arg0: tensor<128x2048xbf16>, %arg1: tensor<1280x2048xbf16>) -> tensor<128x1280xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<128x1280xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<128x1280xbf16>) -> tensor<128x1280xbf16>
         %2 = linalg.matmul_transpose_b ins(%arg0, %arg1 : tensor<128x2048xbf16>, tensor<1280x2048xbf16>) outs(%1 : tensor<128x1280xbf16>) -> tensor<128x1280xbf16>
         return %2 : tensor<128x1280xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_128_1280_2048_f16.mlir b/gemm/mlir/gemm_128_1280_2048_f16.mlir
index 39e0785..84ea04a 100644
--- a/gemm/mlir/gemm_128_1280_2048_f16.mlir
+++ b/gemm/mlir/gemm_128_1280_2048_f16.mlir
@@ -1,9 +1,9 @@
 module {
-    func.func @main_0(%arg0: tensor<128x2048xf16>, %arg1: tensor<2048x1280xf16>) -> tensor<128x1280xf16> {
+    func.func @main(%arg0: tensor<128x2048xf16>, %arg1: tensor<2048x1280xf16>) -> tensor<128x1280xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<128x1280xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<128x1280xf16>) -> tensor<128x1280xf16>
         %2 = linalg.matmul ins(%arg0, %arg1 : tensor<128x2048xf16>, tensor<2048x1280xf16>) outs(%1 : tensor<128x1280xf16>) -> tensor<128x1280xf16>
         return %2 : tensor<128x1280xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_128_1280_2048_f16_tA.mlir b/gemm/mlir/gemm_128_1280_2048_f16_tA.mlir
index f6328f9..45cda80 100644
--- a/gemm/mlir/gemm_128_1280_2048_f16_tA.mlir
+++ b/gemm/mlir/gemm_128_1280_2048_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<2048x128xf16>, %arg1: tensor<2048x1280xf16>) -> tensor<128x1280xf16> {
+    func.func @main(%arg0: tensor<2048x128xf16>, %arg1: tensor<2048x1280xf16>) -> tensor<128x1280xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<128x1280xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<128x1280xf16>) -> tensor<128x1280xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2048x128xf16>, tensor<2048x1280xf16>) outs(%1 : tensor<128x1280xf16>) -> tensor<128x1280xf16>
         return %2 : tensor<128x1280xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_128_1280_2048_f16_tB.mlir b/gemm/mlir/gemm_128_1280_2048_f16_tB.mlir
index de333b7..408620b 100644
--- a/gemm/mlir/gemm_128_1280_2048_f16_tB.mlir
+++ b/gemm/mlir/gemm_128_1280_2048_f16_tB.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<128x2048xf16>, %arg1: tensor<1280x2048xf16>) -> tensor<128x1280xf16> {
+    func.func @main(%arg0: tensor<128x2048xf16>, %arg1: tensor<1280x2048xf16>) -> tensor<128x1280xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<128x1280xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<128x1280xf16>) -> tensor<128x1280xf16>
         %2 = linalg.matmul_transpose_b ins(%arg0, %arg1 : tensor<128x2048xf16>, tensor<1280x2048xf16>) outs(%1 : tensor<128x1280xf16>) -> tensor<128x1280xf16>
         return %2 : tensor<128x1280xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_13824_16_5120_bf16_tA.mlir b/gemm/mlir/gemm_13824_16_5120_bf16_tA.mlir
index 5587c3e..bf06e53 100644
--- a/gemm/mlir/gemm_13824_16_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_13824_16_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x13824xbf16>, %arg1: tensor<5120x16xbf16>) -> tensor<13824x16xbf16> {
+    func.func @main(%arg0: tensor<5120x13824xbf16>, %arg1: tensor<5120x16xbf16>) -> tensor<13824x16xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<13824x16xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<13824x16xbf16>) -> tensor<13824x16xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x13824xbf16>, tensor<5120x16xbf16>) outs(%1 : tensor<13824x16xbf16>) -> tensor<13824x16xbf16>
         return %2 : tensor<13824x16xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_13824_16_5120_f16_tA.mlir b/gemm/mlir/gemm_13824_16_5120_f16_tA.mlir
index 6904c26..6820445 100644
--- a/gemm/mlir/gemm_13824_16_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_13824_16_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x13824xf16>, %arg1: tensor<5120x16xf16>) -> tensor<13824x16xf16> {
+    func.func @main(%arg0: tensor<5120x13824xf16>, %arg1: tensor<5120x16xf16>) -> tensor<13824x16xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<13824x16xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<13824x16xf16>) -> tensor<13824x16xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x13824xf16>, tensor<5120x16xf16>) outs(%1 : tensor<13824x16xf16>) -> tensor<13824x16xf16>
         return %2 : tensor<13824x16xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_13824_1_5120_bf16_tA.mlir b/gemm/mlir/gemm_13824_1_5120_bf16_tA.mlir
index 67242a6..bddc513 100644
--- a/gemm/mlir/gemm_13824_1_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_13824_1_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x13824xbf16>, %arg1: tensor<5120x1xbf16>) -> tensor<13824x1xbf16> {
+    func.func @main(%arg0: tensor<5120x13824xbf16>, %arg1: tensor<5120x1xbf16>) -> tensor<13824x1xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<13824x1xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<13824x1xbf16>) -> tensor<13824x1xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x13824xbf16>, tensor<5120x1xbf16>) outs(%1 : tensor<13824x1xbf16>) -> tensor<13824x1xbf16>
         return %2 : tensor<13824x1xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_13824_1_5120_f16_tA.mlir b/gemm/mlir/gemm_13824_1_5120_f16_tA.mlir
index 6c8d6f6..de51690 100644
--- a/gemm/mlir/gemm_13824_1_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_13824_1_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x13824xf16>, %arg1: tensor<5120x1xf16>) -> tensor<13824x1xf16> {
+    func.func @main(%arg0: tensor<5120x13824xf16>, %arg1: tensor<5120x1xf16>) -> tensor<13824x1xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<13824x1xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<13824x1xf16>) -> tensor<13824x1xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x13824xf16>, tensor<5120x1xf16>) outs(%1 : tensor<13824x1xf16>) -> tensor<13824x1xf16>
         return %2 : tensor<13824x1xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_13824_2_5120_bf16_tA.mlir b/gemm/mlir/gemm_13824_2_5120_bf16_tA.mlir
index 4887c3c..b73977b 100644
--- a/gemm/mlir/gemm_13824_2_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_13824_2_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x13824xbf16>, %arg1: tensor<5120x2xbf16>) -> tensor<13824x2xbf16> {
+    func.func @main(%arg0: tensor<5120x13824xbf16>, %arg1: tensor<5120x2xbf16>) -> tensor<13824x2xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<13824x2xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<13824x2xbf16>) -> tensor<13824x2xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x13824xbf16>, tensor<5120x2xbf16>) outs(%1 : tensor<13824x2xbf16>) -> tensor<13824x2xbf16>
         return %2 : tensor<13824x2xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_13824_2_5120_f16_tA.mlir b/gemm/mlir/gemm_13824_2_5120_f16_tA.mlir
index ca0a0c3..b763847 100644
--- a/gemm/mlir/gemm_13824_2_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_13824_2_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x13824xf16>, %arg1: tensor<5120x2xf16>) -> tensor<13824x2xf16> {
+    func.func @main(%arg0: tensor<5120x13824xf16>, %arg1: tensor<5120x2xf16>) -> tensor<13824x2xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<13824x2xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<13824x2xf16>) -> tensor<13824x2xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x13824xf16>, tensor<5120x2xf16>) outs(%1 : tensor<13824x2xf16>) -> tensor<13824x2xf16>
         return %2 : tensor<13824x2xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_13824_32_5120_bf16_tA.mlir b/gemm/mlir/gemm_13824_32_5120_bf16_tA.mlir
index 0df7805..3be8ecf 100644
--- a/gemm/mlir/gemm_13824_32_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_13824_32_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x13824xbf16>, %arg1: tensor<5120x32xbf16>) -> tensor<13824x32xbf16> {
+    func.func @main(%arg0: tensor<5120x13824xbf16>, %arg1: tensor<5120x32xbf16>) -> tensor<13824x32xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<13824x32xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<13824x32xbf16>) -> tensor<13824x32xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x13824xbf16>, tensor<5120x32xbf16>) outs(%1 : tensor<13824x32xbf16>) -> tensor<13824x32xbf16>
         return %2 : tensor<13824x32xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_13824_32_5120_f16_tA.mlir b/gemm/mlir/gemm_13824_32_5120_f16_tA.mlir
index 4b59dd3..2069eef 100644
--- a/gemm/mlir/gemm_13824_32_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_13824_32_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x13824xf16>, %arg1: tensor<5120x32xf16>) -> tensor<13824x32xf16> {
+    func.func @main(%arg0: tensor<5120x13824xf16>, %arg1: tensor<5120x32xf16>) -> tensor<13824x32xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<13824x32xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<13824x32xf16>) -> tensor<13824x32xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x13824xf16>, tensor<5120x32xf16>) outs(%1 : tensor<13824x32xf16>) -> tensor<13824x32xf16>
         return %2 : tensor<13824x32xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_13824_4_5120_bf16_tA.mlir b/gemm/mlir/gemm_13824_4_5120_bf16_tA.mlir
index 552d8df..3ac974f 100644
--- a/gemm/mlir/gemm_13824_4_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_13824_4_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x13824xbf16>, %arg1: tensor<5120x4xbf16>) -> tensor<13824x4xbf16> {
+    func.func @main(%arg0: tensor<5120x13824xbf16>, %arg1: tensor<5120x4xbf16>) -> tensor<13824x4xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<13824x4xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<13824x4xbf16>) -> tensor<13824x4xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x13824xbf16>, tensor<5120x4xbf16>) outs(%1 : tensor<13824x4xbf16>) -> tensor<13824x4xbf16>
         return %2 : tensor<13824x4xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_13824_4_5120_f16_tA.mlir b/gemm/mlir/gemm_13824_4_5120_f16_tA.mlir
index 7b8d299..2d2dbaf 100644
--- a/gemm/mlir/gemm_13824_4_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_13824_4_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x13824xf16>, %arg1: tensor<5120x4xf16>) -> tensor<13824x4xf16> {
+    func.func @main(%arg0: tensor<5120x13824xf16>, %arg1: tensor<5120x4xf16>) -> tensor<13824x4xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<13824x4xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<13824x4xf16>) -> tensor<13824x4xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x13824xf16>, tensor<5120x4xf16>) outs(%1 : tensor<13824x4xf16>) -> tensor<13824x4xf16>
         return %2 : tensor<13824x4xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_13824_8_5120_bf16_tA.mlir b/gemm/mlir/gemm_13824_8_5120_bf16_tA.mlir
index 4c0d745..30c7d55 100644
--- a/gemm/mlir/gemm_13824_8_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_13824_8_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x13824xbf16>, %arg1: tensor<5120x8xbf16>) -> tensor<13824x8xbf16> {
+    func.func @main(%arg0: tensor<5120x13824xbf16>, %arg1: tensor<5120x8xbf16>) -> tensor<13824x8xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<13824x8xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<13824x8xbf16>) -> tensor<13824x8xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x13824xbf16>, tensor<5120x8xbf16>) outs(%1 : tensor<13824x8xbf16>) -> tensor<13824x8xbf16>
         return %2 : tensor<13824x8xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_13824_8_5120_f16_tA.mlir b/gemm/mlir/gemm_13824_8_5120_f16_tA.mlir
index 2109f18..96d5e3c 100644
--- a/gemm/mlir/gemm_13824_8_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_13824_8_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x13824xf16>, %arg1: tensor<5120x8xf16>) -> tensor<13824x8xf16> {
+    func.func @main(%arg0: tensor<5120x13824xf16>, %arg1: tensor<5120x8xf16>) -> tensor<13824x8xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<13824x8xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<13824x8xf16>) -> tensor<13824x8xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x13824xf16>, tensor<5120x8xf16>) outs(%1 : tensor<13824x8xf16>) -> tensor<13824x8xf16>
         return %2 : tensor<13824x8xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_14336_16_8192_bf16_tA.mlir b/gemm/mlir/gemm_14336_16_8192_bf16_tA.mlir
index 9fdd075..ebb53bd 100644
--- a/gemm/mlir/gemm_14336_16_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_14336_16_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x14336xbf16>, %arg1: tensor<8192x16xbf16>) -> tensor<14336x16xbf16> {
+    func.func @main(%arg0: tensor<8192x14336xbf16>, %arg1: tensor<8192x16xbf16>) -> tensor<14336x16xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<14336x16xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<14336x16xbf16>) -> tensor<14336x16xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x14336xbf16>, tensor<8192x16xbf16>) outs(%1 : tensor<14336x16xbf16>) -> tensor<14336x16xbf16>
         return %2 : tensor<14336x16xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_14336_16_8192_f16_tA.mlir b/gemm/mlir/gemm_14336_16_8192_f16_tA.mlir
index a5627cb..1c62bae 100644
--- a/gemm/mlir/gemm_14336_16_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_14336_16_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x14336xf16>, %arg1: tensor<8192x16xf16>) -> tensor<14336x16xf16> {
+    func.func @main(%arg0: tensor<8192x14336xf16>, %arg1: tensor<8192x16xf16>) -> tensor<14336x16xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<14336x16xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<14336x16xf16>) -> tensor<14336x16xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x14336xf16>, tensor<8192x16xf16>) outs(%1 : tensor<14336x16xf16>) -> tensor<14336x16xf16>
         return %2 : tensor<14336x16xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_14336_1_8192_bf16_tA.mlir b/gemm/mlir/gemm_14336_1_8192_bf16_tA.mlir
index c4eddd3..12e1750 100644
--- a/gemm/mlir/gemm_14336_1_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_14336_1_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x14336xbf16>, %arg1: tensor<8192x1xbf16>) -> tensor<14336x1xbf16> {
+    func.func @main(%arg0: tensor<8192x14336xbf16>, %arg1: tensor<8192x1xbf16>) -> tensor<14336x1xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<14336x1xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<14336x1xbf16>) -> tensor<14336x1xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x14336xbf16>, tensor<8192x1xbf16>) outs(%1 : tensor<14336x1xbf16>) -> tensor<14336x1xbf16>
         return %2 : tensor<14336x1xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_14336_1_8192_f16_tA.mlir b/gemm/mlir/gemm_14336_1_8192_f16_tA.mlir
index f6623f1..b3cee07 100644
--- a/gemm/mlir/gemm_14336_1_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_14336_1_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x14336xf16>, %arg1: tensor<8192x1xf16>) -> tensor<14336x1xf16> {
+    func.func @main(%arg0: tensor<8192x14336xf16>, %arg1: tensor<8192x1xf16>) -> tensor<14336x1xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<14336x1xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<14336x1xf16>) -> tensor<14336x1xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x14336xf16>, tensor<8192x1xf16>) outs(%1 : tensor<14336x1xf16>) -> tensor<14336x1xf16>
         return %2 : tensor<14336x1xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_14336_2_8192_bf16_tA.mlir b/gemm/mlir/gemm_14336_2_8192_bf16_tA.mlir
index 31ff061..ce3f701 100644
--- a/gemm/mlir/gemm_14336_2_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_14336_2_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x14336xbf16>, %arg1: tensor<8192x2xbf16>) -> tensor<14336x2xbf16> {
+    func.func @main(%arg0: tensor<8192x14336xbf16>, %arg1: tensor<8192x2xbf16>) -> tensor<14336x2xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<14336x2xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<14336x2xbf16>) -> tensor<14336x2xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x14336xbf16>, tensor<8192x2xbf16>) outs(%1 : tensor<14336x2xbf16>) -> tensor<14336x2xbf16>
         return %2 : tensor<14336x2xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_14336_2_8192_f16_tA.mlir b/gemm/mlir/gemm_14336_2_8192_f16_tA.mlir
index 19b60e7..100d62f 100644
--- a/gemm/mlir/gemm_14336_2_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_14336_2_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x14336xf16>, %arg1: tensor<8192x2xf16>) -> tensor<14336x2xf16> {
+    func.func @main(%arg0: tensor<8192x14336xf16>, %arg1: tensor<8192x2xf16>) -> tensor<14336x2xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<14336x2xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<14336x2xf16>) -> tensor<14336x2xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x14336xf16>, tensor<8192x2xf16>) outs(%1 : tensor<14336x2xf16>) -> tensor<14336x2xf16>
         return %2 : tensor<14336x2xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_14336_32_8192_bf16_tA.mlir b/gemm/mlir/gemm_14336_32_8192_bf16_tA.mlir
index 5233c40..39a012e 100644
--- a/gemm/mlir/gemm_14336_32_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_14336_32_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x14336xbf16>, %arg1: tensor<8192x32xbf16>) -> tensor<14336x32xbf16> {
+    func.func @main(%arg0: tensor<8192x14336xbf16>, %arg1: tensor<8192x32xbf16>) -> tensor<14336x32xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<14336x32xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<14336x32xbf16>) -> tensor<14336x32xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x14336xbf16>, tensor<8192x32xbf16>) outs(%1 : tensor<14336x32xbf16>) -> tensor<14336x32xbf16>
         return %2 : tensor<14336x32xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_14336_32_8192_f16_tA.mlir b/gemm/mlir/gemm_14336_32_8192_f16_tA.mlir
index 79dc048..6457a07 100644
--- a/gemm/mlir/gemm_14336_32_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_14336_32_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x14336xf16>, %arg1: tensor<8192x32xf16>) -> tensor<14336x32xf16> {
+    func.func @main(%arg0: tensor<8192x14336xf16>, %arg1: tensor<8192x32xf16>) -> tensor<14336x32xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<14336x32xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<14336x32xf16>) -> tensor<14336x32xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x14336xf16>, tensor<8192x32xf16>) outs(%1 : tensor<14336x32xf16>) -> tensor<14336x32xf16>
         return %2 : tensor<14336x32xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_14336_4_8192_bf16_tA.mlir b/gemm/mlir/gemm_14336_4_8192_bf16_tA.mlir
index da10c2b..99bcffb 100644
--- a/gemm/mlir/gemm_14336_4_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_14336_4_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x14336xbf16>, %arg1: tensor<8192x4xbf16>) -> tensor<14336x4xbf16> {
+    func.func @main(%arg0: tensor<8192x14336xbf16>, %arg1: tensor<8192x4xbf16>) -> tensor<14336x4xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<14336x4xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<14336x4xbf16>) -> tensor<14336x4xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x14336xbf16>, tensor<8192x4xbf16>) outs(%1 : tensor<14336x4xbf16>) -> tensor<14336x4xbf16>
         return %2 : tensor<14336x4xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_14336_4_8192_f16_tA.mlir b/gemm/mlir/gemm_14336_4_8192_f16_tA.mlir
index f1b0d2e..6c93d68 100644
--- a/gemm/mlir/gemm_14336_4_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_14336_4_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x14336xf16>, %arg1: tensor<8192x4xf16>) -> tensor<14336x4xf16> {
+    func.func @main(%arg0: tensor<8192x14336xf16>, %arg1: tensor<8192x4xf16>) -> tensor<14336x4xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<14336x4xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<14336x4xf16>) -> tensor<14336x4xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x14336xf16>, tensor<8192x4xf16>) outs(%1 : tensor<14336x4xf16>) -> tensor<14336x4xf16>
         return %2 : tensor<14336x4xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_14336_8_8192_bf16_tA.mlir b/gemm/mlir/gemm_14336_8_8192_bf16_tA.mlir
index b2ee9d2..22146cb 100644
--- a/gemm/mlir/gemm_14336_8_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_14336_8_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x14336xbf16>, %arg1: tensor<8192x8xbf16>) -> tensor<14336x8xbf16> {
+    func.func @main(%arg0: tensor<8192x14336xbf16>, %arg1: tensor<8192x8xbf16>) -> tensor<14336x8xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<14336x8xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<14336x8xbf16>) -> tensor<14336x8xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x14336xbf16>, tensor<8192x8xbf16>) outs(%1 : tensor<14336x8xbf16>) -> tensor<14336x8xbf16>
         return %2 : tensor<14336x8xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_14336_8_8192_f16_tA.mlir b/gemm/mlir/gemm_14336_8_8192_f16_tA.mlir
index 8bcf588..452edf9 100644
--- a/gemm/mlir/gemm_14336_8_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_14336_8_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x14336xf16>, %arg1: tensor<8192x8xf16>) -> tensor<14336x8xf16> {
+    func.func @main(%arg0: tensor<8192x14336xf16>, %arg1: tensor<8192x8xf16>) -> tensor<14336x8xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<14336x8xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<14336x8xf16>) -> tensor<14336x8xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x14336xf16>, tensor<8192x8xf16>) outs(%1 : tensor<14336x8xf16>) -> tensor<14336x8xf16>
         return %2 : tensor<14336x8xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_15360_16_5120_bf16_tA.mlir b/gemm/mlir/gemm_15360_16_5120_bf16_tA.mlir
index 60c13e0..da57d0c 100644
--- a/gemm/mlir/gemm_15360_16_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_15360_16_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x15360xbf16>, %arg1: tensor<5120x16xbf16>) -> tensor<15360x16xbf16> {
+    func.func @main(%arg0: tensor<5120x15360xbf16>, %arg1: tensor<5120x16xbf16>) -> tensor<15360x16xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<15360x16xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<15360x16xbf16>) -> tensor<15360x16xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x15360xbf16>, tensor<5120x16xbf16>) outs(%1 : tensor<15360x16xbf16>) -> tensor<15360x16xbf16>
         return %2 : tensor<15360x16xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_15360_16_5120_f16_tA.mlir b/gemm/mlir/gemm_15360_16_5120_f16_tA.mlir
index 8748c90..b15d265 100644
--- a/gemm/mlir/gemm_15360_16_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_15360_16_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x15360xf16>, %arg1: tensor<5120x16xf16>) -> tensor<15360x16xf16> {
+    func.func @main(%arg0: tensor<5120x15360xf16>, %arg1: tensor<5120x16xf16>) -> tensor<15360x16xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<15360x16xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<15360x16xf16>) -> tensor<15360x16xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x15360xf16>, tensor<5120x16xf16>) outs(%1 : tensor<15360x16xf16>) -> tensor<15360x16xf16>
         return %2 : tensor<15360x16xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_15360_1_5120_bf16_tA.mlir b/gemm/mlir/gemm_15360_1_5120_bf16_tA.mlir
index 0b1567d..b0d9c92 100644
--- a/gemm/mlir/gemm_15360_1_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_15360_1_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x15360xbf16>, %arg1: tensor<5120x1xbf16>) -> tensor<15360x1xbf16> {
+    func.func @main(%arg0: tensor<5120x15360xbf16>, %arg1: tensor<5120x1xbf16>) -> tensor<15360x1xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<15360x1xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<15360x1xbf16>) -> tensor<15360x1xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x15360xbf16>, tensor<5120x1xbf16>) outs(%1 : tensor<15360x1xbf16>) -> tensor<15360x1xbf16>
         return %2 : tensor<15360x1xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_15360_1_5120_f16_tA.mlir b/gemm/mlir/gemm_15360_1_5120_f16_tA.mlir
index e985d8a..d458ee9 100644
--- a/gemm/mlir/gemm_15360_1_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_15360_1_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x15360xf16>, %arg1: tensor<5120x1xf16>) -> tensor<15360x1xf16> {
+    func.func @main(%arg0: tensor<5120x15360xf16>, %arg1: tensor<5120x1xf16>) -> tensor<15360x1xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<15360x1xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<15360x1xf16>) -> tensor<15360x1xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x15360xf16>, tensor<5120x1xf16>) outs(%1 : tensor<15360x1xf16>) -> tensor<15360x1xf16>
         return %2 : tensor<15360x1xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_15360_2_5120_bf16_tA.mlir b/gemm/mlir/gemm_15360_2_5120_bf16_tA.mlir
index 5f3266e..032eae5 100644
--- a/gemm/mlir/gemm_15360_2_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_15360_2_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x15360xbf16>, %arg1: tensor<5120x2xbf16>) -> tensor<15360x2xbf16> {
+    func.func @main(%arg0: tensor<5120x15360xbf16>, %arg1: tensor<5120x2xbf16>) -> tensor<15360x2xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<15360x2xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<15360x2xbf16>) -> tensor<15360x2xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x15360xbf16>, tensor<5120x2xbf16>) outs(%1 : tensor<15360x2xbf16>) -> tensor<15360x2xbf16>
         return %2 : tensor<15360x2xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_15360_2_5120_f16_tA.mlir b/gemm/mlir/gemm_15360_2_5120_f16_tA.mlir
index d4dbe8b..18a0d50 100644
--- a/gemm/mlir/gemm_15360_2_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_15360_2_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x15360xf16>, %arg1: tensor<5120x2xf16>) -> tensor<15360x2xf16> {
+    func.func @main(%arg0: tensor<5120x15360xf16>, %arg1: tensor<5120x2xf16>) -> tensor<15360x2xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<15360x2xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<15360x2xf16>) -> tensor<15360x2xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x15360xf16>, tensor<5120x2xf16>) outs(%1 : tensor<15360x2xf16>) -> tensor<15360x2xf16>
         return %2 : tensor<15360x2xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_15360_32_5120_bf16_tA.mlir b/gemm/mlir/gemm_15360_32_5120_bf16_tA.mlir
index a3bd858..8f7fa25 100644
--- a/gemm/mlir/gemm_15360_32_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_15360_32_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x15360xbf16>, %arg1: tensor<5120x32xbf16>) -> tensor<15360x32xbf16> {
+    func.func @main(%arg0: tensor<5120x15360xbf16>, %arg1: tensor<5120x32xbf16>) -> tensor<15360x32xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<15360x32xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<15360x32xbf16>) -> tensor<15360x32xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x15360xbf16>, tensor<5120x32xbf16>) outs(%1 : tensor<15360x32xbf16>) -> tensor<15360x32xbf16>
         return %2 : tensor<15360x32xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_15360_32_5120_f16_tA.mlir b/gemm/mlir/gemm_15360_32_5120_f16_tA.mlir
index 1b7f012..fc86593 100644
--- a/gemm/mlir/gemm_15360_32_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_15360_32_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x15360xf16>, %arg1: tensor<5120x32xf16>) -> tensor<15360x32xf16> {
+    func.func @main(%arg0: tensor<5120x15360xf16>, %arg1: tensor<5120x32xf16>) -> tensor<15360x32xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<15360x32xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<15360x32xf16>) -> tensor<15360x32xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x15360xf16>, tensor<5120x32xf16>) outs(%1 : tensor<15360x32xf16>) -> tensor<15360x32xf16>
         return %2 : tensor<15360x32xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_15360_4_5120_bf16_tA.mlir b/gemm/mlir/gemm_15360_4_5120_bf16_tA.mlir
index f95cedd..f388bfc 100644
--- a/gemm/mlir/gemm_15360_4_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_15360_4_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x15360xbf16>, %arg1: tensor<5120x4xbf16>) -> tensor<15360x4xbf16> {
+    func.func @main(%arg0: tensor<5120x15360xbf16>, %arg1: tensor<5120x4xbf16>) -> tensor<15360x4xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<15360x4xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<15360x4xbf16>) -> tensor<15360x4xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x15360xbf16>, tensor<5120x4xbf16>) outs(%1 : tensor<15360x4xbf16>) -> tensor<15360x4xbf16>
         return %2 : tensor<15360x4xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_15360_4_5120_f16_tA.mlir b/gemm/mlir/gemm_15360_4_5120_f16_tA.mlir
index 5ce3aa7..c8666aa 100644
--- a/gemm/mlir/gemm_15360_4_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_15360_4_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x15360xf16>, %arg1: tensor<5120x4xf16>) -> tensor<15360x4xf16> {
+    func.func @main(%arg0: tensor<5120x15360xf16>, %arg1: tensor<5120x4xf16>) -> tensor<15360x4xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<15360x4xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<15360x4xf16>) -> tensor<15360x4xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x15360xf16>, tensor<5120x4xf16>) outs(%1 : tensor<15360x4xf16>) -> tensor<15360x4xf16>
         return %2 : tensor<15360x4xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_15360_8_5120_bf16_tA.mlir b/gemm/mlir/gemm_15360_8_5120_bf16_tA.mlir
index bfe420d..813f5a1 100644
--- a/gemm/mlir/gemm_15360_8_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_15360_8_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x15360xbf16>, %arg1: tensor<5120x8xbf16>) -> tensor<15360x8xbf16> {
+    func.func @main(%arg0: tensor<5120x15360xbf16>, %arg1: tensor<5120x8xbf16>) -> tensor<15360x8xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<15360x8xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<15360x8xbf16>) -> tensor<15360x8xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x15360xbf16>, tensor<5120x8xbf16>) outs(%1 : tensor<15360x8xbf16>) -> tensor<15360x8xbf16>
         return %2 : tensor<15360x8xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_15360_8_5120_f16_tA.mlir b/gemm/mlir/gemm_15360_8_5120_f16_tA.mlir
index 7f0dc72..5df7526 100644
--- a/gemm/mlir/gemm_15360_8_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_15360_8_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x15360xf16>, %arg1: tensor<5120x8xf16>) -> tensor<15360x8xf16> {
+    func.func @main(%arg0: tensor<5120x15360xf16>, %arg1: tensor<5120x8xf16>) -> tensor<15360x8xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<15360x8xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<15360x8xf16>) -> tensor<15360x8xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x15360xf16>, tensor<5120x8xf16>) outs(%1 : tensor<15360x8xf16>) -> tensor<15360x8xf16>
         return %2 : tensor<15360x8xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_16000_16_5120_bf16_tA.mlir b/gemm/mlir/gemm_16000_16_5120_bf16_tA.mlir
index 9a886db..50136f8 100644
--- a/gemm/mlir/gemm_16000_16_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_16000_16_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x16000xbf16>, %arg1: tensor<5120x16xbf16>) -> tensor<16000x16xbf16> {
+    func.func @main(%arg0: tensor<5120x16000xbf16>, %arg1: tensor<5120x16xbf16>) -> tensor<16000x16xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<16000x16xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<16000x16xbf16>) -> tensor<16000x16xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x16000xbf16>, tensor<5120x16xbf16>) outs(%1 : tensor<16000x16xbf16>) -> tensor<16000x16xbf16>
         return %2 : tensor<16000x16xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_16000_16_5120_f16_tA.mlir b/gemm/mlir/gemm_16000_16_5120_f16_tA.mlir
index b013989..e0ebb71 100644
--- a/gemm/mlir/gemm_16000_16_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_16000_16_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x16000xf16>, %arg1: tensor<5120x16xf16>) -> tensor<16000x16xf16> {
+    func.func @main(%arg0: tensor<5120x16000xf16>, %arg1: tensor<5120x16xf16>) -> tensor<16000x16xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<16000x16xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<16000x16xf16>) -> tensor<16000x16xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x16000xf16>, tensor<5120x16xf16>) outs(%1 : tensor<16000x16xf16>) -> tensor<16000x16xf16>
         return %2 : tensor<16000x16xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_16000_16_8192_bf16_tA.mlir b/gemm/mlir/gemm_16000_16_8192_bf16_tA.mlir
index 10ddd1d..95ae5e6 100644
--- a/gemm/mlir/gemm_16000_16_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_16000_16_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x16000xbf16>, %arg1: tensor<8192x16xbf16>) -> tensor<16000x16xbf16> {
+    func.func @main(%arg0: tensor<8192x16000xbf16>, %arg1: tensor<8192x16xbf16>) -> tensor<16000x16xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<16000x16xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<16000x16xbf16>) -> tensor<16000x16xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x16000xbf16>, tensor<8192x16xbf16>) outs(%1 : tensor<16000x16xbf16>) -> tensor<16000x16xbf16>
         return %2 : tensor<16000x16xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_16000_16_8192_f16_tA.mlir b/gemm/mlir/gemm_16000_16_8192_f16_tA.mlir
index 35b7b27..c1107cc 100644
--- a/gemm/mlir/gemm_16000_16_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_16000_16_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x16000xf16>, %arg1: tensor<8192x16xf16>) -> tensor<16000x16xf16> {
+    func.func @main(%arg0: tensor<8192x16000xf16>, %arg1: tensor<8192x16xf16>) -> tensor<16000x16xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<16000x16xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<16000x16xf16>) -> tensor<16000x16xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x16000xf16>, tensor<8192x16xf16>) outs(%1 : tensor<16000x16xf16>) -> tensor<16000x16xf16>
         return %2 : tensor<16000x16xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_16000_1_5120_bf16_tA.mlir b/gemm/mlir/gemm_16000_1_5120_bf16_tA.mlir
index 9c53fc4..d0fc2f2 100644
--- a/gemm/mlir/gemm_16000_1_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_16000_1_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x16000xbf16>, %arg1: tensor<5120x1xbf16>) -> tensor<16000x1xbf16> {
+    func.func @main(%arg0: tensor<5120x16000xbf16>, %arg1: tensor<5120x1xbf16>) -> tensor<16000x1xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<16000x1xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<16000x1xbf16>) -> tensor<16000x1xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x16000xbf16>, tensor<5120x1xbf16>) outs(%1 : tensor<16000x1xbf16>) -> tensor<16000x1xbf16>
         return %2 : tensor<16000x1xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_16000_1_5120_f16_tA.mlir b/gemm/mlir/gemm_16000_1_5120_f16_tA.mlir
index 70c5c26..7182791 100644
--- a/gemm/mlir/gemm_16000_1_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_16000_1_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x16000xf16>, %arg1: tensor<5120x1xf16>) -> tensor<16000x1xf16> {
+    func.func @main(%arg0: tensor<5120x16000xf16>, %arg1: tensor<5120x1xf16>) -> tensor<16000x1xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<16000x1xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<16000x1xf16>) -> tensor<16000x1xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x16000xf16>, tensor<5120x1xf16>) outs(%1 : tensor<16000x1xf16>) -> tensor<16000x1xf16>
         return %2 : tensor<16000x1xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_16000_1_8192_bf16_tA.mlir b/gemm/mlir/gemm_16000_1_8192_bf16_tA.mlir
index 8930976..8258663 100644
--- a/gemm/mlir/gemm_16000_1_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_16000_1_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x16000xbf16>, %arg1: tensor<8192x1xbf16>) -> tensor<16000x1xbf16> {
+    func.func @main(%arg0: tensor<8192x16000xbf16>, %arg1: tensor<8192x1xbf16>) -> tensor<16000x1xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<16000x1xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<16000x1xbf16>) -> tensor<16000x1xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x16000xbf16>, tensor<8192x1xbf16>) outs(%1 : tensor<16000x1xbf16>) -> tensor<16000x1xbf16>
         return %2 : tensor<16000x1xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_16000_1_8192_f16_tA.mlir b/gemm/mlir/gemm_16000_1_8192_f16_tA.mlir
index e91b93e..8186ad5 100644
--- a/gemm/mlir/gemm_16000_1_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_16000_1_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x16000xf16>, %arg1: tensor<8192x1xf16>) -> tensor<16000x1xf16> {
+    func.func @main(%arg0: tensor<8192x16000xf16>, %arg1: tensor<8192x1xf16>) -> tensor<16000x1xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<16000x1xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<16000x1xf16>) -> tensor<16000x1xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x16000xf16>, tensor<8192x1xf16>) outs(%1 : tensor<16000x1xf16>) -> tensor<16000x1xf16>
         return %2 : tensor<16000x1xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_16000_2_5120_bf16_tA.mlir b/gemm/mlir/gemm_16000_2_5120_bf16_tA.mlir
index 57d5461..11c07f2 100644
--- a/gemm/mlir/gemm_16000_2_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_16000_2_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x16000xbf16>, %arg1: tensor<5120x2xbf16>) -> tensor<16000x2xbf16> {
+    func.func @main(%arg0: tensor<5120x16000xbf16>, %arg1: tensor<5120x2xbf16>) -> tensor<16000x2xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<16000x2xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<16000x2xbf16>) -> tensor<16000x2xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x16000xbf16>, tensor<5120x2xbf16>) outs(%1 : tensor<16000x2xbf16>) -> tensor<16000x2xbf16>
         return %2 : tensor<16000x2xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_16000_2_5120_f16_tA.mlir b/gemm/mlir/gemm_16000_2_5120_f16_tA.mlir
index 39d6ed2..3efeb6a 100644
--- a/gemm/mlir/gemm_16000_2_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_16000_2_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x16000xf16>, %arg1: tensor<5120x2xf16>) -> tensor<16000x2xf16> {
+    func.func @main(%arg0: tensor<5120x16000xf16>, %arg1: tensor<5120x2xf16>) -> tensor<16000x2xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<16000x2xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<16000x2xf16>) -> tensor<16000x2xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x16000xf16>, tensor<5120x2xf16>) outs(%1 : tensor<16000x2xf16>) -> tensor<16000x2xf16>
         return %2 : tensor<16000x2xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_16000_2_8192_bf16_tA.mlir b/gemm/mlir/gemm_16000_2_8192_bf16_tA.mlir
index ca32231..28e4d63 100644
--- a/gemm/mlir/gemm_16000_2_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_16000_2_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x16000xbf16>, %arg1: tensor<8192x2xbf16>) -> tensor<16000x2xbf16> {
+    func.func @main(%arg0: tensor<8192x16000xbf16>, %arg1: tensor<8192x2xbf16>) -> tensor<16000x2xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<16000x2xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<16000x2xbf16>) -> tensor<16000x2xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x16000xbf16>, tensor<8192x2xbf16>) outs(%1 : tensor<16000x2xbf16>) -> tensor<16000x2xbf16>
         return %2 : tensor<16000x2xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_16000_2_8192_f16_tA.mlir b/gemm/mlir/gemm_16000_2_8192_f16_tA.mlir
index c2a7e2c..8c125de 100644
--- a/gemm/mlir/gemm_16000_2_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_16000_2_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x16000xf16>, %arg1: tensor<8192x2xf16>) -> tensor<16000x2xf16> {
+    func.func @main(%arg0: tensor<8192x16000xf16>, %arg1: tensor<8192x2xf16>) -> tensor<16000x2xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<16000x2xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<16000x2xf16>) -> tensor<16000x2xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x16000xf16>, tensor<8192x2xf16>) outs(%1 : tensor<16000x2xf16>) -> tensor<16000x2xf16>
         return %2 : tensor<16000x2xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_16000_32_5120_bf16_tA.mlir b/gemm/mlir/gemm_16000_32_5120_bf16_tA.mlir
index c631bc7..a47ce25 100644
--- a/gemm/mlir/gemm_16000_32_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_16000_32_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x16000xbf16>, %arg1: tensor<5120x32xbf16>) -> tensor<16000x32xbf16> {
+    func.func @main(%arg0: tensor<5120x16000xbf16>, %arg1: tensor<5120x32xbf16>) -> tensor<16000x32xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<16000x32xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<16000x32xbf16>) -> tensor<16000x32xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x16000xbf16>, tensor<5120x32xbf16>) outs(%1 : tensor<16000x32xbf16>) -> tensor<16000x32xbf16>
         return %2 : tensor<16000x32xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_16000_32_5120_f16_tA.mlir b/gemm/mlir/gemm_16000_32_5120_f16_tA.mlir
index 983b2bf..5ea27d7 100644
--- a/gemm/mlir/gemm_16000_32_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_16000_32_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x16000xf16>, %arg1: tensor<5120x32xf16>) -> tensor<16000x32xf16> {
+    func.func @main(%arg0: tensor<5120x16000xf16>, %arg1: tensor<5120x32xf16>) -> tensor<16000x32xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<16000x32xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<16000x32xf16>) -> tensor<16000x32xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x16000xf16>, tensor<5120x32xf16>) outs(%1 : tensor<16000x32xf16>) -> tensor<16000x32xf16>
         return %2 : tensor<16000x32xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_16000_32_8192_bf16_tA.mlir b/gemm/mlir/gemm_16000_32_8192_bf16_tA.mlir
index 04085df..72308e0 100644
--- a/gemm/mlir/gemm_16000_32_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_16000_32_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x16000xbf16>, %arg1: tensor<8192x32xbf16>) -> tensor<16000x32xbf16> {
+    func.func @main(%arg0: tensor<8192x16000xbf16>, %arg1: tensor<8192x32xbf16>) -> tensor<16000x32xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<16000x32xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<16000x32xbf16>) -> tensor<16000x32xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x16000xbf16>, tensor<8192x32xbf16>) outs(%1 : tensor<16000x32xbf16>) -> tensor<16000x32xbf16>
         return %2 : tensor<16000x32xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_16000_32_8192_f16_tA.mlir b/gemm/mlir/gemm_16000_32_8192_f16_tA.mlir
index 274f470..e5f6d3b 100644
--- a/gemm/mlir/gemm_16000_32_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_16000_32_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x16000xf16>, %arg1: tensor<8192x32xf16>) -> tensor<16000x32xf16> {
+    func.func @main(%arg0: tensor<8192x16000xf16>, %arg1: tensor<8192x32xf16>) -> tensor<16000x32xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<16000x32xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<16000x32xf16>) -> tensor<16000x32xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x16000xf16>, tensor<8192x32xf16>) outs(%1 : tensor<16000x32xf16>) -> tensor<16000x32xf16>
         return %2 : tensor<16000x32xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_16000_4_5120_bf16_tA.mlir b/gemm/mlir/gemm_16000_4_5120_bf16_tA.mlir
index 077e87c..a514a47 100644
--- a/gemm/mlir/gemm_16000_4_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_16000_4_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x16000xbf16>, %arg1: tensor<5120x4xbf16>) -> tensor<16000x4xbf16> {
+    func.func @main(%arg0: tensor<5120x16000xbf16>, %arg1: tensor<5120x4xbf16>) -> tensor<16000x4xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<16000x4xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<16000x4xbf16>) -> tensor<16000x4xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x16000xbf16>, tensor<5120x4xbf16>) outs(%1 : tensor<16000x4xbf16>) -> tensor<16000x4xbf16>
         return %2 : tensor<16000x4xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_16000_4_5120_f16_tA.mlir b/gemm/mlir/gemm_16000_4_5120_f16_tA.mlir
index e30e392..1b73c07 100644
--- a/gemm/mlir/gemm_16000_4_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_16000_4_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x16000xf16>, %arg1: tensor<5120x4xf16>) -> tensor<16000x4xf16> {
+    func.func @main(%arg0: tensor<5120x16000xf16>, %arg1: tensor<5120x4xf16>) -> tensor<16000x4xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<16000x4xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<16000x4xf16>) -> tensor<16000x4xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x16000xf16>, tensor<5120x4xf16>) outs(%1 : tensor<16000x4xf16>) -> tensor<16000x4xf16>
         return %2 : tensor<16000x4xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_16000_4_8192_bf16_tA.mlir b/gemm/mlir/gemm_16000_4_8192_bf16_tA.mlir
index b345a5f..1de70e2 100644
--- a/gemm/mlir/gemm_16000_4_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_16000_4_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x16000xbf16>, %arg1: tensor<8192x4xbf16>) -> tensor<16000x4xbf16> {
+    func.func @main(%arg0: tensor<8192x16000xbf16>, %arg1: tensor<8192x4xbf16>) -> tensor<16000x4xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<16000x4xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<16000x4xbf16>) -> tensor<16000x4xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x16000xbf16>, tensor<8192x4xbf16>) outs(%1 : tensor<16000x4xbf16>) -> tensor<16000x4xbf16>
         return %2 : tensor<16000x4xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_16000_4_8192_f16_tA.mlir b/gemm/mlir/gemm_16000_4_8192_f16_tA.mlir
index 8d4d4e7..a035de1 100644
--- a/gemm/mlir/gemm_16000_4_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_16000_4_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x16000xf16>, %arg1: tensor<8192x4xf16>) -> tensor<16000x4xf16> {
+    func.func @main(%arg0: tensor<8192x16000xf16>, %arg1: tensor<8192x4xf16>) -> tensor<16000x4xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<16000x4xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<16000x4xf16>) -> tensor<16000x4xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x16000xf16>, tensor<8192x4xf16>) outs(%1 : tensor<16000x4xf16>) -> tensor<16000x4xf16>
         return %2 : tensor<16000x4xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_16000_8_5120_bf16_tA.mlir b/gemm/mlir/gemm_16000_8_5120_bf16_tA.mlir
index 43a46c9..23c98e5 100644
--- a/gemm/mlir/gemm_16000_8_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_16000_8_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x16000xbf16>, %arg1: tensor<5120x8xbf16>) -> tensor<16000x8xbf16> {
+    func.func @main(%arg0: tensor<5120x16000xbf16>, %arg1: tensor<5120x8xbf16>) -> tensor<16000x8xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<16000x8xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<16000x8xbf16>) -> tensor<16000x8xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x16000xbf16>, tensor<5120x8xbf16>) outs(%1 : tensor<16000x8xbf16>) -> tensor<16000x8xbf16>
         return %2 : tensor<16000x8xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_16000_8_5120_f16_tA.mlir b/gemm/mlir/gemm_16000_8_5120_f16_tA.mlir
index 1eab178..25ea2f2 100644
--- a/gemm/mlir/gemm_16000_8_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_16000_8_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x16000xf16>, %arg1: tensor<5120x8xf16>) -> tensor<16000x8xf16> {
+    func.func @main(%arg0: tensor<5120x16000xf16>, %arg1: tensor<5120x8xf16>) -> tensor<16000x8xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<16000x8xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<16000x8xf16>) -> tensor<16000x8xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x16000xf16>, tensor<5120x8xf16>) outs(%1 : tensor<16000x8xf16>) -> tensor<16000x8xf16>
         return %2 : tensor<16000x8xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_16000_8_8192_bf16_tA.mlir b/gemm/mlir/gemm_16000_8_8192_bf16_tA.mlir
index 463e418..8b5ce5a 100644
--- a/gemm/mlir/gemm_16000_8_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_16000_8_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x16000xbf16>, %arg1: tensor<8192x8xbf16>) -> tensor<16000x8xbf16> {
+    func.func @main(%arg0: tensor<8192x16000xbf16>, %arg1: tensor<8192x8xbf16>) -> tensor<16000x8xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<16000x8xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<16000x8xbf16>) -> tensor<16000x8xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x16000xbf16>, tensor<8192x8xbf16>) outs(%1 : tensor<16000x8xbf16>) -> tensor<16000x8xbf16>
         return %2 : tensor<16000x8xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_16000_8_8192_f16_tA.mlir b/gemm/mlir/gemm_16000_8_8192_f16_tA.mlir
index 2ecfebf..b53f1c0 100644
--- a/gemm/mlir/gemm_16000_8_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_16000_8_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x16000xf16>, %arg1: tensor<8192x8xf16>) -> tensor<16000x8xf16> {
+    func.func @main(%arg0: tensor<8192x16000xf16>, %arg1: tensor<8192x8xf16>) -> tensor<16000x8xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<16000x8xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<16000x8xf16>) -> tensor<16000x8xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x16000xf16>, tensor<8192x8xf16>) outs(%1 : tensor<16000x8xf16>) -> tensor<16000x8xf16>
         return %2 : tensor<16000x8xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_1920_16_5120_bf16_tA.mlir b/gemm/mlir/gemm_1920_16_5120_bf16_tA.mlir
index 0ad1566..0498cb4 100644
--- a/gemm/mlir/gemm_1920_16_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_1920_16_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x1920xbf16>, %arg1: tensor<5120x16xbf16>) -> tensor<1920x16xbf16> {
+    func.func @main(%arg0: tensor<5120x1920xbf16>, %arg1: tensor<5120x16xbf16>) -> tensor<1920x16xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<1920x16xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<1920x16xbf16>) -> tensor<1920x16xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x1920xbf16>, tensor<5120x16xbf16>) outs(%1 : tensor<1920x16xbf16>) -> tensor<1920x16xbf16>
         return %2 : tensor<1920x16xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_1920_16_5120_f16_tA.mlir b/gemm/mlir/gemm_1920_16_5120_f16_tA.mlir
index 253e857..7a26a60 100644
--- a/gemm/mlir/gemm_1920_16_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_1920_16_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x1920xf16>, %arg1: tensor<5120x16xf16>) -> tensor<1920x16xf16> {
+    func.func @main(%arg0: tensor<5120x1920xf16>, %arg1: tensor<5120x16xf16>) -> tensor<1920x16xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<1920x16xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<1920x16xf16>) -> tensor<1920x16xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x1920xf16>, tensor<5120x16xf16>) outs(%1 : tensor<1920x16xf16>) -> tensor<1920x16xf16>
         return %2 : tensor<1920x16xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_1920_1_5120_bf16_tA.mlir b/gemm/mlir/gemm_1920_1_5120_bf16_tA.mlir
index c96d9a7..69a8142 100644
--- a/gemm/mlir/gemm_1920_1_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_1920_1_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x1920xbf16>, %arg1: tensor<5120x1xbf16>) -> tensor<1920x1xbf16> {
+    func.func @main(%arg0: tensor<5120x1920xbf16>, %arg1: tensor<5120x1xbf16>) -> tensor<1920x1xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<1920x1xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<1920x1xbf16>) -> tensor<1920x1xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x1920xbf16>, tensor<5120x1xbf16>) outs(%1 : tensor<1920x1xbf16>) -> tensor<1920x1xbf16>
         return %2 : tensor<1920x1xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_1920_1_5120_f16_tA.mlir b/gemm/mlir/gemm_1920_1_5120_f16_tA.mlir
index 0444dd8..7f56072 100644
--- a/gemm/mlir/gemm_1920_1_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_1920_1_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x1920xf16>, %arg1: tensor<5120x1xf16>) -> tensor<1920x1xf16> {
+    func.func @main(%arg0: tensor<5120x1920xf16>, %arg1: tensor<5120x1xf16>) -> tensor<1920x1xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<1920x1xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<1920x1xf16>) -> tensor<1920x1xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x1920xf16>, tensor<5120x1xf16>) outs(%1 : tensor<1920x1xf16>) -> tensor<1920x1xf16>
         return %2 : tensor<1920x1xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_1920_2_5120_bf16_tA.mlir b/gemm/mlir/gemm_1920_2_5120_bf16_tA.mlir
index 508a6e5..8241b87 100644
--- a/gemm/mlir/gemm_1920_2_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_1920_2_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x1920xbf16>, %arg1: tensor<5120x2xbf16>) -> tensor<1920x2xbf16> {
+    func.func @main(%arg0: tensor<5120x1920xbf16>, %arg1: tensor<5120x2xbf16>) -> tensor<1920x2xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<1920x2xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<1920x2xbf16>) -> tensor<1920x2xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x1920xbf16>, tensor<5120x2xbf16>) outs(%1 : tensor<1920x2xbf16>) -> tensor<1920x2xbf16>
         return %2 : tensor<1920x2xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_1920_2_5120_f16_tA.mlir b/gemm/mlir/gemm_1920_2_5120_f16_tA.mlir
index cc7ec7c..8410b70 100644
--- a/gemm/mlir/gemm_1920_2_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_1920_2_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x1920xf16>, %arg1: tensor<5120x2xf16>) -> tensor<1920x2xf16> {
+    func.func @main(%arg0: tensor<5120x1920xf16>, %arg1: tensor<5120x2xf16>) -> tensor<1920x2xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<1920x2xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<1920x2xf16>) -> tensor<1920x2xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x1920xf16>, tensor<5120x2xf16>) outs(%1 : tensor<1920x2xf16>) -> tensor<1920x2xf16>
         return %2 : tensor<1920x2xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_1920_32_5120_bf16_tA.mlir b/gemm/mlir/gemm_1920_32_5120_bf16_tA.mlir
index 0a74e00..fb33ba0 100644
--- a/gemm/mlir/gemm_1920_32_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_1920_32_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x1920xbf16>, %arg1: tensor<5120x32xbf16>) -> tensor<1920x32xbf16> {
+    func.func @main(%arg0: tensor<5120x1920xbf16>, %arg1: tensor<5120x32xbf16>) -> tensor<1920x32xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<1920x32xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<1920x32xbf16>) -> tensor<1920x32xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x1920xbf16>, tensor<5120x32xbf16>) outs(%1 : tensor<1920x32xbf16>) -> tensor<1920x32xbf16>
         return %2 : tensor<1920x32xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_1920_32_5120_f16_tA.mlir b/gemm/mlir/gemm_1920_32_5120_f16_tA.mlir
index f12339f..17e9ebc 100644
--- a/gemm/mlir/gemm_1920_32_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_1920_32_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x1920xf16>, %arg1: tensor<5120x32xf16>) -> tensor<1920x32xf16> {
+    func.func @main(%arg0: tensor<5120x1920xf16>, %arg1: tensor<5120x32xf16>) -> tensor<1920x32xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<1920x32xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<1920x32xf16>) -> tensor<1920x32xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x1920xf16>, tensor<5120x32xf16>) outs(%1 : tensor<1920x32xf16>) -> tensor<1920x32xf16>
         return %2 : tensor<1920x32xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_1920_4_5120_bf16_tA.mlir b/gemm/mlir/gemm_1920_4_5120_bf16_tA.mlir
index 8d9a5f0..5f1c806 100644
--- a/gemm/mlir/gemm_1920_4_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_1920_4_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x1920xbf16>, %arg1: tensor<5120x4xbf16>) -> tensor<1920x4xbf16> {
+    func.func @main(%arg0: tensor<5120x1920xbf16>, %arg1: tensor<5120x4xbf16>) -> tensor<1920x4xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<1920x4xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<1920x4xbf16>) -> tensor<1920x4xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x1920xbf16>, tensor<5120x4xbf16>) outs(%1 : tensor<1920x4xbf16>) -> tensor<1920x4xbf16>
         return %2 : tensor<1920x4xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_1920_4_5120_f16_tA.mlir b/gemm/mlir/gemm_1920_4_5120_f16_tA.mlir
index abb25bf..cd45416 100644
--- a/gemm/mlir/gemm_1920_4_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_1920_4_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x1920xf16>, %arg1: tensor<5120x4xf16>) -> tensor<1920x4xf16> {
+    func.func @main(%arg0: tensor<5120x1920xf16>, %arg1: tensor<5120x4xf16>) -> tensor<1920x4xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<1920x4xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<1920x4xf16>) -> tensor<1920x4xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x1920xf16>, tensor<5120x4xf16>) outs(%1 : tensor<1920x4xf16>) -> tensor<1920x4xf16>
         return %2 : tensor<1920x4xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_1920_8_5120_bf16_tA.mlir b/gemm/mlir/gemm_1920_8_5120_bf16_tA.mlir
index ee1a352..bb5ee3c 100644
--- a/gemm/mlir/gemm_1920_8_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_1920_8_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x1920xbf16>, %arg1: tensor<5120x8xbf16>) -> tensor<1920x8xbf16> {
+    func.func @main(%arg0: tensor<5120x1920xbf16>, %arg1: tensor<5120x8xbf16>) -> tensor<1920x8xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<1920x8xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<1920x8xbf16>) -> tensor<1920x8xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x1920xbf16>, tensor<5120x8xbf16>) outs(%1 : tensor<1920x8xbf16>) -> tensor<1920x8xbf16>
         return %2 : tensor<1920x8xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_1920_8_5120_f16_tA.mlir b/gemm/mlir/gemm_1920_8_5120_f16_tA.mlir
index 2c1faa3..7f94a48 100644
--- a/gemm/mlir/gemm_1920_8_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_1920_8_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x1920xf16>, %arg1: tensor<5120x8xf16>) -> tensor<1920x8xf16> {
+    func.func @main(%arg0: tensor<5120x1920xf16>, %arg1: tensor<5120x8xf16>) -> tensor<1920x8xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<1920x8xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<1920x8xf16>) -> tensor<1920x8xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x1920xf16>, tensor<5120x8xf16>) outs(%1 : tensor<1920x8xf16>) -> tensor<1920x8xf16>
         return %2 : tensor<1920x8xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_2048_10240_1280_bf16.mlir b/gemm/mlir/gemm_2048_10240_1280_bf16.mlir
index 74b20a4..d65d3a7 100644
--- a/gemm/mlir/gemm_2048_10240_1280_bf16.mlir
+++ b/gemm/mlir/gemm_2048_10240_1280_bf16.mlir
@@ -1,9 +1,9 @@
 module {
-    func.func @main_0(%arg0: tensor<2048x1280xbf16>, %arg1: tensor<1280x10240xbf16>) -> tensor<2048x10240xbf16> {
+    func.func @main(%arg0: tensor<2048x1280xbf16>, %arg1: tensor<1280x10240xbf16>) -> tensor<2048x10240xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<2048x10240xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<2048x10240xbf16>) -> tensor<2048x10240xbf16>
         %2 = linalg.matmul ins(%arg0, %arg1 : tensor<2048x1280xbf16>, tensor<1280x10240xbf16>) outs(%1 : tensor<2048x10240xbf16>) -> tensor<2048x10240xbf16>
         return %2 : tensor<2048x10240xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_2048_10240_1280_bf16_tA.mlir b/gemm/mlir/gemm_2048_10240_1280_bf16_tA.mlir
index 5e8be84..84241c7 100644
--- a/gemm/mlir/gemm_2048_10240_1280_bf16_tA.mlir
+++ b/gemm/mlir/gemm_2048_10240_1280_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<1280x2048xbf16>, %arg1: tensor<1280x10240xbf16>) -> tensor<2048x10240xbf16> {
+    func.func @main(%arg0: tensor<1280x2048xbf16>, %arg1: tensor<1280x10240xbf16>) -> tensor<2048x10240xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<2048x10240xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<2048x10240xbf16>) -> tensor<2048x10240xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1280x2048xbf16>, tensor<1280x10240xbf16>) outs(%1 : tensor<2048x10240xbf16>) -> tensor<2048x10240xbf16>
         return %2 : tensor<2048x10240xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_2048_10240_1280_bf16_tB.mlir b/gemm/mlir/gemm_2048_10240_1280_bf16_tB.mlir
index e103cc1..28e61ff 100644
--- a/gemm/mlir/gemm_2048_10240_1280_bf16_tB.mlir
+++ b/gemm/mlir/gemm_2048_10240_1280_bf16_tB.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<2048x1280xbf16>, %arg1: tensor<10240x1280xbf16>) -> tensor<2048x10240xbf16> {
+    func.func @main(%arg0: tensor<2048x1280xbf16>, %arg1: tensor<10240x1280xbf16>) -> tensor<2048x10240xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<2048x10240xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<2048x10240xbf16>) -> tensor<2048x10240xbf16>
         %2 = linalg.matmul_transpose_b ins(%arg0, %arg1 : tensor<2048x1280xbf16>, tensor<10240x1280xbf16>) outs(%1 : tensor<2048x10240xbf16>) -> tensor<2048x10240xbf16>
         return %2 : tensor<2048x10240xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_2048_10240_1280_f16.mlir b/gemm/mlir/gemm_2048_10240_1280_f16.mlir
index 76ff7bb..e3bbec5 100644
--- a/gemm/mlir/gemm_2048_10240_1280_f16.mlir
+++ b/gemm/mlir/gemm_2048_10240_1280_f16.mlir
@@ -1,9 +1,9 @@
 module {
-    func.func @main_0(%arg0: tensor<2048x1280xf16>, %arg1: tensor<1280x10240xf16>) -> tensor<2048x10240xf16> {
+    func.func @main(%arg0: tensor<2048x1280xf16>, %arg1: tensor<1280x10240xf16>) -> tensor<2048x10240xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<2048x10240xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<2048x10240xf16>) -> tensor<2048x10240xf16>
         %2 = linalg.matmul ins(%arg0, %arg1 : tensor<2048x1280xf16>, tensor<1280x10240xf16>) outs(%1 : tensor<2048x10240xf16>) -> tensor<2048x10240xf16>
         return %2 : tensor<2048x10240xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_2048_10240_1280_f16_tA.mlir b/gemm/mlir/gemm_2048_10240_1280_f16_tA.mlir
index cea341b..6a0033c 100644
--- a/gemm/mlir/gemm_2048_10240_1280_f16_tA.mlir
+++ b/gemm/mlir/gemm_2048_10240_1280_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<1280x2048xf16>, %arg1: tensor<1280x10240xf16>) -> tensor<2048x10240xf16> {
+    func.func @main(%arg0: tensor<1280x2048xf16>, %arg1: tensor<1280x10240xf16>) -> tensor<2048x10240xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<2048x10240xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<2048x10240xf16>) -> tensor<2048x10240xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1280x2048xf16>, tensor<1280x10240xf16>) outs(%1 : tensor<2048x10240xf16>) -> tensor<2048x10240xf16>
         return %2 : tensor<2048x10240xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_2048_10240_1280_f16_tB.mlir b/gemm/mlir/gemm_2048_10240_1280_f16_tB.mlir
index 69f813b..065e02f 100644
--- a/gemm/mlir/gemm_2048_10240_1280_f16_tB.mlir
+++ b/gemm/mlir/gemm_2048_10240_1280_f16_tB.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<2048x1280xf16>, %arg1: tensor<10240x1280xf16>) -> tensor<2048x10240xf16> {
+    func.func @main(%arg0: tensor<2048x1280xf16>, %arg1: tensor<10240x1280xf16>) -> tensor<2048x10240xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<2048x10240xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<2048x10240xf16>) -> tensor<2048x10240xf16>
         %2 = linalg.matmul_transpose_b ins(%arg0, %arg1 : tensor<2048x1280xf16>, tensor<10240x1280xf16>) outs(%1 : tensor<2048x10240xf16>) -> tensor<2048x10240xf16>
         return %2 : tensor<2048x10240xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_2048_1280_1280_bf16.mlir b/gemm/mlir/gemm_2048_1280_1280_bf16.mlir
index c5b2018..91c8ae2 100644
--- a/gemm/mlir/gemm_2048_1280_1280_bf16.mlir
+++ b/gemm/mlir/gemm_2048_1280_1280_bf16.mlir
@@ -1,9 +1,9 @@
 module {
-    func.func @main_0(%arg0: tensor<2048x1280xbf16>, %arg1: tensor<1280x1280xbf16>) -> tensor<2048x1280xbf16> {
+    func.func @main(%arg0: tensor<2048x1280xbf16>, %arg1: tensor<1280x1280xbf16>) -> tensor<2048x1280xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<2048x1280xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<2048x1280xbf16>) -> tensor<2048x1280xbf16>
         %2 = linalg.matmul ins(%arg0, %arg1 : tensor<2048x1280xbf16>, tensor<1280x1280xbf16>) outs(%1 : tensor<2048x1280xbf16>) -> tensor<2048x1280xbf16>
         return %2 : tensor<2048x1280xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_2048_1280_1280_bf16_tA.mlir b/gemm/mlir/gemm_2048_1280_1280_bf16_tA.mlir
index 9c68853..a155776 100644
--- a/gemm/mlir/gemm_2048_1280_1280_bf16_tA.mlir
+++ b/gemm/mlir/gemm_2048_1280_1280_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<1280x2048xbf16>, %arg1: tensor<1280x1280xbf16>) -> tensor<2048x1280xbf16> {
+    func.func @main(%arg0: tensor<1280x2048xbf16>, %arg1: tensor<1280x1280xbf16>) -> tensor<2048x1280xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<2048x1280xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<2048x1280xbf16>) -> tensor<2048x1280xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1280x2048xbf16>, tensor<1280x1280xbf16>) outs(%1 : tensor<2048x1280xbf16>) -> tensor<2048x1280xbf16>
         return %2 : tensor<2048x1280xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_2048_1280_1280_bf16_tB.mlir b/gemm/mlir/gemm_2048_1280_1280_bf16_tB.mlir
index 6879238..2087786 100644
--- a/gemm/mlir/gemm_2048_1280_1280_bf16_tB.mlir
+++ b/gemm/mlir/gemm_2048_1280_1280_bf16_tB.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<2048x1280xbf16>, %arg1: tensor<1280x1280xbf16>) -> tensor<2048x1280xbf16> {
+    func.func @main(%arg0: tensor<2048x1280xbf16>, %arg1: tensor<1280x1280xbf16>) -> tensor<2048x1280xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<2048x1280xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<2048x1280xbf16>) -> tensor<2048x1280xbf16>
         %2 = linalg.matmul_transpose_b ins(%arg0, %arg1 : tensor<2048x1280xbf16>, tensor<1280x1280xbf16>) outs(%1 : tensor<2048x1280xbf16>) -> tensor<2048x1280xbf16>
         return %2 : tensor<2048x1280xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_2048_1280_1280_f16.mlir b/gemm/mlir/gemm_2048_1280_1280_f16.mlir
index c505e3f..ceb58a2 100644
--- a/gemm/mlir/gemm_2048_1280_1280_f16.mlir
+++ b/gemm/mlir/gemm_2048_1280_1280_f16.mlir
@@ -1,9 +1,9 @@
 module {
-    func.func @main_0(%arg0: tensor<2048x1280xf16>, %arg1: tensor<1280x1280xf16>) -> tensor<2048x1280xf16> {
+    func.func @main(%arg0: tensor<2048x1280xf16>, %arg1: tensor<1280x1280xf16>) -> tensor<2048x1280xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<2048x1280xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<2048x1280xf16>) -> tensor<2048x1280xf16>
         %2 = linalg.matmul ins(%arg0, %arg1 : tensor<2048x1280xf16>, tensor<1280x1280xf16>) outs(%1 : tensor<2048x1280xf16>) -> tensor<2048x1280xf16>
         return %2 : tensor<2048x1280xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_2048_1280_1280_f16_tA.mlir b/gemm/mlir/gemm_2048_1280_1280_f16_tA.mlir
index df9a150..13b6466 100644
--- a/gemm/mlir/gemm_2048_1280_1280_f16_tA.mlir
+++ b/gemm/mlir/gemm_2048_1280_1280_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<1280x2048xf16>, %arg1: tensor<1280x1280xf16>) -> tensor<2048x1280xf16> {
+    func.func @main(%arg0: tensor<1280x2048xf16>, %arg1: tensor<1280x1280xf16>) -> tensor<2048x1280xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<2048x1280xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<2048x1280xf16>) -> tensor<2048x1280xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1280x2048xf16>, tensor<1280x1280xf16>) outs(%1 : tensor<2048x1280xf16>) -> tensor<2048x1280xf16>
         return %2 : tensor<2048x1280xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_2048_1280_1280_f16_tB.mlir b/gemm/mlir/gemm_2048_1280_1280_f16_tB.mlir
index 436c879..92384aa 100644
--- a/gemm/mlir/gemm_2048_1280_1280_f16_tB.mlir
+++ b/gemm/mlir/gemm_2048_1280_1280_f16_tB.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<2048x1280xf16>, %arg1: tensor<1280x1280xf16>) -> tensor<2048x1280xf16> {
+    func.func @main(%arg0: tensor<2048x1280xf16>, %arg1: tensor<1280x1280xf16>) -> tensor<2048x1280xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<2048x1280xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<2048x1280xf16>) -> tensor<2048x1280xf16>
         %2 = linalg.matmul_transpose_b ins(%arg0, %arg1 : tensor<2048x1280xf16>, tensor<1280x1280xf16>) outs(%1 : tensor<2048x1280xf16>) -> tensor<2048x1280xf16>
         return %2 : tensor<2048x1280xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_2048_1280_5120_bf16.mlir b/gemm/mlir/gemm_2048_1280_5120_bf16.mlir
index 97e7e4a..6739dcc 100644
--- a/gemm/mlir/gemm_2048_1280_5120_bf16.mlir
+++ b/gemm/mlir/gemm_2048_1280_5120_bf16.mlir
@@ -1,9 +1,9 @@
 module {
-    func.func @main_0(%arg0: tensor<2048x5120xbf16>, %arg1: tensor<5120x1280xbf16>) -> tensor<2048x1280xbf16> {
+    func.func @main(%arg0: tensor<2048x5120xbf16>, %arg1: tensor<5120x1280xbf16>) -> tensor<2048x1280xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<2048x1280xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<2048x1280xbf16>) -> tensor<2048x1280xbf16>
         %2 = linalg.matmul ins(%arg0, %arg1 : tensor<2048x5120xbf16>, tensor<5120x1280xbf16>) outs(%1 : tensor<2048x1280xbf16>) -> tensor<2048x1280xbf16>
         return %2 : tensor<2048x1280xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_2048_1280_5120_bf16_tA.mlir b/gemm/mlir/gemm_2048_1280_5120_bf16_tA.mlir
index bb19ae7..d99f327 100644
--- a/gemm/mlir/gemm_2048_1280_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_2048_1280_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x2048xbf16>, %arg1: tensor<5120x1280xbf16>) -> tensor<2048x1280xbf16> {
+    func.func @main(%arg0: tensor<5120x2048xbf16>, %arg1: tensor<5120x1280xbf16>) -> tensor<2048x1280xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<2048x1280xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<2048x1280xbf16>) -> tensor<2048x1280xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x2048xbf16>, tensor<5120x1280xbf16>) outs(%1 : tensor<2048x1280xbf16>) -> tensor<2048x1280xbf16>
         return %2 : tensor<2048x1280xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_2048_1280_5120_bf16_tB.mlir b/gemm/mlir/gemm_2048_1280_5120_bf16_tB.mlir
index bd9d8c7..ef0bd8e 100644
--- a/gemm/mlir/gemm_2048_1280_5120_bf16_tB.mlir
+++ b/gemm/mlir/gemm_2048_1280_5120_bf16_tB.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<2048x5120xbf16>, %arg1: tensor<1280x5120xbf16>) -> tensor<2048x1280xbf16> {
+    func.func @main(%arg0: tensor<2048x5120xbf16>, %arg1: tensor<1280x5120xbf16>) -> tensor<2048x1280xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<2048x1280xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<2048x1280xbf16>) -> tensor<2048x1280xbf16>
         %2 = linalg.matmul_transpose_b ins(%arg0, %arg1 : tensor<2048x5120xbf16>, tensor<1280x5120xbf16>) outs(%1 : tensor<2048x1280xbf16>) -> tensor<2048x1280xbf16>
         return %2 : tensor<2048x1280xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_2048_1280_5120_f16.mlir b/gemm/mlir/gemm_2048_1280_5120_f16.mlir
index bd29efa..c75885c 100644
--- a/gemm/mlir/gemm_2048_1280_5120_f16.mlir
+++ b/gemm/mlir/gemm_2048_1280_5120_f16.mlir
@@ -1,9 +1,9 @@
 module {
-    func.func @main_0(%arg0: tensor<2048x5120xf16>, %arg1: tensor<5120x1280xf16>) -> tensor<2048x1280xf16> {
+    func.func @main(%arg0: tensor<2048x5120xf16>, %arg1: tensor<5120x1280xf16>) -> tensor<2048x1280xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<2048x1280xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<2048x1280xf16>) -> tensor<2048x1280xf16>
         %2 = linalg.matmul ins(%arg0, %arg1 : tensor<2048x5120xf16>, tensor<5120x1280xf16>) outs(%1 : tensor<2048x1280xf16>) -> tensor<2048x1280xf16>
         return %2 : tensor<2048x1280xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_2048_1280_5120_f16_tA.mlir b/gemm/mlir/gemm_2048_1280_5120_f16_tA.mlir
index 4b63108..3d6fa99 100644
--- a/gemm/mlir/gemm_2048_1280_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_2048_1280_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x2048xf16>, %arg1: tensor<5120x1280xf16>) -> tensor<2048x1280xf16> {
+    func.func @main(%arg0: tensor<5120x2048xf16>, %arg1: tensor<5120x1280xf16>) -> tensor<2048x1280xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<2048x1280xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<2048x1280xf16>) -> tensor<2048x1280xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x2048xf16>, tensor<5120x1280xf16>) outs(%1 : tensor<2048x1280xf16>) -> tensor<2048x1280xf16>
         return %2 : tensor<2048x1280xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_2048_1280_5120_f16_tB.mlir b/gemm/mlir/gemm_2048_1280_5120_f16_tB.mlir
index aa4b90e..9ccaabb 100644
--- a/gemm/mlir/gemm_2048_1280_5120_f16_tB.mlir
+++ b/gemm/mlir/gemm_2048_1280_5120_f16_tB.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<2048x5120xf16>, %arg1: tensor<1280x5120xf16>) -> tensor<2048x1280xf16> {
+    func.func @main(%arg0: tensor<2048x5120xf16>, %arg1: tensor<1280x5120xf16>) -> tensor<2048x1280xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<2048x1280xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<2048x1280xf16>) -> tensor<2048x1280xf16>
         %2 = linalg.matmul_transpose_b ins(%arg0, %arg1 : tensor<2048x5120xf16>, tensor<1280x5120xf16>) outs(%1 : tensor<2048x1280xf16>) -> tensor<2048x1280xf16>
         return %2 : tensor<2048x1280xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_2048_2048_1024_f16.mlir b/gemm/mlir/gemm_2048_2048_1024_f16.mlir
index a88bfce..cc77455 100644
--- a/gemm/mlir/gemm_2048_2048_1024_f16.mlir
+++ b/gemm/mlir/gemm_2048_2048_1024_f16.mlir
@@ -1,9 +1,9 @@
 module {
-    func.func @main_0(%arg0: tensor<2048x1024xf16>, %arg1: tensor<1024x2048xf16>) -> tensor<2048x2048xf16> {
+    func.func @main(%arg0: tensor<2048x1024xf16>, %arg1: tensor<1024x2048xf16>) -> tensor<2048x2048xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<2048x2048xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<2048x2048xf16>) -> tensor<2048x2048xf16>
         %2 = linalg.matmul ins(%arg0, %arg1 : tensor<2048x1024xf16>, tensor<1024x2048xf16>) outs(%1 : tensor<2048x2048xf16>) -> tensor<2048x2048xf16>
         return %2 : tensor<2048x2048xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_2048_2048_65536_f16.mlir b/gemm/mlir/gemm_2048_2048_65536_f16.mlir
index dec419c..34b9849 100644
--- a/gemm/mlir/gemm_2048_2048_65536_f16.mlir
+++ b/gemm/mlir/gemm_2048_2048_65536_f16.mlir
@@ -1,9 +1,9 @@
 module {
-    func.func @main_0(%arg0: tensor<2048x65536xf16>, %arg1: tensor<65536x2048xf16>) -> tensor<2048x2048xf16> {
+    func.func @main(%arg0: tensor<2048x65536xf16>, %arg1: tensor<65536x2048xf16>) -> tensor<2048x2048xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<2048x2048xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<2048x2048xf16>) -> tensor<2048x2048xf16>
         %2 = linalg.matmul ins(%arg0, %arg1 : tensor<2048x65536xf16>, tensor<65536x2048xf16>) outs(%1 : tensor<2048x2048xf16>) -> tensor<2048x2048xf16>
         return %2 : tensor<2048x2048xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_2048_2048_8192_f16.mlir b/gemm/mlir/gemm_2048_2048_8192_f16.mlir
index ca66186..e9f3dd8 100644
--- a/gemm/mlir/gemm_2048_2048_8192_f16.mlir
+++ b/gemm/mlir/gemm_2048_2048_8192_f16.mlir
@@ -1,9 +1,9 @@
 module {
-    func.func @main_0(%arg0: tensor<2048x8192xf16>, %arg1: tensor<8192x2048xf16>) -> tensor<2048x2048xf16> {
+    func.func @main(%arg0: tensor<2048x8192xf16>, %arg1: tensor<8192x2048xf16>) -> tensor<2048x2048xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<2048x2048xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<2048x2048xf16>) -> tensor<2048x2048xf16>
         %2 = linalg.matmul ins(%arg0, %arg1 : tensor<2048x8192xf16>, tensor<8192x2048xf16>) outs(%1 : tensor<2048x2048xf16>) -> tensor<2048x2048xf16>
         return %2 : tensor<2048x2048xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_2048_8192_1024_f16.mlir b/gemm/mlir/gemm_2048_8192_1024_f16.mlir
index 71c0ec6..edfa213 100644
--- a/gemm/mlir/gemm_2048_8192_1024_f16.mlir
+++ b/gemm/mlir/gemm_2048_8192_1024_f16.mlir
@@ -1,9 +1,9 @@
 module {
-    func.func @main_0(%arg0: tensor<2048x1024xf16>, %arg1: tensor<1024x8192xf16>) -> tensor<2048x8192xf16> {
+    func.func @main(%arg0: tensor<2048x1024xf16>, %arg1: tensor<1024x8192xf16>) -> tensor<2048x8192xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<2048x8192xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<2048x8192xf16>) -> tensor<2048x8192xf16>
         %2 = linalg.matmul ins(%arg0, %arg1 : tensor<2048x1024xf16>, tensor<1024x8192xf16>) outs(%1 : tensor<2048x8192xf16>) -> tensor<2048x8192xf16>
         return %2 : tensor<2048x8192xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_2048_8192_65536_f16.mlir b/gemm/mlir/gemm_2048_8192_65536_f16.mlir
index 7f0e2f0..e419b78 100644
--- a/gemm/mlir/gemm_2048_8192_65536_f16.mlir
+++ b/gemm/mlir/gemm_2048_8192_65536_f16.mlir
@@ -1,9 +1,9 @@
 module {
-    func.func @main_0(%arg0: tensor<2048x65536xf16>, %arg1: tensor<65536x8192xf16>) -> tensor<2048x8192xf16> {
+    func.func @main(%arg0: tensor<2048x65536xf16>, %arg1: tensor<65536x8192xf16>) -> tensor<2048x8192xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<2048x8192xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<2048x8192xf16>) -> tensor<2048x8192xf16>
         %2 = linalg.matmul ins(%arg0, %arg1 : tensor<2048x65536xf16>, tensor<65536x8192xf16>) outs(%1 : tensor<2048x8192xf16>) -> tensor<2048x8192xf16>
         return %2 : tensor<2048x8192xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_2048_8192_8192_f16.mlir b/gemm/mlir/gemm_2048_8192_8192_f16.mlir
index 09a023a..cc93de1 100644
--- a/gemm/mlir/gemm_2048_8192_8192_f16.mlir
+++ b/gemm/mlir/gemm_2048_8192_8192_f16.mlir
@@ -1,9 +1,9 @@
 module {
-    func.func @main_0(%arg0: tensor<2048x8192xf16>, %arg1: tensor<8192x8192xf16>) -> tensor<2048x8192xf16> {
+    func.func @main(%arg0: tensor<2048x8192xf16>, %arg1: tensor<8192x8192xf16>) -> tensor<2048x8192xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<2048x8192xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<2048x8192xf16>) -> tensor<2048x8192xf16>
         %2 = linalg.matmul ins(%arg0, %arg1 : tensor<2048x8192xf16>, tensor<8192x8192xf16>) outs(%1 : tensor<2048x8192xf16>) -> tensor<2048x8192xf16>
         return %2 : tensor<2048x8192xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_2560_16_8192_bf16_tA.mlir b/gemm/mlir/gemm_2560_16_8192_bf16_tA.mlir
index f7aace2..1ab9cc0 100644
--- a/gemm/mlir/gemm_2560_16_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_2560_16_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x2560xbf16>, %arg1: tensor<8192x16xbf16>) -> tensor<2560x16xbf16> {
+    func.func @main(%arg0: tensor<8192x2560xbf16>, %arg1: tensor<8192x16xbf16>) -> tensor<2560x16xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<2560x16xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<2560x16xbf16>) -> tensor<2560x16xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x2560xbf16>, tensor<8192x16xbf16>) outs(%1 : tensor<2560x16xbf16>) -> tensor<2560x16xbf16>
         return %2 : tensor<2560x16xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_2560_16_8192_f16_tA.mlir b/gemm/mlir/gemm_2560_16_8192_f16_tA.mlir
index ec78c38..fd4d377 100644
--- a/gemm/mlir/gemm_2560_16_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_2560_16_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x2560xf16>, %arg1: tensor<8192x16xf16>) -> tensor<2560x16xf16> {
+    func.func @main(%arg0: tensor<8192x2560xf16>, %arg1: tensor<8192x16xf16>) -> tensor<2560x16xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<2560x16xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<2560x16xf16>) -> tensor<2560x16xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x2560xf16>, tensor<8192x16xf16>) outs(%1 : tensor<2560x16xf16>) -> tensor<2560x16xf16>
         return %2 : tensor<2560x16xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_2560_1_8192_bf16_tA.mlir b/gemm/mlir/gemm_2560_1_8192_bf16_tA.mlir
index c45082b..bf23aca 100644
--- a/gemm/mlir/gemm_2560_1_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_2560_1_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x2560xbf16>, %arg1: tensor<8192x1xbf16>) -> tensor<2560x1xbf16> {
+    func.func @main(%arg0: tensor<8192x2560xbf16>, %arg1: tensor<8192x1xbf16>) -> tensor<2560x1xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<2560x1xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<2560x1xbf16>) -> tensor<2560x1xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x2560xbf16>, tensor<8192x1xbf16>) outs(%1 : tensor<2560x1xbf16>) -> tensor<2560x1xbf16>
         return %2 : tensor<2560x1xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_2560_1_8192_f16_tA.mlir b/gemm/mlir/gemm_2560_1_8192_f16_tA.mlir
index 13ef082..e6b86b4 100644
--- a/gemm/mlir/gemm_2560_1_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_2560_1_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x2560xf16>, %arg1: tensor<8192x1xf16>) -> tensor<2560x1xf16> {
+    func.func @main(%arg0: tensor<8192x2560xf16>, %arg1: tensor<8192x1xf16>) -> tensor<2560x1xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<2560x1xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<2560x1xf16>) -> tensor<2560x1xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x2560xf16>, tensor<8192x1xf16>) outs(%1 : tensor<2560x1xf16>) -> tensor<2560x1xf16>
         return %2 : tensor<2560x1xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_2560_2_8192_bf16_tA.mlir b/gemm/mlir/gemm_2560_2_8192_bf16_tA.mlir
index 4f3eb6b..de185be 100644
--- a/gemm/mlir/gemm_2560_2_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_2560_2_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x2560xbf16>, %arg1: tensor<8192x2xbf16>) -> tensor<2560x2xbf16> {
+    func.func @main(%arg0: tensor<8192x2560xbf16>, %arg1: tensor<8192x2xbf16>) -> tensor<2560x2xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<2560x2xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<2560x2xbf16>) -> tensor<2560x2xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x2560xbf16>, tensor<8192x2xbf16>) outs(%1 : tensor<2560x2xbf16>) -> tensor<2560x2xbf16>
         return %2 : tensor<2560x2xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_2560_2_8192_f16_tA.mlir b/gemm/mlir/gemm_2560_2_8192_f16_tA.mlir
index c82ae7a..3e6664e 100644
--- a/gemm/mlir/gemm_2560_2_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_2560_2_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x2560xf16>, %arg1: tensor<8192x2xf16>) -> tensor<2560x2xf16> {
+    func.func @main(%arg0: tensor<8192x2560xf16>, %arg1: tensor<8192x2xf16>) -> tensor<2560x2xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<2560x2xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<2560x2xf16>) -> tensor<2560x2xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x2560xf16>, tensor<8192x2xf16>) outs(%1 : tensor<2560x2xf16>) -> tensor<2560x2xf16>
         return %2 : tensor<2560x2xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_2560_32_8192_bf16_tA.mlir b/gemm/mlir/gemm_2560_32_8192_bf16_tA.mlir
index a25b3b1..45d0840 100644
--- a/gemm/mlir/gemm_2560_32_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_2560_32_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x2560xbf16>, %arg1: tensor<8192x32xbf16>) -> tensor<2560x32xbf16> {
+    func.func @main(%arg0: tensor<8192x2560xbf16>, %arg1: tensor<8192x32xbf16>) -> tensor<2560x32xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<2560x32xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<2560x32xbf16>) -> tensor<2560x32xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x2560xbf16>, tensor<8192x32xbf16>) outs(%1 : tensor<2560x32xbf16>) -> tensor<2560x32xbf16>
         return %2 : tensor<2560x32xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_2560_32_8192_f16_tA.mlir b/gemm/mlir/gemm_2560_32_8192_f16_tA.mlir
index 961e0f4..456b6c6 100644
--- a/gemm/mlir/gemm_2560_32_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_2560_32_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x2560xf16>, %arg1: tensor<8192x32xf16>) -> tensor<2560x32xf16> {
+    func.func @main(%arg0: tensor<8192x2560xf16>, %arg1: tensor<8192x32xf16>) -> tensor<2560x32xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<2560x32xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<2560x32xf16>) -> tensor<2560x32xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x2560xf16>, tensor<8192x32xf16>) outs(%1 : tensor<2560x32xf16>) -> tensor<2560x32xf16>
         return %2 : tensor<2560x32xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_2560_4_8192_bf16_tA.mlir b/gemm/mlir/gemm_2560_4_8192_bf16_tA.mlir
index ac70b7d..d377ec1 100644
--- a/gemm/mlir/gemm_2560_4_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_2560_4_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x2560xbf16>, %arg1: tensor<8192x4xbf16>) -> tensor<2560x4xbf16> {
+    func.func @main(%arg0: tensor<8192x2560xbf16>, %arg1: tensor<8192x4xbf16>) -> tensor<2560x4xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<2560x4xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<2560x4xbf16>) -> tensor<2560x4xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x2560xbf16>, tensor<8192x4xbf16>) outs(%1 : tensor<2560x4xbf16>) -> tensor<2560x4xbf16>
         return %2 : tensor<2560x4xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_2560_4_8192_f16_tA.mlir b/gemm/mlir/gemm_2560_4_8192_f16_tA.mlir
index 0832a98..a152ec3 100644
--- a/gemm/mlir/gemm_2560_4_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_2560_4_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x2560xf16>, %arg1: tensor<8192x4xf16>) -> tensor<2560x4xf16> {
+    func.func @main(%arg0: tensor<8192x2560xf16>, %arg1: tensor<8192x4xf16>) -> tensor<2560x4xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<2560x4xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<2560x4xf16>) -> tensor<2560x4xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x2560xf16>, tensor<8192x4xf16>) outs(%1 : tensor<2560x4xf16>) -> tensor<2560x4xf16>
         return %2 : tensor<2560x4xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_2560_8_8192_bf16_tA.mlir b/gemm/mlir/gemm_2560_8_8192_bf16_tA.mlir
index e46598a..76c1250 100644
--- a/gemm/mlir/gemm_2560_8_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_2560_8_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x2560xbf16>, %arg1: tensor<8192x8xbf16>) -> tensor<2560x8xbf16> {
+    func.func @main(%arg0: tensor<8192x2560xbf16>, %arg1: tensor<8192x8xbf16>) -> tensor<2560x8xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<2560x8xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<2560x8xbf16>) -> tensor<2560x8xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x2560xbf16>, tensor<8192x8xbf16>) outs(%1 : tensor<2560x8xbf16>) -> tensor<2560x8xbf16>
         return %2 : tensor<2560x8xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_2560_8_8192_f16_tA.mlir b/gemm/mlir/gemm_2560_8_8192_f16_tA.mlir
index 5ce7a8f..fff4a68 100644
--- a/gemm/mlir/gemm_2560_8_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_2560_8_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x2560xf16>, %arg1: tensor<8192x8xf16>) -> tensor<2560x8xf16> {
+    func.func @main(%arg0: tensor<8192x2560xf16>, %arg1: tensor<8192x8xf16>) -> tensor<2560x8xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<2560x8xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<2560x8xf16>) -> tensor<2560x8xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x2560xf16>, tensor<8192x8xf16>) outs(%1 : tensor<2560x8xf16>) -> tensor<2560x8xf16>
         return %2 : tensor<2560x8xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_27648_16_5120_bf16_tA.mlir b/gemm/mlir/gemm_27648_16_5120_bf16_tA.mlir
index 58da7c2..e06171a 100644
--- a/gemm/mlir/gemm_27648_16_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_27648_16_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x27648xbf16>, %arg1: tensor<5120x16xbf16>) -> tensor<27648x16xbf16> {
+    func.func @main(%arg0: tensor<5120x27648xbf16>, %arg1: tensor<5120x16xbf16>) -> tensor<27648x16xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<27648x16xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<27648x16xbf16>) -> tensor<27648x16xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x27648xbf16>, tensor<5120x16xbf16>) outs(%1 : tensor<27648x16xbf16>) -> tensor<27648x16xbf16>
         return %2 : tensor<27648x16xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_27648_16_5120_f16_tA.mlir b/gemm/mlir/gemm_27648_16_5120_f16_tA.mlir
index e7431ec..9af970d 100644
--- a/gemm/mlir/gemm_27648_16_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_27648_16_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x27648xf16>, %arg1: tensor<5120x16xf16>) -> tensor<27648x16xf16> {
+    func.func @main(%arg0: tensor<5120x27648xf16>, %arg1: tensor<5120x16xf16>) -> tensor<27648x16xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<27648x16xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<27648x16xf16>) -> tensor<27648x16xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x27648xf16>, tensor<5120x16xf16>) outs(%1 : tensor<27648x16xf16>) -> tensor<27648x16xf16>
         return %2 : tensor<27648x16xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_27648_1_5120_bf16_tA.mlir b/gemm/mlir/gemm_27648_1_5120_bf16_tA.mlir
index c2f34a7..dda9b15 100644
--- a/gemm/mlir/gemm_27648_1_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_27648_1_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x27648xbf16>, %arg1: tensor<5120x1xbf16>) -> tensor<27648x1xbf16> {
+    func.func @main(%arg0: tensor<5120x27648xbf16>, %arg1: tensor<5120x1xbf16>) -> tensor<27648x1xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<27648x1xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<27648x1xbf16>) -> tensor<27648x1xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x27648xbf16>, tensor<5120x1xbf16>) outs(%1 : tensor<27648x1xbf16>) -> tensor<27648x1xbf16>
         return %2 : tensor<27648x1xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_27648_1_5120_f16_tA.mlir b/gemm/mlir/gemm_27648_1_5120_f16_tA.mlir
index 5e83ea2..f2d5c42 100644
--- a/gemm/mlir/gemm_27648_1_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_27648_1_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x27648xf16>, %arg1: tensor<5120x1xf16>) -> tensor<27648x1xf16> {
+    func.func @main(%arg0: tensor<5120x27648xf16>, %arg1: tensor<5120x1xf16>) -> tensor<27648x1xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<27648x1xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<27648x1xf16>) -> tensor<27648x1xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x27648xf16>, tensor<5120x1xf16>) outs(%1 : tensor<27648x1xf16>) -> tensor<27648x1xf16>
         return %2 : tensor<27648x1xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_27648_2_5120_bf16_tA.mlir b/gemm/mlir/gemm_27648_2_5120_bf16_tA.mlir
index b0dd205..e16cd24 100644
--- a/gemm/mlir/gemm_27648_2_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_27648_2_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x27648xbf16>, %arg1: tensor<5120x2xbf16>) -> tensor<27648x2xbf16> {
+    func.func @main(%arg0: tensor<5120x27648xbf16>, %arg1: tensor<5120x2xbf16>) -> tensor<27648x2xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<27648x2xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<27648x2xbf16>) -> tensor<27648x2xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x27648xbf16>, tensor<5120x2xbf16>) outs(%1 : tensor<27648x2xbf16>) -> tensor<27648x2xbf16>
         return %2 : tensor<27648x2xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_27648_2_5120_f16_tA.mlir b/gemm/mlir/gemm_27648_2_5120_f16_tA.mlir
index 82058eb..dcf4508 100644
--- a/gemm/mlir/gemm_27648_2_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_27648_2_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x27648xf16>, %arg1: tensor<5120x2xf16>) -> tensor<27648x2xf16> {
+    func.func @main(%arg0: tensor<5120x27648xf16>, %arg1: tensor<5120x2xf16>) -> tensor<27648x2xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<27648x2xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<27648x2xf16>) -> tensor<27648x2xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x27648xf16>, tensor<5120x2xf16>) outs(%1 : tensor<27648x2xf16>) -> tensor<27648x2xf16>
         return %2 : tensor<27648x2xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_27648_32_5120_bf16_tA.mlir b/gemm/mlir/gemm_27648_32_5120_bf16_tA.mlir
index 32277d3..0a408fd 100644
--- a/gemm/mlir/gemm_27648_32_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_27648_32_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x27648xbf16>, %arg1: tensor<5120x32xbf16>) -> tensor<27648x32xbf16> {
+    func.func @main(%arg0: tensor<5120x27648xbf16>, %arg1: tensor<5120x32xbf16>) -> tensor<27648x32xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<27648x32xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<27648x32xbf16>) -> tensor<27648x32xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x27648xbf16>, tensor<5120x32xbf16>) outs(%1 : tensor<27648x32xbf16>) -> tensor<27648x32xbf16>
         return %2 : tensor<27648x32xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_27648_32_5120_f16_tA.mlir b/gemm/mlir/gemm_27648_32_5120_f16_tA.mlir
index 71aacf2..90927a3 100644
--- a/gemm/mlir/gemm_27648_32_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_27648_32_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x27648xf16>, %arg1: tensor<5120x32xf16>) -> tensor<27648x32xf16> {
+    func.func @main(%arg0: tensor<5120x27648xf16>, %arg1: tensor<5120x32xf16>) -> tensor<27648x32xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<27648x32xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<27648x32xf16>) -> tensor<27648x32xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x27648xf16>, tensor<5120x32xf16>) outs(%1 : tensor<27648x32xf16>) -> tensor<27648x32xf16>
         return %2 : tensor<27648x32xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_27648_4_5120_bf16_tA.mlir b/gemm/mlir/gemm_27648_4_5120_bf16_tA.mlir
index b0357df..20f2150 100644
--- a/gemm/mlir/gemm_27648_4_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_27648_4_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x27648xbf16>, %arg1: tensor<5120x4xbf16>) -> tensor<27648x4xbf16> {
+    func.func @main(%arg0: tensor<5120x27648xbf16>, %arg1: tensor<5120x4xbf16>) -> tensor<27648x4xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<27648x4xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<27648x4xbf16>) -> tensor<27648x4xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x27648xbf16>, tensor<5120x4xbf16>) outs(%1 : tensor<27648x4xbf16>) -> tensor<27648x4xbf16>
         return %2 : tensor<27648x4xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_27648_4_5120_f16_tA.mlir b/gemm/mlir/gemm_27648_4_5120_f16_tA.mlir
index 1801fc5..af948b5 100644
--- a/gemm/mlir/gemm_27648_4_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_27648_4_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x27648xf16>, %arg1: tensor<5120x4xf16>) -> tensor<27648x4xf16> {
+    func.func @main(%arg0: tensor<5120x27648xf16>, %arg1: tensor<5120x4xf16>) -> tensor<27648x4xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<27648x4xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<27648x4xf16>) -> tensor<27648x4xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x27648xf16>, tensor<5120x4xf16>) outs(%1 : tensor<27648x4xf16>) -> tensor<27648x4xf16>
         return %2 : tensor<27648x4xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_27648_8_5120_bf16_tA.mlir b/gemm/mlir/gemm_27648_8_5120_bf16_tA.mlir
index 0a79fc5..fd43a3e 100644
--- a/gemm/mlir/gemm_27648_8_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_27648_8_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x27648xbf16>, %arg1: tensor<5120x8xbf16>) -> tensor<27648x8xbf16> {
+    func.func @main(%arg0: tensor<5120x27648xbf16>, %arg1: tensor<5120x8xbf16>) -> tensor<27648x8xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<27648x8xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<27648x8xbf16>) -> tensor<27648x8xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x27648xbf16>, tensor<5120x8xbf16>) outs(%1 : tensor<27648x8xbf16>) -> tensor<27648x8xbf16>
         return %2 : tensor<27648x8xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_27648_8_5120_f16_tA.mlir b/gemm/mlir/gemm_27648_8_5120_f16_tA.mlir
index fd5f73a..6d0ec2e 100644
--- a/gemm/mlir/gemm_27648_8_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_27648_8_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x27648xf16>, %arg1: tensor<5120x8xf16>) -> tensor<27648x8xf16> {
+    func.func @main(%arg0: tensor<5120x27648xf16>, %arg1: tensor<5120x8xf16>) -> tensor<27648x8xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<27648x8xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<27648x8xf16>) -> tensor<27648x8xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x27648xf16>, tensor<5120x8xf16>) outs(%1 : tensor<27648x8xf16>) -> tensor<27648x8xf16>
         return %2 : tensor<27648x8xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_28672_16_8192_bf16_tA.mlir b/gemm/mlir/gemm_28672_16_8192_bf16_tA.mlir
index 8099b7e..10c20ee 100644
--- a/gemm/mlir/gemm_28672_16_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_28672_16_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x28672xbf16>, %arg1: tensor<8192x16xbf16>) -> tensor<28672x16xbf16> {
+    func.func @main(%arg0: tensor<8192x28672xbf16>, %arg1: tensor<8192x16xbf16>) -> tensor<28672x16xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<28672x16xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<28672x16xbf16>) -> tensor<28672x16xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x28672xbf16>, tensor<8192x16xbf16>) outs(%1 : tensor<28672x16xbf16>) -> tensor<28672x16xbf16>
         return %2 : tensor<28672x16xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_28672_16_8192_f16_tA.mlir b/gemm/mlir/gemm_28672_16_8192_f16_tA.mlir
index 83e15ae..f923157 100644
--- a/gemm/mlir/gemm_28672_16_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_28672_16_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x28672xf16>, %arg1: tensor<8192x16xf16>) -> tensor<28672x16xf16> {
+    func.func @main(%arg0: tensor<8192x28672xf16>, %arg1: tensor<8192x16xf16>) -> tensor<28672x16xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<28672x16xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<28672x16xf16>) -> tensor<28672x16xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x28672xf16>, tensor<8192x16xf16>) outs(%1 : tensor<28672x16xf16>) -> tensor<28672x16xf16>
         return %2 : tensor<28672x16xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_28672_1_8192_bf16_tA.mlir b/gemm/mlir/gemm_28672_1_8192_bf16_tA.mlir
index e5bad51..6a24568 100644
--- a/gemm/mlir/gemm_28672_1_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_28672_1_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x28672xbf16>, %arg1: tensor<8192x1xbf16>) -> tensor<28672x1xbf16> {
+    func.func @main(%arg0: tensor<8192x28672xbf16>, %arg1: tensor<8192x1xbf16>) -> tensor<28672x1xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<28672x1xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<28672x1xbf16>) -> tensor<28672x1xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x28672xbf16>, tensor<8192x1xbf16>) outs(%1 : tensor<28672x1xbf16>) -> tensor<28672x1xbf16>
         return %2 : tensor<28672x1xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_28672_1_8192_f16_tA.mlir b/gemm/mlir/gemm_28672_1_8192_f16_tA.mlir
index 76b2743..a4bb37c 100644
--- a/gemm/mlir/gemm_28672_1_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_28672_1_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x28672xf16>, %arg1: tensor<8192x1xf16>) -> tensor<28672x1xf16> {
+    func.func @main(%arg0: tensor<8192x28672xf16>, %arg1: tensor<8192x1xf16>) -> tensor<28672x1xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<28672x1xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<28672x1xf16>) -> tensor<28672x1xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x28672xf16>, tensor<8192x1xf16>) outs(%1 : tensor<28672x1xf16>) -> tensor<28672x1xf16>
         return %2 : tensor<28672x1xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_28672_2_8192_bf16_tA.mlir b/gemm/mlir/gemm_28672_2_8192_bf16_tA.mlir
index 280a4e1..24fd156 100644
--- a/gemm/mlir/gemm_28672_2_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_28672_2_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x28672xbf16>, %arg1: tensor<8192x2xbf16>) -> tensor<28672x2xbf16> {
+    func.func @main(%arg0: tensor<8192x28672xbf16>, %arg1: tensor<8192x2xbf16>) -> tensor<28672x2xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<28672x2xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<28672x2xbf16>) -> tensor<28672x2xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x28672xbf16>, tensor<8192x2xbf16>) outs(%1 : tensor<28672x2xbf16>) -> tensor<28672x2xbf16>
         return %2 : tensor<28672x2xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_28672_2_8192_f16_tA.mlir b/gemm/mlir/gemm_28672_2_8192_f16_tA.mlir
index 715fb2f..85df0ac 100644
--- a/gemm/mlir/gemm_28672_2_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_28672_2_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x28672xf16>, %arg1: tensor<8192x2xf16>) -> tensor<28672x2xf16> {
+    func.func @main(%arg0: tensor<8192x28672xf16>, %arg1: tensor<8192x2xf16>) -> tensor<28672x2xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<28672x2xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<28672x2xf16>) -> tensor<28672x2xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x28672xf16>, tensor<8192x2xf16>) outs(%1 : tensor<28672x2xf16>) -> tensor<28672x2xf16>
         return %2 : tensor<28672x2xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_28672_32_8192_bf16_tA.mlir b/gemm/mlir/gemm_28672_32_8192_bf16_tA.mlir
index 37bcc48..e920955 100644
--- a/gemm/mlir/gemm_28672_32_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_28672_32_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x28672xbf16>, %arg1: tensor<8192x32xbf16>) -> tensor<28672x32xbf16> {
+    func.func @main(%arg0: tensor<8192x28672xbf16>, %arg1: tensor<8192x32xbf16>) -> tensor<28672x32xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<28672x32xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<28672x32xbf16>) -> tensor<28672x32xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x28672xbf16>, tensor<8192x32xbf16>) outs(%1 : tensor<28672x32xbf16>) -> tensor<28672x32xbf16>
         return %2 : tensor<28672x32xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_28672_32_8192_f16_tA.mlir b/gemm/mlir/gemm_28672_32_8192_f16_tA.mlir
index 1537d79..44a1361 100644
--- a/gemm/mlir/gemm_28672_32_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_28672_32_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x28672xf16>, %arg1: tensor<8192x32xf16>) -> tensor<28672x32xf16> {
+    func.func @main(%arg0: tensor<8192x28672xf16>, %arg1: tensor<8192x32xf16>) -> tensor<28672x32xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<28672x32xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<28672x32xf16>) -> tensor<28672x32xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x28672xf16>, tensor<8192x32xf16>) outs(%1 : tensor<28672x32xf16>) -> tensor<28672x32xf16>
         return %2 : tensor<28672x32xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_28672_4_8192_bf16_tA.mlir b/gemm/mlir/gemm_28672_4_8192_bf16_tA.mlir
index 7470e52..7ce0353 100644
--- a/gemm/mlir/gemm_28672_4_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_28672_4_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x28672xbf16>, %arg1: tensor<8192x4xbf16>) -> tensor<28672x4xbf16> {
+    func.func @main(%arg0: tensor<8192x28672xbf16>, %arg1: tensor<8192x4xbf16>) -> tensor<28672x4xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<28672x4xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<28672x4xbf16>) -> tensor<28672x4xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x28672xbf16>, tensor<8192x4xbf16>) outs(%1 : tensor<28672x4xbf16>) -> tensor<28672x4xbf16>
         return %2 : tensor<28672x4xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_28672_4_8192_f16_tA.mlir b/gemm/mlir/gemm_28672_4_8192_f16_tA.mlir
index ed276d3..a773111 100644
--- a/gemm/mlir/gemm_28672_4_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_28672_4_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x28672xf16>, %arg1: tensor<8192x4xf16>) -> tensor<28672x4xf16> {
+    func.func @main(%arg0: tensor<8192x28672xf16>, %arg1: tensor<8192x4xf16>) -> tensor<28672x4xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<28672x4xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<28672x4xf16>) -> tensor<28672x4xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x28672xf16>, tensor<8192x4xf16>) outs(%1 : tensor<28672x4xf16>) -> tensor<28672x4xf16>
         return %2 : tensor<28672x4xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_28672_8_8192_bf16_tA.mlir b/gemm/mlir/gemm_28672_8_8192_bf16_tA.mlir
index 2db383b..5a2541f 100644
--- a/gemm/mlir/gemm_28672_8_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_28672_8_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x28672xbf16>, %arg1: tensor<8192x8xbf16>) -> tensor<28672x8xbf16> {
+    func.func @main(%arg0: tensor<8192x28672xbf16>, %arg1: tensor<8192x8xbf16>) -> tensor<28672x8xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<28672x8xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<28672x8xbf16>) -> tensor<28672x8xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x28672xbf16>, tensor<8192x8xbf16>) outs(%1 : tensor<28672x8xbf16>) -> tensor<28672x8xbf16>
         return %2 : tensor<28672x8xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_28672_8_8192_f16_tA.mlir b/gemm/mlir/gemm_28672_8_8192_f16_tA.mlir
index 94a2919..9226cfe 100644
--- a/gemm/mlir/gemm_28672_8_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_28672_8_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x28672xf16>, %arg1: tensor<8192x8xf16>) -> tensor<28672x8xf16> {
+    func.func @main(%arg0: tensor<8192x28672xf16>, %arg1: tensor<8192x8xf16>) -> tensor<28672x8xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<28672x8xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<28672x8xf16>) -> tensor<28672x8xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x28672xf16>, tensor<8192x8xf16>) outs(%1 : tensor<28672x8xf16>) -> tensor<28672x8xf16>
         return %2 : tensor<28672x8xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_2_1280_8192_bf16_tB.mlir b/gemm/mlir/gemm_2_1280_8192_bf16_tB.mlir
index ad41cc8..1040350 100644
--- a/gemm/mlir/gemm_2_1280_8192_bf16_tB.mlir
+++ b/gemm/mlir/gemm_2_1280_8192_bf16_tB.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<2x8192xbf16>, %arg1: tensor<1280x8192xbf16>) -> tensor<2x1280xbf16> {
+    func.func @main(%arg0: tensor<2x8192xbf16>, %arg1: tensor<1280x8192xbf16>) -> tensor<2x1280xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<2x1280xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<2x1280xbf16>) -> tensor<2x1280xbf16>
         %2 = linalg.matmul_transpose_b ins(%arg0, %arg1 : tensor<2x8192xbf16>, tensor<1280x8192xbf16>) outs(%1 : tensor<2x1280xbf16>) -> tensor<2x1280xbf16>
         return %2 : tensor<2x1280xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_2_3584_8192_bf16_tB.mlir b/gemm/mlir/gemm_2_3584_8192_bf16_tB.mlir
index c25a2e1..7f6b6ea 100644
--- a/gemm/mlir/gemm_2_3584_8192_bf16_tB.mlir
+++ b/gemm/mlir/gemm_2_3584_8192_bf16_tB.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<2x8192xbf16>, %arg1: tensor<3584x8192xbf16>) -> tensor<2x3584xbf16> {
+    func.func @main(%arg0: tensor<2x8192xbf16>, %arg1: tensor<3584x8192xbf16>) -> tensor<2x3584xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<2x3584xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<2x3584xbf16>) -> tensor<2x3584xbf16>
         %2 = linalg.matmul_transpose_b ins(%arg0, %arg1 : tensor<2x8192xbf16>, tensor<3584x8192xbf16>) outs(%1 : tensor<2x3584xbf16>) -> tensor<2x3584xbf16>
         return %2 : tensor<2x3584xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_2_7168_8192_bf16_tB.mlir b/gemm/mlir/gemm_2_7168_8192_bf16_tB.mlir
index 25ca618..6ac8002 100644
--- a/gemm/mlir/gemm_2_7168_8192_bf16_tB.mlir
+++ b/gemm/mlir/gemm_2_7168_8192_bf16_tB.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<2x8192xbf16>, %arg1: tensor<7168x8192xbf16>) -> tensor<2x7168xbf16> {
+    func.func @main(%arg0: tensor<2x8192xbf16>, %arg1: tensor<7168x8192xbf16>) -> tensor<2x7168xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<2x7168xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<2x7168xbf16>) -> tensor<2x7168xbf16>
         %2 = linalg.matmul_transpose_b ins(%arg0, %arg1 : tensor<2x8192xbf16>, tensor<7168x8192xbf16>) outs(%1 : tensor<2x7168xbf16>) -> tensor<2x7168xbf16>
         return %2 : tensor<2x7168xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_32000_16_5120_bf16_tA.mlir b/gemm/mlir/gemm_32000_16_5120_bf16_tA.mlir
index 6b11ff1..986fbe3 100644
--- a/gemm/mlir/gemm_32000_16_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_32000_16_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x32000xbf16>, %arg1: tensor<5120x16xbf16>) -> tensor<32000x16xbf16> {
+    func.func @main(%arg0: tensor<5120x32000xbf16>, %arg1: tensor<5120x16xbf16>) -> tensor<32000x16xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<32000x16xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<32000x16xbf16>) -> tensor<32000x16xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x32000xbf16>, tensor<5120x16xbf16>) outs(%1 : tensor<32000x16xbf16>) -> tensor<32000x16xbf16>
         return %2 : tensor<32000x16xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_32000_16_5120_f16_tA.mlir b/gemm/mlir/gemm_32000_16_5120_f16_tA.mlir
index b6d25de..bb83872 100644
--- a/gemm/mlir/gemm_32000_16_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_32000_16_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x32000xf16>, %arg1: tensor<5120x16xf16>) -> tensor<32000x16xf16> {
+    func.func @main(%arg0: tensor<5120x32000xf16>, %arg1: tensor<5120x16xf16>) -> tensor<32000x16xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<32000x16xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<32000x16xf16>) -> tensor<32000x16xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x32000xf16>, tensor<5120x16xf16>) outs(%1 : tensor<32000x16xf16>) -> tensor<32000x16xf16>
         return %2 : tensor<32000x16xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_32000_16_8192_bf16_tA.mlir b/gemm/mlir/gemm_32000_16_8192_bf16_tA.mlir
index d2ad418..af63a99 100644
--- a/gemm/mlir/gemm_32000_16_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_32000_16_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x32000xbf16>, %arg1: tensor<8192x16xbf16>) -> tensor<32000x16xbf16> {
+    func.func @main(%arg0: tensor<8192x32000xbf16>, %arg1: tensor<8192x16xbf16>) -> tensor<32000x16xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<32000x16xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<32000x16xbf16>) -> tensor<32000x16xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x32000xbf16>, tensor<8192x16xbf16>) outs(%1 : tensor<32000x16xbf16>) -> tensor<32000x16xbf16>
         return %2 : tensor<32000x16xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_32000_16_8192_f16_tA.mlir b/gemm/mlir/gemm_32000_16_8192_f16_tA.mlir
index 5802b26..9881c6e 100644
--- a/gemm/mlir/gemm_32000_16_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_32000_16_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x32000xf16>, %arg1: tensor<8192x16xf16>) -> tensor<32000x16xf16> {
+    func.func @main(%arg0: tensor<8192x32000xf16>, %arg1: tensor<8192x16xf16>) -> tensor<32000x16xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<32000x16xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<32000x16xf16>) -> tensor<32000x16xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x32000xf16>, tensor<8192x16xf16>) outs(%1 : tensor<32000x16xf16>) -> tensor<32000x16xf16>
         return %2 : tensor<32000x16xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_32000_1_5120_bf16_tA.mlir b/gemm/mlir/gemm_32000_1_5120_bf16_tA.mlir
index 75a9deb..4d33257 100644
--- a/gemm/mlir/gemm_32000_1_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_32000_1_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x32000xbf16>, %arg1: tensor<5120x1xbf16>) -> tensor<32000x1xbf16> {
+    func.func @main(%arg0: tensor<5120x32000xbf16>, %arg1: tensor<5120x1xbf16>) -> tensor<32000x1xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<32000x1xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<32000x1xbf16>) -> tensor<32000x1xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x32000xbf16>, tensor<5120x1xbf16>) outs(%1 : tensor<32000x1xbf16>) -> tensor<32000x1xbf16>
         return %2 : tensor<32000x1xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_32000_1_5120_f16_tA.mlir b/gemm/mlir/gemm_32000_1_5120_f16_tA.mlir
index 8805125..9849f9c 100644
--- a/gemm/mlir/gemm_32000_1_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_32000_1_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x32000xf16>, %arg1: tensor<5120x1xf16>) -> tensor<32000x1xf16> {
+    func.func @main(%arg0: tensor<5120x32000xf16>, %arg1: tensor<5120x1xf16>) -> tensor<32000x1xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<32000x1xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<32000x1xf16>) -> tensor<32000x1xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x32000xf16>, tensor<5120x1xf16>) outs(%1 : tensor<32000x1xf16>) -> tensor<32000x1xf16>
         return %2 : tensor<32000x1xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_32000_1_8192_bf16_tA.mlir b/gemm/mlir/gemm_32000_1_8192_bf16_tA.mlir
index dd5abe0..cdf30e8 100644
--- a/gemm/mlir/gemm_32000_1_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_32000_1_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x32000xbf16>, %arg1: tensor<8192x1xbf16>) -> tensor<32000x1xbf16> {
+    func.func @main(%arg0: tensor<8192x32000xbf16>, %arg1: tensor<8192x1xbf16>) -> tensor<32000x1xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<32000x1xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<32000x1xbf16>) -> tensor<32000x1xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x32000xbf16>, tensor<8192x1xbf16>) outs(%1 : tensor<32000x1xbf16>) -> tensor<32000x1xbf16>
         return %2 : tensor<32000x1xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_32000_1_8192_f16_tA.mlir b/gemm/mlir/gemm_32000_1_8192_f16_tA.mlir
index 1d25619..fb063c9 100644
--- a/gemm/mlir/gemm_32000_1_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_32000_1_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x32000xf16>, %arg1: tensor<8192x1xf16>) -> tensor<32000x1xf16> {
+    func.func @main(%arg0: tensor<8192x32000xf16>, %arg1: tensor<8192x1xf16>) -> tensor<32000x1xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<32000x1xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<32000x1xf16>) -> tensor<32000x1xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x32000xf16>, tensor<8192x1xf16>) outs(%1 : tensor<32000x1xf16>) -> tensor<32000x1xf16>
         return %2 : tensor<32000x1xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_32000_2_5120_bf16_tA.mlir b/gemm/mlir/gemm_32000_2_5120_bf16_tA.mlir
index 0cbecc4..ffcff1f 100644
--- a/gemm/mlir/gemm_32000_2_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_32000_2_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x32000xbf16>, %arg1: tensor<5120x2xbf16>) -> tensor<32000x2xbf16> {
+    func.func @main(%arg0: tensor<5120x32000xbf16>, %arg1: tensor<5120x2xbf16>) -> tensor<32000x2xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<32000x2xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<32000x2xbf16>) -> tensor<32000x2xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x32000xbf16>, tensor<5120x2xbf16>) outs(%1 : tensor<32000x2xbf16>) -> tensor<32000x2xbf16>
         return %2 : tensor<32000x2xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_32000_2_5120_f16_tA.mlir b/gemm/mlir/gemm_32000_2_5120_f16_tA.mlir
index b7112b7..74b1e6a 100644
--- a/gemm/mlir/gemm_32000_2_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_32000_2_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x32000xf16>, %arg1: tensor<5120x2xf16>) -> tensor<32000x2xf16> {
+    func.func @main(%arg0: tensor<5120x32000xf16>, %arg1: tensor<5120x2xf16>) -> tensor<32000x2xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<32000x2xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<32000x2xf16>) -> tensor<32000x2xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x32000xf16>, tensor<5120x2xf16>) outs(%1 : tensor<32000x2xf16>) -> tensor<32000x2xf16>
         return %2 : tensor<32000x2xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_32000_2_8192_bf16_tA.mlir b/gemm/mlir/gemm_32000_2_8192_bf16_tA.mlir
index f614264..5c6b46d 100644
--- a/gemm/mlir/gemm_32000_2_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_32000_2_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x32000xbf16>, %arg1: tensor<8192x2xbf16>) -> tensor<32000x2xbf16> {
+    func.func @main(%arg0: tensor<8192x32000xbf16>, %arg1: tensor<8192x2xbf16>) -> tensor<32000x2xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<32000x2xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<32000x2xbf16>) -> tensor<32000x2xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x32000xbf16>, tensor<8192x2xbf16>) outs(%1 : tensor<32000x2xbf16>) -> tensor<32000x2xbf16>
         return %2 : tensor<32000x2xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_32000_2_8192_f16_tA.mlir b/gemm/mlir/gemm_32000_2_8192_f16_tA.mlir
index f6d736a..5623d69 100644
--- a/gemm/mlir/gemm_32000_2_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_32000_2_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x32000xf16>, %arg1: tensor<8192x2xf16>) -> tensor<32000x2xf16> {
+    func.func @main(%arg0: tensor<8192x32000xf16>, %arg1: tensor<8192x2xf16>) -> tensor<32000x2xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<32000x2xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<32000x2xf16>) -> tensor<32000x2xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x32000xf16>, tensor<8192x2xf16>) outs(%1 : tensor<32000x2xf16>) -> tensor<32000x2xf16>
         return %2 : tensor<32000x2xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_32000_32_5120_bf16_tA.mlir b/gemm/mlir/gemm_32000_32_5120_bf16_tA.mlir
index 8dce8b5..6585842 100644
--- a/gemm/mlir/gemm_32000_32_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_32000_32_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x32000xbf16>, %arg1: tensor<5120x32xbf16>) -> tensor<32000x32xbf16> {
+    func.func @main(%arg0: tensor<5120x32000xbf16>, %arg1: tensor<5120x32xbf16>) -> tensor<32000x32xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<32000x32xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<32000x32xbf16>) -> tensor<32000x32xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x32000xbf16>, tensor<5120x32xbf16>) outs(%1 : tensor<32000x32xbf16>) -> tensor<32000x32xbf16>
         return %2 : tensor<32000x32xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_32000_32_5120_f16_tA.mlir b/gemm/mlir/gemm_32000_32_5120_f16_tA.mlir
index 0447404..dfc38c7 100644
--- a/gemm/mlir/gemm_32000_32_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_32000_32_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x32000xf16>, %arg1: tensor<5120x32xf16>) -> tensor<32000x32xf16> {
+    func.func @main(%arg0: tensor<5120x32000xf16>, %arg1: tensor<5120x32xf16>) -> tensor<32000x32xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<32000x32xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<32000x32xf16>) -> tensor<32000x32xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x32000xf16>, tensor<5120x32xf16>) outs(%1 : tensor<32000x32xf16>) -> tensor<32000x32xf16>
         return %2 : tensor<32000x32xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_32000_32_8192_bf16_tA.mlir b/gemm/mlir/gemm_32000_32_8192_bf16_tA.mlir
index e3247ed..efaefd2 100644
--- a/gemm/mlir/gemm_32000_32_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_32000_32_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x32000xbf16>, %arg1: tensor<8192x32xbf16>) -> tensor<32000x32xbf16> {
+    func.func @main(%arg0: tensor<8192x32000xbf16>, %arg1: tensor<8192x32xbf16>) -> tensor<32000x32xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<32000x32xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<32000x32xbf16>) -> tensor<32000x32xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x32000xbf16>, tensor<8192x32xbf16>) outs(%1 : tensor<32000x32xbf16>) -> tensor<32000x32xbf16>
         return %2 : tensor<32000x32xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_32000_32_8192_f16_tA.mlir b/gemm/mlir/gemm_32000_32_8192_f16_tA.mlir
index 50202db..d82b086 100644
--- a/gemm/mlir/gemm_32000_32_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_32000_32_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x32000xf16>, %arg1: tensor<8192x32xf16>) -> tensor<32000x32xf16> {
+    func.func @main(%arg0: tensor<8192x32000xf16>, %arg1: tensor<8192x32xf16>) -> tensor<32000x32xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<32000x32xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<32000x32xf16>) -> tensor<32000x32xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x32000xf16>, tensor<8192x32xf16>) outs(%1 : tensor<32000x32xf16>) -> tensor<32000x32xf16>
         return %2 : tensor<32000x32xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_32000_4_5120_bf16_tA.mlir b/gemm/mlir/gemm_32000_4_5120_bf16_tA.mlir
index 792f2bf..f52612c 100644
--- a/gemm/mlir/gemm_32000_4_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_32000_4_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x32000xbf16>, %arg1: tensor<5120x4xbf16>) -> tensor<32000x4xbf16> {
+    func.func @main(%arg0: tensor<5120x32000xbf16>, %arg1: tensor<5120x4xbf16>) -> tensor<32000x4xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<32000x4xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<32000x4xbf16>) -> tensor<32000x4xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x32000xbf16>, tensor<5120x4xbf16>) outs(%1 : tensor<32000x4xbf16>) -> tensor<32000x4xbf16>
         return %2 : tensor<32000x4xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_32000_4_5120_f16_tA.mlir b/gemm/mlir/gemm_32000_4_5120_f16_tA.mlir
index 4a22243..43e179b 100644
--- a/gemm/mlir/gemm_32000_4_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_32000_4_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x32000xf16>, %arg1: tensor<5120x4xf16>) -> tensor<32000x4xf16> {
+    func.func @main(%arg0: tensor<5120x32000xf16>, %arg1: tensor<5120x4xf16>) -> tensor<32000x4xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<32000x4xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<32000x4xf16>) -> tensor<32000x4xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x32000xf16>, tensor<5120x4xf16>) outs(%1 : tensor<32000x4xf16>) -> tensor<32000x4xf16>
         return %2 : tensor<32000x4xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_32000_4_8192_bf16_tA.mlir b/gemm/mlir/gemm_32000_4_8192_bf16_tA.mlir
index fe9b7b4..e3a7fcc 100644
--- a/gemm/mlir/gemm_32000_4_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_32000_4_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x32000xbf16>, %arg1: tensor<8192x4xbf16>) -> tensor<32000x4xbf16> {
+    func.func @main(%arg0: tensor<8192x32000xbf16>, %arg1: tensor<8192x4xbf16>) -> tensor<32000x4xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<32000x4xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<32000x4xbf16>) -> tensor<32000x4xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x32000xbf16>, tensor<8192x4xbf16>) outs(%1 : tensor<32000x4xbf16>) -> tensor<32000x4xbf16>
         return %2 : tensor<32000x4xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_32000_4_8192_f16_tA.mlir b/gemm/mlir/gemm_32000_4_8192_f16_tA.mlir
index 881ece7..c430b43 100644
--- a/gemm/mlir/gemm_32000_4_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_32000_4_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x32000xf16>, %arg1: tensor<8192x4xf16>) -> tensor<32000x4xf16> {
+    func.func @main(%arg0: tensor<8192x32000xf16>, %arg1: tensor<8192x4xf16>) -> tensor<32000x4xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<32000x4xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<32000x4xf16>) -> tensor<32000x4xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x32000xf16>, tensor<8192x4xf16>) outs(%1 : tensor<32000x4xf16>) -> tensor<32000x4xf16>
         return %2 : tensor<32000x4xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_32000_8_5120_bf16_tA.mlir b/gemm/mlir/gemm_32000_8_5120_bf16_tA.mlir
index 7a8eda9..c3082b6 100644
--- a/gemm/mlir/gemm_32000_8_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_32000_8_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x32000xbf16>, %arg1: tensor<5120x8xbf16>) -> tensor<32000x8xbf16> {
+    func.func @main(%arg0: tensor<5120x32000xbf16>, %arg1: tensor<5120x8xbf16>) -> tensor<32000x8xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<32000x8xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<32000x8xbf16>) -> tensor<32000x8xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x32000xbf16>, tensor<5120x8xbf16>) outs(%1 : tensor<32000x8xbf16>) -> tensor<32000x8xbf16>
         return %2 : tensor<32000x8xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_32000_8_5120_f16_tA.mlir b/gemm/mlir/gemm_32000_8_5120_f16_tA.mlir
index 5258a32..84959d3 100644
--- a/gemm/mlir/gemm_32000_8_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_32000_8_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x32000xf16>, %arg1: tensor<5120x8xf16>) -> tensor<32000x8xf16> {
+    func.func @main(%arg0: tensor<5120x32000xf16>, %arg1: tensor<5120x8xf16>) -> tensor<32000x8xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<32000x8xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<32000x8xf16>) -> tensor<32000x8xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x32000xf16>, tensor<5120x8xf16>) outs(%1 : tensor<32000x8xf16>) -> tensor<32000x8xf16>
         return %2 : tensor<32000x8xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_32000_8_8192_bf16_tA.mlir b/gemm/mlir/gemm_32000_8_8192_bf16_tA.mlir
index b23902c..7cbee49 100644
--- a/gemm/mlir/gemm_32000_8_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_32000_8_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x32000xbf16>, %arg1: tensor<8192x8xbf16>) -> tensor<32000x8xbf16> {
+    func.func @main(%arg0: tensor<8192x32000xbf16>, %arg1: tensor<8192x8xbf16>) -> tensor<32000x8xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<32000x8xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<32000x8xbf16>) -> tensor<32000x8xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x32000xbf16>, tensor<8192x8xbf16>) outs(%1 : tensor<32000x8xbf16>) -> tensor<32000x8xbf16>
         return %2 : tensor<32000x8xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_32000_8_8192_f16_tA.mlir b/gemm/mlir/gemm_32000_8_8192_f16_tA.mlir
index 675af52..67d245e 100644
--- a/gemm/mlir/gemm_32000_8_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_32000_8_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x32000xf16>, %arg1: tensor<8192x8xf16>) -> tensor<32000x8xf16> {
+    func.func @main(%arg0: tensor<8192x32000xf16>, %arg1: tensor<8192x8xf16>) -> tensor<32000x8xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<32000x8xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<32000x8xf16>) -> tensor<32000x8xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x32000xf16>, tensor<8192x8xf16>) outs(%1 : tensor<32000x8xf16>) -> tensor<32000x8xf16>
         return %2 : tensor<32000x8xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_3456_16_5120_bf16_tA.mlir b/gemm/mlir/gemm_3456_16_5120_bf16_tA.mlir
index d15ae4e..ab4fa46 100644
--- a/gemm/mlir/gemm_3456_16_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_3456_16_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x3456xbf16>, %arg1: tensor<5120x16xbf16>) -> tensor<3456x16xbf16> {
+    func.func @main(%arg0: tensor<5120x3456xbf16>, %arg1: tensor<5120x16xbf16>) -> tensor<3456x16xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<3456x16xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<3456x16xbf16>) -> tensor<3456x16xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3456xbf16>, tensor<5120x16xbf16>) outs(%1 : tensor<3456x16xbf16>) -> tensor<3456x16xbf16>
         return %2 : tensor<3456x16xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_3456_16_5120_f16_tA.mlir b/gemm/mlir/gemm_3456_16_5120_f16_tA.mlir
index 196b277..0c15001 100644
--- a/gemm/mlir/gemm_3456_16_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_3456_16_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x3456xf16>, %arg1: tensor<5120x16xf16>) -> tensor<3456x16xf16> {
+    func.func @main(%arg0: tensor<5120x3456xf16>, %arg1: tensor<5120x16xf16>) -> tensor<3456x16xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<3456x16xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<3456x16xf16>) -> tensor<3456x16xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3456xf16>, tensor<5120x16xf16>) outs(%1 : tensor<3456x16xf16>) -> tensor<3456x16xf16>
         return %2 : tensor<3456x16xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_3456_1_5120_bf16_tA.mlir b/gemm/mlir/gemm_3456_1_5120_bf16_tA.mlir
index 1f33608..754923e 100644
--- a/gemm/mlir/gemm_3456_1_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_3456_1_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x3456xbf16>, %arg1: tensor<5120x1xbf16>) -> tensor<3456x1xbf16> {
+    func.func @main(%arg0: tensor<5120x3456xbf16>, %arg1: tensor<5120x1xbf16>) -> tensor<3456x1xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<3456x1xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<3456x1xbf16>) -> tensor<3456x1xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3456xbf16>, tensor<5120x1xbf16>) outs(%1 : tensor<3456x1xbf16>) -> tensor<3456x1xbf16>
         return %2 : tensor<3456x1xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_3456_1_5120_f16_tA.mlir b/gemm/mlir/gemm_3456_1_5120_f16_tA.mlir
index e80c247..a179e69 100644
--- a/gemm/mlir/gemm_3456_1_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_3456_1_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x3456xf16>, %arg1: tensor<5120x1xf16>) -> tensor<3456x1xf16> {
+    func.func @main(%arg0: tensor<5120x3456xf16>, %arg1: tensor<5120x1xf16>) -> tensor<3456x1xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<3456x1xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<3456x1xf16>) -> tensor<3456x1xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3456xf16>, tensor<5120x1xf16>) outs(%1 : tensor<3456x1xf16>) -> tensor<3456x1xf16>
         return %2 : tensor<3456x1xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_3456_2_5120_bf16_tA.mlir b/gemm/mlir/gemm_3456_2_5120_bf16_tA.mlir
index bcc53a2..68afe12 100644
--- a/gemm/mlir/gemm_3456_2_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_3456_2_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x3456xbf16>, %arg1: tensor<5120x2xbf16>) -> tensor<3456x2xbf16> {
+    func.func @main(%arg0: tensor<5120x3456xbf16>, %arg1: tensor<5120x2xbf16>) -> tensor<3456x2xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<3456x2xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<3456x2xbf16>) -> tensor<3456x2xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3456xbf16>, tensor<5120x2xbf16>) outs(%1 : tensor<3456x2xbf16>) -> tensor<3456x2xbf16>
         return %2 : tensor<3456x2xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_3456_2_5120_f16_tA.mlir b/gemm/mlir/gemm_3456_2_5120_f16_tA.mlir
index 6ce2677..c0fe5f9 100644
--- a/gemm/mlir/gemm_3456_2_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_3456_2_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x3456xf16>, %arg1: tensor<5120x2xf16>) -> tensor<3456x2xf16> {
+    func.func @main(%arg0: tensor<5120x3456xf16>, %arg1: tensor<5120x2xf16>) -> tensor<3456x2xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<3456x2xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<3456x2xf16>) -> tensor<3456x2xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3456xf16>, tensor<5120x2xf16>) outs(%1 : tensor<3456x2xf16>) -> tensor<3456x2xf16>
         return %2 : tensor<3456x2xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_3456_32_5120_bf16_tA.mlir b/gemm/mlir/gemm_3456_32_5120_bf16_tA.mlir
index f203e2d..9b8159a 100644
--- a/gemm/mlir/gemm_3456_32_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_3456_32_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x3456xbf16>, %arg1: tensor<5120x32xbf16>) -> tensor<3456x32xbf16> {
+    func.func @main(%arg0: tensor<5120x3456xbf16>, %arg1: tensor<5120x32xbf16>) -> tensor<3456x32xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<3456x32xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<3456x32xbf16>) -> tensor<3456x32xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3456xbf16>, tensor<5120x32xbf16>) outs(%1 : tensor<3456x32xbf16>) -> tensor<3456x32xbf16>
         return %2 : tensor<3456x32xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_3456_32_5120_f16_tA.mlir b/gemm/mlir/gemm_3456_32_5120_f16_tA.mlir
index ee6f1aa..fe43487 100644
--- a/gemm/mlir/gemm_3456_32_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_3456_32_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x3456xf16>, %arg1: tensor<5120x32xf16>) -> tensor<3456x32xf16> {
+    func.func @main(%arg0: tensor<5120x3456xf16>, %arg1: tensor<5120x32xf16>) -> tensor<3456x32xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<3456x32xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<3456x32xf16>) -> tensor<3456x32xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3456xf16>, tensor<5120x32xf16>) outs(%1 : tensor<3456x32xf16>) -> tensor<3456x32xf16>
         return %2 : tensor<3456x32xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_3456_4_5120_bf16_tA.mlir b/gemm/mlir/gemm_3456_4_5120_bf16_tA.mlir
index 540c119..d6bbdaa 100644
--- a/gemm/mlir/gemm_3456_4_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_3456_4_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x3456xbf16>, %arg1: tensor<5120x4xbf16>) -> tensor<3456x4xbf16> {
+    func.func @main(%arg0: tensor<5120x3456xbf16>, %arg1: tensor<5120x4xbf16>) -> tensor<3456x4xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<3456x4xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<3456x4xbf16>) -> tensor<3456x4xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3456xbf16>, tensor<5120x4xbf16>) outs(%1 : tensor<3456x4xbf16>) -> tensor<3456x4xbf16>
         return %2 : tensor<3456x4xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_3456_4_5120_f16_tA.mlir b/gemm/mlir/gemm_3456_4_5120_f16_tA.mlir
index 46490bf..d1ba93e 100644
--- a/gemm/mlir/gemm_3456_4_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_3456_4_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x3456xf16>, %arg1: tensor<5120x4xf16>) -> tensor<3456x4xf16> {
+    func.func @main(%arg0: tensor<5120x3456xf16>, %arg1: tensor<5120x4xf16>) -> tensor<3456x4xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<3456x4xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<3456x4xf16>) -> tensor<3456x4xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3456xf16>, tensor<5120x4xf16>) outs(%1 : tensor<3456x4xf16>) -> tensor<3456x4xf16>
         return %2 : tensor<3456x4xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_3456_8_5120_bf16_tA.mlir b/gemm/mlir/gemm_3456_8_5120_bf16_tA.mlir
index f616913..b7b3a1e 100644
--- a/gemm/mlir/gemm_3456_8_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_3456_8_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x3456xbf16>, %arg1: tensor<5120x8xbf16>) -> tensor<3456x8xbf16> {
+    func.func @main(%arg0: tensor<5120x3456xbf16>, %arg1: tensor<5120x8xbf16>) -> tensor<3456x8xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<3456x8xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<3456x8xbf16>) -> tensor<3456x8xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3456xbf16>, tensor<5120x8xbf16>) outs(%1 : tensor<3456x8xbf16>) -> tensor<3456x8xbf16>
         return %2 : tensor<3456x8xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_3456_8_5120_f16_tA.mlir b/gemm/mlir/gemm_3456_8_5120_f16_tA.mlir
index e661d74..60f9e0c 100644
--- a/gemm/mlir/gemm_3456_8_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_3456_8_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x3456xf16>, %arg1: tensor<5120x8xf16>) -> tensor<3456x8xf16> {
+    func.func @main(%arg0: tensor<5120x3456xf16>, %arg1: tensor<5120x8xf16>) -> tensor<3456x8xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<3456x8xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<3456x8xf16>) -> tensor<3456x8xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3456xf16>, tensor<5120x8xf16>) outs(%1 : tensor<3456x8xf16>) -> tensor<3456x8xf16>
         return %2 : tensor<3456x8xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_3840_16_5120_bf16_tA.mlir b/gemm/mlir/gemm_3840_16_5120_bf16_tA.mlir
index c970daf..63c122d 100644
--- a/gemm/mlir/gemm_3840_16_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_3840_16_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x3840xbf16>, %arg1: tensor<5120x16xbf16>) -> tensor<3840x16xbf16> {
+    func.func @main(%arg0: tensor<5120x3840xbf16>, %arg1: tensor<5120x16xbf16>) -> tensor<3840x16xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<3840x16xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<3840x16xbf16>) -> tensor<3840x16xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3840xbf16>, tensor<5120x16xbf16>) outs(%1 : tensor<3840x16xbf16>) -> tensor<3840x16xbf16>
         return %2 : tensor<3840x16xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_3840_16_5120_f16_tA.mlir b/gemm/mlir/gemm_3840_16_5120_f16_tA.mlir
index 3af2ad7..5ed7814 100644
--- a/gemm/mlir/gemm_3840_16_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_3840_16_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x3840xf16>, %arg1: tensor<5120x16xf16>) -> tensor<3840x16xf16> {
+    func.func @main(%arg0: tensor<5120x3840xf16>, %arg1: tensor<5120x16xf16>) -> tensor<3840x16xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<3840x16xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<3840x16xf16>) -> tensor<3840x16xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3840xf16>, tensor<5120x16xf16>) outs(%1 : tensor<3840x16xf16>) -> tensor<3840x16xf16>
         return %2 : tensor<3840x16xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_3840_1_5120_bf16_tA.mlir b/gemm/mlir/gemm_3840_1_5120_bf16_tA.mlir
index bd9295a..30fce43 100644
--- a/gemm/mlir/gemm_3840_1_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_3840_1_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x3840xbf16>, %arg1: tensor<5120x1xbf16>) -> tensor<3840x1xbf16> {
+    func.func @main(%arg0: tensor<5120x3840xbf16>, %arg1: tensor<5120x1xbf16>) -> tensor<3840x1xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<3840x1xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<3840x1xbf16>) -> tensor<3840x1xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3840xbf16>, tensor<5120x1xbf16>) outs(%1 : tensor<3840x1xbf16>) -> tensor<3840x1xbf16>
         return %2 : tensor<3840x1xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_3840_1_5120_f16_tA.mlir b/gemm/mlir/gemm_3840_1_5120_f16_tA.mlir
index 2e5ad52..c83b20c 100644
--- a/gemm/mlir/gemm_3840_1_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_3840_1_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x3840xf16>, %arg1: tensor<5120x1xf16>) -> tensor<3840x1xf16> {
+    func.func @main(%arg0: tensor<5120x3840xf16>, %arg1: tensor<5120x1xf16>) -> tensor<3840x1xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<3840x1xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<3840x1xf16>) -> tensor<3840x1xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3840xf16>, tensor<5120x1xf16>) outs(%1 : tensor<3840x1xf16>) -> tensor<3840x1xf16>
         return %2 : tensor<3840x1xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_3840_2_5120_bf16_tA.mlir b/gemm/mlir/gemm_3840_2_5120_bf16_tA.mlir
index e851a6b..fde61e4 100644
--- a/gemm/mlir/gemm_3840_2_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_3840_2_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x3840xbf16>, %arg1: tensor<5120x2xbf16>) -> tensor<3840x2xbf16> {
+    func.func @main(%arg0: tensor<5120x3840xbf16>, %arg1: tensor<5120x2xbf16>) -> tensor<3840x2xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<3840x2xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<3840x2xbf16>) -> tensor<3840x2xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3840xbf16>, tensor<5120x2xbf16>) outs(%1 : tensor<3840x2xbf16>) -> tensor<3840x2xbf16>
         return %2 : tensor<3840x2xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_3840_2_5120_f16_tA.mlir b/gemm/mlir/gemm_3840_2_5120_f16_tA.mlir
index 2e1f931..3526c21 100644
--- a/gemm/mlir/gemm_3840_2_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_3840_2_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x3840xf16>, %arg1: tensor<5120x2xf16>) -> tensor<3840x2xf16> {
+    func.func @main(%arg0: tensor<5120x3840xf16>, %arg1: tensor<5120x2xf16>) -> tensor<3840x2xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<3840x2xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<3840x2xf16>) -> tensor<3840x2xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3840xf16>, tensor<5120x2xf16>) outs(%1 : tensor<3840x2xf16>) -> tensor<3840x2xf16>
         return %2 : tensor<3840x2xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_3840_32_5120_bf16_tA.mlir b/gemm/mlir/gemm_3840_32_5120_bf16_tA.mlir
index 75c89de..aae821a 100644
--- a/gemm/mlir/gemm_3840_32_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_3840_32_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x3840xbf16>, %arg1: tensor<5120x32xbf16>) -> tensor<3840x32xbf16> {
+    func.func @main(%arg0: tensor<5120x3840xbf16>, %arg1: tensor<5120x32xbf16>) -> tensor<3840x32xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<3840x32xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<3840x32xbf16>) -> tensor<3840x32xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3840xbf16>, tensor<5120x32xbf16>) outs(%1 : tensor<3840x32xbf16>) -> tensor<3840x32xbf16>
         return %2 : tensor<3840x32xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_3840_32_5120_f16_tA.mlir b/gemm/mlir/gemm_3840_32_5120_f16_tA.mlir
index cba4e56..1491630 100644
--- a/gemm/mlir/gemm_3840_32_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_3840_32_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x3840xf16>, %arg1: tensor<5120x32xf16>) -> tensor<3840x32xf16> {
+    func.func @main(%arg0: tensor<5120x3840xf16>, %arg1: tensor<5120x32xf16>) -> tensor<3840x32xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<3840x32xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<3840x32xf16>) -> tensor<3840x32xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3840xf16>, tensor<5120x32xf16>) outs(%1 : tensor<3840x32xf16>) -> tensor<3840x32xf16>
         return %2 : tensor<3840x32xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_3840_4_5120_bf16_tA.mlir b/gemm/mlir/gemm_3840_4_5120_bf16_tA.mlir
index 19b8835..fe34d3f 100644
--- a/gemm/mlir/gemm_3840_4_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_3840_4_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x3840xbf16>, %arg1: tensor<5120x4xbf16>) -> tensor<3840x4xbf16> {
+    func.func @main(%arg0: tensor<5120x3840xbf16>, %arg1: tensor<5120x4xbf16>) -> tensor<3840x4xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<3840x4xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<3840x4xbf16>) -> tensor<3840x4xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3840xbf16>, tensor<5120x4xbf16>) outs(%1 : tensor<3840x4xbf16>) -> tensor<3840x4xbf16>
         return %2 : tensor<3840x4xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_3840_4_5120_f16_tA.mlir b/gemm/mlir/gemm_3840_4_5120_f16_tA.mlir
index 02a0213..eab6a7c 100644
--- a/gemm/mlir/gemm_3840_4_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_3840_4_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x3840xf16>, %arg1: tensor<5120x4xf16>) -> tensor<3840x4xf16> {
+    func.func @main(%arg0: tensor<5120x3840xf16>, %arg1: tensor<5120x4xf16>) -> tensor<3840x4xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<3840x4xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<3840x4xf16>) -> tensor<3840x4xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3840xf16>, tensor<5120x4xf16>) outs(%1 : tensor<3840x4xf16>) -> tensor<3840x4xf16>
         return %2 : tensor<3840x4xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_3840_8_5120_bf16_tA.mlir b/gemm/mlir/gemm_3840_8_5120_bf16_tA.mlir
index 04f3bc3..84bb52a 100644
--- a/gemm/mlir/gemm_3840_8_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_3840_8_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x3840xbf16>, %arg1: tensor<5120x8xbf16>) -> tensor<3840x8xbf16> {
+    func.func @main(%arg0: tensor<5120x3840xbf16>, %arg1: tensor<5120x8xbf16>) -> tensor<3840x8xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<3840x8xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<3840x8xbf16>) -> tensor<3840x8xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3840xbf16>, tensor<5120x8xbf16>) outs(%1 : tensor<3840x8xbf16>) -> tensor<3840x8xbf16>
         return %2 : tensor<3840x8xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_3840_8_5120_f16_tA.mlir b/gemm/mlir/gemm_3840_8_5120_f16_tA.mlir
index 3565fd0..8c91198 100644
--- a/gemm/mlir/gemm_3840_8_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_3840_8_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x3840xf16>, %arg1: tensor<5120x8xf16>) -> tensor<3840x8xf16> {
+    func.func @main(%arg0: tensor<5120x3840xf16>, %arg1: tensor<5120x8xf16>) -> tensor<3840x8xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<3840x8xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<3840x8xf16>) -> tensor<3840x8xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3840xf16>, tensor<5120x8xf16>) outs(%1 : tensor<3840x8xf16>) -> tensor<3840x8xf16>
         return %2 : tensor<3840x8xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_4000_16_5120_bf16_tA.mlir b/gemm/mlir/gemm_4000_16_5120_bf16_tA.mlir
index abab876..01c0a78 100644
--- a/gemm/mlir/gemm_4000_16_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_4000_16_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x4000xbf16>, %arg1: tensor<5120x16xbf16>) -> tensor<4000x16xbf16> {
+    func.func @main(%arg0: tensor<5120x4000xbf16>, %arg1: tensor<5120x16xbf16>) -> tensor<4000x16xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<4000x16xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<4000x16xbf16>) -> tensor<4000x16xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x4000xbf16>, tensor<5120x16xbf16>) outs(%1 : tensor<4000x16xbf16>) -> tensor<4000x16xbf16>
         return %2 : tensor<4000x16xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_4000_16_5120_f16_tA.mlir b/gemm/mlir/gemm_4000_16_5120_f16_tA.mlir
index a8be651..3eb9fe7 100644
--- a/gemm/mlir/gemm_4000_16_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_4000_16_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x4000xf16>, %arg1: tensor<5120x16xf16>) -> tensor<4000x16xf16> {
+    func.func @main(%arg0: tensor<5120x4000xf16>, %arg1: tensor<5120x16xf16>) -> tensor<4000x16xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<4000x16xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<4000x16xf16>) -> tensor<4000x16xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x4000xf16>, tensor<5120x16xf16>) outs(%1 : tensor<4000x16xf16>) -> tensor<4000x16xf16>
         return %2 : tensor<4000x16xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_4000_16_8192_bf16_tA.mlir b/gemm/mlir/gemm_4000_16_8192_bf16_tA.mlir
index 4a9f61a..a64464a 100644
--- a/gemm/mlir/gemm_4000_16_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_4000_16_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x4000xbf16>, %arg1: tensor<8192x16xbf16>) -> tensor<4000x16xbf16> {
+    func.func @main(%arg0: tensor<8192x4000xbf16>, %arg1: tensor<8192x16xbf16>) -> tensor<4000x16xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<4000x16xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<4000x16xbf16>) -> tensor<4000x16xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x4000xbf16>, tensor<8192x16xbf16>) outs(%1 : tensor<4000x16xbf16>) -> tensor<4000x16xbf16>
         return %2 : tensor<4000x16xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_4000_16_8192_f16_tA.mlir b/gemm/mlir/gemm_4000_16_8192_f16_tA.mlir
index 0782415..68f9cda 100644
--- a/gemm/mlir/gemm_4000_16_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_4000_16_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x4000xf16>, %arg1: tensor<8192x16xf16>) -> tensor<4000x16xf16> {
+    func.func @main(%arg0: tensor<8192x4000xf16>, %arg1: tensor<8192x16xf16>) -> tensor<4000x16xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<4000x16xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<4000x16xf16>) -> tensor<4000x16xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x4000xf16>, tensor<8192x16xf16>) outs(%1 : tensor<4000x16xf16>) -> tensor<4000x16xf16>
         return %2 : tensor<4000x16xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_4000_1_5120_bf16_tA.mlir b/gemm/mlir/gemm_4000_1_5120_bf16_tA.mlir
index 308e330..857de41 100644
--- a/gemm/mlir/gemm_4000_1_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_4000_1_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x4000xbf16>, %arg1: tensor<5120x1xbf16>) -> tensor<4000x1xbf16> {
+    func.func @main(%arg0: tensor<5120x4000xbf16>, %arg1: tensor<5120x1xbf16>) -> tensor<4000x1xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<4000x1xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<4000x1xbf16>) -> tensor<4000x1xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x4000xbf16>, tensor<5120x1xbf16>) outs(%1 : tensor<4000x1xbf16>) -> tensor<4000x1xbf16>
         return %2 : tensor<4000x1xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_4000_1_5120_f16_tA.mlir b/gemm/mlir/gemm_4000_1_5120_f16_tA.mlir
index 32ba2b6..f64c226 100644
--- a/gemm/mlir/gemm_4000_1_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_4000_1_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x4000xf16>, %arg1: tensor<5120x1xf16>) -> tensor<4000x1xf16> {
+    func.func @main(%arg0: tensor<5120x4000xf16>, %arg1: tensor<5120x1xf16>) -> tensor<4000x1xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<4000x1xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<4000x1xf16>) -> tensor<4000x1xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x4000xf16>, tensor<5120x1xf16>) outs(%1 : tensor<4000x1xf16>) -> tensor<4000x1xf16>
         return %2 : tensor<4000x1xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_4000_1_8192_bf16_tA.mlir b/gemm/mlir/gemm_4000_1_8192_bf16_tA.mlir
index 7d55ed4..c98f58c 100644
--- a/gemm/mlir/gemm_4000_1_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_4000_1_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x4000xbf16>, %arg1: tensor<8192x1xbf16>) -> tensor<4000x1xbf16> {
+    func.func @main(%arg0: tensor<8192x4000xbf16>, %arg1: tensor<8192x1xbf16>) -> tensor<4000x1xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<4000x1xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<4000x1xbf16>) -> tensor<4000x1xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x4000xbf16>, tensor<8192x1xbf16>) outs(%1 : tensor<4000x1xbf16>) -> tensor<4000x1xbf16>
         return %2 : tensor<4000x1xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_4000_1_8192_f16_tA.mlir b/gemm/mlir/gemm_4000_1_8192_f16_tA.mlir
index 4058c6b..5aaef53 100644
--- a/gemm/mlir/gemm_4000_1_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_4000_1_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x4000xf16>, %arg1: tensor<8192x1xf16>) -> tensor<4000x1xf16> {
+    func.func @main(%arg0: tensor<8192x4000xf16>, %arg1: tensor<8192x1xf16>) -> tensor<4000x1xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<4000x1xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<4000x1xf16>) -> tensor<4000x1xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x4000xf16>, tensor<8192x1xf16>) outs(%1 : tensor<4000x1xf16>) -> tensor<4000x1xf16>
         return %2 : tensor<4000x1xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_4000_2_5120_bf16_tA.mlir b/gemm/mlir/gemm_4000_2_5120_bf16_tA.mlir
index e6acf12..cf6d890 100644
--- a/gemm/mlir/gemm_4000_2_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_4000_2_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x4000xbf16>, %arg1: tensor<5120x2xbf16>) -> tensor<4000x2xbf16> {
+    func.func @main(%arg0: tensor<5120x4000xbf16>, %arg1: tensor<5120x2xbf16>) -> tensor<4000x2xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<4000x2xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<4000x2xbf16>) -> tensor<4000x2xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x4000xbf16>, tensor<5120x2xbf16>) outs(%1 : tensor<4000x2xbf16>) -> tensor<4000x2xbf16>
         return %2 : tensor<4000x2xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_4000_2_5120_f16_tA.mlir b/gemm/mlir/gemm_4000_2_5120_f16_tA.mlir
index 1b4232f..1d7ef35 100644
--- a/gemm/mlir/gemm_4000_2_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_4000_2_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x4000xf16>, %arg1: tensor<5120x2xf16>) -> tensor<4000x2xf16> {
+    func.func @main(%arg0: tensor<5120x4000xf16>, %arg1: tensor<5120x2xf16>) -> tensor<4000x2xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<4000x2xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<4000x2xf16>) -> tensor<4000x2xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x4000xf16>, tensor<5120x2xf16>) outs(%1 : tensor<4000x2xf16>) -> tensor<4000x2xf16>
         return %2 : tensor<4000x2xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_4000_2_8192_bf16_tA.mlir b/gemm/mlir/gemm_4000_2_8192_bf16_tA.mlir
index ab005d9..1081115 100644
--- a/gemm/mlir/gemm_4000_2_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_4000_2_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x4000xbf16>, %arg1: tensor<8192x2xbf16>) -> tensor<4000x2xbf16> {
+    func.func @main(%arg0: tensor<8192x4000xbf16>, %arg1: tensor<8192x2xbf16>) -> tensor<4000x2xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<4000x2xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<4000x2xbf16>) -> tensor<4000x2xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x4000xbf16>, tensor<8192x2xbf16>) outs(%1 : tensor<4000x2xbf16>) -> tensor<4000x2xbf16>
         return %2 : tensor<4000x2xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_4000_2_8192_f16_tA.mlir b/gemm/mlir/gemm_4000_2_8192_f16_tA.mlir
index f760b77..5d645df 100644
--- a/gemm/mlir/gemm_4000_2_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_4000_2_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x4000xf16>, %arg1: tensor<8192x2xf16>) -> tensor<4000x2xf16> {
+    func.func @main(%arg0: tensor<8192x4000xf16>, %arg1: tensor<8192x2xf16>) -> tensor<4000x2xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<4000x2xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<4000x2xf16>) -> tensor<4000x2xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x4000xf16>, tensor<8192x2xf16>) outs(%1 : tensor<4000x2xf16>) -> tensor<4000x2xf16>
         return %2 : tensor<4000x2xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_4000_32_5120_bf16_tA.mlir b/gemm/mlir/gemm_4000_32_5120_bf16_tA.mlir
index 84b5c7e..faa22ff 100644
--- a/gemm/mlir/gemm_4000_32_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_4000_32_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x4000xbf16>, %arg1: tensor<5120x32xbf16>) -> tensor<4000x32xbf16> {
+    func.func @main(%arg0: tensor<5120x4000xbf16>, %arg1: tensor<5120x32xbf16>) -> tensor<4000x32xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<4000x32xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<4000x32xbf16>) -> tensor<4000x32xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x4000xbf16>, tensor<5120x32xbf16>) outs(%1 : tensor<4000x32xbf16>) -> tensor<4000x32xbf16>
         return %2 : tensor<4000x32xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_4000_32_5120_f16_tA.mlir b/gemm/mlir/gemm_4000_32_5120_f16_tA.mlir
index 6647868..eb8e87e 100644
--- a/gemm/mlir/gemm_4000_32_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_4000_32_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x4000xf16>, %arg1: tensor<5120x32xf16>) -> tensor<4000x32xf16> {
+    func.func @main(%arg0: tensor<5120x4000xf16>, %arg1: tensor<5120x32xf16>) -> tensor<4000x32xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<4000x32xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<4000x32xf16>) -> tensor<4000x32xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x4000xf16>, tensor<5120x32xf16>) outs(%1 : tensor<4000x32xf16>) -> tensor<4000x32xf16>
         return %2 : tensor<4000x32xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_4000_32_8192_bf16_tA.mlir b/gemm/mlir/gemm_4000_32_8192_bf16_tA.mlir
index ea59621..0688fe2 100644
--- a/gemm/mlir/gemm_4000_32_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_4000_32_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x4000xbf16>, %arg1: tensor<8192x32xbf16>) -> tensor<4000x32xbf16> {
+    func.func @main(%arg0: tensor<8192x4000xbf16>, %arg1: tensor<8192x32xbf16>) -> tensor<4000x32xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<4000x32xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<4000x32xbf16>) -> tensor<4000x32xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x4000xbf16>, tensor<8192x32xbf16>) outs(%1 : tensor<4000x32xbf16>) -> tensor<4000x32xbf16>
         return %2 : tensor<4000x32xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_4000_32_8192_f16_tA.mlir b/gemm/mlir/gemm_4000_32_8192_f16_tA.mlir
index 0eb7bad..d261394 100644
--- a/gemm/mlir/gemm_4000_32_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_4000_32_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x4000xf16>, %arg1: tensor<8192x32xf16>) -> tensor<4000x32xf16> {
+    func.func @main(%arg0: tensor<8192x4000xf16>, %arg1: tensor<8192x32xf16>) -> tensor<4000x32xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<4000x32xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<4000x32xf16>) -> tensor<4000x32xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x4000xf16>, tensor<8192x32xf16>) outs(%1 : tensor<4000x32xf16>) -> tensor<4000x32xf16>
         return %2 : tensor<4000x32xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_4000_4_5120_bf16_tA.mlir b/gemm/mlir/gemm_4000_4_5120_bf16_tA.mlir
index 8930578..ee32dc1 100644
--- a/gemm/mlir/gemm_4000_4_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_4000_4_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x4000xbf16>, %arg1: tensor<5120x4xbf16>) -> tensor<4000x4xbf16> {
+    func.func @main(%arg0: tensor<5120x4000xbf16>, %arg1: tensor<5120x4xbf16>) -> tensor<4000x4xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<4000x4xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<4000x4xbf16>) -> tensor<4000x4xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x4000xbf16>, tensor<5120x4xbf16>) outs(%1 : tensor<4000x4xbf16>) -> tensor<4000x4xbf16>
         return %2 : tensor<4000x4xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_4000_4_5120_f16_tA.mlir b/gemm/mlir/gemm_4000_4_5120_f16_tA.mlir
index 6189801..61b5e3d 100644
--- a/gemm/mlir/gemm_4000_4_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_4000_4_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x4000xf16>, %arg1: tensor<5120x4xf16>) -> tensor<4000x4xf16> {
+    func.func @main(%arg0: tensor<5120x4000xf16>, %arg1: tensor<5120x4xf16>) -> tensor<4000x4xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<4000x4xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<4000x4xf16>) -> tensor<4000x4xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x4000xf16>, tensor<5120x4xf16>) outs(%1 : tensor<4000x4xf16>) -> tensor<4000x4xf16>
         return %2 : tensor<4000x4xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_4000_4_8192_bf16_tA.mlir b/gemm/mlir/gemm_4000_4_8192_bf16_tA.mlir
index a26e946..1f73b7e 100644
--- a/gemm/mlir/gemm_4000_4_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_4000_4_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x4000xbf16>, %arg1: tensor<8192x4xbf16>) -> tensor<4000x4xbf16> {
+    func.func @main(%arg0: tensor<8192x4000xbf16>, %arg1: tensor<8192x4xbf16>) -> tensor<4000x4xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<4000x4xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<4000x4xbf16>) -> tensor<4000x4xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x4000xbf16>, tensor<8192x4xbf16>) outs(%1 : tensor<4000x4xbf16>) -> tensor<4000x4xbf16>
         return %2 : tensor<4000x4xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_4000_4_8192_f16_tA.mlir b/gemm/mlir/gemm_4000_4_8192_f16_tA.mlir
index 03b39b8..f85ff47 100644
--- a/gemm/mlir/gemm_4000_4_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_4000_4_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x4000xf16>, %arg1: tensor<8192x4xf16>) -> tensor<4000x4xf16> {
+    func.func @main(%arg0: tensor<8192x4000xf16>, %arg1: tensor<8192x4xf16>) -> tensor<4000x4xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<4000x4xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<4000x4xf16>) -> tensor<4000x4xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x4000xf16>, tensor<8192x4xf16>) outs(%1 : tensor<4000x4xf16>) -> tensor<4000x4xf16>
         return %2 : tensor<4000x4xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_4000_8_5120_bf16_tA.mlir b/gemm/mlir/gemm_4000_8_5120_bf16_tA.mlir
index 89a98f6..a59e9b6 100644
--- a/gemm/mlir/gemm_4000_8_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_4000_8_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x4000xbf16>, %arg1: tensor<5120x8xbf16>) -> tensor<4000x8xbf16> {
+    func.func @main(%arg0: tensor<5120x4000xbf16>, %arg1: tensor<5120x8xbf16>) -> tensor<4000x8xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<4000x8xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<4000x8xbf16>) -> tensor<4000x8xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x4000xbf16>, tensor<5120x8xbf16>) outs(%1 : tensor<4000x8xbf16>) -> tensor<4000x8xbf16>
         return %2 : tensor<4000x8xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_4000_8_5120_f16_tA.mlir b/gemm/mlir/gemm_4000_8_5120_f16_tA.mlir
index 1801f29..2821933 100644
--- a/gemm/mlir/gemm_4000_8_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_4000_8_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x4000xf16>, %arg1: tensor<5120x8xf16>) -> tensor<4000x8xf16> {
+    func.func @main(%arg0: tensor<5120x4000xf16>, %arg1: tensor<5120x8xf16>) -> tensor<4000x8xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<4000x8xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<4000x8xf16>) -> tensor<4000x8xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x4000xf16>, tensor<5120x8xf16>) outs(%1 : tensor<4000x8xf16>) -> tensor<4000x8xf16>
         return %2 : tensor<4000x8xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_4000_8_8192_bf16_tA.mlir b/gemm/mlir/gemm_4000_8_8192_bf16_tA.mlir
index 8ad4e74..bbaeb69 100644
--- a/gemm/mlir/gemm_4000_8_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_4000_8_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x4000xbf16>, %arg1: tensor<8192x8xbf16>) -> tensor<4000x8xbf16> {
+    func.func @main(%arg0: tensor<8192x4000xbf16>, %arg1: tensor<8192x8xbf16>) -> tensor<4000x8xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<4000x8xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<4000x8xbf16>) -> tensor<4000x8xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x4000xbf16>, tensor<8192x8xbf16>) outs(%1 : tensor<4000x8xbf16>) -> tensor<4000x8xbf16>
         return %2 : tensor<4000x8xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_4000_8_8192_f16_tA.mlir b/gemm/mlir/gemm_4000_8_8192_f16_tA.mlir
index 316083d..3bd900f 100644
--- a/gemm/mlir/gemm_4000_8_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_4000_8_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x4000xf16>, %arg1: tensor<8192x8xf16>) -> tensor<4000x8xf16> {
+    func.func @main(%arg0: tensor<8192x4000xf16>, %arg1: tensor<8192x8xf16>) -> tensor<4000x8xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<4000x8xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<4000x8xf16>) -> tensor<4000x8xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x4000xf16>, tensor<8192x8xf16>) outs(%1 : tensor<4000x8xf16>) -> tensor<4000x8xf16>
         return %2 : tensor<4000x8xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_4096_4096_8192_bf16.mlir b/gemm/mlir/gemm_4096_4096_8192_bf16.mlir
index eaf1be6..da783d2 100644
--- a/gemm/mlir/gemm_4096_4096_8192_bf16.mlir
+++ b/gemm/mlir/gemm_4096_4096_8192_bf16.mlir
@@ -1,9 +1,9 @@
 module {
-    func.func @main_0(%arg0: tensor<4096x8192xbf16>, %arg1: tensor<8192x4096xbf16>) -> tensor<4096x4096xbf16> {
+    func.func @main(%arg0: tensor<4096x8192xbf16>, %arg1: tensor<8192x4096xbf16>) -> tensor<4096x4096xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<4096x4096xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<4096x4096xbf16>) -> tensor<4096x4096xbf16>
         %2 = linalg.matmul ins(%arg0, %arg1 : tensor<4096x8192xbf16>, tensor<8192x4096xbf16>) outs(%1 : tensor<4096x4096xbf16>) -> tensor<4096x4096xbf16>
         return %2 : tensor<4096x4096xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_4096_4096_8192_bf16_tA.mlir b/gemm/mlir/gemm_4096_4096_8192_bf16_tA.mlir
index dfacc5e..f9c0df8 100644
--- a/gemm/mlir/gemm_4096_4096_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_4096_4096_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x4096xbf16>, %arg1: tensor<8192x4096xbf16>) -> tensor<4096x4096xbf16> {
+    func.func @main(%arg0: tensor<8192x4096xbf16>, %arg1: tensor<8192x4096xbf16>) -> tensor<4096x4096xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<4096x4096xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<4096x4096xbf16>) -> tensor<4096x4096xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x4096xbf16>, tensor<8192x4096xbf16>) outs(%1 : tensor<4096x4096xbf16>) -> tensor<4096x4096xbf16>
         return %2 : tensor<4096x4096xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_4096_4096_8192_bf16_tB.mlir b/gemm/mlir/gemm_4096_4096_8192_bf16_tB.mlir
index 651bbcc..ff2a1ac 100644
--- a/gemm/mlir/gemm_4096_4096_8192_bf16_tB.mlir
+++ b/gemm/mlir/gemm_4096_4096_8192_bf16_tB.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<4096x8192xbf16>, %arg1: tensor<4096x8192xbf16>) -> tensor<4096x4096xbf16> {
+    func.func @main(%arg0: tensor<4096x8192xbf16>, %arg1: tensor<4096x8192xbf16>) -> tensor<4096x4096xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<4096x4096xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<4096x4096xbf16>) -> tensor<4096x4096xbf16>
         %2 = linalg.matmul_transpose_b ins(%arg0, %arg1 : tensor<4096x8192xbf16>, tensor<4096x8192xbf16>) outs(%1 : tensor<4096x4096xbf16>) -> tensor<4096x4096xbf16>
         return %2 : tensor<4096x4096xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_4096_4096_8192_f16.mlir b/gemm/mlir/gemm_4096_4096_8192_f16.mlir
index fa2f268..d21690a 100644
--- a/gemm/mlir/gemm_4096_4096_8192_f16.mlir
+++ b/gemm/mlir/gemm_4096_4096_8192_f16.mlir
@@ -1,9 +1,9 @@
 module {
-    func.func @main_0(%arg0: tensor<4096x8192xf16>, %arg1: tensor<8192x4096xf16>) -> tensor<4096x4096xf16> {
+    func.func @main(%arg0: tensor<4096x8192xf16>, %arg1: tensor<8192x4096xf16>) -> tensor<4096x4096xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<4096x4096xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<4096x4096xf16>) -> tensor<4096x4096xf16>
         %2 = linalg.matmul ins(%arg0, %arg1 : tensor<4096x8192xf16>, tensor<8192x4096xf16>) outs(%1 : tensor<4096x4096xf16>) -> tensor<4096x4096xf16>
         return %2 : tensor<4096x4096xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_4096_4096_8192_f16_tA.mlir b/gemm/mlir/gemm_4096_4096_8192_f16_tA.mlir
index 86c37bf..f4ba892 100644
--- a/gemm/mlir/gemm_4096_4096_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_4096_4096_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x4096xf16>, %arg1: tensor<8192x4096xf16>) -> tensor<4096x4096xf16> {
+    func.func @main(%arg0: tensor<8192x4096xf16>, %arg1: tensor<8192x4096xf16>) -> tensor<4096x4096xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<4096x4096xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<4096x4096xf16>) -> tensor<4096x4096xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x4096xf16>, tensor<8192x4096xf16>) outs(%1 : tensor<4096x4096xf16>) -> tensor<4096x4096xf16>
         return %2 : tensor<4096x4096xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_4096_4096_8192_f16_tB.mlir b/gemm/mlir/gemm_4096_4096_8192_f16_tB.mlir
index da4a938..d96e00f 100644
--- a/gemm/mlir/gemm_4096_4096_8192_f16_tB.mlir
+++ b/gemm/mlir/gemm_4096_4096_8192_f16_tB.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<4096x8192xf16>, %arg1: tensor<4096x8192xf16>) -> tensor<4096x4096xf16> {
+    func.func @main(%arg0: tensor<4096x8192xf16>, %arg1: tensor<4096x8192xf16>) -> tensor<4096x4096xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<4096x4096xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<4096x4096xf16>) -> tensor<4096x4096xf16>
         %2 = linalg.matmul_transpose_b ins(%arg0, %arg1 : tensor<4096x8192xf16>, tensor<4096x8192xf16>) outs(%1 : tensor<4096x4096xf16>) -> tensor<4096x4096xf16>
         return %2 : tensor<4096x4096xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_16_1280_bf16_tA.mlir b/gemm/mlir/gemm_5120_16_1280_bf16_tA.mlir
index 7d55b6a..7e21b10 100644
--- a/gemm/mlir/gemm_5120_16_1280_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_16_1280_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<1280x5120xbf16>, %arg1: tensor<1280x16xbf16>) -> tensor<5120x16xbf16> {
+    func.func @main(%arg0: tensor<1280x5120xbf16>, %arg1: tensor<1280x16xbf16>) -> tensor<5120x16xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x16xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x16xbf16>) -> tensor<5120x16xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1280x5120xbf16>, tensor<1280x16xbf16>) outs(%1 : tensor<5120x16xbf16>) -> tensor<5120x16xbf16>
         return %2 : tensor<5120x16xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_16_1280_f16_tA.mlir b/gemm/mlir/gemm_5120_16_1280_f16_tA.mlir
index 38ec94b..e777fe8 100644
--- a/gemm/mlir/gemm_5120_16_1280_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_16_1280_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<1280x5120xf16>, %arg1: tensor<1280x16xf16>) -> tensor<5120x16xf16> {
+    func.func @main(%arg0: tensor<1280x5120xf16>, %arg1: tensor<1280x16xf16>) -> tensor<5120x16xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x16xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x16xf16>) -> tensor<5120x16xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1280x5120xf16>, tensor<1280x16xf16>) outs(%1 : tensor<5120x16xf16>) -> tensor<5120x16xf16>
         return %2 : tensor<5120x16xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_16_13824_bf16_tA.mlir b/gemm/mlir/gemm_5120_16_13824_bf16_tA.mlir
index a05a431..712a5a3 100644
--- a/gemm/mlir/gemm_5120_16_13824_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_16_13824_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<13824x5120xbf16>, %arg1: tensor<13824x16xbf16>) -> tensor<5120x16xbf16> {
+    func.func @main(%arg0: tensor<13824x5120xbf16>, %arg1: tensor<13824x16xbf16>) -> tensor<5120x16xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x16xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x16xbf16>) -> tensor<5120x16xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<13824x5120xbf16>, tensor<13824x16xbf16>) outs(%1 : tensor<5120x16xbf16>) -> tensor<5120x16xbf16>
         return %2 : tensor<5120x16xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_16_13824_f16_tA.mlir b/gemm/mlir/gemm_5120_16_13824_f16_tA.mlir
index db46cbf..e95a174 100644
--- a/gemm/mlir/gemm_5120_16_13824_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_16_13824_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<13824x5120xf16>, %arg1: tensor<13824x16xf16>) -> tensor<5120x16xf16> {
+    func.func @main(%arg0: tensor<13824x5120xf16>, %arg1: tensor<13824x16xf16>) -> tensor<5120x16xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x16xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x16xf16>) -> tensor<5120x16xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<13824x5120xf16>, tensor<13824x16xf16>) outs(%1 : tensor<5120x16xf16>) -> tensor<5120x16xf16>
         return %2 : tensor<5120x16xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_16_1728_bf16_tA.mlir b/gemm/mlir/gemm_5120_16_1728_bf16_tA.mlir
index 1a5833f..1f0b6cf 100644
--- a/gemm/mlir/gemm_5120_16_1728_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_16_1728_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<1728x5120xbf16>, %arg1: tensor<1728x16xbf16>) -> tensor<5120x16xbf16> {
+    func.func @main(%arg0: tensor<1728x5120xbf16>, %arg1: tensor<1728x16xbf16>) -> tensor<5120x16xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x16xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x16xbf16>) -> tensor<5120x16xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1728x5120xbf16>, tensor<1728x16xbf16>) outs(%1 : tensor<5120x16xbf16>) -> tensor<5120x16xbf16>
         return %2 : tensor<5120x16xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_16_1728_f16_tA.mlir b/gemm/mlir/gemm_5120_16_1728_f16_tA.mlir
index 0b4e6aa..c0efaf2 100644
--- a/gemm/mlir/gemm_5120_16_1728_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_16_1728_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<1728x5120xf16>, %arg1: tensor<1728x16xf16>) -> tensor<5120x16xf16> {
+    func.func @main(%arg0: tensor<1728x5120xf16>, %arg1: tensor<1728x16xf16>) -> tensor<5120x16xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x16xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x16xf16>) -> tensor<5120x16xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1728x5120xf16>, tensor<1728x16xf16>) outs(%1 : tensor<5120x16xf16>) -> tensor<5120x16xf16>
         return %2 : tensor<5120x16xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_16_2560_bf16_tA.mlir b/gemm/mlir/gemm_5120_16_2560_bf16_tA.mlir
index 4e70387..d850d73 100644
--- a/gemm/mlir/gemm_5120_16_2560_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_16_2560_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<2560x5120xbf16>, %arg1: tensor<2560x16xbf16>) -> tensor<5120x16xbf16> {
+    func.func @main(%arg0: tensor<2560x5120xbf16>, %arg1: tensor<2560x16xbf16>) -> tensor<5120x16xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x16xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x16xbf16>) -> tensor<5120x16xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2560x5120xbf16>, tensor<2560x16xbf16>) outs(%1 : tensor<5120x16xbf16>) -> tensor<5120x16xbf16>
         return %2 : tensor<5120x16xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_16_2560_f16_tA.mlir b/gemm/mlir/gemm_5120_16_2560_f16_tA.mlir
index 38f4d8e..e4183f4 100644
--- a/gemm/mlir/gemm_5120_16_2560_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_16_2560_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<2560x5120xf16>, %arg1: tensor<2560x16xf16>) -> tensor<5120x16xf16> {
+    func.func @main(%arg0: tensor<2560x5120xf16>, %arg1: tensor<2560x16xf16>) -> tensor<5120x16xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x16xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x16xf16>) -> tensor<5120x16xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2560x5120xf16>, tensor<2560x16xf16>) outs(%1 : tensor<5120x16xf16>) -> tensor<5120x16xf16>
         return %2 : tensor<5120x16xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_16_3456_bf16_tA.mlir b/gemm/mlir/gemm_5120_16_3456_bf16_tA.mlir
index 3322b49..dab5177 100644
--- a/gemm/mlir/gemm_5120_16_3456_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_16_3456_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<3456x5120xbf16>, %arg1: tensor<3456x16xbf16>) -> tensor<5120x16xbf16> {
+    func.func @main(%arg0: tensor<3456x5120xbf16>, %arg1: tensor<3456x16xbf16>) -> tensor<5120x16xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x16xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x16xbf16>) -> tensor<5120x16xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3456x5120xbf16>, tensor<3456x16xbf16>) outs(%1 : tensor<5120x16xbf16>) -> tensor<5120x16xbf16>
         return %2 : tensor<5120x16xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_16_3456_f16_tA.mlir b/gemm/mlir/gemm_5120_16_3456_f16_tA.mlir
index 2f73f40..e4d9277 100644
--- a/gemm/mlir/gemm_5120_16_3456_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_16_3456_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<3456x5120xf16>, %arg1: tensor<3456x16xf16>) -> tensor<5120x16xf16> {
+    func.func @main(%arg0: tensor<3456x5120xf16>, %arg1: tensor<3456x16xf16>) -> tensor<5120x16xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x16xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x16xf16>) -> tensor<5120x16xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3456x5120xf16>, tensor<3456x16xf16>) outs(%1 : tensor<5120x16xf16>) -> tensor<5120x16xf16>
         return %2 : tensor<5120x16xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_16_5120_bf16_tA.mlir b/gemm/mlir/gemm_5120_16_5120_bf16_tA.mlir
index d11355f..f5dfe26 100644
--- a/gemm/mlir/gemm_5120_16_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_16_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x5120xbf16>, %arg1: tensor<5120x16xbf16>) -> tensor<5120x16xbf16> {
+    func.func @main(%arg0: tensor<5120x5120xbf16>, %arg1: tensor<5120x16xbf16>) -> tensor<5120x16xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x16xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x16xbf16>) -> tensor<5120x16xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x5120xbf16>, tensor<5120x16xbf16>) outs(%1 : tensor<5120x16xbf16>) -> tensor<5120x16xbf16>
         return %2 : tensor<5120x16xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_16_5120_f16_tA.mlir b/gemm/mlir/gemm_5120_16_5120_f16_tA.mlir
index 509426e..71c7f1f 100644
--- a/gemm/mlir/gemm_5120_16_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_16_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x5120xf16>, %arg1: tensor<5120x16xf16>) -> tensor<5120x16xf16> {
+    func.func @main(%arg0: tensor<5120x5120xf16>, %arg1: tensor<5120x16xf16>) -> tensor<5120x16xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x16xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x16xf16>) -> tensor<5120x16xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x5120xf16>, tensor<5120x16xf16>) outs(%1 : tensor<5120x16xf16>) -> tensor<5120x16xf16>
         return %2 : tensor<5120x16xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_16_640_bf16_tA.mlir b/gemm/mlir/gemm_5120_16_640_bf16_tA.mlir
index 2aff311..20d9a68 100644
--- a/gemm/mlir/gemm_5120_16_640_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_16_640_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<640x5120xbf16>, %arg1: tensor<640x16xbf16>) -> tensor<5120x16xbf16> {
+    func.func @main(%arg0: tensor<640x5120xbf16>, %arg1: tensor<640x16xbf16>) -> tensor<5120x16xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x16xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x16xbf16>) -> tensor<5120x16xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<640x5120xbf16>, tensor<640x16xbf16>) outs(%1 : tensor<5120x16xbf16>) -> tensor<5120x16xbf16>
         return %2 : tensor<5120x16xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_16_640_f16_tA.mlir b/gemm/mlir/gemm_5120_16_640_f16_tA.mlir
index 9f5483e..bf06141 100644
--- a/gemm/mlir/gemm_5120_16_640_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_16_640_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<640x5120xf16>, %arg1: tensor<640x16xf16>) -> tensor<5120x16xf16> {
+    func.func @main(%arg0: tensor<640x5120xf16>, %arg1: tensor<640x16xf16>) -> tensor<5120x16xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x16xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x16xf16>) -> tensor<5120x16xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<640x5120xf16>, tensor<640x16xf16>) outs(%1 : tensor<5120x16xf16>) -> tensor<5120x16xf16>
         return %2 : tensor<5120x16xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_16_6912_bf16_tA.mlir b/gemm/mlir/gemm_5120_16_6912_bf16_tA.mlir
index 0ecdb14..4ab4378 100644
--- a/gemm/mlir/gemm_5120_16_6912_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_16_6912_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<6912x5120xbf16>, %arg1: tensor<6912x16xbf16>) -> tensor<5120x16xbf16> {
+    func.func @main(%arg0: tensor<6912x5120xbf16>, %arg1: tensor<6912x16xbf16>) -> tensor<5120x16xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x16xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x16xbf16>) -> tensor<5120x16xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<6912x5120xbf16>, tensor<6912x16xbf16>) outs(%1 : tensor<5120x16xbf16>) -> tensor<5120x16xbf16>
         return %2 : tensor<5120x16xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_16_6912_f16_tA.mlir b/gemm/mlir/gemm_5120_16_6912_f16_tA.mlir
index fd21611..476253e 100644
--- a/gemm/mlir/gemm_5120_16_6912_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_16_6912_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<6912x5120xf16>, %arg1: tensor<6912x16xf16>) -> tensor<5120x16xf16> {
+    func.func @main(%arg0: tensor<6912x5120xf16>, %arg1: tensor<6912x16xf16>) -> tensor<5120x16xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x16xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x16xf16>) -> tensor<5120x16xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<6912x5120xf16>, tensor<6912x16xf16>) outs(%1 : tensor<5120x16xf16>) -> tensor<5120x16xf16>
         return %2 : tensor<5120x16xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_16_8192_bf16_tA.mlir b/gemm/mlir/gemm_5120_16_8192_bf16_tA.mlir
index b031807..af65c87 100644
--- a/gemm/mlir/gemm_5120_16_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_16_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x5120xbf16>, %arg1: tensor<8192x16xbf16>) -> tensor<5120x16xbf16> {
+    func.func @main(%arg0: tensor<8192x5120xbf16>, %arg1: tensor<8192x16xbf16>) -> tensor<5120x16xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x16xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x16xbf16>) -> tensor<5120x16xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x5120xbf16>, tensor<8192x16xbf16>) outs(%1 : tensor<5120x16xbf16>) -> tensor<5120x16xbf16>
         return %2 : tensor<5120x16xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_16_8192_f16_tA.mlir b/gemm/mlir/gemm_5120_16_8192_f16_tA.mlir
index 7f8df36..9acb611 100644
--- a/gemm/mlir/gemm_5120_16_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_16_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x5120xf16>, %arg1: tensor<8192x16xf16>) -> tensor<5120x16xf16> {
+    func.func @main(%arg0: tensor<8192x5120xf16>, %arg1: tensor<8192x16xf16>) -> tensor<5120x16xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x16xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x16xf16>) -> tensor<5120x16xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x5120xf16>, tensor<8192x16xf16>) outs(%1 : tensor<5120x16xf16>) -> tensor<5120x16xf16>
         return %2 : tensor<5120x16xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_1_1280_bf16_tA.mlir b/gemm/mlir/gemm_5120_1_1280_bf16_tA.mlir
index 1499573..fbad7cb 100644
--- a/gemm/mlir/gemm_5120_1_1280_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_1_1280_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<1280x5120xbf16>, %arg1: tensor<1280x1xbf16>) -> tensor<5120x1xbf16> {
+    func.func @main(%arg0: tensor<1280x5120xbf16>, %arg1: tensor<1280x1xbf16>) -> tensor<5120x1xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x1xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x1xbf16>) -> tensor<5120x1xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1280x5120xbf16>, tensor<1280x1xbf16>) outs(%1 : tensor<5120x1xbf16>) -> tensor<5120x1xbf16>
         return %2 : tensor<5120x1xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_1_1280_f16_tA.mlir b/gemm/mlir/gemm_5120_1_1280_f16_tA.mlir
index 67867ae..a7e29cd 100644
--- a/gemm/mlir/gemm_5120_1_1280_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_1_1280_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<1280x5120xf16>, %arg1: tensor<1280x1xf16>) -> tensor<5120x1xf16> {
+    func.func @main(%arg0: tensor<1280x5120xf16>, %arg1: tensor<1280x1xf16>) -> tensor<5120x1xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x1xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x1xf16>) -> tensor<5120x1xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1280x5120xf16>, tensor<1280x1xf16>) outs(%1 : tensor<5120x1xf16>) -> tensor<5120x1xf16>
         return %2 : tensor<5120x1xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_1_13824_bf16_tA.mlir b/gemm/mlir/gemm_5120_1_13824_bf16_tA.mlir
index 76e7072..d006ff7 100644
--- a/gemm/mlir/gemm_5120_1_13824_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_1_13824_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<13824x5120xbf16>, %arg1: tensor<13824x1xbf16>) -> tensor<5120x1xbf16> {
+    func.func @main(%arg0: tensor<13824x5120xbf16>, %arg1: tensor<13824x1xbf16>) -> tensor<5120x1xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x1xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x1xbf16>) -> tensor<5120x1xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<13824x5120xbf16>, tensor<13824x1xbf16>) outs(%1 : tensor<5120x1xbf16>) -> tensor<5120x1xbf16>
         return %2 : tensor<5120x1xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_1_13824_f16_tA.mlir b/gemm/mlir/gemm_5120_1_13824_f16_tA.mlir
index 0bf2d47..a9fcf15 100644
--- a/gemm/mlir/gemm_5120_1_13824_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_1_13824_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<13824x5120xf16>, %arg1: tensor<13824x1xf16>) -> tensor<5120x1xf16> {
+    func.func @main(%arg0: tensor<13824x5120xf16>, %arg1: tensor<13824x1xf16>) -> tensor<5120x1xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x1xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x1xf16>) -> tensor<5120x1xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<13824x5120xf16>, tensor<13824x1xf16>) outs(%1 : tensor<5120x1xf16>) -> tensor<5120x1xf16>
         return %2 : tensor<5120x1xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_1_1728_bf16_tA.mlir b/gemm/mlir/gemm_5120_1_1728_bf16_tA.mlir
index 453e9e2..9417831 100644
--- a/gemm/mlir/gemm_5120_1_1728_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_1_1728_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<1728x5120xbf16>, %arg1: tensor<1728x1xbf16>) -> tensor<5120x1xbf16> {
+    func.func @main(%arg0: tensor<1728x5120xbf16>, %arg1: tensor<1728x1xbf16>) -> tensor<5120x1xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x1xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x1xbf16>) -> tensor<5120x1xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1728x5120xbf16>, tensor<1728x1xbf16>) outs(%1 : tensor<5120x1xbf16>) -> tensor<5120x1xbf16>
         return %2 : tensor<5120x1xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_1_1728_f16_tA.mlir b/gemm/mlir/gemm_5120_1_1728_f16_tA.mlir
index 69e0946..124f5a6 100644
--- a/gemm/mlir/gemm_5120_1_1728_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_1_1728_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<1728x5120xf16>, %arg1: tensor<1728x1xf16>) -> tensor<5120x1xf16> {
+    func.func @main(%arg0: tensor<1728x5120xf16>, %arg1: tensor<1728x1xf16>) -> tensor<5120x1xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x1xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x1xf16>) -> tensor<5120x1xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1728x5120xf16>, tensor<1728x1xf16>) outs(%1 : tensor<5120x1xf16>) -> tensor<5120x1xf16>
         return %2 : tensor<5120x1xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_1_2560_bf16_tA.mlir b/gemm/mlir/gemm_5120_1_2560_bf16_tA.mlir
index 762e752..3779817 100644
--- a/gemm/mlir/gemm_5120_1_2560_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_1_2560_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<2560x5120xbf16>, %arg1: tensor<2560x1xbf16>) -> tensor<5120x1xbf16> {
+    func.func @main(%arg0: tensor<2560x5120xbf16>, %arg1: tensor<2560x1xbf16>) -> tensor<5120x1xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x1xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x1xbf16>) -> tensor<5120x1xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2560x5120xbf16>, tensor<2560x1xbf16>) outs(%1 : tensor<5120x1xbf16>) -> tensor<5120x1xbf16>
         return %2 : tensor<5120x1xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_1_2560_f16_tA.mlir b/gemm/mlir/gemm_5120_1_2560_f16_tA.mlir
index dc68ca7..6258f4f 100644
--- a/gemm/mlir/gemm_5120_1_2560_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_1_2560_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<2560x5120xf16>, %arg1: tensor<2560x1xf16>) -> tensor<5120x1xf16> {
+    func.func @main(%arg0: tensor<2560x5120xf16>, %arg1: tensor<2560x1xf16>) -> tensor<5120x1xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x1xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x1xf16>) -> tensor<5120x1xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2560x5120xf16>, tensor<2560x1xf16>) outs(%1 : tensor<5120x1xf16>) -> tensor<5120x1xf16>
         return %2 : tensor<5120x1xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_1_3456_bf16_tA.mlir b/gemm/mlir/gemm_5120_1_3456_bf16_tA.mlir
index ecd1418..c2c0363 100644
--- a/gemm/mlir/gemm_5120_1_3456_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_1_3456_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<3456x5120xbf16>, %arg1: tensor<3456x1xbf16>) -> tensor<5120x1xbf16> {
+    func.func @main(%arg0: tensor<3456x5120xbf16>, %arg1: tensor<3456x1xbf16>) -> tensor<5120x1xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x1xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x1xbf16>) -> tensor<5120x1xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3456x5120xbf16>, tensor<3456x1xbf16>) outs(%1 : tensor<5120x1xbf16>) -> tensor<5120x1xbf16>
         return %2 : tensor<5120x1xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_1_3456_f16_tA.mlir b/gemm/mlir/gemm_5120_1_3456_f16_tA.mlir
index a635c3a..27728e7 100644
--- a/gemm/mlir/gemm_5120_1_3456_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_1_3456_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<3456x5120xf16>, %arg1: tensor<3456x1xf16>) -> tensor<5120x1xf16> {
+    func.func @main(%arg0: tensor<3456x5120xf16>, %arg1: tensor<3456x1xf16>) -> tensor<5120x1xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x1xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x1xf16>) -> tensor<5120x1xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3456x5120xf16>, tensor<3456x1xf16>) outs(%1 : tensor<5120x1xf16>) -> tensor<5120x1xf16>
         return %2 : tensor<5120x1xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_1_5120_bf16_tA.mlir b/gemm/mlir/gemm_5120_1_5120_bf16_tA.mlir
index 5dca089..e8652a1 100644
--- a/gemm/mlir/gemm_5120_1_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_1_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x5120xbf16>, %arg1: tensor<5120x1xbf16>) -> tensor<5120x1xbf16> {
+    func.func @main(%arg0: tensor<5120x5120xbf16>, %arg1: tensor<5120x1xbf16>) -> tensor<5120x1xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x1xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x1xbf16>) -> tensor<5120x1xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x5120xbf16>, tensor<5120x1xbf16>) outs(%1 : tensor<5120x1xbf16>) -> tensor<5120x1xbf16>
         return %2 : tensor<5120x1xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_1_5120_f16_tA.mlir b/gemm/mlir/gemm_5120_1_5120_f16_tA.mlir
index 9ec5717..d36e54c 100644
--- a/gemm/mlir/gemm_5120_1_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_1_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x5120xf16>, %arg1: tensor<5120x1xf16>) -> tensor<5120x1xf16> {
+    func.func @main(%arg0: tensor<5120x5120xf16>, %arg1: tensor<5120x1xf16>) -> tensor<5120x1xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x1xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x1xf16>) -> tensor<5120x1xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x5120xf16>, tensor<5120x1xf16>) outs(%1 : tensor<5120x1xf16>) -> tensor<5120x1xf16>
         return %2 : tensor<5120x1xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_1_640_bf16_tA.mlir b/gemm/mlir/gemm_5120_1_640_bf16_tA.mlir
index 731361c..3b414a8 100644
--- a/gemm/mlir/gemm_5120_1_640_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_1_640_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<640x5120xbf16>, %arg1: tensor<640x1xbf16>) -> tensor<5120x1xbf16> {
+    func.func @main(%arg0: tensor<640x5120xbf16>, %arg1: tensor<640x1xbf16>) -> tensor<5120x1xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x1xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x1xbf16>) -> tensor<5120x1xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<640x5120xbf16>, tensor<640x1xbf16>) outs(%1 : tensor<5120x1xbf16>) -> tensor<5120x1xbf16>
         return %2 : tensor<5120x1xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_1_640_f16_tA.mlir b/gemm/mlir/gemm_5120_1_640_f16_tA.mlir
index 6447cb1..f8bbbe2 100644
--- a/gemm/mlir/gemm_5120_1_640_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_1_640_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<640x5120xf16>, %arg1: tensor<640x1xf16>) -> tensor<5120x1xf16> {
+    func.func @main(%arg0: tensor<640x5120xf16>, %arg1: tensor<640x1xf16>) -> tensor<5120x1xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x1xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x1xf16>) -> tensor<5120x1xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<640x5120xf16>, tensor<640x1xf16>) outs(%1 : tensor<5120x1xf16>) -> tensor<5120x1xf16>
         return %2 : tensor<5120x1xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_1_6912_bf16_tA.mlir b/gemm/mlir/gemm_5120_1_6912_bf16_tA.mlir
index ee4ac72..fdc2298 100644
--- a/gemm/mlir/gemm_5120_1_6912_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_1_6912_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<6912x5120xbf16>, %arg1: tensor<6912x1xbf16>) -> tensor<5120x1xbf16> {
+    func.func @main(%arg0: tensor<6912x5120xbf16>, %arg1: tensor<6912x1xbf16>) -> tensor<5120x1xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x1xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x1xbf16>) -> tensor<5120x1xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<6912x5120xbf16>, tensor<6912x1xbf16>) outs(%1 : tensor<5120x1xbf16>) -> tensor<5120x1xbf16>
         return %2 : tensor<5120x1xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_1_6912_f16_tA.mlir b/gemm/mlir/gemm_5120_1_6912_f16_tA.mlir
index 9b18aad..be5c109 100644
--- a/gemm/mlir/gemm_5120_1_6912_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_1_6912_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<6912x5120xf16>, %arg1: tensor<6912x1xf16>) -> tensor<5120x1xf16> {
+    func.func @main(%arg0: tensor<6912x5120xf16>, %arg1: tensor<6912x1xf16>) -> tensor<5120x1xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x1xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x1xf16>) -> tensor<5120x1xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<6912x5120xf16>, tensor<6912x1xf16>) outs(%1 : tensor<5120x1xf16>) -> tensor<5120x1xf16>
         return %2 : tensor<5120x1xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_1_8192_bf16_tA.mlir b/gemm/mlir/gemm_5120_1_8192_bf16_tA.mlir
index 4026c65..13e6f69 100644
--- a/gemm/mlir/gemm_5120_1_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_1_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x5120xbf16>, %arg1: tensor<8192x1xbf16>) -> tensor<5120x1xbf16> {
+    func.func @main(%arg0: tensor<8192x5120xbf16>, %arg1: tensor<8192x1xbf16>) -> tensor<5120x1xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x1xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x1xbf16>) -> tensor<5120x1xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x5120xbf16>, tensor<8192x1xbf16>) outs(%1 : tensor<5120x1xbf16>) -> tensor<5120x1xbf16>
         return %2 : tensor<5120x1xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_1_8192_f16_tA.mlir b/gemm/mlir/gemm_5120_1_8192_f16_tA.mlir
index 3d64426..572ff85 100644
--- a/gemm/mlir/gemm_5120_1_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_1_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x5120xf16>, %arg1: tensor<8192x1xf16>) -> tensor<5120x1xf16> {
+    func.func @main(%arg0: tensor<8192x5120xf16>, %arg1: tensor<8192x1xf16>) -> tensor<5120x1xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x1xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x1xf16>) -> tensor<5120x1xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x5120xf16>, tensor<8192x1xf16>) outs(%1 : tensor<5120x1xf16>) -> tensor<5120x1xf16>
         return %2 : tensor<5120x1xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_2_1280_bf16_tA.mlir b/gemm/mlir/gemm_5120_2_1280_bf16_tA.mlir
index b6f4c9b..07b6e62 100644
--- a/gemm/mlir/gemm_5120_2_1280_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_2_1280_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<1280x5120xbf16>, %arg1: tensor<1280x2xbf16>) -> tensor<5120x2xbf16> {
+    func.func @main(%arg0: tensor<1280x5120xbf16>, %arg1: tensor<1280x2xbf16>) -> tensor<5120x2xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x2xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x2xbf16>) -> tensor<5120x2xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1280x5120xbf16>, tensor<1280x2xbf16>) outs(%1 : tensor<5120x2xbf16>) -> tensor<5120x2xbf16>
         return %2 : tensor<5120x2xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_2_1280_f16_tA.mlir b/gemm/mlir/gemm_5120_2_1280_f16_tA.mlir
index b557fb9..70ad768 100644
--- a/gemm/mlir/gemm_5120_2_1280_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_2_1280_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<1280x5120xf16>, %arg1: tensor<1280x2xf16>) -> tensor<5120x2xf16> {
+    func.func @main(%arg0: tensor<1280x5120xf16>, %arg1: tensor<1280x2xf16>) -> tensor<5120x2xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x2xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x2xf16>) -> tensor<5120x2xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1280x5120xf16>, tensor<1280x2xf16>) outs(%1 : tensor<5120x2xf16>) -> tensor<5120x2xf16>
         return %2 : tensor<5120x2xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_2_13824_bf16_tA.mlir b/gemm/mlir/gemm_5120_2_13824_bf16_tA.mlir
index eb48187..e83f65d 100644
--- a/gemm/mlir/gemm_5120_2_13824_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_2_13824_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<13824x5120xbf16>, %arg1: tensor<13824x2xbf16>) -> tensor<5120x2xbf16> {
+    func.func @main(%arg0: tensor<13824x5120xbf16>, %arg1: tensor<13824x2xbf16>) -> tensor<5120x2xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x2xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x2xbf16>) -> tensor<5120x2xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<13824x5120xbf16>, tensor<13824x2xbf16>) outs(%1 : tensor<5120x2xbf16>) -> tensor<5120x2xbf16>
         return %2 : tensor<5120x2xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_2_13824_f16_tA.mlir b/gemm/mlir/gemm_5120_2_13824_f16_tA.mlir
index bd31359..e30738c 100644
--- a/gemm/mlir/gemm_5120_2_13824_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_2_13824_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<13824x5120xf16>, %arg1: tensor<13824x2xf16>) -> tensor<5120x2xf16> {
+    func.func @main(%arg0: tensor<13824x5120xf16>, %arg1: tensor<13824x2xf16>) -> tensor<5120x2xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x2xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x2xf16>) -> tensor<5120x2xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<13824x5120xf16>, tensor<13824x2xf16>) outs(%1 : tensor<5120x2xf16>) -> tensor<5120x2xf16>
         return %2 : tensor<5120x2xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_2_1728_bf16_tA.mlir b/gemm/mlir/gemm_5120_2_1728_bf16_tA.mlir
index 6f4c566..8a04fb2 100644
--- a/gemm/mlir/gemm_5120_2_1728_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_2_1728_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<1728x5120xbf16>, %arg1: tensor<1728x2xbf16>) -> tensor<5120x2xbf16> {
+    func.func @main(%arg0: tensor<1728x5120xbf16>, %arg1: tensor<1728x2xbf16>) -> tensor<5120x2xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x2xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x2xbf16>) -> tensor<5120x2xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1728x5120xbf16>, tensor<1728x2xbf16>) outs(%1 : tensor<5120x2xbf16>) -> tensor<5120x2xbf16>
         return %2 : tensor<5120x2xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_2_1728_f16_tA.mlir b/gemm/mlir/gemm_5120_2_1728_f16_tA.mlir
index 021d2b5..2c77846 100644
--- a/gemm/mlir/gemm_5120_2_1728_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_2_1728_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<1728x5120xf16>, %arg1: tensor<1728x2xf16>) -> tensor<5120x2xf16> {
+    func.func @main(%arg0: tensor<1728x5120xf16>, %arg1: tensor<1728x2xf16>) -> tensor<5120x2xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x2xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x2xf16>) -> tensor<5120x2xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1728x5120xf16>, tensor<1728x2xf16>) outs(%1 : tensor<5120x2xf16>) -> tensor<5120x2xf16>
         return %2 : tensor<5120x2xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_2_2560_bf16_tA.mlir b/gemm/mlir/gemm_5120_2_2560_bf16_tA.mlir
index 7426ffb..25d142a 100644
--- a/gemm/mlir/gemm_5120_2_2560_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_2_2560_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<2560x5120xbf16>, %arg1: tensor<2560x2xbf16>) -> tensor<5120x2xbf16> {
+    func.func @main(%arg0: tensor<2560x5120xbf16>, %arg1: tensor<2560x2xbf16>) -> tensor<5120x2xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x2xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x2xbf16>) -> tensor<5120x2xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2560x5120xbf16>, tensor<2560x2xbf16>) outs(%1 : tensor<5120x2xbf16>) -> tensor<5120x2xbf16>
         return %2 : tensor<5120x2xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_2_2560_f16_tA.mlir b/gemm/mlir/gemm_5120_2_2560_f16_tA.mlir
index 4b1c18d..414bd86 100644
--- a/gemm/mlir/gemm_5120_2_2560_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_2_2560_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<2560x5120xf16>, %arg1: tensor<2560x2xf16>) -> tensor<5120x2xf16> {
+    func.func @main(%arg0: tensor<2560x5120xf16>, %arg1: tensor<2560x2xf16>) -> tensor<5120x2xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x2xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x2xf16>) -> tensor<5120x2xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2560x5120xf16>, tensor<2560x2xf16>) outs(%1 : tensor<5120x2xf16>) -> tensor<5120x2xf16>
         return %2 : tensor<5120x2xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_2_3456_bf16_tA.mlir b/gemm/mlir/gemm_5120_2_3456_bf16_tA.mlir
index 1f5cb73..3b81d86 100644
--- a/gemm/mlir/gemm_5120_2_3456_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_2_3456_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<3456x5120xbf16>, %arg1: tensor<3456x2xbf16>) -> tensor<5120x2xbf16> {
+    func.func @main(%arg0: tensor<3456x5120xbf16>, %arg1: tensor<3456x2xbf16>) -> tensor<5120x2xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x2xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x2xbf16>) -> tensor<5120x2xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3456x5120xbf16>, tensor<3456x2xbf16>) outs(%1 : tensor<5120x2xbf16>) -> tensor<5120x2xbf16>
         return %2 : tensor<5120x2xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_2_3456_f16_tA.mlir b/gemm/mlir/gemm_5120_2_3456_f16_tA.mlir
index 23dcf3c..fe954d2 100644
--- a/gemm/mlir/gemm_5120_2_3456_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_2_3456_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<3456x5120xf16>, %arg1: tensor<3456x2xf16>) -> tensor<5120x2xf16> {
+    func.func @main(%arg0: tensor<3456x5120xf16>, %arg1: tensor<3456x2xf16>) -> tensor<5120x2xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x2xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x2xf16>) -> tensor<5120x2xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3456x5120xf16>, tensor<3456x2xf16>) outs(%1 : tensor<5120x2xf16>) -> tensor<5120x2xf16>
         return %2 : tensor<5120x2xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_2_5120_bf16_tA.mlir b/gemm/mlir/gemm_5120_2_5120_bf16_tA.mlir
index 8e45849..6599984 100644
--- a/gemm/mlir/gemm_5120_2_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_2_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x5120xbf16>, %arg1: tensor<5120x2xbf16>) -> tensor<5120x2xbf16> {
+    func.func @main(%arg0: tensor<5120x5120xbf16>, %arg1: tensor<5120x2xbf16>) -> tensor<5120x2xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x2xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x2xbf16>) -> tensor<5120x2xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x5120xbf16>, tensor<5120x2xbf16>) outs(%1 : tensor<5120x2xbf16>) -> tensor<5120x2xbf16>
         return %2 : tensor<5120x2xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_2_5120_f16_tA.mlir b/gemm/mlir/gemm_5120_2_5120_f16_tA.mlir
index 3498510..f88163e 100644
--- a/gemm/mlir/gemm_5120_2_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_2_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x5120xf16>, %arg1: tensor<5120x2xf16>) -> tensor<5120x2xf16> {
+    func.func @main(%arg0: tensor<5120x5120xf16>, %arg1: tensor<5120x2xf16>) -> tensor<5120x2xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x2xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x2xf16>) -> tensor<5120x2xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x5120xf16>, tensor<5120x2xf16>) outs(%1 : tensor<5120x2xf16>) -> tensor<5120x2xf16>
         return %2 : tensor<5120x2xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_2_640_bf16_tA.mlir b/gemm/mlir/gemm_5120_2_640_bf16_tA.mlir
index df66ea8..8ade0ca 100644
--- a/gemm/mlir/gemm_5120_2_640_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_2_640_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<640x5120xbf16>, %arg1: tensor<640x2xbf16>) -> tensor<5120x2xbf16> {
+    func.func @main(%arg0: tensor<640x5120xbf16>, %arg1: tensor<640x2xbf16>) -> tensor<5120x2xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x2xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x2xbf16>) -> tensor<5120x2xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<640x5120xbf16>, tensor<640x2xbf16>) outs(%1 : tensor<5120x2xbf16>) -> tensor<5120x2xbf16>
         return %2 : tensor<5120x2xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_2_640_f16_tA.mlir b/gemm/mlir/gemm_5120_2_640_f16_tA.mlir
index 7e92ece..3c50f2f 100644
--- a/gemm/mlir/gemm_5120_2_640_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_2_640_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<640x5120xf16>, %arg1: tensor<640x2xf16>) -> tensor<5120x2xf16> {
+    func.func @main(%arg0: tensor<640x5120xf16>, %arg1: tensor<640x2xf16>) -> tensor<5120x2xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x2xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x2xf16>) -> tensor<5120x2xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<640x5120xf16>, tensor<640x2xf16>) outs(%1 : tensor<5120x2xf16>) -> tensor<5120x2xf16>
         return %2 : tensor<5120x2xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_2_6912_bf16_tA.mlir b/gemm/mlir/gemm_5120_2_6912_bf16_tA.mlir
index 2135217..5f8b20a 100644
--- a/gemm/mlir/gemm_5120_2_6912_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_2_6912_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<6912x5120xbf16>, %arg1: tensor<6912x2xbf16>) -> tensor<5120x2xbf16> {
+    func.func @main(%arg0: tensor<6912x5120xbf16>, %arg1: tensor<6912x2xbf16>) -> tensor<5120x2xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x2xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x2xbf16>) -> tensor<5120x2xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<6912x5120xbf16>, tensor<6912x2xbf16>) outs(%1 : tensor<5120x2xbf16>) -> tensor<5120x2xbf16>
         return %2 : tensor<5120x2xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_2_6912_f16_tA.mlir b/gemm/mlir/gemm_5120_2_6912_f16_tA.mlir
index f90ec73..7fe73cd 100644
--- a/gemm/mlir/gemm_5120_2_6912_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_2_6912_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<6912x5120xf16>, %arg1: tensor<6912x2xf16>) -> tensor<5120x2xf16> {
+    func.func @main(%arg0: tensor<6912x5120xf16>, %arg1: tensor<6912x2xf16>) -> tensor<5120x2xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x2xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x2xf16>) -> tensor<5120x2xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<6912x5120xf16>, tensor<6912x2xf16>) outs(%1 : tensor<5120x2xf16>) -> tensor<5120x2xf16>
         return %2 : tensor<5120x2xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_2_8192_bf16_tA.mlir b/gemm/mlir/gemm_5120_2_8192_bf16_tA.mlir
index 794914a..4460592 100644
--- a/gemm/mlir/gemm_5120_2_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_2_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x5120xbf16>, %arg1: tensor<8192x2xbf16>) -> tensor<5120x2xbf16> {
+    func.func @main(%arg0: tensor<8192x5120xbf16>, %arg1: tensor<8192x2xbf16>) -> tensor<5120x2xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x2xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x2xbf16>) -> tensor<5120x2xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x5120xbf16>, tensor<8192x2xbf16>) outs(%1 : tensor<5120x2xbf16>) -> tensor<5120x2xbf16>
         return %2 : tensor<5120x2xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_2_8192_f16_tA.mlir b/gemm/mlir/gemm_5120_2_8192_f16_tA.mlir
index 8c6ecbb..6e9ac82 100644
--- a/gemm/mlir/gemm_5120_2_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_2_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x5120xf16>, %arg1: tensor<8192x2xf16>) -> tensor<5120x2xf16> {
+    func.func @main(%arg0: tensor<8192x5120xf16>, %arg1: tensor<8192x2xf16>) -> tensor<5120x2xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x2xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x2xf16>) -> tensor<5120x2xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x5120xf16>, tensor<8192x2xf16>) outs(%1 : tensor<5120x2xf16>) -> tensor<5120x2xf16>
         return %2 : tensor<5120x2xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_32_1280_bf16_tA.mlir b/gemm/mlir/gemm_5120_32_1280_bf16_tA.mlir
index 0c54b39..256678e 100644
--- a/gemm/mlir/gemm_5120_32_1280_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_32_1280_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<1280x5120xbf16>, %arg1: tensor<1280x32xbf16>) -> tensor<5120x32xbf16> {
+    func.func @main(%arg0: tensor<1280x5120xbf16>, %arg1: tensor<1280x32xbf16>) -> tensor<5120x32xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x32xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x32xbf16>) -> tensor<5120x32xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1280x5120xbf16>, tensor<1280x32xbf16>) outs(%1 : tensor<5120x32xbf16>) -> tensor<5120x32xbf16>
         return %2 : tensor<5120x32xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_32_1280_f16_tA.mlir b/gemm/mlir/gemm_5120_32_1280_f16_tA.mlir
index 6123e81..e7f5580 100644
--- a/gemm/mlir/gemm_5120_32_1280_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_32_1280_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<1280x5120xf16>, %arg1: tensor<1280x32xf16>) -> tensor<5120x32xf16> {
+    func.func @main(%arg0: tensor<1280x5120xf16>, %arg1: tensor<1280x32xf16>) -> tensor<5120x32xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x32xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x32xf16>) -> tensor<5120x32xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1280x5120xf16>, tensor<1280x32xf16>) outs(%1 : tensor<5120x32xf16>) -> tensor<5120x32xf16>
         return %2 : tensor<5120x32xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_32_13824_bf16_tA.mlir b/gemm/mlir/gemm_5120_32_13824_bf16_tA.mlir
index 265fdc1..d84ed24 100644
--- a/gemm/mlir/gemm_5120_32_13824_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_32_13824_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<13824x5120xbf16>, %arg1: tensor<13824x32xbf16>) -> tensor<5120x32xbf16> {
+    func.func @main(%arg0: tensor<13824x5120xbf16>, %arg1: tensor<13824x32xbf16>) -> tensor<5120x32xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x32xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x32xbf16>) -> tensor<5120x32xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<13824x5120xbf16>, tensor<13824x32xbf16>) outs(%1 : tensor<5120x32xbf16>) -> tensor<5120x32xbf16>
         return %2 : tensor<5120x32xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_32_13824_f16_tA.mlir b/gemm/mlir/gemm_5120_32_13824_f16_tA.mlir
index 0ab9e18..f50d0d0 100644
--- a/gemm/mlir/gemm_5120_32_13824_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_32_13824_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<13824x5120xf16>, %arg1: tensor<13824x32xf16>) -> tensor<5120x32xf16> {
+    func.func @main(%arg0: tensor<13824x5120xf16>, %arg1: tensor<13824x32xf16>) -> tensor<5120x32xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x32xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x32xf16>) -> tensor<5120x32xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<13824x5120xf16>, tensor<13824x32xf16>) outs(%1 : tensor<5120x32xf16>) -> tensor<5120x32xf16>
         return %2 : tensor<5120x32xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_32_1728_bf16_tA.mlir b/gemm/mlir/gemm_5120_32_1728_bf16_tA.mlir
index 6687a7e..a4af4b4 100644
--- a/gemm/mlir/gemm_5120_32_1728_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_32_1728_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<1728x5120xbf16>, %arg1: tensor<1728x32xbf16>) -> tensor<5120x32xbf16> {
+    func.func @main(%arg0: tensor<1728x5120xbf16>, %arg1: tensor<1728x32xbf16>) -> tensor<5120x32xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x32xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x32xbf16>) -> tensor<5120x32xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1728x5120xbf16>, tensor<1728x32xbf16>) outs(%1 : tensor<5120x32xbf16>) -> tensor<5120x32xbf16>
         return %2 : tensor<5120x32xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_32_1728_f16_tA.mlir b/gemm/mlir/gemm_5120_32_1728_f16_tA.mlir
index e5e927e..16e7179 100644
--- a/gemm/mlir/gemm_5120_32_1728_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_32_1728_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<1728x5120xf16>, %arg1: tensor<1728x32xf16>) -> tensor<5120x32xf16> {
+    func.func @main(%arg0: tensor<1728x5120xf16>, %arg1: tensor<1728x32xf16>) -> tensor<5120x32xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x32xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x32xf16>) -> tensor<5120x32xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1728x5120xf16>, tensor<1728x32xf16>) outs(%1 : tensor<5120x32xf16>) -> tensor<5120x32xf16>
         return %2 : tensor<5120x32xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_32_2560_bf16_tA.mlir b/gemm/mlir/gemm_5120_32_2560_bf16_tA.mlir
index a5e716f..bea8cb5 100644
--- a/gemm/mlir/gemm_5120_32_2560_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_32_2560_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<2560x5120xbf16>, %arg1: tensor<2560x32xbf16>) -> tensor<5120x32xbf16> {
+    func.func @main(%arg0: tensor<2560x5120xbf16>, %arg1: tensor<2560x32xbf16>) -> tensor<5120x32xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x32xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x32xbf16>) -> tensor<5120x32xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2560x5120xbf16>, tensor<2560x32xbf16>) outs(%1 : tensor<5120x32xbf16>) -> tensor<5120x32xbf16>
         return %2 : tensor<5120x32xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_32_2560_f16_tA.mlir b/gemm/mlir/gemm_5120_32_2560_f16_tA.mlir
index e0b47f0..d4d7491 100644
--- a/gemm/mlir/gemm_5120_32_2560_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_32_2560_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<2560x5120xf16>, %arg1: tensor<2560x32xf16>) -> tensor<5120x32xf16> {
+    func.func @main(%arg0: tensor<2560x5120xf16>, %arg1: tensor<2560x32xf16>) -> tensor<5120x32xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x32xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x32xf16>) -> tensor<5120x32xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2560x5120xf16>, tensor<2560x32xf16>) outs(%1 : tensor<5120x32xf16>) -> tensor<5120x32xf16>
         return %2 : tensor<5120x32xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_32_3456_bf16_tA.mlir b/gemm/mlir/gemm_5120_32_3456_bf16_tA.mlir
index f48e631..a1ec40e 100644
--- a/gemm/mlir/gemm_5120_32_3456_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_32_3456_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<3456x5120xbf16>, %arg1: tensor<3456x32xbf16>) -> tensor<5120x32xbf16> {
+    func.func @main(%arg0: tensor<3456x5120xbf16>, %arg1: tensor<3456x32xbf16>) -> tensor<5120x32xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x32xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x32xbf16>) -> tensor<5120x32xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3456x5120xbf16>, tensor<3456x32xbf16>) outs(%1 : tensor<5120x32xbf16>) -> tensor<5120x32xbf16>
         return %2 : tensor<5120x32xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_32_3456_f16_tA.mlir b/gemm/mlir/gemm_5120_32_3456_f16_tA.mlir
index 4e674d1..8f6301c 100644
--- a/gemm/mlir/gemm_5120_32_3456_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_32_3456_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<3456x5120xf16>, %arg1: tensor<3456x32xf16>) -> tensor<5120x32xf16> {
+    func.func @main(%arg0: tensor<3456x5120xf16>, %arg1: tensor<3456x32xf16>) -> tensor<5120x32xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x32xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x32xf16>) -> tensor<5120x32xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3456x5120xf16>, tensor<3456x32xf16>) outs(%1 : tensor<5120x32xf16>) -> tensor<5120x32xf16>
         return %2 : tensor<5120x32xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_32_5120_bf16_tA.mlir b/gemm/mlir/gemm_5120_32_5120_bf16_tA.mlir
index 832c3c2..4c72158 100644
--- a/gemm/mlir/gemm_5120_32_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_32_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x5120xbf16>, %arg1: tensor<5120x32xbf16>) -> tensor<5120x32xbf16> {
+    func.func @main(%arg0: tensor<5120x5120xbf16>, %arg1: tensor<5120x32xbf16>) -> tensor<5120x32xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x32xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x32xbf16>) -> tensor<5120x32xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x5120xbf16>, tensor<5120x32xbf16>) outs(%1 : tensor<5120x32xbf16>) -> tensor<5120x32xbf16>
         return %2 : tensor<5120x32xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_32_5120_f16_tA.mlir b/gemm/mlir/gemm_5120_32_5120_f16_tA.mlir
index fc13ee2..027a09f 100644
--- a/gemm/mlir/gemm_5120_32_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_32_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x5120xf16>, %arg1: tensor<5120x32xf16>) -> tensor<5120x32xf16> {
+    func.func @main(%arg0: tensor<5120x5120xf16>, %arg1: tensor<5120x32xf16>) -> tensor<5120x32xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x32xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x32xf16>) -> tensor<5120x32xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x5120xf16>, tensor<5120x32xf16>) outs(%1 : tensor<5120x32xf16>) -> tensor<5120x32xf16>
         return %2 : tensor<5120x32xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_32_640_bf16_tA.mlir b/gemm/mlir/gemm_5120_32_640_bf16_tA.mlir
index 6684884..fec70cb 100644
--- a/gemm/mlir/gemm_5120_32_640_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_32_640_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<640x5120xbf16>, %arg1: tensor<640x32xbf16>) -> tensor<5120x32xbf16> {
+    func.func @main(%arg0: tensor<640x5120xbf16>, %arg1: tensor<640x32xbf16>) -> tensor<5120x32xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x32xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x32xbf16>) -> tensor<5120x32xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<640x5120xbf16>, tensor<640x32xbf16>) outs(%1 : tensor<5120x32xbf16>) -> tensor<5120x32xbf16>
         return %2 : tensor<5120x32xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_32_640_f16_tA.mlir b/gemm/mlir/gemm_5120_32_640_f16_tA.mlir
index 8d2c153..d2e3949 100644
--- a/gemm/mlir/gemm_5120_32_640_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_32_640_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<640x5120xf16>, %arg1: tensor<640x32xf16>) -> tensor<5120x32xf16> {
+    func.func @main(%arg0: tensor<640x5120xf16>, %arg1: tensor<640x32xf16>) -> tensor<5120x32xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x32xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x32xf16>) -> tensor<5120x32xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<640x5120xf16>, tensor<640x32xf16>) outs(%1 : tensor<5120x32xf16>) -> tensor<5120x32xf16>
         return %2 : tensor<5120x32xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_32_6912_bf16_tA.mlir b/gemm/mlir/gemm_5120_32_6912_bf16_tA.mlir
index 9d389f1..7e22180 100644
--- a/gemm/mlir/gemm_5120_32_6912_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_32_6912_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<6912x5120xbf16>, %arg1: tensor<6912x32xbf16>) -> tensor<5120x32xbf16> {
+    func.func @main(%arg0: tensor<6912x5120xbf16>, %arg1: tensor<6912x32xbf16>) -> tensor<5120x32xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x32xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x32xbf16>) -> tensor<5120x32xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<6912x5120xbf16>, tensor<6912x32xbf16>) outs(%1 : tensor<5120x32xbf16>) -> tensor<5120x32xbf16>
         return %2 : tensor<5120x32xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_32_6912_f16_tA.mlir b/gemm/mlir/gemm_5120_32_6912_f16_tA.mlir
index 5f3a76c..1d9947a 100644
--- a/gemm/mlir/gemm_5120_32_6912_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_32_6912_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<6912x5120xf16>, %arg1: tensor<6912x32xf16>) -> tensor<5120x32xf16> {
+    func.func @main(%arg0: tensor<6912x5120xf16>, %arg1: tensor<6912x32xf16>) -> tensor<5120x32xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x32xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x32xf16>) -> tensor<5120x32xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<6912x5120xf16>, tensor<6912x32xf16>) outs(%1 : tensor<5120x32xf16>) -> tensor<5120x32xf16>
         return %2 : tensor<5120x32xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_32_8192_bf16_tA.mlir b/gemm/mlir/gemm_5120_32_8192_bf16_tA.mlir
index fa304ce..323437a 100644
--- a/gemm/mlir/gemm_5120_32_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_32_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x5120xbf16>, %arg1: tensor<8192x32xbf16>) -> tensor<5120x32xbf16> {
+    func.func @main(%arg0: tensor<8192x5120xbf16>, %arg1: tensor<8192x32xbf16>) -> tensor<5120x32xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x32xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x32xbf16>) -> tensor<5120x32xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x5120xbf16>, tensor<8192x32xbf16>) outs(%1 : tensor<5120x32xbf16>) -> tensor<5120x32xbf16>
         return %2 : tensor<5120x32xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_32_8192_f16_tA.mlir b/gemm/mlir/gemm_5120_32_8192_f16_tA.mlir
index 13e4c8d..91e0026 100644
--- a/gemm/mlir/gemm_5120_32_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_32_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x5120xf16>, %arg1: tensor<8192x32xf16>) -> tensor<5120x32xf16> {
+    func.func @main(%arg0: tensor<8192x5120xf16>, %arg1: tensor<8192x32xf16>) -> tensor<5120x32xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x32xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x32xf16>) -> tensor<5120x32xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x5120xf16>, tensor<8192x32xf16>) outs(%1 : tensor<5120x32xf16>) -> tensor<5120x32xf16>
         return %2 : tensor<5120x32xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_4_1280_bf16_tA.mlir b/gemm/mlir/gemm_5120_4_1280_bf16_tA.mlir
index fb0b017..b02b975 100644
--- a/gemm/mlir/gemm_5120_4_1280_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_4_1280_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<1280x5120xbf16>, %arg1: tensor<1280x4xbf16>) -> tensor<5120x4xbf16> {
+    func.func @main(%arg0: tensor<1280x5120xbf16>, %arg1: tensor<1280x4xbf16>) -> tensor<5120x4xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x4xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x4xbf16>) -> tensor<5120x4xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1280x5120xbf16>, tensor<1280x4xbf16>) outs(%1 : tensor<5120x4xbf16>) -> tensor<5120x4xbf16>
         return %2 : tensor<5120x4xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_4_1280_f16_tA.mlir b/gemm/mlir/gemm_5120_4_1280_f16_tA.mlir
index c2129bb..cdbe240 100644
--- a/gemm/mlir/gemm_5120_4_1280_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_4_1280_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<1280x5120xf16>, %arg1: tensor<1280x4xf16>) -> tensor<5120x4xf16> {
+    func.func @main(%arg0: tensor<1280x5120xf16>, %arg1: tensor<1280x4xf16>) -> tensor<5120x4xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x4xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x4xf16>) -> tensor<5120x4xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1280x5120xf16>, tensor<1280x4xf16>) outs(%1 : tensor<5120x4xf16>) -> tensor<5120x4xf16>
         return %2 : tensor<5120x4xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_4_13824_bf16_tA.mlir b/gemm/mlir/gemm_5120_4_13824_bf16_tA.mlir
index 2f44985..c024c59 100644
--- a/gemm/mlir/gemm_5120_4_13824_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_4_13824_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<13824x5120xbf16>, %arg1: tensor<13824x4xbf16>) -> tensor<5120x4xbf16> {
+    func.func @main(%arg0: tensor<13824x5120xbf16>, %arg1: tensor<13824x4xbf16>) -> tensor<5120x4xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x4xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x4xbf16>) -> tensor<5120x4xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<13824x5120xbf16>, tensor<13824x4xbf16>) outs(%1 : tensor<5120x4xbf16>) -> tensor<5120x4xbf16>
         return %2 : tensor<5120x4xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_4_13824_f16_tA.mlir b/gemm/mlir/gemm_5120_4_13824_f16_tA.mlir
index 8ba061f..1b355e9 100644
--- a/gemm/mlir/gemm_5120_4_13824_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_4_13824_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<13824x5120xf16>, %arg1: tensor<13824x4xf16>) -> tensor<5120x4xf16> {
+    func.func @main(%arg0: tensor<13824x5120xf16>, %arg1: tensor<13824x4xf16>) -> tensor<5120x4xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x4xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x4xf16>) -> tensor<5120x4xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<13824x5120xf16>, tensor<13824x4xf16>) outs(%1 : tensor<5120x4xf16>) -> tensor<5120x4xf16>
         return %2 : tensor<5120x4xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_4_1728_bf16_tA.mlir b/gemm/mlir/gemm_5120_4_1728_bf16_tA.mlir
index 7db150c..77d316d 100644
--- a/gemm/mlir/gemm_5120_4_1728_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_4_1728_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<1728x5120xbf16>, %arg1: tensor<1728x4xbf16>) -> tensor<5120x4xbf16> {
+    func.func @main(%arg0: tensor<1728x5120xbf16>, %arg1: tensor<1728x4xbf16>) -> tensor<5120x4xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x4xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x4xbf16>) -> tensor<5120x4xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1728x5120xbf16>, tensor<1728x4xbf16>) outs(%1 : tensor<5120x4xbf16>) -> tensor<5120x4xbf16>
         return %2 : tensor<5120x4xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_4_1728_f16_tA.mlir b/gemm/mlir/gemm_5120_4_1728_f16_tA.mlir
index 5b697a3..b77fd46 100644
--- a/gemm/mlir/gemm_5120_4_1728_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_4_1728_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<1728x5120xf16>, %arg1: tensor<1728x4xf16>) -> tensor<5120x4xf16> {
+    func.func @main(%arg0: tensor<1728x5120xf16>, %arg1: tensor<1728x4xf16>) -> tensor<5120x4xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x4xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x4xf16>) -> tensor<5120x4xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1728x5120xf16>, tensor<1728x4xf16>) outs(%1 : tensor<5120x4xf16>) -> tensor<5120x4xf16>
         return %2 : tensor<5120x4xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_4_2560_bf16_tA.mlir b/gemm/mlir/gemm_5120_4_2560_bf16_tA.mlir
index e1775bd..b441065 100644
--- a/gemm/mlir/gemm_5120_4_2560_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_4_2560_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<2560x5120xbf16>, %arg1: tensor<2560x4xbf16>) -> tensor<5120x4xbf16> {
+    func.func @main(%arg0: tensor<2560x5120xbf16>, %arg1: tensor<2560x4xbf16>) -> tensor<5120x4xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x4xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x4xbf16>) -> tensor<5120x4xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2560x5120xbf16>, tensor<2560x4xbf16>) outs(%1 : tensor<5120x4xbf16>) -> tensor<5120x4xbf16>
         return %2 : tensor<5120x4xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_4_2560_f16_tA.mlir b/gemm/mlir/gemm_5120_4_2560_f16_tA.mlir
index 0fa3e27..78af1ae 100644
--- a/gemm/mlir/gemm_5120_4_2560_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_4_2560_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<2560x5120xf16>, %arg1: tensor<2560x4xf16>) -> tensor<5120x4xf16> {
+    func.func @main(%arg0: tensor<2560x5120xf16>, %arg1: tensor<2560x4xf16>) -> tensor<5120x4xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x4xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x4xf16>) -> tensor<5120x4xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2560x5120xf16>, tensor<2560x4xf16>) outs(%1 : tensor<5120x4xf16>) -> tensor<5120x4xf16>
         return %2 : tensor<5120x4xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_4_3456_bf16_tA.mlir b/gemm/mlir/gemm_5120_4_3456_bf16_tA.mlir
index 9a7c145..65e3813 100644
--- a/gemm/mlir/gemm_5120_4_3456_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_4_3456_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<3456x5120xbf16>, %arg1: tensor<3456x4xbf16>) -> tensor<5120x4xbf16> {
+    func.func @main(%arg0: tensor<3456x5120xbf16>, %arg1: tensor<3456x4xbf16>) -> tensor<5120x4xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x4xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x4xbf16>) -> tensor<5120x4xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3456x5120xbf16>, tensor<3456x4xbf16>) outs(%1 : tensor<5120x4xbf16>) -> tensor<5120x4xbf16>
         return %2 : tensor<5120x4xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_4_3456_f16_tA.mlir b/gemm/mlir/gemm_5120_4_3456_f16_tA.mlir
index cb83bba..055a56a 100644
--- a/gemm/mlir/gemm_5120_4_3456_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_4_3456_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<3456x5120xf16>, %arg1: tensor<3456x4xf16>) -> tensor<5120x4xf16> {
+    func.func @main(%arg0: tensor<3456x5120xf16>, %arg1: tensor<3456x4xf16>) -> tensor<5120x4xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x4xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x4xf16>) -> tensor<5120x4xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3456x5120xf16>, tensor<3456x4xf16>) outs(%1 : tensor<5120x4xf16>) -> tensor<5120x4xf16>
         return %2 : tensor<5120x4xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_4_5120_bf16_tA.mlir b/gemm/mlir/gemm_5120_4_5120_bf16_tA.mlir
index 051d5e1..133c6e2 100644
--- a/gemm/mlir/gemm_5120_4_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_4_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x5120xbf16>, %arg1: tensor<5120x4xbf16>) -> tensor<5120x4xbf16> {
+    func.func @main(%arg0: tensor<5120x5120xbf16>, %arg1: tensor<5120x4xbf16>) -> tensor<5120x4xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x4xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x4xbf16>) -> tensor<5120x4xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x5120xbf16>, tensor<5120x4xbf16>) outs(%1 : tensor<5120x4xbf16>) -> tensor<5120x4xbf16>
         return %2 : tensor<5120x4xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_4_5120_f16_tA.mlir b/gemm/mlir/gemm_5120_4_5120_f16_tA.mlir
index 24df30b..3b6cabf 100644
--- a/gemm/mlir/gemm_5120_4_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_4_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x5120xf16>, %arg1: tensor<5120x4xf16>) -> tensor<5120x4xf16> {
+    func.func @main(%arg0: tensor<5120x5120xf16>, %arg1: tensor<5120x4xf16>) -> tensor<5120x4xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x4xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x4xf16>) -> tensor<5120x4xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x5120xf16>, tensor<5120x4xf16>) outs(%1 : tensor<5120x4xf16>) -> tensor<5120x4xf16>
         return %2 : tensor<5120x4xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_4_640_bf16_tA.mlir b/gemm/mlir/gemm_5120_4_640_bf16_tA.mlir
index cb11302..1e22dd9 100644
--- a/gemm/mlir/gemm_5120_4_640_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_4_640_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<640x5120xbf16>, %arg1: tensor<640x4xbf16>) -> tensor<5120x4xbf16> {
+    func.func @main(%arg0: tensor<640x5120xbf16>, %arg1: tensor<640x4xbf16>) -> tensor<5120x4xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x4xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x4xbf16>) -> tensor<5120x4xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<640x5120xbf16>, tensor<640x4xbf16>) outs(%1 : tensor<5120x4xbf16>) -> tensor<5120x4xbf16>
         return %2 : tensor<5120x4xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_4_640_f16_tA.mlir b/gemm/mlir/gemm_5120_4_640_f16_tA.mlir
index 1cbfd8b..f7459f4 100644
--- a/gemm/mlir/gemm_5120_4_640_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_4_640_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<640x5120xf16>, %arg1: tensor<640x4xf16>) -> tensor<5120x4xf16> {
+    func.func @main(%arg0: tensor<640x5120xf16>, %arg1: tensor<640x4xf16>) -> tensor<5120x4xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x4xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x4xf16>) -> tensor<5120x4xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<640x5120xf16>, tensor<640x4xf16>) outs(%1 : tensor<5120x4xf16>) -> tensor<5120x4xf16>
         return %2 : tensor<5120x4xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_4_6912_bf16_tA.mlir b/gemm/mlir/gemm_5120_4_6912_bf16_tA.mlir
index 579b577..9244683 100644
--- a/gemm/mlir/gemm_5120_4_6912_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_4_6912_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<6912x5120xbf16>, %arg1: tensor<6912x4xbf16>) -> tensor<5120x4xbf16> {
+    func.func @main(%arg0: tensor<6912x5120xbf16>, %arg1: tensor<6912x4xbf16>) -> tensor<5120x4xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x4xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x4xbf16>) -> tensor<5120x4xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<6912x5120xbf16>, tensor<6912x4xbf16>) outs(%1 : tensor<5120x4xbf16>) -> tensor<5120x4xbf16>
         return %2 : tensor<5120x4xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_4_6912_f16_tA.mlir b/gemm/mlir/gemm_5120_4_6912_f16_tA.mlir
index d16b66d..f3c0b6a 100644
--- a/gemm/mlir/gemm_5120_4_6912_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_4_6912_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<6912x5120xf16>, %arg1: tensor<6912x4xf16>) -> tensor<5120x4xf16> {
+    func.func @main(%arg0: tensor<6912x5120xf16>, %arg1: tensor<6912x4xf16>) -> tensor<5120x4xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x4xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x4xf16>) -> tensor<5120x4xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<6912x5120xf16>, tensor<6912x4xf16>) outs(%1 : tensor<5120x4xf16>) -> tensor<5120x4xf16>
         return %2 : tensor<5120x4xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_4_8192_bf16_tA.mlir b/gemm/mlir/gemm_5120_4_8192_bf16_tA.mlir
index 678fff8..1e39bcc 100644
--- a/gemm/mlir/gemm_5120_4_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_4_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x5120xbf16>, %arg1: tensor<8192x4xbf16>) -> tensor<5120x4xbf16> {
+    func.func @main(%arg0: tensor<8192x5120xbf16>, %arg1: tensor<8192x4xbf16>) -> tensor<5120x4xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x4xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x4xbf16>) -> tensor<5120x4xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x5120xbf16>, tensor<8192x4xbf16>) outs(%1 : tensor<5120x4xbf16>) -> tensor<5120x4xbf16>
         return %2 : tensor<5120x4xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_4_8192_f16_tA.mlir b/gemm/mlir/gemm_5120_4_8192_f16_tA.mlir
index 708ed77..59ff5c1 100644
--- a/gemm/mlir/gemm_5120_4_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_4_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x5120xf16>, %arg1: tensor<8192x4xf16>) -> tensor<5120x4xf16> {
+    func.func @main(%arg0: tensor<8192x5120xf16>, %arg1: tensor<8192x4xf16>) -> tensor<5120x4xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x4xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x4xf16>) -> tensor<5120x4xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x5120xf16>, tensor<8192x4xf16>) outs(%1 : tensor<5120x4xf16>) -> tensor<5120x4xf16>
         return %2 : tensor<5120x4xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_8_1280_bf16_tA.mlir b/gemm/mlir/gemm_5120_8_1280_bf16_tA.mlir
index c9ec3dd..090d0a3 100644
--- a/gemm/mlir/gemm_5120_8_1280_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_8_1280_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<1280x5120xbf16>, %arg1: tensor<1280x8xbf16>) -> tensor<5120x8xbf16> {
+    func.func @main(%arg0: tensor<1280x5120xbf16>, %arg1: tensor<1280x8xbf16>) -> tensor<5120x8xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x8xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x8xbf16>) -> tensor<5120x8xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1280x5120xbf16>, tensor<1280x8xbf16>) outs(%1 : tensor<5120x8xbf16>) -> tensor<5120x8xbf16>
         return %2 : tensor<5120x8xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_8_1280_f16_tA.mlir b/gemm/mlir/gemm_5120_8_1280_f16_tA.mlir
index 332820c..68c2973 100644
--- a/gemm/mlir/gemm_5120_8_1280_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_8_1280_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<1280x5120xf16>, %arg1: tensor<1280x8xf16>) -> tensor<5120x8xf16> {
+    func.func @main(%arg0: tensor<1280x5120xf16>, %arg1: tensor<1280x8xf16>) -> tensor<5120x8xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x8xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x8xf16>) -> tensor<5120x8xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1280x5120xf16>, tensor<1280x8xf16>) outs(%1 : tensor<5120x8xf16>) -> tensor<5120x8xf16>
         return %2 : tensor<5120x8xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_8_13824_bf16_tA.mlir b/gemm/mlir/gemm_5120_8_13824_bf16_tA.mlir
index 72d5e9e..b80c0d8 100644
--- a/gemm/mlir/gemm_5120_8_13824_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_8_13824_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<13824x5120xbf16>, %arg1: tensor<13824x8xbf16>) -> tensor<5120x8xbf16> {
+    func.func @main(%arg0: tensor<13824x5120xbf16>, %arg1: tensor<13824x8xbf16>) -> tensor<5120x8xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x8xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x8xbf16>) -> tensor<5120x8xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<13824x5120xbf16>, tensor<13824x8xbf16>) outs(%1 : tensor<5120x8xbf16>) -> tensor<5120x8xbf16>
         return %2 : tensor<5120x8xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_8_13824_f16_tA.mlir b/gemm/mlir/gemm_5120_8_13824_f16_tA.mlir
index 15e392b..77658a9 100644
--- a/gemm/mlir/gemm_5120_8_13824_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_8_13824_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<13824x5120xf16>, %arg1: tensor<13824x8xf16>) -> tensor<5120x8xf16> {
+    func.func @main(%arg0: tensor<13824x5120xf16>, %arg1: tensor<13824x8xf16>) -> tensor<5120x8xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x8xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x8xf16>) -> tensor<5120x8xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<13824x5120xf16>, tensor<13824x8xf16>) outs(%1 : tensor<5120x8xf16>) -> tensor<5120x8xf16>
         return %2 : tensor<5120x8xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_8_1728_bf16_tA.mlir b/gemm/mlir/gemm_5120_8_1728_bf16_tA.mlir
index ee5abd5..3d405b3 100644
--- a/gemm/mlir/gemm_5120_8_1728_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_8_1728_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<1728x5120xbf16>, %arg1: tensor<1728x8xbf16>) -> tensor<5120x8xbf16> {
+    func.func @main(%arg0: tensor<1728x5120xbf16>, %arg1: tensor<1728x8xbf16>) -> tensor<5120x8xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x8xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x8xbf16>) -> tensor<5120x8xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1728x5120xbf16>, tensor<1728x8xbf16>) outs(%1 : tensor<5120x8xbf16>) -> tensor<5120x8xbf16>
         return %2 : tensor<5120x8xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_8_1728_f16_tA.mlir b/gemm/mlir/gemm_5120_8_1728_f16_tA.mlir
index 1be3b9b..9717a1c 100644
--- a/gemm/mlir/gemm_5120_8_1728_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_8_1728_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<1728x5120xf16>, %arg1: tensor<1728x8xf16>) -> tensor<5120x8xf16> {
+    func.func @main(%arg0: tensor<1728x5120xf16>, %arg1: tensor<1728x8xf16>) -> tensor<5120x8xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x8xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x8xf16>) -> tensor<5120x8xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1728x5120xf16>, tensor<1728x8xf16>) outs(%1 : tensor<5120x8xf16>) -> tensor<5120x8xf16>
         return %2 : tensor<5120x8xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_8_2560_bf16_tA.mlir b/gemm/mlir/gemm_5120_8_2560_bf16_tA.mlir
index e4f6f55..e20b534 100644
--- a/gemm/mlir/gemm_5120_8_2560_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_8_2560_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<2560x5120xbf16>, %arg1: tensor<2560x8xbf16>) -> tensor<5120x8xbf16> {
+    func.func @main(%arg0: tensor<2560x5120xbf16>, %arg1: tensor<2560x8xbf16>) -> tensor<5120x8xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x8xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x8xbf16>) -> tensor<5120x8xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2560x5120xbf16>, tensor<2560x8xbf16>) outs(%1 : tensor<5120x8xbf16>) -> tensor<5120x8xbf16>
         return %2 : tensor<5120x8xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_8_2560_f16_tA.mlir b/gemm/mlir/gemm_5120_8_2560_f16_tA.mlir
index 8eb2094..fcb3692 100644
--- a/gemm/mlir/gemm_5120_8_2560_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_8_2560_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<2560x5120xf16>, %arg1: tensor<2560x8xf16>) -> tensor<5120x8xf16> {
+    func.func @main(%arg0: tensor<2560x5120xf16>, %arg1: tensor<2560x8xf16>) -> tensor<5120x8xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x8xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x8xf16>) -> tensor<5120x8xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2560x5120xf16>, tensor<2560x8xf16>) outs(%1 : tensor<5120x8xf16>) -> tensor<5120x8xf16>
         return %2 : tensor<5120x8xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_8_3456_bf16_tA.mlir b/gemm/mlir/gemm_5120_8_3456_bf16_tA.mlir
index c8cecb5..e86a941 100644
--- a/gemm/mlir/gemm_5120_8_3456_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_8_3456_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<3456x5120xbf16>, %arg1: tensor<3456x8xbf16>) -> tensor<5120x8xbf16> {
+    func.func @main(%arg0: tensor<3456x5120xbf16>, %arg1: tensor<3456x8xbf16>) -> tensor<5120x8xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x8xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x8xbf16>) -> tensor<5120x8xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3456x5120xbf16>, tensor<3456x8xbf16>) outs(%1 : tensor<5120x8xbf16>) -> tensor<5120x8xbf16>
         return %2 : tensor<5120x8xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_8_3456_f16_tA.mlir b/gemm/mlir/gemm_5120_8_3456_f16_tA.mlir
index abd3026..b81b946 100644
--- a/gemm/mlir/gemm_5120_8_3456_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_8_3456_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<3456x5120xf16>, %arg1: tensor<3456x8xf16>) -> tensor<5120x8xf16> {
+    func.func @main(%arg0: tensor<3456x5120xf16>, %arg1: tensor<3456x8xf16>) -> tensor<5120x8xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x8xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x8xf16>) -> tensor<5120x8xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3456x5120xf16>, tensor<3456x8xf16>) outs(%1 : tensor<5120x8xf16>) -> tensor<5120x8xf16>
         return %2 : tensor<5120x8xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_8_5120_bf16_tA.mlir b/gemm/mlir/gemm_5120_8_5120_bf16_tA.mlir
index 06c159d..b66fabd 100644
--- a/gemm/mlir/gemm_5120_8_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_8_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x5120xbf16>, %arg1: tensor<5120x8xbf16>) -> tensor<5120x8xbf16> {
+    func.func @main(%arg0: tensor<5120x5120xbf16>, %arg1: tensor<5120x8xbf16>) -> tensor<5120x8xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x8xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x8xbf16>) -> tensor<5120x8xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x5120xbf16>, tensor<5120x8xbf16>) outs(%1 : tensor<5120x8xbf16>) -> tensor<5120x8xbf16>
         return %2 : tensor<5120x8xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_8_5120_f16_tA.mlir b/gemm/mlir/gemm_5120_8_5120_f16_tA.mlir
index 45ee2f7..b42ef4d 100644
--- a/gemm/mlir/gemm_5120_8_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_8_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x5120xf16>, %arg1: tensor<5120x8xf16>) -> tensor<5120x8xf16> {
+    func.func @main(%arg0: tensor<5120x5120xf16>, %arg1: tensor<5120x8xf16>) -> tensor<5120x8xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x8xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x8xf16>) -> tensor<5120x8xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x5120xf16>, tensor<5120x8xf16>) outs(%1 : tensor<5120x8xf16>) -> tensor<5120x8xf16>
         return %2 : tensor<5120x8xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_8_640_bf16_tA.mlir b/gemm/mlir/gemm_5120_8_640_bf16_tA.mlir
index a171f4a..919f4aa 100644
--- a/gemm/mlir/gemm_5120_8_640_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_8_640_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<640x5120xbf16>, %arg1: tensor<640x8xbf16>) -> tensor<5120x8xbf16> {
+    func.func @main(%arg0: tensor<640x5120xbf16>, %arg1: tensor<640x8xbf16>) -> tensor<5120x8xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x8xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x8xbf16>) -> tensor<5120x8xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<640x5120xbf16>, tensor<640x8xbf16>) outs(%1 : tensor<5120x8xbf16>) -> tensor<5120x8xbf16>
         return %2 : tensor<5120x8xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_8_640_f16_tA.mlir b/gemm/mlir/gemm_5120_8_640_f16_tA.mlir
index e64ddeb..2667615 100644
--- a/gemm/mlir/gemm_5120_8_640_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_8_640_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<640x5120xf16>, %arg1: tensor<640x8xf16>) -> tensor<5120x8xf16> {
+    func.func @main(%arg0: tensor<640x5120xf16>, %arg1: tensor<640x8xf16>) -> tensor<5120x8xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x8xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x8xf16>) -> tensor<5120x8xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<640x5120xf16>, tensor<640x8xf16>) outs(%1 : tensor<5120x8xf16>) -> tensor<5120x8xf16>
         return %2 : tensor<5120x8xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_8_6912_bf16_tA.mlir b/gemm/mlir/gemm_5120_8_6912_bf16_tA.mlir
index 025a5f7..68be7d6 100644
--- a/gemm/mlir/gemm_5120_8_6912_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_8_6912_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<6912x5120xbf16>, %arg1: tensor<6912x8xbf16>) -> tensor<5120x8xbf16> {
+    func.func @main(%arg0: tensor<6912x5120xbf16>, %arg1: tensor<6912x8xbf16>) -> tensor<5120x8xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x8xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x8xbf16>) -> tensor<5120x8xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<6912x5120xbf16>, tensor<6912x8xbf16>) outs(%1 : tensor<5120x8xbf16>) -> tensor<5120x8xbf16>
         return %2 : tensor<5120x8xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_8_6912_f16_tA.mlir b/gemm/mlir/gemm_5120_8_6912_f16_tA.mlir
index 828bcb1..b423ad4 100644
--- a/gemm/mlir/gemm_5120_8_6912_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_8_6912_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<6912x5120xf16>, %arg1: tensor<6912x8xf16>) -> tensor<5120x8xf16> {
+    func.func @main(%arg0: tensor<6912x5120xf16>, %arg1: tensor<6912x8xf16>) -> tensor<5120x8xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x8xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x8xf16>) -> tensor<5120x8xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<6912x5120xf16>, tensor<6912x8xf16>) outs(%1 : tensor<5120x8xf16>) -> tensor<5120x8xf16>
         return %2 : tensor<5120x8xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_8_8192_bf16_tA.mlir b/gemm/mlir/gemm_5120_8_8192_bf16_tA.mlir
index 5a7f3ab..70c44a9 100644
--- a/gemm/mlir/gemm_5120_8_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_5120_8_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x5120xbf16>, %arg1: tensor<8192x8xbf16>) -> tensor<5120x8xbf16> {
+    func.func @main(%arg0: tensor<8192x5120xbf16>, %arg1: tensor<8192x8xbf16>) -> tensor<5120x8xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<5120x8xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x8xbf16>) -> tensor<5120x8xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x5120xbf16>, tensor<8192x8xbf16>) outs(%1 : tensor<5120x8xbf16>) -> tensor<5120x8xbf16>
         return %2 : tensor<5120x8xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_5120_8_8192_f16_tA.mlir b/gemm/mlir/gemm_5120_8_8192_f16_tA.mlir
index 8245617..79a3420 100644
--- a/gemm/mlir/gemm_5120_8_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_5120_8_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x5120xf16>, %arg1: tensor<8192x8xf16>) -> tensor<5120x8xf16> {
+    func.func @main(%arg0: tensor<8192x5120xf16>, %arg1: tensor<8192x8xf16>) -> tensor<5120x8xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<5120x8xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x8xf16>) -> tensor<5120x8xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x5120xf16>, tensor<8192x8xf16>) outs(%1 : tensor<5120x8xf16>) -> tensor<5120x8xf16>
         return %2 : tensor<5120x8xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_57344_16_8192_bf16_tA.mlir b/gemm/mlir/gemm_57344_16_8192_bf16_tA.mlir
index eed5995..c05fd42 100644
--- a/gemm/mlir/gemm_57344_16_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_57344_16_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x57344xbf16>, %arg1: tensor<8192x16xbf16>) -> tensor<57344x16xbf16> {
+    func.func @main(%arg0: tensor<8192x57344xbf16>, %arg1: tensor<8192x16xbf16>) -> tensor<57344x16xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<57344x16xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<57344x16xbf16>) -> tensor<57344x16xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x57344xbf16>, tensor<8192x16xbf16>) outs(%1 : tensor<57344x16xbf16>) -> tensor<57344x16xbf16>
         return %2 : tensor<57344x16xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_57344_16_8192_f16_tA.mlir b/gemm/mlir/gemm_57344_16_8192_f16_tA.mlir
index 88699d6..a27c3ca 100644
--- a/gemm/mlir/gemm_57344_16_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_57344_16_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x57344xf16>, %arg1: tensor<8192x16xf16>) -> tensor<57344x16xf16> {
+    func.func @main(%arg0: tensor<8192x57344xf16>, %arg1: tensor<8192x16xf16>) -> tensor<57344x16xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<57344x16xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<57344x16xf16>) -> tensor<57344x16xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x57344xf16>, tensor<8192x16xf16>) outs(%1 : tensor<57344x16xf16>) -> tensor<57344x16xf16>
         return %2 : tensor<57344x16xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_57344_1_8192_bf16_tA.mlir b/gemm/mlir/gemm_57344_1_8192_bf16_tA.mlir
index 09807d7..6b50b47 100644
--- a/gemm/mlir/gemm_57344_1_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_57344_1_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x57344xbf16>, %arg1: tensor<8192x1xbf16>) -> tensor<57344x1xbf16> {
+    func.func @main(%arg0: tensor<8192x57344xbf16>, %arg1: tensor<8192x1xbf16>) -> tensor<57344x1xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<57344x1xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<57344x1xbf16>) -> tensor<57344x1xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x57344xbf16>, tensor<8192x1xbf16>) outs(%1 : tensor<57344x1xbf16>) -> tensor<57344x1xbf16>
         return %2 : tensor<57344x1xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_57344_1_8192_f16_tA.mlir b/gemm/mlir/gemm_57344_1_8192_f16_tA.mlir
index 49be87f..a391e24 100644
--- a/gemm/mlir/gemm_57344_1_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_57344_1_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x57344xf16>, %arg1: tensor<8192x1xf16>) -> tensor<57344x1xf16> {
+    func.func @main(%arg0: tensor<8192x57344xf16>, %arg1: tensor<8192x1xf16>) -> tensor<57344x1xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<57344x1xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<57344x1xf16>) -> tensor<57344x1xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x57344xf16>, tensor<8192x1xf16>) outs(%1 : tensor<57344x1xf16>) -> tensor<57344x1xf16>
         return %2 : tensor<57344x1xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_57344_2_8192_bf16_tA.mlir b/gemm/mlir/gemm_57344_2_8192_bf16_tA.mlir
index db022a1..b176f2c 100644
--- a/gemm/mlir/gemm_57344_2_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_57344_2_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x57344xbf16>, %arg1: tensor<8192x2xbf16>) -> tensor<57344x2xbf16> {
+    func.func @main(%arg0: tensor<8192x57344xbf16>, %arg1: tensor<8192x2xbf16>) -> tensor<57344x2xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<57344x2xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<57344x2xbf16>) -> tensor<57344x2xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x57344xbf16>, tensor<8192x2xbf16>) outs(%1 : tensor<57344x2xbf16>) -> tensor<57344x2xbf16>
         return %2 : tensor<57344x2xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_57344_2_8192_f16_tA.mlir b/gemm/mlir/gemm_57344_2_8192_f16_tA.mlir
index b0b3085..ffac68f 100644
--- a/gemm/mlir/gemm_57344_2_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_57344_2_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x57344xf16>, %arg1: tensor<8192x2xf16>) -> tensor<57344x2xf16> {
+    func.func @main(%arg0: tensor<8192x57344xf16>, %arg1: tensor<8192x2xf16>) -> tensor<57344x2xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<57344x2xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<57344x2xf16>) -> tensor<57344x2xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x57344xf16>, tensor<8192x2xf16>) outs(%1 : tensor<57344x2xf16>) -> tensor<57344x2xf16>
         return %2 : tensor<57344x2xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_57344_32_8192_bf16_tA.mlir b/gemm/mlir/gemm_57344_32_8192_bf16_tA.mlir
index 963021f..bbe0c75 100644
--- a/gemm/mlir/gemm_57344_32_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_57344_32_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x57344xbf16>, %arg1: tensor<8192x32xbf16>) -> tensor<57344x32xbf16> {
+    func.func @main(%arg0: tensor<8192x57344xbf16>, %arg1: tensor<8192x32xbf16>) -> tensor<57344x32xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<57344x32xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<57344x32xbf16>) -> tensor<57344x32xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x57344xbf16>, tensor<8192x32xbf16>) outs(%1 : tensor<57344x32xbf16>) -> tensor<57344x32xbf16>
         return %2 : tensor<57344x32xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_57344_32_8192_f16_tA.mlir b/gemm/mlir/gemm_57344_32_8192_f16_tA.mlir
index 828eedb..34675d0 100644
--- a/gemm/mlir/gemm_57344_32_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_57344_32_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x57344xf16>, %arg1: tensor<8192x32xf16>) -> tensor<57344x32xf16> {
+    func.func @main(%arg0: tensor<8192x57344xf16>, %arg1: tensor<8192x32xf16>) -> tensor<57344x32xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<57344x32xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<57344x32xf16>) -> tensor<57344x32xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x57344xf16>, tensor<8192x32xf16>) outs(%1 : tensor<57344x32xf16>) -> tensor<57344x32xf16>
         return %2 : tensor<57344x32xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_57344_4_8192_bf16_tA.mlir b/gemm/mlir/gemm_57344_4_8192_bf16_tA.mlir
index 29d4cbf..2189c7e 100644
--- a/gemm/mlir/gemm_57344_4_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_57344_4_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x57344xbf16>, %arg1: tensor<8192x4xbf16>) -> tensor<57344x4xbf16> {
+    func.func @main(%arg0: tensor<8192x57344xbf16>, %arg1: tensor<8192x4xbf16>) -> tensor<57344x4xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<57344x4xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<57344x4xbf16>) -> tensor<57344x4xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x57344xbf16>, tensor<8192x4xbf16>) outs(%1 : tensor<57344x4xbf16>) -> tensor<57344x4xbf16>
         return %2 : tensor<57344x4xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_57344_4_8192_f16_tA.mlir b/gemm/mlir/gemm_57344_4_8192_f16_tA.mlir
index ba6f0f2..5419137 100644
--- a/gemm/mlir/gemm_57344_4_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_57344_4_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x57344xf16>, %arg1: tensor<8192x4xf16>) -> tensor<57344x4xf16> {
+    func.func @main(%arg0: tensor<8192x57344xf16>, %arg1: tensor<8192x4xf16>) -> tensor<57344x4xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<57344x4xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<57344x4xf16>) -> tensor<57344x4xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x57344xf16>, tensor<8192x4xf16>) outs(%1 : tensor<57344x4xf16>) -> tensor<57344x4xf16>
         return %2 : tensor<57344x4xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_57344_8_8192_bf16_tA.mlir b/gemm/mlir/gemm_57344_8_8192_bf16_tA.mlir
index 9334fb2..84785f7 100644
--- a/gemm/mlir/gemm_57344_8_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_57344_8_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x57344xbf16>, %arg1: tensor<8192x8xbf16>) -> tensor<57344x8xbf16> {
+    func.func @main(%arg0: tensor<8192x57344xbf16>, %arg1: tensor<8192x8xbf16>) -> tensor<57344x8xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<57344x8xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<57344x8xbf16>) -> tensor<57344x8xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x57344xbf16>, tensor<8192x8xbf16>) outs(%1 : tensor<57344x8xbf16>) -> tensor<57344x8xbf16>
         return %2 : tensor<57344x8xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_57344_8_8192_f16_tA.mlir b/gemm/mlir/gemm_57344_8_8192_f16_tA.mlir
index e633414..58d7ded 100644
--- a/gemm/mlir/gemm_57344_8_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_57344_8_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x57344xf16>, %arg1: tensor<8192x8xf16>) -> tensor<57344x8xf16> {
+    func.func @main(%arg0: tensor<8192x57344xf16>, %arg1: tensor<8192x8xf16>) -> tensor<57344x8xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<57344x8xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<57344x8xf16>) -> tensor<57344x8xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x57344xf16>, tensor<8192x8xf16>) outs(%1 : tensor<57344x8xf16>) -> tensor<57344x8xf16>
         return %2 : tensor<57344x8xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_6912_16_5120_bf16_tA.mlir b/gemm/mlir/gemm_6912_16_5120_bf16_tA.mlir
index f624390..a9180ad 100644
--- a/gemm/mlir/gemm_6912_16_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_6912_16_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x6912xbf16>, %arg1: tensor<5120x16xbf16>) -> tensor<6912x16xbf16> {
+    func.func @main(%arg0: tensor<5120x6912xbf16>, %arg1: tensor<5120x16xbf16>) -> tensor<6912x16xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<6912x16xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<6912x16xbf16>) -> tensor<6912x16xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x6912xbf16>, tensor<5120x16xbf16>) outs(%1 : tensor<6912x16xbf16>) -> tensor<6912x16xbf16>
         return %2 : tensor<6912x16xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_6912_16_5120_f16_tA.mlir b/gemm/mlir/gemm_6912_16_5120_f16_tA.mlir
index 771c6da..47aaf92 100644
--- a/gemm/mlir/gemm_6912_16_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_6912_16_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x6912xf16>, %arg1: tensor<5120x16xf16>) -> tensor<6912x16xf16> {
+    func.func @main(%arg0: tensor<5120x6912xf16>, %arg1: tensor<5120x16xf16>) -> tensor<6912x16xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<6912x16xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<6912x16xf16>) -> tensor<6912x16xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x6912xf16>, tensor<5120x16xf16>) outs(%1 : tensor<6912x16xf16>) -> tensor<6912x16xf16>
         return %2 : tensor<6912x16xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_6912_1_5120_bf16_tA.mlir b/gemm/mlir/gemm_6912_1_5120_bf16_tA.mlir
index cc30e53..f087893 100644
--- a/gemm/mlir/gemm_6912_1_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_6912_1_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x6912xbf16>, %arg1: tensor<5120x1xbf16>) -> tensor<6912x1xbf16> {
+    func.func @main(%arg0: tensor<5120x6912xbf16>, %arg1: tensor<5120x1xbf16>) -> tensor<6912x1xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<6912x1xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<6912x1xbf16>) -> tensor<6912x1xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x6912xbf16>, tensor<5120x1xbf16>) outs(%1 : tensor<6912x1xbf16>) -> tensor<6912x1xbf16>
         return %2 : tensor<6912x1xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_6912_1_5120_f16_tA.mlir b/gemm/mlir/gemm_6912_1_5120_f16_tA.mlir
index 51cd9e1..beee00d 100644
--- a/gemm/mlir/gemm_6912_1_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_6912_1_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x6912xf16>, %arg1: tensor<5120x1xf16>) -> tensor<6912x1xf16> {
+    func.func @main(%arg0: tensor<5120x6912xf16>, %arg1: tensor<5120x1xf16>) -> tensor<6912x1xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<6912x1xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<6912x1xf16>) -> tensor<6912x1xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x6912xf16>, tensor<5120x1xf16>) outs(%1 : tensor<6912x1xf16>) -> tensor<6912x1xf16>
         return %2 : tensor<6912x1xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_6912_2_5120_bf16_tA.mlir b/gemm/mlir/gemm_6912_2_5120_bf16_tA.mlir
index 94f0fa1..441ec83 100644
--- a/gemm/mlir/gemm_6912_2_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_6912_2_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x6912xbf16>, %arg1: tensor<5120x2xbf16>) -> tensor<6912x2xbf16> {
+    func.func @main(%arg0: tensor<5120x6912xbf16>, %arg1: tensor<5120x2xbf16>) -> tensor<6912x2xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<6912x2xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<6912x2xbf16>) -> tensor<6912x2xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x6912xbf16>, tensor<5120x2xbf16>) outs(%1 : tensor<6912x2xbf16>) -> tensor<6912x2xbf16>
         return %2 : tensor<6912x2xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_6912_2_5120_f16_tA.mlir b/gemm/mlir/gemm_6912_2_5120_f16_tA.mlir
index b658eae..397c7b2 100644
--- a/gemm/mlir/gemm_6912_2_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_6912_2_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x6912xf16>, %arg1: tensor<5120x2xf16>) -> tensor<6912x2xf16> {
+    func.func @main(%arg0: tensor<5120x6912xf16>, %arg1: tensor<5120x2xf16>) -> tensor<6912x2xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<6912x2xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<6912x2xf16>) -> tensor<6912x2xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x6912xf16>, tensor<5120x2xf16>) outs(%1 : tensor<6912x2xf16>) -> tensor<6912x2xf16>
         return %2 : tensor<6912x2xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_6912_32_5120_bf16_tA.mlir b/gemm/mlir/gemm_6912_32_5120_bf16_tA.mlir
index 73ee4e6..926a24a 100644
--- a/gemm/mlir/gemm_6912_32_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_6912_32_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x6912xbf16>, %arg1: tensor<5120x32xbf16>) -> tensor<6912x32xbf16> {
+    func.func @main(%arg0: tensor<5120x6912xbf16>, %arg1: tensor<5120x32xbf16>) -> tensor<6912x32xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<6912x32xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<6912x32xbf16>) -> tensor<6912x32xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x6912xbf16>, tensor<5120x32xbf16>) outs(%1 : tensor<6912x32xbf16>) -> tensor<6912x32xbf16>
         return %2 : tensor<6912x32xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_6912_32_5120_f16_tA.mlir b/gemm/mlir/gemm_6912_32_5120_f16_tA.mlir
index 90ec1b6..75888ec 100644
--- a/gemm/mlir/gemm_6912_32_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_6912_32_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x6912xf16>, %arg1: tensor<5120x32xf16>) -> tensor<6912x32xf16> {
+    func.func @main(%arg0: tensor<5120x6912xf16>, %arg1: tensor<5120x32xf16>) -> tensor<6912x32xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<6912x32xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<6912x32xf16>) -> tensor<6912x32xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x6912xf16>, tensor<5120x32xf16>) outs(%1 : tensor<6912x32xf16>) -> tensor<6912x32xf16>
         return %2 : tensor<6912x32xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_6912_4_5120_bf16_tA.mlir b/gemm/mlir/gemm_6912_4_5120_bf16_tA.mlir
index b68ea97..105402a 100644
--- a/gemm/mlir/gemm_6912_4_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_6912_4_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x6912xbf16>, %arg1: tensor<5120x4xbf16>) -> tensor<6912x4xbf16> {
+    func.func @main(%arg0: tensor<5120x6912xbf16>, %arg1: tensor<5120x4xbf16>) -> tensor<6912x4xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<6912x4xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<6912x4xbf16>) -> tensor<6912x4xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x6912xbf16>, tensor<5120x4xbf16>) outs(%1 : tensor<6912x4xbf16>) -> tensor<6912x4xbf16>
         return %2 : tensor<6912x4xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_6912_4_5120_f16_tA.mlir b/gemm/mlir/gemm_6912_4_5120_f16_tA.mlir
index c1dfedf..2938490 100644
--- a/gemm/mlir/gemm_6912_4_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_6912_4_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x6912xf16>, %arg1: tensor<5120x4xf16>) -> tensor<6912x4xf16> {
+    func.func @main(%arg0: tensor<5120x6912xf16>, %arg1: tensor<5120x4xf16>) -> tensor<6912x4xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<6912x4xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<6912x4xf16>) -> tensor<6912x4xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x6912xf16>, tensor<5120x4xf16>) outs(%1 : tensor<6912x4xf16>) -> tensor<6912x4xf16>
         return %2 : tensor<6912x4xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_6912_8_5120_bf16_tA.mlir b/gemm/mlir/gemm_6912_8_5120_bf16_tA.mlir
index 437677a..c62dc28 100644
--- a/gemm/mlir/gemm_6912_8_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_6912_8_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x6912xbf16>, %arg1: tensor<5120x8xbf16>) -> tensor<6912x8xbf16> {
+    func.func @main(%arg0: tensor<5120x6912xbf16>, %arg1: tensor<5120x8xbf16>) -> tensor<6912x8xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<6912x8xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<6912x8xbf16>) -> tensor<6912x8xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x6912xbf16>, tensor<5120x8xbf16>) outs(%1 : tensor<6912x8xbf16>) -> tensor<6912x8xbf16>
         return %2 : tensor<6912x8xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_6912_8_5120_f16_tA.mlir b/gemm/mlir/gemm_6912_8_5120_f16_tA.mlir
index 1eee406..0fc7b88 100644
--- a/gemm/mlir/gemm_6912_8_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_6912_8_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x6912xf16>, %arg1: tensor<5120x8xf16>) -> tensor<6912x8xf16> {
+    func.func @main(%arg0: tensor<5120x6912xf16>, %arg1: tensor<5120x8xf16>) -> tensor<6912x8xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<6912x8xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<6912x8xf16>) -> tensor<6912x8xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x6912xf16>, tensor<5120x8xf16>) outs(%1 : tensor<6912x8xf16>) -> tensor<6912x8xf16>
         return %2 : tensor<6912x8xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_7168_16_8192_bf16_tA.mlir b/gemm/mlir/gemm_7168_16_8192_bf16_tA.mlir
index 8f8f690..c7660f1 100644
--- a/gemm/mlir/gemm_7168_16_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_7168_16_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x7168xbf16>, %arg1: tensor<8192x16xbf16>) -> tensor<7168x16xbf16> {
+    func.func @main(%arg0: tensor<8192x7168xbf16>, %arg1: tensor<8192x16xbf16>) -> tensor<7168x16xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<7168x16xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<7168x16xbf16>) -> tensor<7168x16xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x7168xbf16>, tensor<8192x16xbf16>) outs(%1 : tensor<7168x16xbf16>) -> tensor<7168x16xbf16>
         return %2 : tensor<7168x16xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_7168_16_8192_f16_tA.mlir b/gemm/mlir/gemm_7168_16_8192_f16_tA.mlir
index ac8a58c..3b4e48c 100644
--- a/gemm/mlir/gemm_7168_16_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_7168_16_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x7168xf16>, %arg1: tensor<8192x16xf16>) -> tensor<7168x16xf16> {
+    func.func @main(%arg0: tensor<8192x7168xf16>, %arg1: tensor<8192x16xf16>) -> tensor<7168x16xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<7168x16xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<7168x16xf16>) -> tensor<7168x16xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x7168xf16>, tensor<8192x16xf16>) outs(%1 : tensor<7168x16xf16>) -> tensor<7168x16xf16>
         return %2 : tensor<7168x16xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_7168_1_8192_bf16_tA.mlir b/gemm/mlir/gemm_7168_1_8192_bf16_tA.mlir
index a159ede..41d8ee8 100644
--- a/gemm/mlir/gemm_7168_1_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_7168_1_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x7168xbf16>, %arg1: tensor<8192x1xbf16>) -> tensor<7168x1xbf16> {
+    func.func @main(%arg0: tensor<8192x7168xbf16>, %arg1: tensor<8192x1xbf16>) -> tensor<7168x1xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<7168x1xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<7168x1xbf16>) -> tensor<7168x1xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x7168xbf16>, tensor<8192x1xbf16>) outs(%1 : tensor<7168x1xbf16>) -> tensor<7168x1xbf16>
         return %2 : tensor<7168x1xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_7168_1_8192_f16_tA.mlir b/gemm/mlir/gemm_7168_1_8192_f16_tA.mlir
index 42bc406..93b1d5e 100644
--- a/gemm/mlir/gemm_7168_1_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_7168_1_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x7168xf16>, %arg1: tensor<8192x1xf16>) -> tensor<7168x1xf16> {
+    func.func @main(%arg0: tensor<8192x7168xf16>, %arg1: tensor<8192x1xf16>) -> tensor<7168x1xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<7168x1xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<7168x1xf16>) -> tensor<7168x1xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x7168xf16>, tensor<8192x1xf16>) outs(%1 : tensor<7168x1xf16>) -> tensor<7168x1xf16>
         return %2 : tensor<7168x1xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_7168_2_8192_bf16_tA.mlir b/gemm/mlir/gemm_7168_2_8192_bf16_tA.mlir
index 0850d0e..555cca9 100644
--- a/gemm/mlir/gemm_7168_2_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_7168_2_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x7168xbf16>, %arg1: tensor<8192x2xbf16>) -> tensor<7168x2xbf16> {
+    func.func @main(%arg0: tensor<8192x7168xbf16>, %arg1: tensor<8192x2xbf16>) -> tensor<7168x2xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<7168x2xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<7168x2xbf16>) -> tensor<7168x2xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x7168xbf16>, tensor<8192x2xbf16>) outs(%1 : tensor<7168x2xbf16>) -> tensor<7168x2xbf16>
         return %2 : tensor<7168x2xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_7168_2_8192_f16_tA.mlir b/gemm/mlir/gemm_7168_2_8192_f16_tA.mlir
index 038def0..4ab13c2 100644
--- a/gemm/mlir/gemm_7168_2_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_7168_2_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x7168xf16>, %arg1: tensor<8192x2xf16>) -> tensor<7168x2xf16> {
+    func.func @main(%arg0: tensor<8192x7168xf16>, %arg1: tensor<8192x2xf16>) -> tensor<7168x2xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<7168x2xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<7168x2xf16>) -> tensor<7168x2xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x7168xf16>, tensor<8192x2xf16>) outs(%1 : tensor<7168x2xf16>) -> tensor<7168x2xf16>
         return %2 : tensor<7168x2xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_7168_32_8192_bf16_tA.mlir b/gemm/mlir/gemm_7168_32_8192_bf16_tA.mlir
index 63c7701..e6b536c 100644
--- a/gemm/mlir/gemm_7168_32_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_7168_32_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x7168xbf16>, %arg1: tensor<8192x32xbf16>) -> tensor<7168x32xbf16> {
+    func.func @main(%arg0: tensor<8192x7168xbf16>, %arg1: tensor<8192x32xbf16>) -> tensor<7168x32xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<7168x32xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<7168x32xbf16>) -> tensor<7168x32xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x7168xbf16>, tensor<8192x32xbf16>) outs(%1 : tensor<7168x32xbf16>) -> tensor<7168x32xbf16>
         return %2 : tensor<7168x32xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_7168_32_8192_f16_tA.mlir b/gemm/mlir/gemm_7168_32_8192_f16_tA.mlir
index 80bb61a..2d2744f 100644
--- a/gemm/mlir/gemm_7168_32_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_7168_32_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x7168xf16>, %arg1: tensor<8192x32xf16>) -> tensor<7168x32xf16> {
+    func.func @main(%arg0: tensor<8192x7168xf16>, %arg1: tensor<8192x32xf16>) -> tensor<7168x32xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<7168x32xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<7168x32xf16>) -> tensor<7168x32xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x7168xf16>, tensor<8192x32xf16>) outs(%1 : tensor<7168x32xf16>) -> tensor<7168x32xf16>
         return %2 : tensor<7168x32xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_7168_4_8192_bf16_tA.mlir b/gemm/mlir/gemm_7168_4_8192_bf16_tA.mlir
index 2d51b63..98c5839 100644
--- a/gemm/mlir/gemm_7168_4_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_7168_4_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x7168xbf16>, %arg1: tensor<8192x4xbf16>) -> tensor<7168x4xbf16> {
+    func.func @main(%arg0: tensor<8192x7168xbf16>, %arg1: tensor<8192x4xbf16>) -> tensor<7168x4xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<7168x4xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<7168x4xbf16>) -> tensor<7168x4xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x7168xbf16>, tensor<8192x4xbf16>) outs(%1 : tensor<7168x4xbf16>) -> tensor<7168x4xbf16>
         return %2 : tensor<7168x4xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_7168_4_8192_f16_tA.mlir b/gemm/mlir/gemm_7168_4_8192_f16_tA.mlir
index bf655f6..1bf5e1c 100644
--- a/gemm/mlir/gemm_7168_4_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_7168_4_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x7168xf16>, %arg1: tensor<8192x4xf16>) -> tensor<7168x4xf16> {
+    func.func @main(%arg0: tensor<8192x7168xf16>, %arg1: tensor<8192x4xf16>) -> tensor<7168x4xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<7168x4xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<7168x4xf16>) -> tensor<7168x4xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x7168xf16>, tensor<8192x4xf16>) outs(%1 : tensor<7168x4xf16>) -> tensor<7168x4xf16>
         return %2 : tensor<7168x4xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_7168_8_8192_bf16_tA.mlir b/gemm/mlir/gemm_7168_8_8192_bf16_tA.mlir
index 20e0805..c7dbcb9 100644
--- a/gemm/mlir/gemm_7168_8_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_7168_8_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x7168xbf16>, %arg1: tensor<8192x8xbf16>) -> tensor<7168x8xbf16> {
+    func.func @main(%arg0: tensor<8192x7168xbf16>, %arg1: tensor<8192x8xbf16>) -> tensor<7168x8xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<7168x8xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<7168x8xbf16>) -> tensor<7168x8xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x7168xbf16>, tensor<8192x8xbf16>) outs(%1 : tensor<7168x8xbf16>) -> tensor<7168x8xbf16>
         return %2 : tensor<7168x8xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_7168_8_8192_f16_tA.mlir b/gemm/mlir/gemm_7168_8_8192_f16_tA.mlir
index ad72f3f..f36208c 100644
--- a/gemm/mlir/gemm_7168_8_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_7168_8_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x7168xf16>, %arg1: tensor<8192x8xf16>) -> tensor<7168x8xf16> {
+    func.func @main(%arg0: tensor<8192x7168xf16>, %arg1: tensor<8192x8xf16>) -> tensor<7168x8xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<7168x8xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<7168x8xf16>) -> tensor<7168x8xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x7168xf16>, tensor<8192x8xf16>) outs(%1 : tensor<7168x8xf16>) -> tensor<7168x8xf16>
         return %2 : tensor<7168x8xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_7680_16_5120_bf16_tA.mlir b/gemm/mlir/gemm_7680_16_5120_bf16_tA.mlir
index 5413ea7..db4ed5e 100644
--- a/gemm/mlir/gemm_7680_16_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_7680_16_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x7680xbf16>, %arg1: tensor<5120x16xbf16>) -> tensor<7680x16xbf16> {
+    func.func @main(%arg0: tensor<5120x7680xbf16>, %arg1: tensor<5120x16xbf16>) -> tensor<7680x16xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<7680x16xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<7680x16xbf16>) -> tensor<7680x16xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x7680xbf16>, tensor<5120x16xbf16>) outs(%1 : tensor<7680x16xbf16>) -> tensor<7680x16xbf16>
         return %2 : tensor<7680x16xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_7680_16_5120_f16_tA.mlir b/gemm/mlir/gemm_7680_16_5120_f16_tA.mlir
index 68cbc88..884fae5 100644
--- a/gemm/mlir/gemm_7680_16_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_7680_16_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x7680xf16>, %arg1: tensor<5120x16xf16>) -> tensor<7680x16xf16> {
+    func.func @main(%arg0: tensor<5120x7680xf16>, %arg1: tensor<5120x16xf16>) -> tensor<7680x16xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<7680x16xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<7680x16xf16>) -> tensor<7680x16xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x7680xf16>, tensor<5120x16xf16>) outs(%1 : tensor<7680x16xf16>) -> tensor<7680x16xf16>
         return %2 : tensor<7680x16xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_7680_1_5120_bf16_tA.mlir b/gemm/mlir/gemm_7680_1_5120_bf16_tA.mlir
index d22bc41..3e9229a 100644
--- a/gemm/mlir/gemm_7680_1_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_7680_1_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x7680xbf16>, %arg1: tensor<5120x1xbf16>) -> tensor<7680x1xbf16> {
+    func.func @main(%arg0: tensor<5120x7680xbf16>, %arg1: tensor<5120x1xbf16>) -> tensor<7680x1xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<7680x1xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<7680x1xbf16>) -> tensor<7680x1xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x7680xbf16>, tensor<5120x1xbf16>) outs(%1 : tensor<7680x1xbf16>) -> tensor<7680x1xbf16>
         return %2 : tensor<7680x1xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_7680_1_5120_f16_tA.mlir b/gemm/mlir/gemm_7680_1_5120_f16_tA.mlir
index b4b7ea4..8852272 100644
--- a/gemm/mlir/gemm_7680_1_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_7680_1_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x7680xf16>, %arg1: tensor<5120x1xf16>) -> tensor<7680x1xf16> {
+    func.func @main(%arg0: tensor<5120x7680xf16>, %arg1: tensor<5120x1xf16>) -> tensor<7680x1xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<7680x1xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<7680x1xf16>) -> tensor<7680x1xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x7680xf16>, tensor<5120x1xf16>) outs(%1 : tensor<7680x1xf16>) -> tensor<7680x1xf16>
         return %2 : tensor<7680x1xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_7680_2_5120_bf16_tA.mlir b/gemm/mlir/gemm_7680_2_5120_bf16_tA.mlir
index 1272238..91b162d 100644
--- a/gemm/mlir/gemm_7680_2_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_7680_2_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x7680xbf16>, %arg1: tensor<5120x2xbf16>) -> tensor<7680x2xbf16> {
+    func.func @main(%arg0: tensor<5120x7680xbf16>, %arg1: tensor<5120x2xbf16>) -> tensor<7680x2xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<7680x2xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<7680x2xbf16>) -> tensor<7680x2xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x7680xbf16>, tensor<5120x2xbf16>) outs(%1 : tensor<7680x2xbf16>) -> tensor<7680x2xbf16>
         return %2 : tensor<7680x2xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_7680_2_5120_f16_tA.mlir b/gemm/mlir/gemm_7680_2_5120_f16_tA.mlir
index 061d2cd..0b11af3 100644
--- a/gemm/mlir/gemm_7680_2_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_7680_2_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x7680xf16>, %arg1: tensor<5120x2xf16>) -> tensor<7680x2xf16> {
+    func.func @main(%arg0: tensor<5120x7680xf16>, %arg1: tensor<5120x2xf16>) -> tensor<7680x2xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<7680x2xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<7680x2xf16>) -> tensor<7680x2xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x7680xf16>, tensor<5120x2xf16>) outs(%1 : tensor<7680x2xf16>) -> tensor<7680x2xf16>
         return %2 : tensor<7680x2xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_7680_32_5120_bf16_tA.mlir b/gemm/mlir/gemm_7680_32_5120_bf16_tA.mlir
index e65a756..a89c462 100644
--- a/gemm/mlir/gemm_7680_32_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_7680_32_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x7680xbf16>, %arg1: tensor<5120x32xbf16>) -> tensor<7680x32xbf16> {
+    func.func @main(%arg0: tensor<5120x7680xbf16>, %arg1: tensor<5120x32xbf16>) -> tensor<7680x32xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<7680x32xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<7680x32xbf16>) -> tensor<7680x32xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x7680xbf16>, tensor<5120x32xbf16>) outs(%1 : tensor<7680x32xbf16>) -> tensor<7680x32xbf16>
         return %2 : tensor<7680x32xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_7680_32_5120_f16_tA.mlir b/gemm/mlir/gemm_7680_32_5120_f16_tA.mlir
index 11c6226..6dd24ce 100644
--- a/gemm/mlir/gemm_7680_32_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_7680_32_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x7680xf16>, %arg1: tensor<5120x32xf16>) -> tensor<7680x32xf16> {
+    func.func @main(%arg0: tensor<5120x7680xf16>, %arg1: tensor<5120x32xf16>) -> tensor<7680x32xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<7680x32xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<7680x32xf16>) -> tensor<7680x32xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x7680xf16>, tensor<5120x32xf16>) outs(%1 : tensor<7680x32xf16>) -> tensor<7680x32xf16>
         return %2 : tensor<7680x32xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_7680_4_5120_bf16_tA.mlir b/gemm/mlir/gemm_7680_4_5120_bf16_tA.mlir
index 9ab8446..b0334e4 100644
--- a/gemm/mlir/gemm_7680_4_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_7680_4_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x7680xbf16>, %arg1: tensor<5120x4xbf16>) -> tensor<7680x4xbf16> {
+    func.func @main(%arg0: tensor<5120x7680xbf16>, %arg1: tensor<5120x4xbf16>) -> tensor<7680x4xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<7680x4xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<7680x4xbf16>) -> tensor<7680x4xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x7680xbf16>, tensor<5120x4xbf16>) outs(%1 : tensor<7680x4xbf16>) -> tensor<7680x4xbf16>
         return %2 : tensor<7680x4xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_7680_4_5120_f16_tA.mlir b/gemm/mlir/gemm_7680_4_5120_f16_tA.mlir
index 18d769c..c927588 100644
--- a/gemm/mlir/gemm_7680_4_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_7680_4_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x7680xf16>, %arg1: tensor<5120x4xf16>) -> tensor<7680x4xf16> {
+    func.func @main(%arg0: tensor<5120x7680xf16>, %arg1: tensor<5120x4xf16>) -> tensor<7680x4xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<7680x4xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<7680x4xf16>) -> tensor<7680x4xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x7680xf16>, tensor<5120x4xf16>) outs(%1 : tensor<7680x4xf16>) -> tensor<7680x4xf16>
         return %2 : tensor<7680x4xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_7680_8_5120_bf16_tA.mlir b/gemm/mlir/gemm_7680_8_5120_bf16_tA.mlir
index c70637e..4d799fa 100644
--- a/gemm/mlir/gemm_7680_8_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_7680_8_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x7680xbf16>, %arg1: tensor<5120x8xbf16>) -> tensor<7680x8xbf16> {
+    func.func @main(%arg0: tensor<5120x7680xbf16>, %arg1: tensor<5120x8xbf16>) -> tensor<7680x8xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<7680x8xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<7680x8xbf16>) -> tensor<7680x8xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x7680xbf16>, tensor<5120x8xbf16>) outs(%1 : tensor<7680x8xbf16>) -> tensor<7680x8xbf16>
         return %2 : tensor<7680x8xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_7680_8_5120_f16_tA.mlir b/gemm/mlir/gemm_7680_8_5120_f16_tA.mlir
index 87eaf92..f817f9d 100644
--- a/gemm/mlir/gemm_7680_8_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_7680_8_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x7680xf16>, %arg1: tensor<5120x8xf16>) -> tensor<7680x8xf16> {
+    func.func @main(%arg0: tensor<5120x7680xf16>, %arg1: tensor<5120x8xf16>) -> tensor<7680x8xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<7680x8xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<7680x8xf16>) -> tensor<7680x8xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x7680xf16>, tensor<5120x8xf16>) outs(%1 : tensor<7680x8xf16>) -> tensor<7680x8xf16>
         return %2 : tensor<7680x8xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8000_16_5120_bf16_tA.mlir b/gemm/mlir/gemm_8000_16_5120_bf16_tA.mlir
index a5ec137..50cb640 100644
--- a/gemm/mlir/gemm_8000_16_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8000_16_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x8000xbf16>, %arg1: tensor<5120x16xbf16>) -> tensor<8000x16xbf16> {
+    func.func @main(%arg0: tensor<5120x8000xbf16>, %arg1: tensor<5120x16xbf16>) -> tensor<8000x16xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8000x16xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8000x16xbf16>) -> tensor<8000x16xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x8000xbf16>, tensor<5120x16xbf16>) outs(%1 : tensor<8000x16xbf16>) -> tensor<8000x16xbf16>
         return %2 : tensor<8000x16xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8000_16_5120_f16_tA.mlir b/gemm/mlir/gemm_8000_16_5120_f16_tA.mlir
index e5a8643..53b5315 100644
--- a/gemm/mlir/gemm_8000_16_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_8000_16_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x8000xf16>, %arg1: tensor<5120x16xf16>) -> tensor<8000x16xf16> {
+    func.func @main(%arg0: tensor<5120x8000xf16>, %arg1: tensor<5120x16xf16>) -> tensor<8000x16xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8000x16xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8000x16xf16>) -> tensor<8000x16xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x8000xf16>, tensor<5120x16xf16>) outs(%1 : tensor<8000x16xf16>) -> tensor<8000x16xf16>
         return %2 : tensor<8000x16xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8000_16_8192_bf16_tA.mlir b/gemm/mlir/gemm_8000_16_8192_bf16_tA.mlir
index e9d1c6c..d61ae44 100644
--- a/gemm/mlir/gemm_8000_16_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8000_16_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x8000xbf16>, %arg1: tensor<8192x16xbf16>) -> tensor<8000x16xbf16> {
+    func.func @main(%arg0: tensor<8192x8000xbf16>, %arg1: tensor<8192x16xbf16>) -> tensor<8000x16xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8000x16xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8000x16xbf16>) -> tensor<8000x16xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8000xbf16>, tensor<8192x16xbf16>) outs(%1 : tensor<8000x16xbf16>) -> tensor<8000x16xbf16>
         return %2 : tensor<8000x16xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8000_16_8192_f16_tA.mlir b/gemm/mlir/gemm_8000_16_8192_f16_tA.mlir
index 8d3948a..2fdaae0 100644
--- a/gemm/mlir/gemm_8000_16_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_8000_16_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x8000xf16>, %arg1: tensor<8192x16xf16>) -> tensor<8000x16xf16> {
+    func.func @main(%arg0: tensor<8192x8000xf16>, %arg1: tensor<8192x16xf16>) -> tensor<8000x16xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8000x16xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8000x16xf16>) -> tensor<8000x16xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8000xf16>, tensor<8192x16xf16>) outs(%1 : tensor<8000x16xf16>) -> tensor<8000x16xf16>
         return %2 : tensor<8000x16xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8000_1_5120_bf16_tA.mlir b/gemm/mlir/gemm_8000_1_5120_bf16_tA.mlir
index 80a6737..2103508 100644
--- a/gemm/mlir/gemm_8000_1_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8000_1_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x8000xbf16>, %arg1: tensor<5120x1xbf16>) -> tensor<8000x1xbf16> {
+    func.func @main(%arg0: tensor<5120x8000xbf16>, %arg1: tensor<5120x1xbf16>) -> tensor<8000x1xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8000x1xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8000x1xbf16>) -> tensor<8000x1xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x8000xbf16>, tensor<5120x1xbf16>) outs(%1 : tensor<8000x1xbf16>) -> tensor<8000x1xbf16>
         return %2 : tensor<8000x1xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8000_1_5120_f16_tA.mlir b/gemm/mlir/gemm_8000_1_5120_f16_tA.mlir
index 3bd6144..d168465 100644
--- a/gemm/mlir/gemm_8000_1_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_8000_1_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x8000xf16>, %arg1: tensor<5120x1xf16>) -> tensor<8000x1xf16> {
+    func.func @main(%arg0: tensor<5120x8000xf16>, %arg1: tensor<5120x1xf16>) -> tensor<8000x1xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8000x1xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8000x1xf16>) -> tensor<8000x1xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x8000xf16>, tensor<5120x1xf16>) outs(%1 : tensor<8000x1xf16>) -> tensor<8000x1xf16>
         return %2 : tensor<8000x1xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8000_1_8192_bf16_tA.mlir b/gemm/mlir/gemm_8000_1_8192_bf16_tA.mlir
index ecbcf17..0f58095 100644
--- a/gemm/mlir/gemm_8000_1_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8000_1_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x8000xbf16>, %arg1: tensor<8192x1xbf16>) -> tensor<8000x1xbf16> {
+    func.func @main(%arg0: tensor<8192x8000xbf16>, %arg1: tensor<8192x1xbf16>) -> tensor<8000x1xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8000x1xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8000x1xbf16>) -> tensor<8000x1xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8000xbf16>, tensor<8192x1xbf16>) outs(%1 : tensor<8000x1xbf16>) -> tensor<8000x1xbf16>
         return %2 : tensor<8000x1xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8000_1_8192_f16_tA.mlir b/gemm/mlir/gemm_8000_1_8192_f16_tA.mlir
index 9c9dd7a..52e5c03 100644
--- a/gemm/mlir/gemm_8000_1_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_8000_1_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x8000xf16>, %arg1: tensor<8192x1xf16>) -> tensor<8000x1xf16> {
+    func.func @main(%arg0: tensor<8192x8000xf16>, %arg1: tensor<8192x1xf16>) -> tensor<8000x1xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8000x1xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8000x1xf16>) -> tensor<8000x1xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8000xf16>, tensor<8192x1xf16>) outs(%1 : tensor<8000x1xf16>) -> tensor<8000x1xf16>
         return %2 : tensor<8000x1xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8000_2_5120_bf16_tA.mlir b/gemm/mlir/gemm_8000_2_5120_bf16_tA.mlir
index ace5d7d..668917b 100644
--- a/gemm/mlir/gemm_8000_2_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8000_2_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x8000xbf16>, %arg1: tensor<5120x2xbf16>) -> tensor<8000x2xbf16> {
+    func.func @main(%arg0: tensor<5120x8000xbf16>, %arg1: tensor<5120x2xbf16>) -> tensor<8000x2xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8000x2xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8000x2xbf16>) -> tensor<8000x2xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x8000xbf16>, tensor<5120x2xbf16>) outs(%1 : tensor<8000x2xbf16>) -> tensor<8000x2xbf16>
         return %2 : tensor<8000x2xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8000_2_5120_f16_tA.mlir b/gemm/mlir/gemm_8000_2_5120_f16_tA.mlir
index ea4bf75..d85abf9 100644
--- a/gemm/mlir/gemm_8000_2_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_8000_2_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x8000xf16>, %arg1: tensor<5120x2xf16>) -> tensor<8000x2xf16> {
+    func.func @main(%arg0: tensor<5120x8000xf16>, %arg1: tensor<5120x2xf16>) -> tensor<8000x2xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8000x2xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8000x2xf16>) -> tensor<8000x2xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x8000xf16>, tensor<5120x2xf16>) outs(%1 : tensor<8000x2xf16>) -> tensor<8000x2xf16>
         return %2 : tensor<8000x2xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8000_2_8192_bf16_tA.mlir b/gemm/mlir/gemm_8000_2_8192_bf16_tA.mlir
index d6f4f98..fa3aeca 100644
--- a/gemm/mlir/gemm_8000_2_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8000_2_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x8000xbf16>, %arg1: tensor<8192x2xbf16>) -> tensor<8000x2xbf16> {
+    func.func @main(%arg0: tensor<8192x8000xbf16>, %arg1: tensor<8192x2xbf16>) -> tensor<8000x2xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8000x2xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8000x2xbf16>) -> tensor<8000x2xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8000xbf16>, tensor<8192x2xbf16>) outs(%1 : tensor<8000x2xbf16>) -> tensor<8000x2xbf16>
         return %2 : tensor<8000x2xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8000_2_8192_f16_tA.mlir b/gemm/mlir/gemm_8000_2_8192_f16_tA.mlir
index 9b566e7..4d8cb01 100644
--- a/gemm/mlir/gemm_8000_2_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_8000_2_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x8000xf16>, %arg1: tensor<8192x2xf16>) -> tensor<8000x2xf16> {
+    func.func @main(%arg0: tensor<8192x8000xf16>, %arg1: tensor<8192x2xf16>) -> tensor<8000x2xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8000x2xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8000x2xf16>) -> tensor<8000x2xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8000xf16>, tensor<8192x2xf16>) outs(%1 : tensor<8000x2xf16>) -> tensor<8000x2xf16>
         return %2 : tensor<8000x2xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8000_32_5120_bf16_tA.mlir b/gemm/mlir/gemm_8000_32_5120_bf16_tA.mlir
index 6b08d4a..e76d224 100644
--- a/gemm/mlir/gemm_8000_32_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8000_32_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x8000xbf16>, %arg1: tensor<5120x32xbf16>) -> tensor<8000x32xbf16> {
+    func.func @main(%arg0: tensor<5120x8000xbf16>, %arg1: tensor<5120x32xbf16>) -> tensor<8000x32xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8000x32xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8000x32xbf16>) -> tensor<8000x32xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x8000xbf16>, tensor<5120x32xbf16>) outs(%1 : tensor<8000x32xbf16>) -> tensor<8000x32xbf16>
         return %2 : tensor<8000x32xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8000_32_5120_f16_tA.mlir b/gemm/mlir/gemm_8000_32_5120_f16_tA.mlir
index e56cb29..5c226af 100644
--- a/gemm/mlir/gemm_8000_32_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_8000_32_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x8000xf16>, %arg1: tensor<5120x32xf16>) -> tensor<8000x32xf16> {
+    func.func @main(%arg0: tensor<5120x8000xf16>, %arg1: tensor<5120x32xf16>) -> tensor<8000x32xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8000x32xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8000x32xf16>) -> tensor<8000x32xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x8000xf16>, tensor<5120x32xf16>) outs(%1 : tensor<8000x32xf16>) -> tensor<8000x32xf16>
         return %2 : tensor<8000x32xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8000_32_8192_bf16_tA.mlir b/gemm/mlir/gemm_8000_32_8192_bf16_tA.mlir
index e918725..4df2655 100644
--- a/gemm/mlir/gemm_8000_32_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8000_32_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x8000xbf16>, %arg1: tensor<8192x32xbf16>) -> tensor<8000x32xbf16> {
+    func.func @main(%arg0: tensor<8192x8000xbf16>, %arg1: tensor<8192x32xbf16>) -> tensor<8000x32xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8000x32xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8000x32xbf16>) -> tensor<8000x32xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8000xbf16>, tensor<8192x32xbf16>) outs(%1 : tensor<8000x32xbf16>) -> tensor<8000x32xbf16>
         return %2 : tensor<8000x32xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8000_32_8192_f16_tA.mlir b/gemm/mlir/gemm_8000_32_8192_f16_tA.mlir
index bcf5ef2..656010a 100644
--- a/gemm/mlir/gemm_8000_32_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_8000_32_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x8000xf16>, %arg1: tensor<8192x32xf16>) -> tensor<8000x32xf16> {
+    func.func @main(%arg0: tensor<8192x8000xf16>, %arg1: tensor<8192x32xf16>) -> tensor<8000x32xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8000x32xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8000x32xf16>) -> tensor<8000x32xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8000xf16>, tensor<8192x32xf16>) outs(%1 : tensor<8000x32xf16>) -> tensor<8000x32xf16>
         return %2 : tensor<8000x32xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8000_4_5120_bf16_tA.mlir b/gemm/mlir/gemm_8000_4_5120_bf16_tA.mlir
index 515fa72..f45eab2 100644
--- a/gemm/mlir/gemm_8000_4_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8000_4_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x8000xbf16>, %arg1: tensor<5120x4xbf16>) -> tensor<8000x4xbf16> {
+    func.func @main(%arg0: tensor<5120x8000xbf16>, %arg1: tensor<5120x4xbf16>) -> tensor<8000x4xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8000x4xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8000x4xbf16>) -> tensor<8000x4xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x8000xbf16>, tensor<5120x4xbf16>) outs(%1 : tensor<8000x4xbf16>) -> tensor<8000x4xbf16>
         return %2 : tensor<8000x4xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8000_4_5120_f16_tA.mlir b/gemm/mlir/gemm_8000_4_5120_f16_tA.mlir
index 2ebec27..a715200 100644
--- a/gemm/mlir/gemm_8000_4_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_8000_4_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x8000xf16>, %arg1: tensor<5120x4xf16>) -> tensor<8000x4xf16> {
+    func.func @main(%arg0: tensor<5120x8000xf16>, %arg1: tensor<5120x4xf16>) -> tensor<8000x4xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8000x4xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8000x4xf16>) -> tensor<8000x4xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x8000xf16>, tensor<5120x4xf16>) outs(%1 : tensor<8000x4xf16>) -> tensor<8000x4xf16>
         return %2 : tensor<8000x4xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8000_4_8192_bf16_tA.mlir b/gemm/mlir/gemm_8000_4_8192_bf16_tA.mlir
index eebdaa6..a0bf7e7 100644
--- a/gemm/mlir/gemm_8000_4_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8000_4_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x8000xbf16>, %arg1: tensor<8192x4xbf16>) -> tensor<8000x4xbf16> {
+    func.func @main(%arg0: tensor<8192x8000xbf16>, %arg1: tensor<8192x4xbf16>) -> tensor<8000x4xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8000x4xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8000x4xbf16>) -> tensor<8000x4xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8000xbf16>, tensor<8192x4xbf16>) outs(%1 : tensor<8000x4xbf16>) -> tensor<8000x4xbf16>
         return %2 : tensor<8000x4xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8000_4_8192_f16_tA.mlir b/gemm/mlir/gemm_8000_4_8192_f16_tA.mlir
index e086491..4d5e8c5 100644
--- a/gemm/mlir/gemm_8000_4_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_8000_4_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x8000xf16>, %arg1: tensor<8192x4xf16>) -> tensor<8000x4xf16> {
+    func.func @main(%arg0: tensor<8192x8000xf16>, %arg1: tensor<8192x4xf16>) -> tensor<8000x4xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8000x4xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8000x4xf16>) -> tensor<8000x4xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8000xf16>, tensor<8192x4xf16>) outs(%1 : tensor<8000x4xf16>) -> tensor<8000x4xf16>
         return %2 : tensor<8000x4xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8000_8_5120_bf16_tA.mlir b/gemm/mlir/gemm_8000_8_5120_bf16_tA.mlir
index 7420d11..5a7f7e8 100644
--- a/gemm/mlir/gemm_8000_8_5120_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8000_8_5120_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x8000xbf16>, %arg1: tensor<5120x8xbf16>) -> tensor<8000x8xbf16> {
+    func.func @main(%arg0: tensor<5120x8000xbf16>, %arg1: tensor<5120x8xbf16>) -> tensor<8000x8xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8000x8xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8000x8xbf16>) -> tensor<8000x8xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x8000xbf16>, tensor<5120x8xbf16>) outs(%1 : tensor<8000x8xbf16>) -> tensor<8000x8xbf16>
         return %2 : tensor<8000x8xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8000_8_5120_f16_tA.mlir b/gemm/mlir/gemm_8000_8_5120_f16_tA.mlir
index 6684ea0..5552aa7 100644
--- a/gemm/mlir/gemm_8000_8_5120_f16_tA.mlir
+++ b/gemm/mlir/gemm_8000_8_5120_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<5120x8000xf16>, %arg1: tensor<5120x8xf16>) -> tensor<8000x8xf16> {
+    func.func @main(%arg0: tensor<5120x8000xf16>, %arg1: tensor<5120x8xf16>) -> tensor<8000x8xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8000x8xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8000x8xf16>) -> tensor<8000x8xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x8000xf16>, tensor<5120x8xf16>) outs(%1 : tensor<8000x8xf16>) -> tensor<8000x8xf16>
         return %2 : tensor<8000x8xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8000_8_8192_bf16_tA.mlir b/gemm/mlir/gemm_8000_8_8192_bf16_tA.mlir
index bab0926..a79aac9 100644
--- a/gemm/mlir/gemm_8000_8_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8000_8_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x8000xbf16>, %arg1: tensor<8192x8xbf16>) -> tensor<8000x8xbf16> {
+    func.func @main(%arg0: tensor<8192x8000xbf16>, %arg1: tensor<8192x8xbf16>) -> tensor<8000x8xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8000x8xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8000x8xbf16>) -> tensor<8000x8xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8000xbf16>, tensor<8192x8xbf16>) outs(%1 : tensor<8000x8xbf16>) -> tensor<8000x8xbf16>
         return %2 : tensor<8000x8xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8000_8_8192_f16_tA.mlir b/gemm/mlir/gemm_8000_8_8192_f16_tA.mlir
index d8697c2..4f1ed4c 100644
--- a/gemm/mlir/gemm_8000_8_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_8000_8_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x8000xf16>, %arg1: tensor<8192x8xf16>) -> tensor<8000x8xf16> {
+    func.func @main(%arg0: tensor<8192x8000xf16>, %arg1: tensor<8192x8xf16>) -> tensor<8000x8xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8000x8xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8000x8xf16>) -> tensor<8000x8xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8000xf16>, tensor<8192x8xf16>) outs(%1 : tensor<8000x8xf16>) -> tensor<8000x8xf16>
         return %2 : tensor<8000x8xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_16_1024_bf16_tA.mlir b/gemm/mlir/gemm_8192_16_1024_bf16_tA.mlir
index 038e187..665dc34 100644
--- a/gemm/mlir/gemm_8192_16_1024_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_16_1024_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<1024x8192xbf16>, %arg1: tensor<1024x16xbf16>) -> tensor<8192x16xbf16> {
+    func.func @main(%arg0: tensor<1024x8192xbf16>, %arg1: tensor<1024x16xbf16>) -> tensor<8192x16xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x16xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x16xbf16>) -> tensor<8192x16xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1024x8192xbf16>, tensor<1024x16xbf16>) outs(%1 : tensor<8192x16xbf16>) -> tensor<8192x16xbf16>
         return %2 : tensor<8192x16xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_16_1024_f16_tA.mlir b/gemm/mlir/gemm_8192_16_1024_f16_tA.mlir
index eaf0d6c..b37806e 100644
--- a/gemm/mlir/gemm_8192_16_1024_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_16_1024_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<1024x8192xf16>, %arg1: tensor<1024x16xf16>) -> tensor<8192x16xf16> {
+    func.func @main(%arg0: tensor<1024x8192xf16>, %arg1: tensor<1024x16xf16>) -> tensor<8192x16xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x16xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x16xf16>) -> tensor<8192x16xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1024x8192xf16>, tensor<1024x16xf16>) outs(%1 : tensor<8192x16xf16>) -> tensor<8192x16xf16>
         return %2 : tensor<8192x16xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_16_14336_bf16_tA.mlir b/gemm/mlir/gemm_8192_16_14336_bf16_tA.mlir
index 6b7aff8..5a46495 100644
--- a/gemm/mlir/gemm_8192_16_14336_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_16_14336_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<14336x8192xbf16>, %arg1: tensor<14336x16xbf16>) -> tensor<8192x16xbf16> {
+    func.func @main(%arg0: tensor<14336x8192xbf16>, %arg1: tensor<14336x16xbf16>) -> tensor<8192x16xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x16xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x16xbf16>) -> tensor<8192x16xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<14336x8192xbf16>, tensor<14336x16xbf16>) outs(%1 : tensor<8192x16xbf16>) -> tensor<8192x16xbf16>
         return %2 : tensor<8192x16xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_16_14336_f16_tA.mlir b/gemm/mlir/gemm_8192_16_14336_f16_tA.mlir
index de7e11c..e9fab90 100644
--- a/gemm/mlir/gemm_8192_16_14336_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_16_14336_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<14336x8192xf16>, %arg1: tensor<14336x16xf16>) -> tensor<8192x16xf16> {
+    func.func @main(%arg0: tensor<14336x8192xf16>, %arg1: tensor<14336x16xf16>) -> tensor<8192x16xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x16xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x16xf16>) -> tensor<8192x16xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<14336x8192xf16>, tensor<14336x16xf16>) outs(%1 : tensor<8192x16xf16>) -> tensor<8192x16xf16>
         return %2 : tensor<8192x16xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_16_2048_bf16_tA.mlir b/gemm/mlir/gemm_8192_16_2048_bf16_tA.mlir
index 0d691f4..d5390e1 100644
--- a/gemm/mlir/gemm_8192_16_2048_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_16_2048_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<2048x8192xbf16>, %arg1: tensor<2048x16xbf16>) -> tensor<8192x16xbf16> {
+    func.func @main(%arg0: tensor<2048x8192xbf16>, %arg1: tensor<2048x16xbf16>) -> tensor<8192x16xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x16xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x16xbf16>) -> tensor<8192x16xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2048x8192xbf16>, tensor<2048x16xbf16>) outs(%1 : tensor<8192x16xbf16>) -> tensor<8192x16xbf16>
         return %2 : tensor<8192x16xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_16_2048_f16_tA.mlir b/gemm/mlir/gemm_8192_16_2048_f16_tA.mlir
index dc80f68..899b396 100644
--- a/gemm/mlir/gemm_8192_16_2048_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_16_2048_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<2048x8192xf16>, %arg1: tensor<2048x16xf16>) -> tensor<8192x16xf16> {
+    func.func @main(%arg0: tensor<2048x8192xf16>, %arg1: tensor<2048x16xf16>) -> tensor<8192x16xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x16xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x16xf16>) -> tensor<8192x16xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2048x8192xf16>, tensor<2048x16xf16>) outs(%1 : tensor<8192x16xf16>) -> tensor<8192x16xf16>
         return %2 : tensor<8192x16xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_16_28672_bf16_tA.mlir b/gemm/mlir/gemm_8192_16_28672_bf16_tA.mlir
index 760aa13..a052c9d 100644
--- a/gemm/mlir/gemm_8192_16_28672_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_16_28672_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<28672x8192xbf16>, %arg1: tensor<28672x16xbf16>) -> tensor<8192x16xbf16> {
+    func.func @main(%arg0: tensor<28672x8192xbf16>, %arg1: tensor<28672x16xbf16>) -> tensor<8192x16xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x16xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x16xbf16>) -> tensor<8192x16xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<28672x8192xbf16>, tensor<28672x16xbf16>) outs(%1 : tensor<8192x16xbf16>) -> tensor<8192x16xbf16>
         return %2 : tensor<8192x16xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_16_28672_f16_tA.mlir b/gemm/mlir/gemm_8192_16_28672_f16_tA.mlir
index a161368..937f6ca 100644
--- a/gemm/mlir/gemm_8192_16_28672_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_16_28672_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<28672x8192xf16>, %arg1: tensor<28672x16xf16>) -> tensor<8192x16xf16> {
+    func.func @main(%arg0: tensor<28672x8192xf16>, %arg1: tensor<28672x16xf16>) -> tensor<8192x16xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x16xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x16xf16>) -> tensor<8192x16xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<28672x8192xf16>, tensor<28672x16xf16>) outs(%1 : tensor<8192x16xf16>) -> tensor<8192x16xf16>
         return %2 : tensor<8192x16xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_16_3584_bf16_tA.mlir b/gemm/mlir/gemm_8192_16_3584_bf16_tA.mlir
index 532f6a4..956f501 100644
--- a/gemm/mlir/gemm_8192_16_3584_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_16_3584_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<3584x8192xbf16>, %arg1: tensor<3584x16xbf16>) -> tensor<8192x16xbf16> {
+    func.func @main(%arg0: tensor<3584x8192xbf16>, %arg1: tensor<3584x16xbf16>) -> tensor<8192x16xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x16xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x16xbf16>) -> tensor<8192x16xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3584x8192xbf16>, tensor<3584x16xbf16>) outs(%1 : tensor<8192x16xbf16>) -> tensor<8192x16xbf16>
         return %2 : tensor<8192x16xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_16_3584_f16_tA.mlir b/gemm/mlir/gemm_8192_16_3584_f16_tA.mlir
index 54bb2bc..5182fe9 100644
--- a/gemm/mlir/gemm_8192_16_3584_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_16_3584_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<3584x8192xf16>, %arg1: tensor<3584x16xf16>) -> tensor<8192x16xf16> {
+    func.func @main(%arg0: tensor<3584x8192xf16>, %arg1: tensor<3584x16xf16>) -> tensor<8192x16xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x16xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x16xf16>) -> tensor<8192x16xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3584x8192xf16>, tensor<3584x16xf16>) outs(%1 : tensor<8192x16xf16>) -> tensor<8192x16xf16>
         return %2 : tensor<8192x16xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_16_4096_bf16_tA.mlir b/gemm/mlir/gemm_8192_16_4096_bf16_tA.mlir
index 3899546..ae4ae02 100644
--- a/gemm/mlir/gemm_8192_16_4096_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_16_4096_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<4096x8192xbf16>, %arg1: tensor<4096x16xbf16>) -> tensor<8192x16xbf16> {
+    func.func @main(%arg0: tensor<4096x8192xbf16>, %arg1: tensor<4096x16xbf16>) -> tensor<8192x16xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x16xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x16xbf16>) -> tensor<8192x16xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<4096x8192xbf16>, tensor<4096x16xbf16>) outs(%1 : tensor<8192x16xbf16>) -> tensor<8192x16xbf16>
         return %2 : tensor<8192x16xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_16_4096_f16_tA.mlir b/gemm/mlir/gemm_8192_16_4096_f16_tA.mlir
index 7039123..1510fbb 100644
--- a/gemm/mlir/gemm_8192_16_4096_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_16_4096_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<4096x8192xf16>, %arg1: tensor<4096x16xf16>) -> tensor<8192x16xf16> {
+    func.func @main(%arg0: tensor<4096x8192xf16>, %arg1: tensor<4096x16xf16>) -> tensor<8192x16xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x16xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x16xf16>) -> tensor<8192x16xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<4096x8192xf16>, tensor<4096x16xf16>) outs(%1 : tensor<8192x16xf16>) -> tensor<8192x16xf16>
         return %2 : tensor<8192x16xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_16_7168_bf16_tA.mlir b/gemm/mlir/gemm_8192_16_7168_bf16_tA.mlir
index e7b1414..ce83d31 100644
--- a/gemm/mlir/gemm_8192_16_7168_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_16_7168_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<7168x8192xbf16>, %arg1: tensor<7168x16xbf16>) -> tensor<8192x16xbf16> {
+    func.func @main(%arg0: tensor<7168x8192xbf16>, %arg1: tensor<7168x16xbf16>) -> tensor<8192x16xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x16xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x16xbf16>) -> tensor<8192x16xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<7168x8192xbf16>, tensor<7168x16xbf16>) outs(%1 : tensor<8192x16xbf16>) -> tensor<8192x16xbf16>
         return %2 : tensor<8192x16xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_16_7168_f16_tA.mlir b/gemm/mlir/gemm_8192_16_7168_f16_tA.mlir
index 6ac4cdd..63da0a2 100644
--- a/gemm/mlir/gemm_8192_16_7168_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_16_7168_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<7168x8192xf16>, %arg1: tensor<7168x16xf16>) -> tensor<8192x16xf16> {
+    func.func @main(%arg0: tensor<7168x8192xf16>, %arg1: tensor<7168x16xf16>) -> tensor<8192x16xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x16xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x16xf16>) -> tensor<8192x16xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<7168x8192xf16>, tensor<7168x16xf16>) outs(%1 : tensor<8192x16xf16>) -> tensor<8192x16xf16>
         return %2 : tensor<8192x16xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_16_8192_bf16_tA.mlir b/gemm/mlir/gemm_8192_16_8192_bf16_tA.mlir
index a749aa1..b269adb 100644
--- a/gemm/mlir/gemm_8192_16_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_16_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x8192xbf16>, %arg1: tensor<8192x16xbf16>) -> tensor<8192x16xbf16> {
+    func.func @main(%arg0: tensor<8192x8192xbf16>, %arg1: tensor<8192x16xbf16>) -> tensor<8192x16xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x16xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x16xbf16>) -> tensor<8192x16xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8192xbf16>, tensor<8192x16xbf16>) outs(%1 : tensor<8192x16xbf16>) -> tensor<8192x16xbf16>
         return %2 : tensor<8192x16xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_16_8192_f16_tA.mlir b/gemm/mlir/gemm_8192_16_8192_f16_tA.mlir
index d9a8957..a00d5ed 100644
--- a/gemm/mlir/gemm_8192_16_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_16_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x8192xf16>, %arg1: tensor<8192x16xf16>) -> tensor<8192x16xf16> {
+    func.func @main(%arg0: tensor<8192x8192xf16>, %arg1: tensor<8192x16xf16>) -> tensor<8192x16xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x16xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x16xf16>) -> tensor<8192x16xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8192xf16>, tensor<8192x16xf16>) outs(%1 : tensor<8192x16xf16>) -> tensor<8192x16xf16>
         return %2 : tensor<8192x16xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_1_1024_bf16_tA.mlir b/gemm/mlir/gemm_8192_1_1024_bf16_tA.mlir
index 6df831a..4dbf154 100644
--- a/gemm/mlir/gemm_8192_1_1024_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_1_1024_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<1024x8192xbf16>, %arg1: tensor<1024x1xbf16>) -> tensor<8192x1xbf16> {
+    func.func @main(%arg0: tensor<1024x8192xbf16>, %arg1: tensor<1024x1xbf16>) -> tensor<8192x1xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x1xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x1xbf16>) -> tensor<8192x1xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1024x8192xbf16>, tensor<1024x1xbf16>) outs(%1 : tensor<8192x1xbf16>) -> tensor<8192x1xbf16>
         return %2 : tensor<8192x1xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_1_1024_f16_tA.mlir b/gemm/mlir/gemm_8192_1_1024_f16_tA.mlir
index 91e9ae3..cfe91b7 100644
--- a/gemm/mlir/gemm_8192_1_1024_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_1_1024_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<1024x8192xf16>, %arg1: tensor<1024x1xf16>) -> tensor<8192x1xf16> {
+    func.func @main(%arg0: tensor<1024x8192xf16>, %arg1: tensor<1024x1xf16>) -> tensor<8192x1xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x1xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x1xf16>) -> tensor<8192x1xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1024x8192xf16>, tensor<1024x1xf16>) outs(%1 : tensor<8192x1xf16>) -> tensor<8192x1xf16>
         return %2 : tensor<8192x1xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_1_14336_bf16_tA.mlir b/gemm/mlir/gemm_8192_1_14336_bf16_tA.mlir
index 4e44496..bb3ee2c 100644
--- a/gemm/mlir/gemm_8192_1_14336_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_1_14336_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<14336x8192xbf16>, %arg1: tensor<14336x1xbf16>) -> tensor<8192x1xbf16> {
+    func.func @main(%arg0: tensor<14336x8192xbf16>, %arg1: tensor<14336x1xbf16>) -> tensor<8192x1xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x1xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x1xbf16>) -> tensor<8192x1xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<14336x8192xbf16>, tensor<14336x1xbf16>) outs(%1 : tensor<8192x1xbf16>) -> tensor<8192x1xbf16>
         return %2 : tensor<8192x1xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_1_14336_f16_tA.mlir b/gemm/mlir/gemm_8192_1_14336_f16_tA.mlir
index 13191f4..d1bf657 100644
--- a/gemm/mlir/gemm_8192_1_14336_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_1_14336_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<14336x8192xf16>, %arg1: tensor<14336x1xf16>) -> tensor<8192x1xf16> {
+    func.func @main(%arg0: tensor<14336x8192xf16>, %arg1: tensor<14336x1xf16>) -> tensor<8192x1xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x1xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x1xf16>) -> tensor<8192x1xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<14336x8192xf16>, tensor<14336x1xf16>) outs(%1 : tensor<8192x1xf16>) -> tensor<8192x1xf16>
         return %2 : tensor<8192x1xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_1_2048_bf16_tA.mlir b/gemm/mlir/gemm_8192_1_2048_bf16_tA.mlir
index 7042489..dcb9440 100644
--- a/gemm/mlir/gemm_8192_1_2048_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_1_2048_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<2048x8192xbf16>, %arg1: tensor<2048x1xbf16>) -> tensor<8192x1xbf16> {
+    func.func @main(%arg0: tensor<2048x8192xbf16>, %arg1: tensor<2048x1xbf16>) -> tensor<8192x1xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x1xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x1xbf16>) -> tensor<8192x1xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2048x8192xbf16>, tensor<2048x1xbf16>) outs(%1 : tensor<8192x1xbf16>) -> tensor<8192x1xbf16>
         return %2 : tensor<8192x1xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_1_2048_f16_tA.mlir b/gemm/mlir/gemm_8192_1_2048_f16_tA.mlir
index 2231418..bb68ab7 100644
--- a/gemm/mlir/gemm_8192_1_2048_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_1_2048_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<2048x8192xf16>, %arg1: tensor<2048x1xf16>) -> tensor<8192x1xf16> {
+    func.func @main(%arg0: tensor<2048x8192xf16>, %arg1: tensor<2048x1xf16>) -> tensor<8192x1xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x1xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x1xf16>) -> tensor<8192x1xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2048x8192xf16>, tensor<2048x1xf16>) outs(%1 : tensor<8192x1xf16>) -> tensor<8192x1xf16>
         return %2 : tensor<8192x1xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_1_28672_bf16_tA.mlir b/gemm/mlir/gemm_8192_1_28672_bf16_tA.mlir
index d0505c0..9d8ba4c 100644
--- a/gemm/mlir/gemm_8192_1_28672_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_1_28672_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<28672x8192xbf16>, %arg1: tensor<28672x1xbf16>) -> tensor<8192x1xbf16> {
+    func.func @main(%arg0: tensor<28672x8192xbf16>, %arg1: tensor<28672x1xbf16>) -> tensor<8192x1xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x1xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x1xbf16>) -> tensor<8192x1xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<28672x8192xbf16>, tensor<28672x1xbf16>) outs(%1 : tensor<8192x1xbf16>) -> tensor<8192x1xbf16>
         return %2 : tensor<8192x1xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_1_28672_f16_tA.mlir b/gemm/mlir/gemm_8192_1_28672_f16_tA.mlir
index 99b51fe..7f24658 100644
--- a/gemm/mlir/gemm_8192_1_28672_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_1_28672_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<28672x8192xf16>, %arg1: tensor<28672x1xf16>) -> tensor<8192x1xf16> {
+    func.func @main(%arg0: tensor<28672x8192xf16>, %arg1: tensor<28672x1xf16>) -> tensor<8192x1xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x1xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x1xf16>) -> tensor<8192x1xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<28672x8192xf16>, tensor<28672x1xf16>) outs(%1 : tensor<8192x1xf16>) -> tensor<8192x1xf16>
         return %2 : tensor<8192x1xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_1_3584_bf16_tA.mlir b/gemm/mlir/gemm_8192_1_3584_bf16_tA.mlir
index 3dc75a9..db96ed7 100644
--- a/gemm/mlir/gemm_8192_1_3584_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_1_3584_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<3584x8192xbf16>, %arg1: tensor<3584x1xbf16>) -> tensor<8192x1xbf16> {
+    func.func @main(%arg0: tensor<3584x8192xbf16>, %arg1: tensor<3584x1xbf16>) -> tensor<8192x1xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x1xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x1xbf16>) -> tensor<8192x1xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3584x8192xbf16>, tensor<3584x1xbf16>) outs(%1 : tensor<8192x1xbf16>) -> tensor<8192x1xbf16>
         return %2 : tensor<8192x1xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_1_3584_f16_tA.mlir b/gemm/mlir/gemm_8192_1_3584_f16_tA.mlir
index 56734e1..0c6617c 100644
--- a/gemm/mlir/gemm_8192_1_3584_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_1_3584_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<3584x8192xf16>, %arg1: tensor<3584x1xf16>) -> tensor<8192x1xf16> {
+    func.func @main(%arg0: tensor<3584x8192xf16>, %arg1: tensor<3584x1xf16>) -> tensor<8192x1xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x1xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x1xf16>) -> tensor<8192x1xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3584x8192xf16>, tensor<3584x1xf16>) outs(%1 : tensor<8192x1xf16>) -> tensor<8192x1xf16>
         return %2 : tensor<8192x1xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_1_4096_bf16_tA.mlir b/gemm/mlir/gemm_8192_1_4096_bf16_tA.mlir
index b46c7dd..fef2bfe 100644
--- a/gemm/mlir/gemm_8192_1_4096_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_1_4096_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<4096x8192xbf16>, %arg1: tensor<4096x1xbf16>) -> tensor<8192x1xbf16> {
+    func.func @main(%arg0: tensor<4096x8192xbf16>, %arg1: tensor<4096x1xbf16>) -> tensor<8192x1xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x1xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x1xbf16>) -> tensor<8192x1xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<4096x8192xbf16>, tensor<4096x1xbf16>) outs(%1 : tensor<8192x1xbf16>) -> tensor<8192x1xbf16>
         return %2 : tensor<8192x1xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_1_4096_f16_tA.mlir b/gemm/mlir/gemm_8192_1_4096_f16_tA.mlir
index 39b2d9c..76cdec0 100644
--- a/gemm/mlir/gemm_8192_1_4096_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_1_4096_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<4096x8192xf16>, %arg1: tensor<4096x1xf16>) -> tensor<8192x1xf16> {
+    func.func @main(%arg0: tensor<4096x8192xf16>, %arg1: tensor<4096x1xf16>) -> tensor<8192x1xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x1xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x1xf16>) -> tensor<8192x1xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<4096x8192xf16>, tensor<4096x1xf16>) outs(%1 : tensor<8192x1xf16>) -> tensor<8192x1xf16>
         return %2 : tensor<8192x1xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_1_7168_bf16_tA.mlir b/gemm/mlir/gemm_8192_1_7168_bf16_tA.mlir
index fbd4c9b..83f7005 100644
--- a/gemm/mlir/gemm_8192_1_7168_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_1_7168_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<7168x8192xbf16>, %arg1: tensor<7168x1xbf16>) -> tensor<8192x1xbf16> {
+    func.func @main(%arg0: tensor<7168x8192xbf16>, %arg1: tensor<7168x1xbf16>) -> tensor<8192x1xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x1xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x1xbf16>) -> tensor<8192x1xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<7168x8192xbf16>, tensor<7168x1xbf16>) outs(%1 : tensor<8192x1xbf16>) -> tensor<8192x1xbf16>
         return %2 : tensor<8192x1xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_1_7168_f16_tA.mlir b/gemm/mlir/gemm_8192_1_7168_f16_tA.mlir
index c31adcc..1eedb82 100644
--- a/gemm/mlir/gemm_8192_1_7168_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_1_7168_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<7168x8192xf16>, %arg1: tensor<7168x1xf16>) -> tensor<8192x1xf16> {
+    func.func @main(%arg0: tensor<7168x8192xf16>, %arg1: tensor<7168x1xf16>) -> tensor<8192x1xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x1xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x1xf16>) -> tensor<8192x1xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<7168x8192xf16>, tensor<7168x1xf16>) outs(%1 : tensor<8192x1xf16>) -> tensor<8192x1xf16>
         return %2 : tensor<8192x1xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_1_8192_bf16_tA.mlir b/gemm/mlir/gemm_8192_1_8192_bf16_tA.mlir
index 4ac0944..2ea5414 100644
--- a/gemm/mlir/gemm_8192_1_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_1_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x8192xbf16>, %arg1: tensor<8192x1xbf16>) -> tensor<8192x1xbf16> {
+    func.func @main(%arg0: tensor<8192x8192xbf16>, %arg1: tensor<8192x1xbf16>) -> tensor<8192x1xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x1xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x1xbf16>) -> tensor<8192x1xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8192xbf16>, tensor<8192x1xbf16>) outs(%1 : tensor<8192x1xbf16>) -> tensor<8192x1xbf16>
         return %2 : tensor<8192x1xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_1_8192_f16_tA.mlir b/gemm/mlir/gemm_8192_1_8192_f16_tA.mlir
index ffdacd2..ea88565 100644
--- a/gemm/mlir/gemm_8192_1_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_1_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x8192xf16>, %arg1: tensor<8192x1xf16>) -> tensor<8192x1xf16> {
+    func.func @main(%arg0: tensor<8192x8192xf16>, %arg1: tensor<8192x1xf16>) -> tensor<8192x1xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x1xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x1xf16>) -> tensor<8192x1xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8192xf16>, tensor<8192x1xf16>) outs(%1 : tensor<8192x1xf16>) -> tensor<8192x1xf16>
         return %2 : tensor<8192x1xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_2048_1024_f16.mlir b/gemm/mlir/gemm_8192_2048_1024_f16.mlir
index 6bad832..f42dfba 100644
--- a/gemm/mlir/gemm_8192_2048_1024_f16.mlir
+++ b/gemm/mlir/gemm_8192_2048_1024_f16.mlir
@@ -1,9 +1,9 @@
 module {
-    func.func @main_0(%arg0: tensor<8192x1024xf16>, %arg1: tensor<1024x2048xf16>) -> tensor<8192x2048xf16> {
+    func.func @main(%arg0: tensor<8192x1024xf16>, %arg1: tensor<1024x2048xf16>) -> tensor<8192x2048xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x2048xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x2048xf16>) -> tensor<8192x2048xf16>
         %2 = linalg.matmul ins(%arg0, %arg1 : tensor<8192x1024xf16>, tensor<1024x2048xf16>) outs(%1 : tensor<8192x2048xf16>) -> tensor<8192x2048xf16>
         return %2 : tensor<8192x2048xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_2048_65536_f16.mlir b/gemm/mlir/gemm_8192_2048_65536_f16.mlir
index 0593abc..59ae1bc 100644
--- a/gemm/mlir/gemm_8192_2048_65536_f16.mlir
+++ b/gemm/mlir/gemm_8192_2048_65536_f16.mlir
@@ -1,9 +1,9 @@
 module {
-    func.func @main_0(%arg0: tensor<8192x65536xf16>, %arg1: tensor<65536x2048xf16>) -> tensor<8192x2048xf16> {
+    func.func @main(%arg0: tensor<8192x65536xf16>, %arg1: tensor<65536x2048xf16>) -> tensor<8192x2048xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x2048xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x2048xf16>) -> tensor<8192x2048xf16>
         %2 = linalg.matmul ins(%arg0, %arg1 : tensor<8192x65536xf16>, tensor<65536x2048xf16>) outs(%1 : tensor<8192x2048xf16>) -> tensor<8192x2048xf16>
         return %2 : tensor<8192x2048xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_2048_8192_f16.mlir b/gemm/mlir/gemm_8192_2048_8192_f16.mlir
index 84e0fd9..8e0a0a6 100644
--- a/gemm/mlir/gemm_8192_2048_8192_f16.mlir
+++ b/gemm/mlir/gemm_8192_2048_8192_f16.mlir
@@ -1,9 +1,9 @@
 module {
-    func.func @main_0(%arg0: tensor<8192x8192xf16>, %arg1: tensor<8192x2048xf16>) -> tensor<8192x2048xf16> {
+    func.func @main(%arg0: tensor<8192x8192xf16>, %arg1: tensor<8192x2048xf16>) -> tensor<8192x2048xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x2048xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x2048xf16>) -> tensor<8192x2048xf16>
         %2 = linalg.matmul ins(%arg0, %arg1 : tensor<8192x8192xf16>, tensor<8192x2048xf16>) outs(%1 : tensor<8192x2048xf16>) -> tensor<8192x2048xf16>
         return %2 : tensor<8192x2048xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_2_1024_bf16_tA.mlir b/gemm/mlir/gemm_8192_2_1024_bf16_tA.mlir
index 4be7e93..7b07b58 100644
--- a/gemm/mlir/gemm_8192_2_1024_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_2_1024_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<1024x8192xbf16>, %arg1: tensor<1024x2xbf16>) -> tensor<8192x2xbf16> {
+    func.func @main(%arg0: tensor<1024x8192xbf16>, %arg1: tensor<1024x2xbf16>) -> tensor<8192x2xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x2xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x2xbf16>) -> tensor<8192x2xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1024x8192xbf16>, tensor<1024x2xbf16>) outs(%1 : tensor<8192x2xbf16>) -> tensor<8192x2xbf16>
         return %2 : tensor<8192x2xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_2_1024_f16_tA.mlir b/gemm/mlir/gemm_8192_2_1024_f16_tA.mlir
index e88034f..fad9863 100644
--- a/gemm/mlir/gemm_8192_2_1024_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_2_1024_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<1024x8192xf16>, %arg1: tensor<1024x2xf16>) -> tensor<8192x2xf16> {
+    func.func @main(%arg0: tensor<1024x8192xf16>, %arg1: tensor<1024x2xf16>) -> tensor<8192x2xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x2xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x2xf16>) -> tensor<8192x2xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1024x8192xf16>, tensor<1024x2xf16>) outs(%1 : tensor<8192x2xf16>) -> tensor<8192x2xf16>
         return %2 : tensor<8192x2xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_2_14336_bf16_tA.mlir b/gemm/mlir/gemm_8192_2_14336_bf16_tA.mlir
index 8a4f3b5..c3ba4e7 100644
--- a/gemm/mlir/gemm_8192_2_14336_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_2_14336_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<14336x8192xbf16>, %arg1: tensor<14336x2xbf16>) -> tensor<8192x2xbf16> {
+    func.func @main(%arg0: tensor<14336x8192xbf16>, %arg1: tensor<14336x2xbf16>) -> tensor<8192x2xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x2xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x2xbf16>) -> tensor<8192x2xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<14336x8192xbf16>, tensor<14336x2xbf16>) outs(%1 : tensor<8192x2xbf16>) -> tensor<8192x2xbf16>
         return %2 : tensor<8192x2xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_2_14336_f16_tA.mlir b/gemm/mlir/gemm_8192_2_14336_f16_tA.mlir
index 25e2c86..ac6a2f1 100644
--- a/gemm/mlir/gemm_8192_2_14336_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_2_14336_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<14336x8192xf16>, %arg1: tensor<14336x2xf16>) -> tensor<8192x2xf16> {
+    func.func @main(%arg0: tensor<14336x8192xf16>, %arg1: tensor<14336x2xf16>) -> tensor<8192x2xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x2xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x2xf16>) -> tensor<8192x2xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<14336x8192xf16>, tensor<14336x2xf16>) outs(%1 : tensor<8192x2xf16>) -> tensor<8192x2xf16>
         return %2 : tensor<8192x2xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_2_2048_bf16_tA.mlir b/gemm/mlir/gemm_8192_2_2048_bf16_tA.mlir
index 46864c9..154421f 100644
--- a/gemm/mlir/gemm_8192_2_2048_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_2_2048_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<2048x8192xbf16>, %arg1: tensor<2048x2xbf16>) -> tensor<8192x2xbf16> {
+    func.func @main(%arg0: tensor<2048x8192xbf16>, %arg1: tensor<2048x2xbf16>) -> tensor<8192x2xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x2xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x2xbf16>) -> tensor<8192x2xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2048x8192xbf16>, tensor<2048x2xbf16>) outs(%1 : tensor<8192x2xbf16>) -> tensor<8192x2xbf16>
         return %2 : tensor<8192x2xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_2_2048_f16_tA.mlir b/gemm/mlir/gemm_8192_2_2048_f16_tA.mlir
index a044c11..531fb51 100644
--- a/gemm/mlir/gemm_8192_2_2048_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_2_2048_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<2048x8192xf16>, %arg1: tensor<2048x2xf16>) -> tensor<8192x2xf16> {
+    func.func @main(%arg0: tensor<2048x8192xf16>, %arg1: tensor<2048x2xf16>) -> tensor<8192x2xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x2xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x2xf16>) -> tensor<8192x2xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2048x8192xf16>, tensor<2048x2xf16>) outs(%1 : tensor<8192x2xf16>) -> tensor<8192x2xf16>
         return %2 : tensor<8192x2xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_2_28672_bf16_tA.mlir b/gemm/mlir/gemm_8192_2_28672_bf16_tA.mlir
index be33f85..a26e286 100644
--- a/gemm/mlir/gemm_8192_2_28672_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_2_28672_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<28672x8192xbf16>, %arg1: tensor<28672x2xbf16>) -> tensor<8192x2xbf16> {
+    func.func @main(%arg0: tensor<28672x8192xbf16>, %arg1: tensor<28672x2xbf16>) -> tensor<8192x2xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x2xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x2xbf16>) -> tensor<8192x2xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<28672x8192xbf16>, tensor<28672x2xbf16>) outs(%1 : tensor<8192x2xbf16>) -> tensor<8192x2xbf16>
         return %2 : tensor<8192x2xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_2_28672_f16_tA.mlir b/gemm/mlir/gemm_8192_2_28672_f16_tA.mlir
index 04bfd9d..fa64b0d 100644
--- a/gemm/mlir/gemm_8192_2_28672_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_2_28672_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<28672x8192xf16>, %arg1: tensor<28672x2xf16>) -> tensor<8192x2xf16> {
+    func.func @main(%arg0: tensor<28672x8192xf16>, %arg1: tensor<28672x2xf16>) -> tensor<8192x2xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x2xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x2xf16>) -> tensor<8192x2xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<28672x8192xf16>, tensor<28672x2xf16>) outs(%1 : tensor<8192x2xf16>) -> tensor<8192x2xf16>
         return %2 : tensor<8192x2xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_2_3584_bf16_tA.mlir b/gemm/mlir/gemm_8192_2_3584_bf16_tA.mlir
index 9f708d8..6bfc9dc 100644
--- a/gemm/mlir/gemm_8192_2_3584_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_2_3584_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<3584x8192xbf16>, %arg1: tensor<3584x2xbf16>) -> tensor<8192x2xbf16> {
+    func.func @main(%arg0: tensor<3584x8192xbf16>, %arg1: tensor<3584x2xbf16>) -> tensor<8192x2xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x2xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x2xbf16>) -> tensor<8192x2xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3584x8192xbf16>, tensor<3584x2xbf16>) outs(%1 : tensor<8192x2xbf16>) -> tensor<8192x2xbf16>
         return %2 : tensor<8192x2xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_2_3584_f16_tA.mlir b/gemm/mlir/gemm_8192_2_3584_f16_tA.mlir
index 3efba95..5891198 100644
--- a/gemm/mlir/gemm_8192_2_3584_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_2_3584_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<3584x8192xf16>, %arg1: tensor<3584x2xf16>) -> tensor<8192x2xf16> {
+    func.func @main(%arg0: tensor<3584x8192xf16>, %arg1: tensor<3584x2xf16>) -> tensor<8192x2xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x2xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x2xf16>) -> tensor<8192x2xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3584x8192xf16>, tensor<3584x2xf16>) outs(%1 : tensor<8192x2xf16>) -> tensor<8192x2xf16>
         return %2 : tensor<8192x2xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_2_4096_bf16_tA.mlir b/gemm/mlir/gemm_8192_2_4096_bf16_tA.mlir
index ac505cc..5bad65e 100644
--- a/gemm/mlir/gemm_8192_2_4096_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_2_4096_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<4096x8192xbf16>, %arg1: tensor<4096x2xbf16>) -> tensor<8192x2xbf16> {
+    func.func @main(%arg0: tensor<4096x8192xbf16>, %arg1: tensor<4096x2xbf16>) -> tensor<8192x2xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x2xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x2xbf16>) -> tensor<8192x2xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<4096x8192xbf16>, tensor<4096x2xbf16>) outs(%1 : tensor<8192x2xbf16>) -> tensor<8192x2xbf16>
         return %2 : tensor<8192x2xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_2_4096_f16_tA.mlir b/gemm/mlir/gemm_8192_2_4096_f16_tA.mlir
index cf32596..2ff588d 100644
--- a/gemm/mlir/gemm_8192_2_4096_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_2_4096_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<4096x8192xf16>, %arg1: tensor<4096x2xf16>) -> tensor<8192x2xf16> {
+    func.func @main(%arg0: tensor<4096x8192xf16>, %arg1: tensor<4096x2xf16>) -> tensor<8192x2xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x2xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x2xf16>) -> tensor<8192x2xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<4096x8192xf16>, tensor<4096x2xf16>) outs(%1 : tensor<8192x2xf16>) -> tensor<8192x2xf16>
         return %2 : tensor<8192x2xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_2_7168_bf16_tA.mlir b/gemm/mlir/gemm_8192_2_7168_bf16_tA.mlir
index 7cae46d..6017644 100644
--- a/gemm/mlir/gemm_8192_2_7168_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_2_7168_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<7168x8192xbf16>, %arg1: tensor<7168x2xbf16>) -> tensor<8192x2xbf16> {
+    func.func @main(%arg0: tensor<7168x8192xbf16>, %arg1: tensor<7168x2xbf16>) -> tensor<8192x2xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x2xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x2xbf16>) -> tensor<8192x2xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<7168x8192xbf16>, tensor<7168x2xbf16>) outs(%1 : tensor<8192x2xbf16>) -> tensor<8192x2xbf16>
         return %2 : tensor<8192x2xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_2_7168_f16_tA.mlir b/gemm/mlir/gemm_8192_2_7168_f16_tA.mlir
index 40d0de9..ada61d1 100644
--- a/gemm/mlir/gemm_8192_2_7168_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_2_7168_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<7168x8192xf16>, %arg1: tensor<7168x2xf16>) -> tensor<8192x2xf16> {
+    func.func @main(%arg0: tensor<7168x8192xf16>, %arg1: tensor<7168x2xf16>) -> tensor<8192x2xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x2xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x2xf16>) -> tensor<8192x2xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<7168x8192xf16>, tensor<7168x2xf16>) outs(%1 : tensor<8192x2xf16>) -> tensor<8192x2xf16>
         return %2 : tensor<8192x2xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_2_8192_bf16_tA.mlir b/gemm/mlir/gemm_8192_2_8192_bf16_tA.mlir
index e11c1a2..4a5c210 100644
--- a/gemm/mlir/gemm_8192_2_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_2_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x8192xbf16>, %arg1: tensor<8192x2xbf16>) -> tensor<8192x2xbf16> {
+    func.func @main(%arg0: tensor<8192x8192xbf16>, %arg1: tensor<8192x2xbf16>) -> tensor<8192x2xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x2xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x2xbf16>) -> tensor<8192x2xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8192xbf16>, tensor<8192x2xbf16>) outs(%1 : tensor<8192x2xbf16>) -> tensor<8192x2xbf16>
         return %2 : tensor<8192x2xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_2_8192_f16_tA.mlir b/gemm/mlir/gemm_8192_2_8192_f16_tA.mlir
index 4316b29..070a5ba 100644
--- a/gemm/mlir/gemm_8192_2_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_2_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x8192xf16>, %arg1: tensor<8192x2xf16>) -> tensor<8192x2xf16> {
+    func.func @main(%arg0: tensor<8192x8192xf16>, %arg1: tensor<8192x2xf16>) -> tensor<8192x2xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x2xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x2xf16>) -> tensor<8192x2xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8192xf16>, tensor<8192x2xf16>) outs(%1 : tensor<8192x2xf16>) -> tensor<8192x2xf16>
         return %2 : tensor<8192x2xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_32_1024_bf16_tA.mlir b/gemm/mlir/gemm_8192_32_1024_bf16_tA.mlir
index 9d083b2..852e767 100644
--- a/gemm/mlir/gemm_8192_32_1024_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_32_1024_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<1024x8192xbf16>, %arg1: tensor<1024x32xbf16>) -> tensor<8192x32xbf16> {
+    func.func @main(%arg0: tensor<1024x8192xbf16>, %arg1: tensor<1024x32xbf16>) -> tensor<8192x32xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x32xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x32xbf16>) -> tensor<8192x32xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1024x8192xbf16>, tensor<1024x32xbf16>) outs(%1 : tensor<8192x32xbf16>) -> tensor<8192x32xbf16>
         return %2 : tensor<8192x32xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_32_1024_f16_tA.mlir b/gemm/mlir/gemm_8192_32_1024_f16_tA.mlir
index 1ffeb72..c5f5846 100644
--- a/gemm/mlir/gemm_8192_32_1024_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_32_1024_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<1024x8192xf16>, %arg1: tensor<1024x32xf16>) -> tensor<8192x32xf16> {
+    func.func @main(%arg0: tensor<1024x8192xf16>, %arg1: tensor<1024x32xf16>) -> tensor<8192x32xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x32xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x32xf16>) -> tensor<8192x32xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1024x8192xf16>, tensor<1024x32xf16>) outs(%1 : tensor<8192x32xf16>) -> tensor<8192x32xf16>
         return %2 : tensor<8192x32xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_32_14336_bf16_tA.mlir b/gemm/mlir/gemm_8192_32_14336_bf16_tA.mlir
index ee481ff..7f3f684 100644
--- a/gemm/mlir/gemm_8192_32_14336_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_32_14336_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<14336x8192xbf16>, %arg1: tensor<14336x32xbf16>) -> tensor<8192x32xbf16> {
+    func.func @main(%arg0: tensor<14336x8192xbf16>, %arg1: tensor<14336x32xbf16>) -> tensor<8192x32xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x32xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x32xbf16>) -> tensor<8192x32xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<14336x8192xbf16>, tensor<14336x32xbf16>) outs(%1 : tensor<8192x32xbf16>) -> tensor<8192x32xbf16>
         return %2 : tensor<8192x32xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_32_14336_f16_tA.mlir b/gemm/mlir/gemm_8192_32_14336_f16_tA.mlir
index ddd4547..4670ddd 100644
--- a/gemm/mlir/gemm_8192_32_14336_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_32_14336_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<14336x8192xf16>, %arg1: tensor<14336x32xf16>) -> tensor<8192x32xf16> {
+    func.func @main(%arg0: tensor<14336x8192xf16>, %arg1: tensor<14336x32xf16>) -> tensor<8192x32xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x32xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x32xf16>) -> tensor<8192x32xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<14336x8192xf16>, tensor<14336x32xf16>) outs(%1 : tensor<8192x32xf16>) -> tensor<8192x32xf16>
         return %2 : tensor<8192x32xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_32_2048_bf16_tA.mlir b/gemm/mlir/gemm_8192_32_2048_bf16_tA.mlir
index 6a61969..9b7cb18 100644
--- a/gemm/mlir/gemm_8192_32_2048_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_32_2048_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<2048x8192xbf16>, %arg1: tensor<2048x32xbf16>) -> tensor<8192x32xbf16> {
+    func.func @main(%arg0: tensor<2048x8192xbf16>, %arg1: tensor<2048x32xbf16>) -> tensor<8192x32xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x32xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x32xbf16>) -> tensor<8192x32xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2048x8192xbf16>, tensor<2048x32xbf16>) outs(%1 : tensor<8192x32xbf16>) -> tensor<8192x32xbf16>
         return %2 : tensor<8192x32xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_32_2048_f16_tA.mlir b/gemm/mlir/gemm_8192_32_2048_f16_tA.mlir
index 98bfa9b..ad5ff98 100644
--- a/gemm/mlir/gemm_8192_32_2048_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_32_2048_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<2048x8192xf16>, %arg1: tensor<2048x32xf16>) -> tensor<8192x32xf16> {
+    func.func @main(%arg0: tensor<2048x8192xf16>, %arg1: tensor<2048x32xf16>) -> tensor<8192x32xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x32xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x32xf16>) -> tensor<8192x32xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2048x8192xf16>, tensor<2048x32xf16>) outs(%1 : tensor<8192x32xf16>) -> tensor<8192x32xf16>
         return %2 : tensor<8192x32xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_32_28672_bf16_tA.mlir b/gemm/mlir/gemm_8192_32_28672_bf16_tA.mlir
index 25eb462..0e75daf 100644
--- a/gemm/mlir/gemm_8192_32_28672_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_32_28672_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<28672x8192xbf16>, %arg1: tensor<28672x32xbf16>) -> tensor<8192x32xbf16> {
+    func.func @main(%arg0: tensor<28672x8192xbf16>, %arg1: tensor<28672x32xbf16>) -> tensor<8192x32xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x32xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x32xbf16>) -> tensor<8192x32xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<28672x8192xbf16>, tensor<28672x32xbf16>) outs(%1 : tensor<8192x32xbf16>) -> tensor<8192x32xbf16>
         return %2 : tensor<8192x32xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_32_28672_f16_tA.mlir b/gemm/mlir/gemm_8192_32_28672_f16_tA.mlir
index 5f499b8..1809761 100644
--- a/gemm/mlir/gemm_8192_32_28672_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_32_28672_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<28672x8192xf16>, %arg1: tensor<28672x32xf16>) -> tensor<8192x32xf16> {
+    func.func @main(%arg0: tensor<28672x8192xf16>, %arg1: tensor<28672x32xf16>) -> tensor<8192x32xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x32xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x32xf16>) -> tensor<8192x32xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<28672x8192xf16>, tensor<28672x32xf16>) outs(%1 : tensor<8192x32xf16>) -> tensor<8192x32xf16>
         return %2 : tensor<8192x32xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_32_3584_bf16_tA.mlir b/gemm/mlir/gemm_8192_32_3584_bf16_tA.mlir
index acc5d08..ddbba11 100644
--- a/gemm/mlir/gemm_8192_32_3584_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_32_3584_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<3584x8192xbf16>, %arg1: tensor<3584x32xbf16>) -> tensor<8192x32xbf16> {
+    func.func @main(%arg0: tensor<3584x8192xbf16>, %arg1: tensor<3584x32xbf16>) -> tensor<8192x32xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x32xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x32xbf16>) -> tensor<8192x32xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3584x8192xbf16>, tensor<3584x32xbf16>) outs(%1 : tensor<8192x32xbf16>) -> tensor<8192x32xbf16>
         return %2 : tensor<8192x32xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_32_3584_f16_tA.mlir b/gemm/mlir/gemm_8192_32_3584_f16_tA.mlir
index f4cead3..45b7ca4 100644
--- a/gemm/mlir/gemm_8192_32_3584_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_32_3584_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<3584x8192xf16>, %arg1: tensor<3584x32xf16>) -> tensor<8192x32xf16> {
+    func.func @main(%arg0: tensor<3584x8192xf16>, %arg1: tensor<3584x32xf16>) -> tensor<8192x32xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x32xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x32xf16>) -> tensor<8192x32xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3584x8192xf16>, tensor<3584x32xf16>) outs(%1 : tensor<8192x32xf16>) -> tensor<8192x32xf16>
         return %2 : tensor<8192x32xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_32_4096_bf16_tA.mlir b/gemm/mlir/gemm_8192_32_4096_bf16_tA.mlir
index ae5974b..7134984 100644
--- a/gemm/mlir/gemm_8192_32_4096_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_32_4096_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<4096x8192xbf16>, %arg1: tensor<4096x32xbf16>) -> tensor<8192x32xbf16> {
+    func.func @main(%arg0: tensor<4096x8192xbf16>, %arg1: tensor<4096x32xbf16>) -> tensor<8192x32xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x32xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x32xbf16>) -> tensor<8192x32xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<4096x8192xbf16>, tensor<4096x32xbf16>) outs(%1 : tensor<8192x32xbf16>) -> tensor<8192x32xbf16>
         return %2 : tensor<8192x32xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_32_4096_f16_tA.mlir b/gemm/mlir/gemm_8192_32_4096_f16_tA.mlir
index dd43c21..7df2c92 100644
--- a/gemm/mlir/gemm_8192_32_4096_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_32_4096_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<4096x8192xf16>, %arg1: tensor<4096x32xf16>) -> tensor<8192x32xf16> {
+    func.func @main(%arg0: tensor<4096x8192xf16>, %arg1: tensor<4096x32xf16>) -> tensor<8192x32xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x32xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x32xf16>) -> tensor<8192x32xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<4096x8192xf16>, tensor<4096x32xf16>) outs(%1 : tensor<8192x32xf16>) -> tensor<8192x32xf16>
         return %2 : tensor<8192x32xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_32_7168_bf16_tA.mlir b/gemm/mlir/gemm_8192_32_7168_bf16_tA.mlir
index 199a6b0..672f613 100644
--- a/gemm/mlir/gemm_8192_32_7168_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_32_7168_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<7168x8192xbf16>, %arg1: tensor<7168x32xbf16>) -> tensor<8192x32xbf16> {
+    func.func @main(%arg0: tensor<7168x8192xbf16>, %arg1: tensor<7168x32xbf16>) -> tensor<8192x32xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x32xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x32xbf16>) -> tensor<8192x32xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<7168x8192xbf16>, tensor<7168x32xbf16>) outs(%1 : tensor<8192x32xbf16>) -> tensor<8192x32xbf16>
         return %2 : tensor<8192x32xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_32_7168_f16_tA.mlir b/gemm/mlir/gemm_8192_32_7168_f16_tA.mlir
index be300a5..aa39da1 100644
--- a/gemm/mlir/gemm_8192_32_7168_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_32_7168_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<7168x8192xf16>, %arg1: tensor<7168x32xf16>) -> tensor<8192x32xf16> {
+    func.func @main(%arg0: tensor<7168x8192xf16>, %arg1: tensor<7168x32xf16>) -> tensor<8192x32xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x32xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x32xf16>) -> tensor<8192x32xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<7168x8192xf16>, tensor<7168x32xf16>) outs(%1 : tensor<8192x32xf16>) -> tensor<8192x32xf16>
         return %2 : tensor<8192x32xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_32_8192_bf16_tA.mlir b/gemm/mlir/gemm_8192_32_8192_bf16_tA.mlir
index 029e2eb..beeb9f6 100644
--- a/gemm/mlir/gemm_8192_32_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_32_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x8192xbf16>, %arg1: tensor<8192x32xbf16>) -> tensor<8192x32xbf16> {
+    func.func @main(%arg0: tensor<8192x8192xbf16>, %arg1: tensor<8192x32xbf16>) -> tensor<8192x32xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x32xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x32xbf16>) -> tensor<8192x32xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8192xbf16>, tensor<8192x32xbf16>) outs(%1 : tensor<8192x32xbf16>) -> tensor<8192x32xbf16>
         return %2 : tensor<8192x32xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_32_8192_f16_tA.mlir b/gemm/mlir/gemm_8192_32_8192_f16_tA.mlir
index 3182444..538b2a5 100644
--- a/gemm/mlir/gemm_8192_32_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_32_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x8192xf16>, %arg1: tensor<8192x32xf16>) -> tensor<8192x32xf16> {
+    func.func @main(%arg0: tensor<8192x8192xf16>, %arg1: tensor<8192x32xf16>) -> tensor<8192x32xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x32xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x32xf16>) -> tensor<8192x32xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8192xf16>, tensor<8192x32xf16>) outs(%1 : tensor<8192x32xf16>) -> tensor<8192x32xf16>
         return %2 : tensor<8192x32xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_4_1024_bf16_tA.mlir b/gemm/mlir/gemm_8192_4_1024_bf16_tA.mlir
index 7ee7a15..dadcc8c 100644
--- a/gemm/mlir/gemm_8192_4_1024_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_4_1024_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<1024x8192xbf16>, %arg1: tensor<1024x4xbf16>) -> tensor<8192x4xbf16> {
+    func.func @main(%arg0: tensor<1024x8192xbf16>, %arg1: tensor<1024x4xbf16>) -> tensor<8192x4xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x4xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x4xbf16>) -> tensor<8192x4xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1024x8192xbf16>, tensor<1024x4xbf16>) outs(%1 : tensor<8192x4xbf16>) -> tensor<8192x4xbf16>
         return %2 : tensor<8192x4xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_4_1024_f16_tA.mlir b/gemm/mlir/gemm_8192_4_1024_f16_tA.mlir
index 3f3a1d9..ae01271 100644
--- a/gemm/mlir/gemm_8192_4_1024_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_4_1024_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<1024x8192xf16>, %arg1: tensor<1024x4xf16>) -> tensor<8192x4xf16> {
+    func.func @main(%arg0: tensor<1024x8192xf16>, %arg1: tensor<1024x4xf16>) -> tensor<8192x4xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x4xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x4xf16>) -> tensor<8192x4xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1024x8192xf16>, tensor<1024x4xf16>) outs(%1 : tensor<8192x4xf16>) -> tensor<8192x4xf16>
         return %2 : tensor<8192x4xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_4_14336_bf16_tA.mlir b/gemm/mlir/gemm_8192_4_14336_bf16_tA.mlir
index 6ab6a37..a91f9bf 100644
--- a/gemm/mlir/gemm_8192_4_14336_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_4_14336_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<14336x8192xbf16>, %arg1: tensor<14336x4xbf16>) -> tensor<8192x4xbf16> {
+    func.func @main(%arg0: tensor<14336x8192xbf16>, %arg1: tensor<14336x4xbf16>) -> tensor<8192x4xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x4xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x4xbf16>) -> tensor<8192x4xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<14336x8192xbf16>, tensor<14336x4xbf16>) outs(%1 : tensor<8192x4xbf16>) -> tensor<8192x4xbf16>
         return %2 : tensor<8192x4xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_4_14336_f16_tA.mlir b/gemm/mlir/gemm_8192_4_14336_f16_tA.mlir
index 1acec1d..925676f 100644
--- a/gemm/mlir/gemm_8192_4_14336_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_4_14336_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<14336x8192xf16>, %arg1: tensor<14336x4xf16>) -> tensor<8192x4xf16> {
+    func.func @main(%arg0: tensor<14336x8192xf16>, %arg1: tensor<14336x4xf16>) -> tensor<8192x4xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x4xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x4xf16>) -> tensor<8192x4xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<14336x8192xf16>, tensor<14336x4xf16>) outs(%1 : tensor<8192x4xf16>) -> tensor<8192x4xf16>
         return %2 : tensor<8192x4xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_4_2048_bf16_tA.mlir b/gemm/mlir/gemm_8192_4_2048_bf16_tA.mlir
index a2f1152..63f589c 100644
--- a/gemm/mlir/gemm_8192_4_2048_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_4_2048_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<2048x8192xbf16>, %arg1: tensor<2048x4xbf16>) -> tensor<8192x4xbf16> {
+    func.func @main(%arg0: tensor<2048x8192xbf16>, %arg1: tensor<2048x4xbf16>) -> tensor<8192x4xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x4xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x4xbf16>) -> tensor<8192x4xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2048x8192xbf16>, tensor<2048x4xbf16>) outs(%1 : tensor<8192x4xbf16>) -> tensor<8192x4xbf16>
         return %2 : tensor<8192x4xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_4_2048_f16_tA.mlir b/gemm/mlir/gemm_8192_4_2048_f16_tA.mlir
index bbb0827..043dba2 100644
--- a/gemm/mlir/gemm_8192_4_2048_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_4_2048_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<2048x8192xf16>, %arg1: tensor<2048x4xf16>) -> tensor<8192x4xf16> {
+    func.func @main(%arg0: tensor<2048x8192xf16>, %arg1: tensor<2048x4xf16>) -> tensor<8192x4xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x4xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x4xf16>) -> tensor<8192x4xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2048x8192xf16>, tensor<2048x4xf16>) outs(%1 : tensor<8192x4xf16>) -> tensor<8192x4xf16>
         return %2 : tensor<8192x4xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_4_28672_bf16_tA.mlir b/gemm/mlir/gemm_8192_4_28672_bf16_tA.mlir
index ccceace..ffcc49d 100644
--- a/gemm/mlir/gemm_8192_4_28672_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_4_28672_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<28672x8192xbf16>, %arg1: tensor<28672x4xbf16>) -> tensor<8192x4xbf16> {
+    func.func @main(%arg0: tensor<28672x8192xbf16>, %arg1: tensor<28672x4xbf16>) -> tensor<8192x4xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x4xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x4xbf16>) -> tensor<8192x4xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<28672x8192xbf16>, tensor<28672x4xbf16>) outs(%1 : tensor<8192x4xbf16>) -> tensor<8192x4xbf16>
         return %2 : tensor<8192x4xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_4_28672_f16_tA.mlir b/gemm/mlir/gemm_8192_4_28672_f16_tA.mlir
index 9fed52d..3e66079 100644
--- a/gemm/mlir/gemm_8192_4_28672_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_4_28672_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<28672x8192xf16>, %arg1: tensor<28672x4xf16>) -> tensor<8192x4xf16> {
+    func.func @main(%arg0: tensor<28672x8192xf16>, %arg1: tensor<28672x4xf16>) -> tensor<8192x4xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x4xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x4xf16>) -> tensor<8192x4xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<28672x8192xf16>, tensor<28672x4xf16>) outs(%1 : tensor<8192x4xf16>) -> tensor<8192x4xf16>
         return %2 : tensor<8192x4xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_4_3584_bf16_tA.mlir b/gemm/mlir/gemm_8192_4_3584_bf16_tA.mlir
index f7a7302..13ea765 100644
--- a/gemm/mlir/gemm_8192_4_3584_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_4_3584_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<3584x8192xbf16>, %arg1: tensor<3584x4xbf16>) -> tensor<8192x4xbf16> {
+    func.func @main(%arg0: tensor<3584x8192xbf16>, %arg1: tensor<3584x4xbf16>) -> tensor<8192x4xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x4xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x4xbf16>) -> tensor<8192x4xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3584x8192xbf16>, tensor<3584x4xbf16>) outs(%1 : tensor<8192x4xbf16>) -> tensor<8192x4xbf16>
         return %2 : tensor<8192x4xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_4_3584_f16_tA.mlir b/gemm/mlir/gemm_8192_4_3584_f16_tA.mlir
index a7192ae..b3a4aca 100644
--- a/gemm/mlir/gemm_8192_4_3584_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_4_3584_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<3584x8192xf16>, %arg1: tensor<3584x4xf16>) -> tensor<8192x4xf16> {
+    func.func @main(%arg0: tensor<3584x8192xf16>, %arg1: tensor<3584x4xf16>) -> tensor<8192x4xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x4xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x4xf16>) -> tensor<8192x4xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3584x8192xf16>, tensor<3584x4xf16>) outs(%1 : tensor<8192x4xf16>) -> tensor<8192x4xf16>
         return %2 : tensor<8192x4xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_4_4096_bf16_tA.mlir b/gemm/mlir/gemm_8192_4_4096_bf16_tA.mlir
index 54161e0..111e1b9 100644
--- a/gemm/mlir/gemm_8192_4_4096_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_4_4096_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<4096x8192xbf16>, %arg1: tensor<4096x4xbf16>) -> tensor<8192x4xbf16> {
+    func.func @main(%arg0: tensor<4096x8192xbf16>, %arg1: tensor<4096x4xbf16>) -> tensor<8192x4xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x4xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x4xbf16>) -> tensor<8192x4xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<4096x8192xbf16>, tensor<4096x4xbf16>) outs(%1 : tensor<8192x4xbf16>) -> tensor<8192x4xbf16>
         return %2 : tensor<8192x4xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_4_4096_f16_tA.mlir b/gemm/mlir/gemm_8192_4_4096_f16_tA.mlir
index 4586bc9..e9059bf 100644
--- a/gemm/mlir/gemm_8192_4_4096_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_4_4096_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<4096x8192xf16>, %arg1: tensor<4096x4xf16>) -> tensor<8192x4xf16> {
+    func.func @main(%arg0: tensor<4096x8192xf16>, %arg1: tensor<4096x4xf16>) -> tensor<8192x4xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x4xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x4xf16>) -> tensor<8192x4xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<4096x8192xf16>, tensor<4096x4xf16>) outs(%1 : tensor<8192x4xf16>) -> tensor<8192x4xf16>
         return %2 : tensor<8192x4xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_4_7168_bf16_tA.mlir b/gemm/mlir/gemm_8192_4_7168_bf16_tA.mlir
index fc001ab..7c140f3 100644
--- a/gemm/mlir/gemm_8192_4_7168_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_4_7168_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<7168x8192xbf16>, %arg1: tensor<7168x4xbf16>) -> tensor<8192x4xbf16> {
+    func.func @main(%arg0: tensor<7168x8192xbf16>, %arg1: tensor<7168x4xbf16>) -> tensor<8192x4xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x4xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x4xbf16>) -> tensor<8192x4xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<7168x8192xbf16>, tensor<7168x4xbf16>) outs(%1 : tensor<8192x4xbf16>) -> tensor<8192x4xbf16>
         return %2 : tensor<8192x4xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_4_7168_f16_tA.mlir b/gemm/mlir/gemm_8192_4_7168_f16_tA.mlir
index 68cf431..81e98c6 100644
--- a/gemm/mlir/gemm_8192_4_7168_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_4_7168_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<7168x8192xf16>, %arg1: tensor<7168x4xf16>) -> tensor<8192x4xf16> {
+    func.func @main(%arg0: tensor<7168x8192xf16>, %arg1: tensor<7168x4xf16>) -> tensor<8192x4xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x4xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x4xf16>) -> tensor<8192x4xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<7168x8192xf16>, tensor<7168x4xf16>) outs(%1 : tensor<8192x4xf16>) -> tensor<8192x4xf16>
         return %2 : tensor<8192x4xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_4_8192_bf16_tA.mlir b/gemm/mlir/gemm_8192_4_8192_bf16_tA.mlir
index 9affedd..7ae31d7 100644
--- a/gemm/mlir/gemm_8192_4_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_4_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x8192xbf16>, %arg1: tensor<8192x4xbf16>) -> tensor<8192x4xbf16> {
+    func.func @main(%arg0: tensor<8192x8192xbf16>, %arg1: tensor<8192x4xbf16>) -> tensor<8192x4xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x4xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x4xbf16>) -> tensor<8192x4xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8192xbf16>, tensor<8192x4xbf16>) outs(%1 : tensor<8192x4xbf16>) -> tensor<8192x4xbf16>
         return %2 : tensor<8192x4xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_4_8192_f16_tA.mlir b/gemm/mlir/gemm_8192_4_8192_f16_tA.mlir
index 95b9cd9..2378c0f 100644
--- a/gemm/mlir/gemm_8192_4_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_4_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x8192xf16>, %arg1: tensor<8192x4xf16>) -> tensor<8192x4xf16> {
+    func.func @main(%arg0: tensor<8192x8192xf16>, %arg1: tensor<8192x4xf16>) -> tensor<8192x4xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x4xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x4xf16>) -> tensor<8192x4xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8192xf16>, tensor<8192x4xf16>) outs(%1 : tensor<8192x4xf16>) -> tensor<8192x4xf16>
         return %2 : tensor<8192x4xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_5120_640_bf16.mlir b/gemm/mlir/gemm_8192_5120_640_bf16.mlir
index 60e77f8..5f59098 100644
--- a/gemm/mlir/gemm_8192_5120_640_bf16.mlir
+++ b/gemm/mlir/gemm_8192_5120_640_bf16.mlir
@@ -1,9 +1,9 @@
 module {
-    func.func @main_0(%arg0: tensor<8192x640xbf16>, %arg1: tensor<640x5120xbf16>) -> tensor<8192x5120xbf16> {
+    func.func @main(%arg0: tensor<8192x640xbf16>, %arg1: tensor<640x5120xbf16>) -> tensor<8192x5120xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x5120xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x5120xbf16>) -> tensor<8192x5120xbf16>
         %2 = linalg.matmul ins(%arg0, %arg1 : tensor<8192x640xbf16>, tensor<640x5120xbf16>) outs(%1 : tensor<8192x5120xbf16>) -> tensor<8192x5120xbf16>
         return %2 : tensor<8192x5120xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_5120_640_bf16_tA.mlir b/gemm/mlir/gemm_8192_5120_640_bf16_tA.mlir
index 5cc2130..177684d 100644
--- a/gemm/mlir/gemm_8192_5120_640_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_5120_640_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<640x8192xbf16>, %arg1: tensor<640x5120xbf16>) -> tensor<8192x5120xbf16> {
+    func.func @main(%arg0: tensor<640x8192xbf16>, %arg1: tensor<640x5120xbf16>) -> tensor<8192x5120xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x5120xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x5120xbf16>) -> tensor<8192x5120xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<640x8192xbf16>, tensor<640x5120xbf16>) outs(%1 : tensor<8192x5120xbf16>) -> tensor<8192x5120xbf16>
         return %2 : tensor<8192x5120xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_5120_640_bf16_tB.mlir b/gemm/mlir/gemm_8192_5120_640_bf16_tB.mlir
index 522054a..629f56f 100644
--- a/gemm/mlir/gemm_8192_5120_640_bf16_tB.mlir
+++ b/gemm/mlir/gemm_8192_5120_640_bf16_tB.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x640xbf16>, %arg1: tensor<5120x640xbf16>) -> tensor<8192x5120xbf16> {
+    func.func @main(%arg0: tensor<8192x640xbf16>, %arg1: tensor<5120x640xbf16>) -> tensor<8192x5120xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x5120xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x5120xbf16>) -> tensor<8192x5120xbf16>
         %2 = linalg.matmul_transpose_b ins(%arg0, %arg1 : tensor<8192x640xbf16>, tensor<5120x640xbf16>) outs(%1 : tensor<8192x5120xbf16>) -> tensor<8192x5120xbf16>
         return %2 : tensor<8192x5120xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_5120_640_f16.mlir b/gemm/mlir/gemm_8192_5120_640_f16.mlir
index 4f3838e..52be98c 100644
--- a/gemm/mlir/gemm_8192_5120_640_f16.mlir
+++ b/gemm/mlir/gemm_8192_5120_640_f16.mlir
@@ -1,9 +1,9 @@
 module {
-    func.func @main_0(%arg0: tensor<8192x640xf16>, %arg1: tensor<640x5120xf16>) -> tensor<8192x5120xf16> {
+    func.func @main(%arg0: tensor<8192x640xf16>, %arg1: tensor<640x5120xf16>) -> tensor<8192x5120xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x5120xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x5120xf16>) -> tensor<8192x5120xf16>
         %2 = linalg.matmul ins(%arg0, %arg1 : tensor<8192x640xf16>, tensor<640x5120xf16>) outs(%1 : tensor<8192x5120xf16>) -> tensor<8192x5120xf16>
         return %2 : tensor<8192x5120xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_5120_640_f16_tA.mlir b/gemm/mlir/gemm_8192_5120_640_f16_tA.mlir
index 5d7300e..97875f6 100644
--- a/gemm/mlir/gemm_8192_5120_640_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_5120_640_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<640x8192xf16>, %arg1: tensor<640x5120xf16>) -> tensor<8192x5120xf16> {
+    func.func @main(%arg0: tensor<640x8192xf16>, %arg1: tensor<640x5120xf16>) -> tensor<8192x5120xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x5120xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x5120xf16>) -> tensor<8192x5120xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<640x8192xf16>, tensor<640x5120xf16>) outs(%1 : tensor<8192x5120xf16>) -> tensor<8192x5120xf16>
         return %2 : tensor<8192x5120xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_5120_640_f16_tB.mlir b/gemm/mlir/gemm_8192_5120_640_f16_tB.mlir
index c449f76..2f37fd5 100644
--- a/gemm/mlir/gemm_8192_5120_640_f16_tB.mlir
+++ b/gemm/mlir/gemm_8192_5120_640_f16_tB.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x640xf16>, %arg1: tensor<5120x640xf16>) -> tensor<8192x5120xf16> {
+    func.func @main(%arg0: tensor<8192x640xf16>, %arg1: tensor<5120x640xf16>) -> tensor<8192x5120xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x5120xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x5120xf16>) -> tensor<8192x5120xf16>
         %2 = linalg.matmul_transpose_b ins(%arg0, %arg1 : tensor<8192x640xf16>, tensor<5120x640xf16>) outs(%1 : tensor<8192x5120xf16>) -> tensor<8192x5120xf16>
         return %2 : tensor<8192x5120xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_8192_1024_f16.mlir b/gemm/mlir/gemm_8192_8192_1024_f16.mlir
index 9a3648a..8c5ec54 100644
--- a/gemm/mlir/gemm_8192_8192_1024_f16.mlir
+++ b/gemm/mlir/gemm_8192_8192_1024_f16.mlir
@@ -1,9 +1,9 @@
 module {
-    func.func @main_0(%arg0: tensor<8192x1024xf16>, %arg1: tensor<1024x8192xf16>) -> tensor<8192x8192xf16> {
+    func.func @main(%arg0: tensor<8192x1024xf16>, %arg1: tensor<1024x8192xf16>) -> tensor<8192x8192xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x8192xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x8192xf16>) -> tensor<8192x8192xf16>
         %2 = linalg.matmul ins(%arg0, %arg1 : tensor<8192x1024xf16>, tensor<1024x8192xf16>) outs(%1 : tensor<8192x8192xf16>) -> tensor<8192x8192xf16>
         return %2 : tensor<8192x8192xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_8192_65536_f16.mlir b/gemm/mlir/gemm_8192_8192_65536_f16.mlir
index 6560fd3..04bdc92 100644
--- a/gemm/mlir/gemm_8192_8192_65536_f16.mlir
+++ b/gemm/mlir/gemm_8192_8192_65536_f16.mlir
@@ -1,9 +1,9 @@
 module {
-    func.func @main_0(%arg0: tensor<8192x65536xf16>, %arg1: tensor<65536x8192xf16>) -> tensor<8192x8192xf16> {
+    func.func @main(%arg0: tensor<8192x65536xf16>, %arg1: tensor<65536x8192xf16>) -> tensor<8192x8192xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x8192xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x8192xf16>) -> tensor<8192x8192xf16>
         %2 = linalg.matmul ins(%arg0, %arg1 : tensor<8192x65536xf16>, tensor<65536x8192xf16>) outs(%1 : tensor<8192x8192xf16>) -> tensor<8192x8192xf16>
         return %2 : tensor<8192x8192xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_8192_8192_f16.mlir b/gemm/mlir/gemm_8192_8192_8192_f16.mlir
index 82a2379..232fdb7 100644
--- a/gemm/mlir/gemm_8192_8192_8192_f16.mlir
+++ b/gemm/mlir/gemm_8192_8192_8192_f16.mlir
@@ -1,9 +1,9 @@
 module {
-    func.func @main_0(%arg0: tensor<8192x8192xf16>, %arg1: tensor<8192x8192xf16>) -> tensor<8192x8192xf16> {
+    func.func @main(%arg0: tensor<8192x8192xf16>, %arg1: tensor<8192x8192xf16>) -> tensor<8192x8192xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x8192xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x8192xf16>) -> tensor<8192x8192xf16>
         %2 = linalg.matmul ins(%arg0, %arg1 : tensor<8192x8192xf16>, tensor<8192x8192xf16>) outs(%1 : tensor<8192x8192xf16>) -> tensor<8192x8192xf16>
         return %2 : tensor<8192x8192xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_8_1024_bf16_tA.mlir b/gemm/mlir/gemm_8192_8_1024_bf16_tA.mlir
index d2db594..90fbed3 100644
--- a/gemm/mlir/gemm_8192_8_1024_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_8_1024_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<1024x8192xbf16>, %arg1: tensor<1024x8xbf16>) -> tensor<8192x8xbf16> {
+    func.func @main(%arg0: tensor<1024x8192xbf16>, %arg1: tensor<1024x8xbf16>) -> tensor<8192x8xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x8xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x8xbf16>) -> tensor<8192x8xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1024x8192xbf16>, tensor<1024x8xbf16>) outs(%1 : tensor<8192x8xbf16>) -> tensor<8192x8xbf16>
         return %2 : tensor<8192x8xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_8_1024_f16_tA.mlir b/gemm/mlir/gemm_8192_8_1024_f16_tA.mlir
index b2a13c3..b3d0f26 100644
--- a/gemm/mlir/gemm_8192_8_1024_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_8_1024_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<1024x8192xf16>, %arg1: tensor<1024x8xf16>) -> tensor<8192x8xf16> {
+    func.func @main(%arg0: tensor<1024x8192xf16>, %arg1: tensor<1024x8xf16>) -> tensor<8192x8xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x8xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x8xf16>) -> tensor<8192x8xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1024x8192xf16>, tensor<1024x8xf16>) outs(%1 : tensor<8192x8xf16>) -> tensor<8192x8xf16>
         return %2 : tensor<8192x8xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_8_14336_bf16_tA.mlir b/gemm/mlir/gemm_8192_8_14336_bf16_tA.mlir
index 943e95d..17fe727 100644
--- a/gemm/mlir/gemm_8192_8_14336_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_8_14336_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<14336x8192xbf16>, %arg1: tensor<14336x8xbf16>) -> tensor<8192x8xbf16> {
+    func.func @main(%arg0: tensor<14336x8192xbf16>, %arg1: tensor<14336x8xbf16>) -> tensor<8192x8xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x8xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x8xbf16>) -> tensor<8192x8xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<14336x8192xbf16>, tensor<14336x8xbf16>) outs(%1 : tensor<8192x8xbf16>) -> tensor<8192x8xbf16>
         return %2 : tensor<8192x8xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_8_14336_f16_tA.mlir b/gemm/mlir/gemm_8192_8_14336_f16_tA.mlir
index f690bbd..bbf21b1 100644
--- a/gemm/mlir/gemm_8192_8_14336_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_8_14336_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<14336x8192xf16>, %arg1: tensor<14336x8xf16>) -> tensor<8192x8xf16> {
+    func.func @main(%arg0: tensor<14336x8192xf16>, %arg1: tensor<14336x8xf16>) -> tensor<8192x8xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x8xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x8xf16>) -> tensor<8192x8xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<14336x8192xf16>, tensor<14336x8xf16>) outs(%1 : tensor<8192x8xf16>) -> tensor<8192x8xf16>
         return %2 : tensor<8192x8xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_8_2048_bf16_tA.mlir b/gemm/mlir/gemm_8192_8_2048_bf16_tA.mlir
index ea3351c..d46ec59 100644
--- a/gemm/mlir/gemm_8192_8_2048_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_8_2048_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<2048x8192xbf16>, %arg1: tensor<2048x8xbf16>) -> tensor<8192x8xbf16> {
+    func.func @main(%arg0: tensor<2048x8192xbf16>, %arg1: tensor<2048x8xbf16>) -> tensor<8192x8xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x8xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x8xbf16>) -> tensor<8192x8xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2048x8192xbf16>, tensor<2048x8xbf16>) outs(%1 : tensor<8192x8xbf16>) -> tensor<8192x8xbf16>
         return %2 : tensor<8192x8xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_8_2048_f16_tA.mlir b/gemm/mlir/gemm_8192_8_2048_f16_tA.mlir
index 7bf708b..30f757f 100644
--- a/gemm/mlir/gemm_8192_8_2048_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_8_2048_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<2048x8192xf16>, %arg1: tensor<2048x8xf16>) -> tensor<8192x8xf16> {
+    func.func @main(%arg0: tensor<2048x8192xf16>, %arg1: tensor<2048x8xf16>) -> tensor<8192x8xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x8xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x8xf16>) -> tensor<8192x8xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2048x8192xf16>, tensor<2048x8xf16>) outs(%1 : tensor<8192x8xf16>) -> tensor<8192x8xf16>
         return %2 : tensor<8192x8xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_8_28672_bf16_tA.mlir b/gemm/mlir/gemm_8192_8_28672_bf16_tA.mlir
index fd3d526..7fdd508 100644
--- a/gemm/mlir/gemm_8192_8_28672_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_8_28672_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<28672x8192xbf16>, %arg1: tensor<28672x8xbf16>) -> tensor<8192x8xbf16> {
+    func.func @main(%arg0: tensor<28672x8192xbf16>, %arg1: tensor<28672x8xbf16>) -> tensor<8192x8xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x8xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x8xbf16>) -> tensor<8192x8xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<28672x8192xbf16>, tensor<28672x8xbf16>) outs(%1 : tensor<8192x8xbf16>) -> tensor<8192x8xbf16>
         return %2 : tensor<8192x8xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_8_28672_f16_tA.mlir b/gemm/mlir/gemm_8192_8_28672_f16_tA.mlir
index f08e510..aafb576 100644
--- a/gemm/mlir/gemm_8192_8_28672_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_8_28672_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<28672x8192xf16>, %arg1: tensor<28672x8xf16>) -> tensor<8192x8xf16> {
+    func.func @main(%arg0: tensor<28672x8192xf16>, %arg1: tensor<28672x8xf16>) -> tensor<8192x8xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x8xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x8xf16>) -> tensor<8192x8xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<28672x8192xf16>, tensor<28672x8xf16>) outs(%1 : tensor<8192x8xf16>) -> tensor<8192x8xf16>
         return %2 : tensor<8192x8xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_8_3584_bf16_tA.mlir b/gemm/mlir/gemm_8192_8_3584_bf16_tA.mlir
index b04864b..caa3522 100644
--- a/gemm/mlir/gemm_8192_8_3584_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_8_3584_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<3584x8192xbf16>, %arg1: tensor<3584x8xbf16>) -> tensor<8192x8xbf16> {
+    func.func @main(%arg0: tensor<3584x8192xbf16>, %arg1: tensor<3584x8xbf16>) -> tensor<8192x8xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x8xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x8xbf16>) -> tensor<8192x8xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3584x8192xbf16>, tensor<3584x8xbf16>) outs(%1 : tensor<8192x8xbf16>) -> tensor<8192x8xbf16>
         return %2 : tensor<8192x8xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_8_3584_f16_tA.mlir b/gemm/mlir/gemm_8192_8_3584_f16_tA.mlir
index 56afcbc..9964378 100644
--- a/gemm/mlir/gemm_8192_8_3584_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_8_3584_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<3584x8192xf16>, %arg1: tensor<3584x8xf16>) -> tensor<8192x8xf16> {
+    func.func @main(%arg0: tensor<3584x8192xf16>, %arg1: tensor<3584x8xf16>) -> tensor<8192x8xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x8xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x8xf16>) -> tensor<8192x8xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3584x8192xf16>, tensor<3584x8xf16>) outs(%1 : tensor<8192x8xf16>) -> tensor<8192x8xf16>
         return %2 : tensor<8192x8xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_8_4096_bf16_tA.mlir b/gemm/mlir/gemm_8192_8_4096_bf16_tA.mlir
index ede7ce9..ed9262d 100644
--- a/gemm/mlir/gemm_8192_8_4096_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_8_4096_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<4096x8192xbf16>, %arg1: tensor<4096x8xbf16>) -> tensor<8192x8xbf16> {
+    func.func @main(%arg0: tensor<4096x8192xbf16>, %arg1: tensor<4096x8xbf16>) -> tensor<8192x8xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x8xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x8xbf16>) -> tensor<8192x8xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<4096x8192xbf16>, tensor<4096x8xbf16>) outs(%1 : tensor<8192x8xbf16>) -> tensor<8192x8xbf16>
         return %2 : tensor<8192x8xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_8_4096_f16_tA.mlir b/gemm/mlir/gemm_8192_8_4096_f16_tA.mlir
index 03c6a9f..0fca3dc 100644
--- a/gemm/mlir/gemm_8192_8_4096_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_8_4096_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<4096x8192xf16>, %arg1: tensor<4096x8xf16>) -> tensor<8192x8xf16> {
+    func.func @main(%arg0: tensor<4096x8192xf16>, %arg1: tensor<4096x8xf16>) -> tensor<8192x8xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x8xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x8xf16>) -> tensor<8192x8xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<4096x8192xf16>, tensor<4096x8xf16>) outs(%1 : tensor<8192x8xf16>) -> tensor<8192x8xf16>
         return %2 : tensor<8192x8xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_8_7168_bf16_tA.mlir b/gemm/mlir/gemm_8192_8_7168_bf16_tA.mlir
index 9060013..b7f68ff 100644
--- a/gemm/mlir/gemm_8192_8_7168_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_8_7168_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<7168x8192xbf16>, %arg1: tensor<7168x8xbf16>) -> tensor<8192x8xbf16> {
+    func.func @main(%arg0: tensor<7168x8192xbf16>, %arg1: tensor<7168x8xbf16>) -> tensor<8192x8xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x8xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x8xbf16>) -> tensor<8192x8xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<7168x8192xbf16>, tensor<7168x8xbf16>) outs(%1 : tensor<8192x8xbf16>) -> tensor<8192x8xbf16>
         return %2 : tensor<8192x8xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_8_7168_f16_tA.mlir b/gemm/mlir/gemm_8192_8_7168_f16_tA.mlir
index 6e1d748..c143d7f 100644
--- a/gemm/mlir/gemm_8192_8_7168_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_8_7168_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<7168x8192xf16>, %arg1: tensor<7168x8xf16>) -> tensor<8192x8xf16> {
+    func.func @main(%arg0: tensor<7168x8192xf16>, %arg1: tensor<7168x8xf16>) -> tensor<8192x8xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x8xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x8xf16>) -> tensor<8192x8xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<7168x8192xf16>, tensor<7168x8xf16>) outs(%1 : tensor<8192x8xf16>) -> tensor<8192x8xf16>
         return %2 : tensor<8192x8xf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_8_8192_bf16_tA.mlir b/gemm/mlir/gemm_8192_8_8192_bf16_tA.mlir
index 52b48fd..be2e86d 100644
--- a/gemm/mlir/gemm_8192_8_8192_bf16_tA.mlir
+++ b/gemm/mlir/gemm_8192_8_8192_bf16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x8192xbf16>, %arg1: tensor<8192x8xbf16>) -> tensor<8192x8xbf16> {
+    func.func @main(%arg0: tensor<8192x8192xbf16>, %arg1: tensor<8192x8xbf16>) -> tensor<8192x8xbf16> {
         %cst = arith.constant 0.000000e+00 : bf16
         %0 = tensor.empty() : tensor<8192x8xbf16>
         %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x8xbf16>) -> tensor<8192x8xbf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8192xbf16>, tensor<8192x8xbf16>) outs(%1 : tensor<8192x8xbf16>) -> tensor<8192x8xbf16>
         return %2 : tensor<8192x8xbf16>
     }
-} 
+}
diff --git a/gemm/mlir/gemm_8192_8_8192_f16_tA.mlir b/gemm/mlir/gemm_8192_8_8192_f16_tA.mlir
index 932a8de..62431ce 100644
--- a/gemm/mlir/gemm_8192_8_8192_f16_tA.mlir
+++ b/gemm/mlir/gemm_8192_8_8192_f16_tA.mlir
@@ -1,10 +1,10 @@
 
 module {
-    func.func @main_0(%arg0: tensor<8192x8192xf16>, %arg1: tensor<8192x8xf16>) -> tensor<8192x8xf16> {
+    func.func @main(%arg0: tensor<8192x8192xf16>, %arg1: tensor<8192x8xf16>) -> tensor<8192x8xf16> {
         %cst = arith.constant 0.000000e+00 : f16
         %0 = tensor.empty() : tensor<8192x8xf16>
         %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x8xf16>) -> tensor<8192x8xf16>
         %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8192xf16>, tensor<8192x8xf16>) outs(%1 : tensor<8192x8xf16>) -> tensor<8192x8xf16>
         return %2 : tensor<8192x8xf16>
     }
-} 
+}
diff --git a/gemmbench/gemm_bench.py b/gemmbench/gemm_bench.py
index c11d6ef..c7317f2 100644
--- a/gemmbench/gemm_bench.py
+++ b/gemmbench/gemm_bench.py
@@ -8,92 +8,20 @@
 from tqdm import tqdm
 from multiprocessing import Pool, cpu_count, Manager
 import logging
+import itertools
 from pathlib import Path
 import csv
 import argparse
 import sys
 from utils import *
-from problems import *
-
-def generate_mlir_content(M, N, K, tA, tB, dtype):
-
-    mlir_template_A = f"""
-module {{
-    func.func @main_0(%arg0: tensor<{K}x{M}x{dtype}>, %arg1: tensor<{K}x{N}x{dtype}>) -> tensor<{M}x{N}x{dtype}> {{
-        %cst = arith.constant 0.000000e+00 : {dtype}
-        %0 = tensor.empty() : tensor<{M}x{N}x{dtype}>
-        %1 = linalg.fill ins(%cst : {dtype}) outs(%0 : tensor<{M}x{N}x{dtype}>) -> tensor<{M}x{N}x{dtype}>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<{K}x{M}x{dtype}>, tensor<{K}x{N}x{dtype}>) outs(%1 : tensor<{M}x{N}x{dtype}>) -> tensor<{M}x{N}x{dtype}>
-        return %2 : tensor<{M}x{N}x{dtype}>
-    }}
-}}
-"""
-
-    mlir_template_B = f"""
-module {{
-    func.func @main_0(%arg0: tensor<{M}x{K}x{dtype}>, %arg1: tensor<{N}x{K}x{dtype}>) -> tensor<{M}x{N}x{dtype}> {{
-        %cst = arith.constant 0.000000e+00 : {dtype}
-        %0 = tensor.empty() : tensor<{M}x{N}x{dtype}>
-        %1 = linalg.fill ins(%cst : {dtype}) outs(%0 : tensor<{M}x{N}x{dtype}>) -> tensor<{M}x{N}x{dtype}>
-        %2 = linalg.matmul_transpose_b ins(%arg0, %arg1 : tensor<{M}x{K}x{dtype}>, tensor<{N}x{K}x{dtype}>) outs(%1 : tensor<{M}x{N}x{dtype}>) -> tensor<{M}x{N}x{dtype}>
-        return %2 : tensor<{M}x{N}x{dtype}>
-    }}
-}}
-"""
-
-    mlir_template = f"""module {{
-    func.func @main_0(%arg0: tensor<{M}x{K}x{dtype}>, %arg1: tensor<{K}x{N}x{dtype}>) -> tensor<{M}x{N}x{dtype}> {{
-        %cst = arith.constant 0.000000e+00 : {dtype}
-        %0 = tensor.empty() : tensor<{M}x{N}x{dtype}>
-        %1 = linalg.fill ins(%cst : {dtype}) outs(%0 : tensor<{M}x{N}x{dtype}>) -> tensor<{M}x{N}x{dtype}>
-        %2 = linalg.matmul ins(%arg0, %arg1 : tensor<{M}x{K}x{dtype}>, tensor<{K}x{N}x{dtype}>) outs(%1 : tensor<{M}x{N}x{dtype}>) -> tensor<{M}x{N}x{dtype}>
-        return %2 : tensor<{M}x{N}x{dtype}>
-    }}
-}}
-"""
-    if tA == "T":
-        return mlir_template_A
-    if tB == "T":
-        return mlir_template_B
-    return mlir_template
-
-
-def compile_shape(tag, M, N, K, tA, tB, dtype, target, extra_compiler_args, vmfb_dict):
-    if tA == "T" and tB == "T":
-        return f"Can't transpose both inputs"
-
-    # Generate MLIR content
-    mlir_content = generate_mlir_content(M, N, K, tA, tB, dtype)
-
-    # Generate filenames
-    filename = f"gemm/mlir/gemm_{M}_{N}_{K}_{dtype}"
-    if tA == "T":
-        filename += "_tA"
-    elif tB == "T":
-        filename += "_tB"
-    mlir_filename = filename + ".mlir"
-    filename = filename.replace("mlir", "vmfb")
-    vmfb_filename = filename + ".vmfb"
-
-    # Write MLIR content to file
-    with open(mlir_filename, 'w') as f:
-        f.write(mlir_content)
-
-    # Compile MLIR to VMFB
-    exec_args = [
-        "iree-compile",
-        f"{mlir_filename}",
-        "--iree-hal-target-backends=rocm",
-        f"--iree-hip-target={target}",
-        "--iree-llvmgpu-enable-prefetch=true",
-        "-o",
-        f"{vmfb_filename}",
-    ] + extra_compiler_args
-    ret_value, stdout = run_iree_command(exec_args)
-
-    vmfb_dict[vmfb_filename] = [tag, M, N, K, tA, tB, dtype]
-    if ret_value == 0:
-        return f"Successfully compiled {mlir_filename} to {vmfb_filename}"
+from gemm_utils import *
+from problems import get_gemm_configs
+
+
+def compile_gemm(tag, config, kernel_dir, vmfb_dir, target, extra_compiler_args):
+    mlir_file, vmfb_file = compile_gemm_config(config, kernel_dir, vmfb_dir, target, extra_compiler_args)
+    return (tag, config, mlir_file, vmfb_file)
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Config file updater.")
@@ -125,94 +53,80 @@ def compile_shape(tag, M, N, K, tA, tB, dtype, target, extra_compiler_args, vmfb
         roofline(args.roofline, args.plot, args.batch, args.dtype, args.model)
         sys.exit()
 
-    shapes = []
-    print(f"Generated {len(shapes)} gemm shapes.")
+    configs = get_gemm_configs()
+    print(f"Generated {len(configs)} gemm configs.")
 
     num_cpus = max(1, cpu_count() - 20)
     print(f"Using {num_cpus} CPUs for parallel processing.")
 
     manager = Manager()
     vmfb_dict = manager.dict()
-    all(shapes)
-    shape_idx = 0
-    for shape in shapes:
-        shape += (args.target, list(args.Xiree_compile), vmfb_dict,)
-        shapes[shape_idx] = shape
-        shape_idx += 1
 
+    repo_root = Path(__file__).parent.parent
+    kernel_dir = repo_root / "gemm" / "mlir"
+    vmfb_dir = repo_root / "gemm" / "vmfb"
+    kernel_dir.mkdir(parents=True, exist_ok=True)
+    vmfb_dir.mkdir(parents=True, exist_ok=True)
+    target = args.target
+    extra_compiler_args = list(args.Xiree_compile)
+
+    args = itertools.starmap(
+        lambda tag, config: (tag, config, kernel_dir, vmfb_dir, target, extra_compiler_args), configs
+    )
     with Pool(num_cpus) as pool:
-        results = list(tqdm(pool.starmap(compile_shape, shapes)))
+        compilation_results = list(tqdm(pool.starmap(compile_gemm, list(args))))
 
     error_count = 0
-    for result in results:
-        if 'error' in result.lower():
-            # print(result)
+    for tag, config, mlir_file, vmfb_file in compilation_results:
+        if vmfb_file:
+            vmfb_dict[vmfb_file] = (tag, config)
+        else:
             error_count += 1
-    print(f'{len(shapes) - error_count} Success, {error_count} Failed out of {len(shapes)} shapes')
+    print(
+        f"{len(configs) - error_count} Success, {error_count} Failed out of {len(configs)} configs"
+    )
 
     print("Compilation process completed.")
 
-    repo_root = Path(__file__).parent.parent
-
-    vmfb_dir = repo_root / Path('gemm/vmfb')
-
     results = []
     index = 0
-    output_csv = "results/iree_gemm.csv"
+    output_csv = "results/iree_gemm_new.csv"
     csv_dir = os.path.dirname(output_csv)
     if not os.path.exists(csv_dir):
         os.makedirs(csv_dir)
 
-    for vmfb_filename, input_list in vmfb_dict.items():
-        tag = input_list[0]
-        vmfb_filename = vmfb_filename.split("/")[-1]
-        name = vmfb_filename.split(".")[0]
-        M = input_list[1]
-        N = input_list[2]
-        K = input_list[3]
-        tA = input_list[4]
-        tB = input_list[5]
-        dtype = input_list[6]
-
-        if tA == "T":
-            inp1 = f"{K}x{M}x{dtype}"
-            inp2 = f"{K}x{N}x{dtype}"
-        elif tB == "T":
-            inp1 = f"{M}x{K}x{dtype}"
-            inp2 = f"{N}x{K}x{dtype}"
-        else:
-            inp1 = f"{M}x{K}x{dtype}"
-            inp2 = f"{K}x{N}x{dtype}"
+    for vmfb_filename, value in vmfb_dict.items():
+        tag, config = value
+        name = config.get_name()
+
+        inp1 = config.get_inp1()
+        inp2 = config.get_inp2()
 
         exec_args = [
             "iree-benchmark-module",
             f"--device=hip",
             "--device_allocator=caching",
-            f"--module={vmfb_dir}/{vmfb_filename}",
-            "--function=main_0",
+            f"--module={vmfb_filename}",
+            "--function=main",
             f"--input={inp1}",
             f"--input={inp2}",
             "--benchmark_repetitions=3",
         ]
 
-        # iree benchmark command for full sdxl pipeline
+        # iree benchmark kernels
         ret_value, cmd_out = run_iree_command(exec_args)
         ok = ret_value == 0
         benchmark_gemm_mean_time_ms = bench_summary_process(ret_value, cmd_out)
         benchmark_gemm_mean_time_us = benchmark_gemm_mean_time_ms * 1000
 
-        if "bf" in dtype:
-            bytes_per_input = int(dtype[2:]) / 8
-        else:
-            bytes_per_input = int(dtype[1:]) / 8
-        flops = 2 * M * N * K
-        byte_count = bytes_per_input * (M * K + N * K + M * N)
+        flops = config.get_flops()
+        byte_count = config.get_byte_count()
 
         arithmetic_intensity = flops / byte_count
         tflops_per_second = (flops / 1e12) / (benchmark_gemm_mean_time_us / 1e6)
 
         results.append((
-            index, tag, name, M, N, K, dtype, tA, tB,
+            index, tag, name, config.M, config.N, config.K, config.dtype, config.tA, config.tB,
             round(benchmark_gemm_mean_time_us, 4),
             round(arithmetic_intensity, 4),
             round(tflops_per_second, 4),
diff --git a/gemmbench/gemm_utils.py b/gemmbench/gemm_utils.py
new file mode 100644
index 0000000..551d0aa
--- /dev/null
+++ b/gemmbench/gemm_utils.py
@@ -0,0 +1,143 @@
+from utils import *
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+
+
+@dataclass
+class GemmConfig:
+    M: int
+    N: int
+    K: int
+    tA: str
+    tB: str
+    dtype: str
+
+    def get_name(self) -> str:
+        name = f"gemm_{self.M}_{self.N}_{self.K}_{self.dtype}"
+        if self.tA == "T":
+            name += "_tA"
+        elif self.tB == "T":
+            name += "_tB"
+        return name
+
+    def get_inp1(self) -> str:
+        if self.tA == "T":
+            inp1 = f"{self.K}x{self.M}x{self.dtype}"
+        else:
+            inp1 = f"{self.M}x{self.K}x{self.dtype}"
+        return inp1
+
+    def get_inp2(self) -> str:
+        if self.tB == "T":
+            inp2 = f"{self.N}x{self.K}x{self.dtype}"
+        else:
+            inp2 = f"{self.K}x{self.N}x{self.dtype}"
+        return inp2
+
+    def get_byte_count(self) -> int:
+        dtype_bits_map = {
+            "f32": 32,
+            "f16": 16,
+            "bf16": 16,
+            "f8E4M3FNUZ": 8,
+            "i8": 8,
+            "i32": 32,
+        }
+        bytes_per_element = dtype_bits_map[self.dtype] // 8
+        element_count = self.M * self.K + self.N * self.K + self.M * self.N
+        byte_count = element_count * bytes_per_element
+        return byte_count
+
+    def get_flops(self) -> int:
+        flops = 2 * self.M * self.N * self.K
+        return flops
+
+def generate_mlir(config: GemmConfig):
+    K = config.K
+    M = config.M
+    N = config.N
+    dtype = config.dtype
+    tA = config.tA
+    tB = config.tB
+    mlir_template_A = f"""
+module {{
+    func.func @main(%arg0: tensor<{K}x{M}x{dtype}>, %arg1: tensor<{K}x{N}x{dtype}>) -> tensor<{M}x{N}x{dtype}> {{
+        %cst = arith.constant 0.000000e+00 : {dtype}
+        %0 = tensor.empty() : tensor<{M}x{N}x{dtype}>
+        %1 = linalg.fill ins(%cst : {dtype}) outs(%0 : tensor<{M}x{N}x{dtype}>) -> tensor<{M}x{N}x{dtype}>
+        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<{K}x{M}x{dtype}>, tensor<{K}x{N}x{dtype}>) outs(%1 : tensor<{M}x{N}x{dtype}>) -> tensor<{M}x{N}x{dtype}>
+        return %2 : tensor<{M}x{N}x{dtype}>
+    }}
+}}
+"""
+
+    mlir_template_B = f"""
+module {{
+    func.func @main(%arg0: tensor<{M}x{K}x{dtype}>, %arg1: tensor<{N}x{K}x{dtype}>) -> tensor<{M}x{N}x{dtype}> {{
+        %cst = arith.constant 0.000000e+00 : {dtype}
+        %0 = tensor.empty() : tensor<{M}x{N}x{dtype}>
+        %1 = linalg.fill ins(%cst : {dtype}) outs(%0 : tensor<{M}x{N}x{dtype}>) -> tensor<{M}x{N}x{dtype}>
+        %2 = linalg.matmul_transpose_b ins(%arg0, %arg1 : tensor<{M}x{K}x{dtype}>, tensor<{N}x{K}x{dtype}>) outs(%1 : tensor<{M}x{N}x{dtype}>) -> tensor<{M}x{N}x{dtype}>
+        return %2 : tensor<{M}x{N}x{dtype}>
+    }}
+}}
+"""
+
+    mlir_template = f"""module {{
+    func.func @main(%arg0: tensor<{M}x{K}x{dtype}>, %arg1: tensor<{K}x{N}x{dtype}>) -> tensor<{M}x{N}x{dtype}> {{
+        %cst = arith.constant 0.000000e+00 : {dtype}
+        %0 = tensor.empty() : tensor<{M}x{N}x{dtype}>
+        %1 = linalg.fill ins(%cst : {dtype}) outs(%0 : tensor<{M}x{N}x{dtype}>) -> tensor<{M}x{N}x{dtype}>
+        %2 = linalg.matmul ins(%arg0, %arg1 : tensor<{M}x{K}x{dtype}>, tensor<{K}x{N}x{dtype}>) outs(%1 : tensor<{M}x{N}x{dtype}>) -> tensor<{M}x{N}x{dtype}>
+        return %2 : tensor<{M}x{N}x{dtype}>
+    }}
+}}
+"""
+    if tA == "T":
+        return mlir_template_A
+    if tB == "T":
+        return mlir_template_B
+    return mlir_template
+
+
+def compile_gemm_config(
+    config: GemmConfig, kernel_dir: Path, vmfb_dir: Path, target, extra_compiler_args
+) -> tuple[Path, Optional[Path]]:
+    mlir_file = kernel_dir / (config.get_name() + ".mlir")
+    vmfb_file = vmfb_dir / (config.get_name() + ".vmfb")
+
+    if not os.path.exists(vmfb_dir):
+        os.makedirs(vmfb_dir)
+
+    # Generate mlir content
+    mlir_content = generate_mlir(config)
+
+    # Write MLIR content to file
+    with open(mlir_file, "w") as f:
+        f.write(mlir_content)
+
+    # Compile MLIR to VMFB
+    exec_args = [
+        "iree-compile",
+        f"{mlir_file}",
+        "--iree-hal-target-backends=rocm",
+        f"--iree-hip-target={target}",
+        "--iree-llvmgpu-enable-prefetch=true",
+        "-o",
+        f"{vmfb_file}",
+    ] + extra_compiler_args
+
+    print(" ".join(exec_args))
+
+    ret_value, stderr = run_iree_command(exec_args)
+    if ret_value != 0:
+        print(f"Successfully compiled {mlir_file} to {vmfb_file}")
+    else:
+        error_file = vmfb_dir / (config.get_name() + "_error.txt")
+        print(f"Failed to compile {mlir_file}. Error dumped in {error_file}")
+        with open(error_file, "w") as f:
+            f.write(stderr.decode("utf-8"))
+        return mlir_file, None
+
+    return mlir_file, vmfb_file
diff --git a/gemmbench/problems.py b/gemmbench/problems.py
index 73847b8..8cdada4 100644
--- a/gemmbench/problems.py
+++ b/gemmbench/problems.py
@@ -4,6 +4,8 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+from gemm_utils import GemmConfig
+
 def is_compute_bound(M, N, K, bpe):
     """Is this GEMM compute (or memory) bound?"""
     magic_ratio = 64
@@ -637,197 +639,232 @@ def is_compute_bound(M, N, K, bpe):
     (8192, 5120, 640),
 ]
 
-def llama13bmatvec(configs):
+def llama13bmatvec(dtype: str) -> list[GemmConfig]:
+    configs = []
     """LLAMA 13b, single batch, FP16."""
     for m, n, k, model, gcount in LLAMA:
         if n == 1 and model == "13b":
-            configs.append((
-                "llama13bmatvec",
+            configs.append(GemmConfig(
                 m,
                 n,
                 k,
                 "T",
                 "N",
-                "f16"
+                dtype
             ))
+    return configs
 
 
-def llama13bmatvecbf16(configs):
+def llama13bmatvecbf16(dtype: str) -> list[GemmConfig]:
+    configs = []
     """LLAMA 13b, single batch, BF16."""
     for m, n, k, model, gcount in LLAMA:
         if n == 1 and model == "13b":
-            configs.append((
-                "llama13bmatvecbf16",
+            configs.append(GemmConfig(
                 m,
                 n,
                 k,
                 "T",
                 "N",
-                "bf16"
+                dtype
             ))
+    return configs
 
 
-def llama70bmatvec(configs):
+def llama70bmatvec(dtype: str) -> list[GemmConfig]:
     """LLAMA 70b, single batch, FP16."""
+    configs = []
     for m, n, k, model, gcount in LLAMA:
         if n == 1 and model == "70b":
-            configs.append((
-                "llama70bmatvec",
+            configs.append(GemmConfig(
                 m,
                 n,
                 k,
                 "T",
                 "N",
-                "f16",
+                dtype
             ))
+    return configs
 
 
-def llama70bmatvecbf16(configs):
+def llama70bmatvecbf16(dtype: str) -> list[GemmConfig]:
     """LLAMA 70b, single batch, BF16."""
+    configs = []
     for m, n, k, model, gcount in LLAMA:
         if n == 1 and model == "70b":
-            configs.append((
-                "llama70bmatvecbf16",
+            configs.append(GemmConfig(
                 m,
                 n,
                 k,
                 "T",
                 "N",
-                "bf16",
+                dtype
             ))
+    return configs
 
 
-def llama13bskinny(configs):
+def llama13bskinny(dtype: str) -> list[GemmConfig]:
     """LLAMA 13b, multiple batches, FP16."""
+    configs = []
     for m, n, k, model, gcount in LLAMA:
         if n == 1 and model == "13b":
             for batch in [2, 4, 8, 16, 32]:
-                configs.append((
-                    "llama13bskinny",
+                configs.append(GemmConfig(
                     m,
                     batch,
                     k,
                     "T",
                     "N",
-                    "f16",
+                    dtype
                 ))
+    return configs
 
 
-def llama13bskinnybf16(configs):
+def llama13bskinnybf16(dtype: str) -> list[GemmConfig]:
     """LLAMA 13b, multiple batches, BF16."""
+    configs = []
     for m, n, k, model, gcount in LLAMA:
         if n == 1 and model == "13b":
             for batch in [2, 4, 8, 16, 32]:
-                configs.append((
-                    "llama13bskinnybf16",
+                configs.append(GemmConfig(
                     m,
                     batch,
                     k,
                     "T",
                     "N",
-                    "bf16",
+                    dtype
                 ))
+    return configs
 
 
-def llama70bskinny(configs):
+def llama70bskinny(dtype: str) -> list[GemmConfig]:
     """LLAMA 70b, multiple batches, FP16."""
+    configs = []
     for m, n, k, model, gcount in LLAMA:
         if n == 1 and model == "70b":
             for batch in [2, 4, 8, 16, 32]:
-                configs.append((
-                    "llama70bskinny",
+                configs.append(GemmConfig(
                     m,
                     batch,
                     k,
                     "T",
                     "N",
-                    "f16",
+                    dtype
                 ))
+    return configs
 
 
-def llama70bskinnybf16(configs):
+def llama70bskinnybf16(dtype: str) -> list[GemmConfig]:
     """LLAMA 70b, multiple batches, BF16."""
+    configs = []
     for m, n, k, model, gcount in LLAMA:
         if n == 1 and model == "70b":
             for batch in [2, 4, 8, 16, 32]:
-                configs.append((
-                    "llama70bskinnybf16",
+                configs.append(GemmConfig(
                     m,
                     batch,
                     k,
                     "T",
                     "N",
-                    "bf16",
+                    dtype
                 ))
+    return configs
 
 
-def gpt4memory(configs):
+def gpt4memory(dtype: str) -> list[GemmConfig]:
     """GPT4 memory bound GEMMs; FP16."""
+    configs = []
     for m, n, k in GPT4:
-        hgemm = ("gpt4memory", m, n, k, "N", "N", "f16")
+        hgemm = GemmConfig(m, n, k, "N", "N", dtype)
         if not is_compute_bound(m, n, k, 2):
             yield configs.append(hgemm)
+    return configs
 
 
-def gpt4compute(configs):
+def gpt4compute(dtype: str) -> list[GemmConfig]:
     """GPT4 compute bound GEMMs; FP16."""
+    configs = []
     for m, n, k in GPT4:
-        hgemm = ("gpt4compute", m, n, k, "N", "N", "f16")
+        hgemm = GemmConfig(m, n, k, "N", "N", dtype)
         if is_compute_bound(m, n, k, 2):
             configs.append(hgemm)
+    return configs
 
 
-def gpt4clocktest(configs):
+def gpt4clocktest(dtype: str) -> list[GemmConfig]:
     """GPT4 compute bound GEMMs; FP16."""
+    configs = []
     macM, macN = 128, 128
     M, N, K = 2048, 2048, 8192
-
     for mult in range(1, M//macM + 1):
-        configs.append(("clocktest", mult * macM, mult * macN, K, "N", "N", "f16"))
+        configs.append(GemmConfig(mult * macM, mult * macN, K, "N", "N", dtype))
+    return configs
 
 
-def test(configs):
+def test(dtype: str) -> list[GemmConfig]:
     """GPT4 compute bound GEMMs; FP16."""
     #M, N, K = 2048, 2048, 8192
+    configs = []
     M, N, K = 128, 128, 8192
-    configs.append(("test", M, N, K, "N", "N", "f16"))
+    configs.append(GemmConfig(M, N, K, "N", "N", dtype))
     M, N, K = 2048, 2048, 8192
-    configs.append(("test", M, N, K, "N", "N", "f16"))
+    configs.append(GemmConfig(M, N, K, "N", "N", dtype))
+    return configs
 
 
-def llama70bmemory(configs):
+def llama70bmemory(dtype: str) -> list[GemmConfig]:
     """LLAMA 70b memory bound GEMMs; NT; BF16."""
-
+    configs = []
     for n in [1280, 3584, 7168]:
-        configs.append(("llama70bmemory", 2, n, 8192, "N", "T", "bf16"))
+        configs.append(GemmConfig(2, n, 8192, "N", "T", dtype))
+    return configs
 
 
-def compute(configs):
+def compute(dtype: str) -> list[GemmConfig]:
     """Compute bound GEMMs."""
     #for dtype in ["fp16", "bf16", "fp8"]:
-    for dtype in ["f16", "bf16"]:
+    configs = []
+    for dtype in [dtype]:
         for tA in ["N", "T"]:
             for tB in ["N", "T"]:
-                configs.append(("compute", 4096, 4096, 8192, tA, tB, dtype))
+                if tA == "N" or tB == "N":
+                    configs.append(GemmConfig(4096, 4096, 8192, tA, tB, dtype))
+    return configs
 
-def unet(configs):
-    for dtype in ["f16", "bf16"]:
+def unet(dtype: str) -> list[GemmConfig]:
+    configs = []
+    for dtype in [dtype]:
         for tA in ["N", "T"]:
             for tB in ["N", "T"]:
                 for m, n, k in UNET:
-                    configs.append(("unet", m, n, k, tA, tB, dtype))
-
-def all(configs):
-    llama13bmatvec(configs)
-    llama13bmatvecbf16(configs)
-    llama70bmatvec(configs)
-    llama70bmatvecbf16(configs)
-    llama13bskinny(configs)
-    llama13bskinnybf16(configs)
-    llama70bskinny(configs)
-    llama70bskinnybf16(configs)
-    gpt4memory(configs)
-    gpt4compute(configs)
-    llama70bmemory(configs)
-    compute(configs)
-    unet(configs)
+                    if tA == "N" or tB == "N":
+                        configs.append(GemmConfig(m, n, k, tA, tB, dtype))
+    return configs
+
+def get_gemm_configs() -> list[tuple[str, GemmConfig]]:
+    configs: list[tuple[str, GemmConfig]] = []
+    llama13bmatvec_configs = llama13bmatvec("f16")
+    llama13bmatvec_configs += llama13bmatvecbf16("bf16")
+    llama70bmatvec_configs = llama70bmatvec("f16")
+    llama70bmatvec_configs += llama70bmatvecbf16("bf16")
+    llama13bskinny_configs = llama13bskinny("f16")
+    llama13bskinny_configs += llama13bskinnybf16("bf16")
+    llama70bskinny_configs = llama70bskinny("f16")
+    llama70bskinny_configs += llama70bskinnybf16("bf16")
+    gpt4compute_configs = gpt4compute("f16")
+    llama70bmemory_configs = llama70bmemory("bf16")
+    compute_configs = compute("f16")
+    compute_configs += compute("bf16")
+    unet_configs = unet("f16")
+    unet_configs += unet("bf16")
+
+    configs += [("llama13bmatvec", x) for x in llama13bmatvec_configs]
+    configs += [("llama70bmatvec", x) for x in llama70bmatvec_configs]
+    configs += [("llama13bskinny", x) for x in llama13bskinny_configs]
+    configs += [("llama70bskinny", x) for x in llama70bskinny_configs]
+    configs += [("gpt4compute", x) for x in gpt4compute_configs]
+    configs += [("llama70bmemory", x) for x in llama70bmemory_configs]
+    configs += [("compute", x) for x in compute_configs]
+    configs += [("unet", x) for x in unet_configs]
+    
+    return configs