write technical report

AyiStar · May 29, 2024 · 2cbe9c4 · 2cbe9c4
1 parent 013c290
commit 2cbe9c4
Show file tree

Hide file tree

Showing 12 changed files with 872 additions and 380 deletions.
diff --git a/.gitignore b/.gitignore
@@ -26,9 +26,9 @@ a.out
 .clang-tidy
 .vs/
 .idea/
+.vscode/
 draft/
 core
 
-.vscode/**/*
-!.vscode/settings.json
-!.vscode/c_cpp_properties.json
+model_weights/*
+!model_weights/*.py
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -46,7 +46,30 @@
         "__locale": "cpp",
         "__verbose_abort": "cpp",
         "ios": "cpp",
-        "locale": "cpp"
+        "locale": "cpp",
+        "cctype": "cpp",
+        "charconv": "cpp",
+        "chrono": "cpp",
+        "cinttypes": "cpp",
+        "clocale": "cpp",
+        "codecvt": "cpp",
+        "condition_variable": "cpp",
+        "cstdarg": "cpp",
+        "ctime": "cpp",
+        "cwctype": "cpp",
+        "deque": "cpp",
+        "map": "cpp",
+        "unordered_map": "cpp",
+        "functional": "cpp",
+        "numeric": "cpp",
+        "optional": "cpp",
+        "ratio": "cpp",
+        "fstream": "cpp",
+        "iomanip": "cpp",
+        "mutex": "cpp",
+        "sstream": "cpp",
+        "thread": "cpp",
+        "variant": "cpp"
     },
     "C_Cpp.default.compilerPath": "/usr/bin/g++",
 

diff --git a/Makefile b/Makefile
@@ -2,10 +2,11 @@
 
 LLAMA_CPP_DIR = ./llama.cpp-b2430
 SRC_DIR = ./src
-TEST_DIR = ./tests
+TEST_DIR = ./test
 
 ifdef DEBUG
 export LLAMA_DEBUG = 1
+LAMM_FLAGS += -DLAMM_DEBUG
 endif
 
 ifndef NLA_LLAMA
@@ -25,18 +26,18 @@ endif
 
 export LAMM_FLAGS
 
-.PHONY: mm_bench
-mm_bench: $(SRC_DIR)/loongarch_matmul.o
+.PHONY: benchmark
+benchmark: $(SRC_DIR)/loongarch_matmul.o
 	$(MAKE) -C $(LLAMA_CPP_DIR) la-benchmark-matmult $(MK_FORCE) -j8
 
-$(SRC_DIR)/loongarch_matmul.o:
-	$(MAKE) -C $(LLAMA_CPP_DIR) lamm
-
 .PHONY: main
 main: $(SRC_DIR)/loongarch_matmul.o
 	$(MAKE) -C $(LLAMA_CPP_DIR) main $(MK_FORCE) -j8
-	cp $(LLAMA_CPP_DIR)/main $(TEST_DIR)
+	cp $(LLAMA_CPP_DIR)/main $(TEST_DIR)/main
+
+$(SRC_DIR)/loongarch_matmul.o:
+	$(MAKE) -C $(LLAMA_CPP_DIR) lamm
 
 .PHONY: clean
 clean:
-	rm -f $(SRC_DIR)/*.o $(SRC_DIR)/la-benchmark-matmult
+	rm -f $(SRC_DIR)/*.o $(TEST_DIR)/la-benchmark-matmult $(TEST_DIR)/main
diff --git a/README.md b/README.md
diff --git a/dev.md b/dev.md
@@ -0,0 +1,75 @@
+# LA-llama.cpp
+
+Let's play LLM on LoongArch!
+
+
+## Overview
+
+The project aims at porting and optimizing llama.cpp, a C++ LLM inference framework, on LoongArch.
+Especially, we want to tackle the following challenges:
+
+* Potential problems when porting the code on LoongArch platform.
+* Inference performance optimization via SIMD, temporarily targeting at 3A6000 platform.
+* LLM evaluation on LoongArch platform.
+* Interesting applications with presentation.
+
+## Project Structure
+
+The overall directory structure of this project is organized as follows:
+- `llama.cpp-b2430/`: The original code of llama.cpp with fixed release version `b2430`. During development, we try to keep minimum change within this directory by only revising the build system (Makefile) and some conditionally compiled code (Macros to insert our work). Most of the real work are in the `src/` directory.
+- `src/`: This is where we put the real optimization code, i.e., `loongarch_matmul.[cpp|h]`.
+- `test/`: The benchmark code, which is altered from `llama.cpp-b2430/examples/benchmark/benchmark-matmult.cpp`. That means, the performance measure is completely comparable with the former reported results in community.
+- `docs/`: The documentation generated along with the project.
+
+## Plan
+
+Based on the above challenges, the project can be divided into the following 4 stages:
+
+### Setup
+- Task: Build basic environments and get familiar to the codebase.
+- Objective: Environment setup and self warm-up.
+
+### Porting
+- Task: Port llama.cpp to LoongArch platform.
+- Objective: Compile and run llama.cpp on 3A6000.
+
+### Optimization
+- Task: Optimize the efficiency of llama.cpp on LoongArch (focus on CPU).
+- Objective: Apply programming optimization techniques and document the improvements.
+
+### Evaluation
+- Task: Benchmark various LLMs of different sizes.
+- Objective: Output a technical report.
+
+### Application
+- Task: Deploy usable applications with LLM on LoongArch platforms.
+- Objective: Output well-written deployment documents and visual demos.
+
+## Miscellaneous
+- We develop based on release `b2430` of the [original repo](https://github.com/ggerganov/llama.cpp/releases/tag/b2430).
+
+## Progress and TODO list
+
+### Setup Stage
+At this stage, we get familiar with the concept of cross compilation, build and 
+- [x] Compile and run original llama.cpp on x86 CPU.
+- [x] Cross compile llama.cpp to RISCV64 and run with QEMU on x86 CPU (refer to https://github.com/ggerganov/llama.cpp/pull/3453).
+- [x] Set up cross compilation tools and QEMU environment for LoongArch.
+
+### Porting Stage
+- [x] Alter the makefile for LoongArch cross compilation.
+- [x] Cross compile llama.cpp to LoongArch64.
+
+### Optimization Stage
+Thanks to [the excellent work from Loongson team](https://github.com/ggerganov/llama.cpp/pull/6454), we have a great oppotunity to learn about SIMD acceleration with LoongArch LSX/LASX vector instruction set. Part of our work are based on them.
+- [x] Identify performance bottleneck in llama.cpp.
+- [x] Add LSX/LASX SIMD acceleration for llama.cpp.
+- [x] Add LASX GEMM acceleration for llama.cpp.
+
+### Benchmark Stage
+Benchmark goes along with optimization because we always want to know the exact improvement.
+- [x] Measure performance improvement on Loongson 3A6000 processor.
+
+### Finish Stage
+Output a well-organized technical report.
+- [ ] Compete technical report.
diff --git a/docs/cross_compilation.md b/docs/cross_compilation.md
diff --git a/docs/optimization.md b/docs/optimization.md