Merge pull request #4 from kroggen/update-readme

Update README.md
ankan-ban · Sep 16, 2023 · 5931bc3 · 5931bc3
2 parents c247a35 + 6c528fd
commit 5931bc3
Showing 1 changed file with 6 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -19,12 +19,14 @@ Here are the commands for the 7B model:
 
 ```
 git clone https://github.com/ankan-ban/llama_cu_awq
+cd llama_cu_awq
 gcc weight_packer.cpp -o weight_packer
 nvcc -O3 llama2_q4.cu -o llama2_q4
 
 wget https://huggingface.co/abhinavkulkarni/meta-llama-Llama-2-7b-chat-hf-w4-g128-awq/resolve/main/pytorch_model.bin
 wget https://huggingface.co/abhinavkulkarni/meta-llama-Llama-2-7b-chat-hf-w4-g128-awq/resolve/main/config.json
 
+pip install numpy torch
 python3 convert_awq_to_bin.py pytorch_model.bin output
 ./weight_packer config.json output llama2-7b-awq-q4.bin
 
@@ -35,6 +37,7 @@ And here are the commands for the 13B model:
 
 ```
 git clone https://github.com/ankan-ban/llama_cu_awq
+cd llama_cu_awq
 gcc weight_packer.cpp -o weight_packer
 nvcc -O3 llama2_q4.cu -o llama2_q4
 
@@ -43,6 +46,7 @@ wget https://huggingface.co/abhinavkulkarni/meta-llama-Llama-2-13b-chat-hf-w4-g1
 wget https://huggingface.co/abhinavkulkarni/meta-llama-Llama-2-13b-chat-hf-w4-g128-awq/resolve/main/pytorch_model-00002-of-00003.bin
 wget https://huggingface.co/abhinavkulkarni/meta-llama-Llama-2-13b-chat-hf-w4-g128-awq/resolve/main/pytorch_model-00003-of-00003.bin
 
+pip install numpy torch
 python3 convert_awq_to_bin.py pytorch_model-00001-of-00003.bin output
 python3 convert_awq_to_bin.py pytorch_model-00002-of-00003.bin output
 python3 convert_awq_to_bin.py pytorch_model-00003-of-00003.bin output
@@ -68,6 +72,7 @@ Example:
 python -m awq.entry --model_path /path-to-model/Llama-2-7b-chat-hf --w_bit 4 --q_group_size 128 --run_awq --dump_awq awq_cache/llama2-7b-chat-metadata.pt
 python -m awq.entry --model_path /path-to-model/Llama-2-7b-chat-hf --w_bit 4 --q_group_size 128 --load_awq awq_cache/llama2-7b-chat-metadata.pt --q_backend real --dump_quant awq_weights/llama2-7b-awq.pt
 
+pip install numpy torch
 python3 convert_awq_to_bin.py awq_weights/llama2-7b-awq.pt output
 ./weight_packer config.json output llama2-7b-awq-q4.bin
 ```
@@ -78,7 +83,7 @@ python3 convert_awq_to_bin.py awq_weights/llama2-7b-awq.pt output
 We get ~200 tokens per second with RTX 4090 for 7b paramater models:
 
 ```
-llama2_q4_opt.exe llama2-7b-awq-q4.bin -n 256 -i "write an essay about GPUs"
+llama2_q4.exe llama2-7b-awq-q4.bin -n 256 -i "write an essay about GPUs"
 
 Model params:-
 dim: 4096