Skip to content

Commit

Permalink
support llama2 model
Browse files Browse the repository at this point in the history
  • Loading branch information
chenqy4933 committed Aug 16, 2023
1 parent 9bc347c commit fcce99b
Show file tree
Hide file tree
Showing 10 changed files with 203 additions and 28 deletions.
6 changes: 3 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,8 @@ if(ENABLE_ASAN)
add_definitions(-DENABLE_ASAN)
endif()

add_executable(alpaca application/alpaca/alpaca.cpp)
target_link_libraries(alpaca InferLLM)
add_executable(llama application/llama/llama.cpp)
target_link_libraries(llama InferLLM)

add_executable(chatglm application/chatglm/chatglm.cpp)
target_link_libraries(chatglm InferLLM)
Expand All @@ -131,7 +131,7 @@ target_link_libraries(quantizer InferLLM)
if(ENABLE_GPU)
target_link_libraries(InferLLM InferLLMGPU)
target_link_libraries(InferLLMShared InferLLMGPU)
target_link_libraries(alpaca InferLLMGPU)
target_link_libraries(llama InferLLMGPU)
target_link_libraries(chatglm InferLLMGPU)
target_link_libraries(quantizer InferLLMGPU)
target_link_libraries(chat InferLLMGPU)
Expand Down
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,14 @@ InferLLM is a lightweight LLM model inference framework that mainly references a

In short, InferLLM is a simple and efficient LLM CPU inference framework that can deploy quantized models in LLM locally and has good inference speed.

## Latest News
- 2023.08.16: Add support for LLama-2-7B model.
- 2023.08.8: Optimized the performance on Arm, which optimized the int4 matmul kernel with arm asm and kernel packing.
- berfor: support chatglm/chatglm2, baichuan, alpaca, ggml-llama model.

## How to use
### Download model
Currently, InferLLM uses the same models as llama.cpp and can download models from the llama.cpp project. In addition, models can also be downloaded directly from Hugging Face [kewin4933/InferLLM-Model](https://huggingface.co/kewin4933/InferLLM-Model/tree/main). Currently, two alpaca models are uploaded in this project, one is the Chinese int4 model and the other is the English int4 model.
Currently, InferLLM uses the same models as llama.cpp and can download models from the llama.cpp project. In addition, models can also be downloaded directly from Hugging Face [kewin4933/InferLLM-Model](https://huggingface.co/kewin4933/InferLLM-Model/tree/main). Currently, two alpaca, llama2, chatglm/chatglm2 and baichuan models are uploaded in this project, one is the Chinese int4 model and the other is the English int4 model.

### Compile InferLLM
#### Local compilation
Expand Down Expand Up @@ -52,6 +57,7 @@ Now InferLLM supports the fellowing models:
* [ChatGLM2-6B](https://github.com/THUDM/ChatGLM2-6B): usage please refer to [ChatGLM](./application/chatglm/Readme.md)
* [ChatGLM-6B](https://github.com/THUDM/ChatGLM-6B): usage please refer to [ChatGLM](./application/chatglm/Readme.md)
* [llama](https://github.com/facebookresearch/llama)
* [llama2](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)
* [alpaca](https://crfm.stanford.edu/2023/03/13/alpaca.html)
* [baichuan](https://github.com/baichuan-inc/baichuan-7B) : usage please refer to [baichuan](./application/baichuan/Readme.md)
### License
Expand Down
4 changes: 2 additions & 2 deletions README_Chinese.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ export NDK_ROOT=/path/to/ndk
./tools/android_build.sh
```
### 运行 InferLLM
如果是本执行直接执行 `./alpaca -m chinese-alpaca-7b-q4.bin -t 4`,如果要在手机上执行,可以使用 adb 命令把 alpaca 和模型文件拷贝到手机上,然后执行 `adb shell ./alpaca -m chinese-alpaca-7b-q4.bin -t 4`。下面是运行时的截屏,
如果是本执行直接执行 `./llama -m chinese-alpaca-7b-q4.bin -t 4`,如果要在手机上执行,可以使用 adb 命令把 llama 和模型文件拷贝到手机上,然后执行 `adb shell ./llama -m chinese-alpaca-7b-q4.bin -t 4`。下面是运行时的截屏,
- x86 芯片是:Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
![x86执行](./assets/x86.gif)
- 手机是 xiaomi9,Qualcomm SM8150 Snapdragon 855
Expand All @@ -40,7 +40,7 @@ export NDK_ROOT=/path/to/ndk

根据 [x86测速结果](./docs/profile.md),我们推荐使用 4 线程。

默认使用的设备是 CPU,如果要使用 GPU 推理,请使用 `./alpaca -m chinese-alpaca-7b-q4.bin -g GPU` 指定 GPU 设备。
默认使用的设备是 CPU,如果要使用 GPU 推理,请使用 `./llama -m chinese-alpaca-7b-q4.bin -g GPU` 指定 GPU 设备。

### License
InferLLM is licensed under the Apache License, Version 2.0
162 changes: 162 additions & 0 deletions application/llama/convert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
# Convert a baichuan model checkpoint to a InferLLM compatible file
#
# Load the model using Torch
# Iterate over all variables and write them to a binary file.
#
# Model Structure Header:
# - Magic number (int)
# - Param Offset (int)
# - Param Length (int)
# - Vocabulary Offset (int)
# - Vocabulary Length (int)
# - Tensor offset (int)
#
# Param :
# - Hidden Size (int)
# - Number of heads (int)
# - Number of layers (int)
# - Embedding Size (int)
# - FC hidden size (int)
# - Vocabulary Size (int)
# - Weight Data Type (int) (0 = float32, 1 = float16, 2 = int8, 3 = uint8)
#
# For each tensor, write the following:
# - Number of dimensions (int)
# - Name length (int)
# - Dimensions (int[n_dims])
# - Name (char[name_length])
# - Data (int8_t[len])
#
#

import sys
import json
import struct
import numpy as np
import torch
import argparse
import tempfile
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentencepiece import SentencePieceProcessor

# parse arguments
parser = argparse.ArgumentParser(description="Convert a ChatGLM model to a InferLLM compatible fp16 data type file")
parser.add_argument("-o", "--outfile", type=str, help="the output file")
args = parser.parse_args()

# output in the same directory as the model
model_out_path = args.outfile

hparams = {
"embd_size": 4096,
"n_heads": 32,
"n_layers": 32,
"fc_hidden": 11008,
}

model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf").float().state_dict()
dtype = 0 #1 = float16

auto_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", use_fast=False)
vocab_dir = tempfile.mkdtemp()
auto_tokenizer.save_vocabulary(vocab_dir)
tokenizer = SentencePieceProcessor(vocab_dir + "/tokenizer.model")

hparams.update({"vocab_size": tokenizer.vocab_size()})
print(hparams)

fout = open(model_out_path, "wb")
fout.write(struct.pack("i", 0x0123456)) # magic: inferllm

# the model parameters
param_byte = struct.pack("i", hparams["embd_size"])
param_byte +=struct.pack("i", hparams["n_heads"])
param_byte +=struct.pack("i", hparams["n_layers"])
param_byte +=struct.pack("i", hparams["fc_hidden"])
param_byte +=struct.pack("i", hparams["vocab_size"])

vocab_byte = bytearray()
for i in range(tokenizer.vocab_size()):
if tokenizer.is_unknown(i):
# "<unk>" token (translated as ??)
text = " \u2047 ".encode("utf-8")
vocab_byte += struct.pack("i", len(text))
vocab_byte += text
elif tokenizer.is_control(i):
# "<s>"/"</s>" tokens
vocab_byte += struct.pack("i", 0)
elif tokenizer.is_byte(i):
# "<U+XX>" tokens (which may be invalid UTF-8)
piece = tokenizer.id_to_piece(i)
if len(piece) != 6:
print("Invalid token: " + piece)
sys.exit(1)
byte_value = int(piece[3:-1], 16)
vocab_byte += struct.pack("i", 1)
vocab_byte += struct.pack("B", byte_value)
else:
# normal token. Uses U+2581 (LOWER ONE EIGHTH BLOCK) to represent spaces.
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
vocab_byte += struct.pack("i", len(text))
vocab_byte += text

# write the model header

param_offset_addr = fout.tell() # param offset
fout.seek(4, 1) # skip param offset length
fout.write(struct.pack("i", len(param_byte))) # param length
vocab_offset_addr = fout.tell() # vocab offset
fout.seek(4, 1) # skip vocab offset length
fout.write(struct.pack("i", len(vocab_byte))) # vocab length
tensor_offset_addr = fout.tell() # tensor offset
fout.seek(4, 1) # skip tensor offset length

param_offset = fout.tell()
# write the model parameters
fout.write(param_byte)
vocal_offset = fout.tell()
fout.write(vocab_byte)
tensor_offset = fout.tell()

# write the offsets
fout.seek(param_offset_addr, 0)
fout.write(struct.pack("i", param_offset))
fout.seek(vocab_offset_addr, 0)
fout.write(struct.pack("i", vocal_offset))
fout.seek(tensor_offset_addr, 0)
fout.write(struct.pack("i", tensor_offset))

# seek to the end of the file
fout.seek(0, 2)

def dump_tensor(v, name, file):
data = v.numpy().squeeze()
n_dims = len(data.shape)

dshape = data.shape
sname = name.encode('utf-8')
print("write tensor: ", name, " to file :", file.tell())
file.write(struct.pack("iii", n_dims, len(sname), dtype))
for i in range(n_dims):
file.write(struct.pack("i", dshape[i]))
file.write(sname)
data.tofile(file)

for k, v in model.items():
name = k
shape = v.shape

# skip layers.X.attention.inner_attention.rope.freqs
if name.endswith("inv_freq"):
continue

print("Processing variable: " + name + " with shape: ", shape, " and type: ", v.dtype)
dump_tensor(v, name, fout)

# I hope this deallocates the memory ..
model = None

fout.close()

print("Done. Output file: " + model_out_path)
print("")
17 changes: 12 additions & 5 deletions application/alpaca/alpaca.cpp → application/llama/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,14 @@ struct app_params {
float temp = 0.10f;
float repeat_penalty = 1.30f;

std::string model = "/home/supercb/mycode/mlsys/InferLLM/chinese-alpaca-7b-q4.bin"; // model path
std::string model = "chinese-alpaca-7b-q4.bin"; // model path

bool use_color = true; // use color to distinguish generations and inputs
bool use_mmap = false; // use mmap to load model
std::string dtype = "float32"; // configure the compute dtype
std::string device = "CPU"; // configure the compute device type
std::string mtype = "llama"; // the model type name, llama
int32_t version = 1; // the model version
};

void app_print_usage(int argc, char** argv, const app_params& params) {
Expand Down Expand Up @@ -80,9 +81,7 @@ void app_print_usage(int argc, char** argv, const app_params& params) {
fprintf(stderr,
" -g type configure the compute device type, default CPU, "
"can be CPU and GPU now.\n");
fprintf(stderr,
" --model_type type the model type name, default llama, can only be "
"llama now.\n");
fprintf(stderr, " --version N the llama model version, default 1.\n");
fprintf(stderr, "\n");
}

Expand Down Expand Up @@ -113,7 +112,9 @@ bool app_params_parse(int argc, char** argv, app_params& params) {
params.model = argv[++i];
} else if (arg == "--color") {
params.use_color = true;
} else if (arg == "--mmap") {
} else if (arg == "--version" || arg == "-v") {
params.version = std::stoi(argv[++i]);
}else if (arg == "--mmap") {
params.use_mmap = true;
} else if (arg == "-h" || arg == "--help") {
app_print_usage(argc, argv, params);
Expand Down Expand Up @@ -160,6 +161,12 @@ int main(int argc, char** argv) {
config.enable_mmap = params.use_mmap;
config.nr_ctx = params.n_ctx;

if(params.version == 1){
params.mtype = "llama";
} else if (params.version == 2){
params.mtype = "llama2";
}

std::shared_ptr<inferllm::Model> model =
std::make_shared<inferllm::Model>(config, params.mtype);
model->load(params.model);
Expand Down
8 changes: 4 additions & 4 deletions src/graph/llama.cpp → src/graph/ggml_llama.cpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#include "llama.h"
#include "ggml_llama.h"
#include <iostream>
using namespace inferllm;

void LlamaGraph::set_weights_alias() {
void GgmlLlamaGraph::set_weights_alias() {
m_weights_name_aliases.clear();
m_weights_name_aliases = {
{"norm.weight", "head.norm.weight"},
Expand All @@ -16,7 +16,7 @@ void LlamaGraph::set_weights_alias() {
}

//! LlamaGraph
void LlamaGraph::load(
void GgmlLlamaGraph::load(
std::shared_ptr<InputFile> fin, LlmParams& param,
std::shared_ptr<Vocab> vocab) {
// verify the magic number wrote when model convert
Expand Down Expand Up @@ -119,7 +119,7 @@ void LlamaGraph::load(
INFER_LOG("total weight length = %lu\n", weight_length);
}

void LlamaGraph::construct_llm() {
void GgmlLlamaGraph::construct_llm() {
uint32_t embd = m_param.n_embd;
uint32_t mult = m_param.n_mult;
uint32_t head = m_param.n_head;
Expand Down
2 changes: 1 addition & 1 deletion src/graph/llama.h → src/graph/ggml_llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ enum class LlamaModelType {
LLAMA_FILE_VERSION_GGJT_V1
};

class LlamaGraph : public Graph {
class GgmlLlamaGraph : public Graph {
using Graph::Graph;

public:
Expand Down
10 changes: 5 additions & 5 deletions src/graph/graph_imp.cpp
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
#include "chatGLM.h"
#include "llama.h"
#include "baichuan.h"
#include "ggml_llama.h"
#include "llama_like.h"

using namespace inferllm;
std::shared_ptr<Graph> Graph::make_graph(
UserConfig model_config, Device* device, const std::string& name) {
if (name == "llama") {
return std::make_shared<LlamaGraph>(model_config, device, name);
return std::make_shared<GgmlLlamaGraph>(model_config, device, name);
} else if (name == "chatglm") {
return std::make_shared<ChatGLMGraph>(model_config, device, name);
} else if (name == "chatglm2") {
return std::make_shared<ChatGLMGraph2>(model_config, device, name);
} else if (name == "baichuan") {
return std::make_shared<BaiChuanGraph>(model_config, device, name);
} else if (name == "baichuan" || name == "llama2") {
return std::make_shared<LlamaLikeGraph>(model_config, device, name);
} else {
INFER_ASSERT(0, "unsupported model.");
}
Expand Down
9 changes: 5 additions & 4 deletions src/graph/baichuan.cpp → src/graph/llama_like.cpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
#include "baichuan.h"
#include "llama_like.h"
#include "chatGLM.h"

using namespace inferllm;

void BaiChuanGraph::set_weights_alias() {
void LlamaLikeGraph::set_weights_alias() {
m_weights_name_aliases.clear();
// clang-format off
m_weights_name_aliases = {
Expand All @@ -22,7 +23,7 @@ void BaiChuanGraph::set_weights_alias() {
// clang-format on
}

void BaiChuanGraph::load_param(
void LlamaLikeGraph::load_param(
std::shared_ptr<InputFile> fin, LlmParams& param,
std::shared_ptr<Vocab> vocab) {
Header header;
Expand Down Expand Up @@ -50,7 +51,7 @@ void BaiChuanGraph::load_param(
fin->seek(header.tensor_offset);
}

void BaiChuanGraph::construct_llm() {
void LlamaLikeGraph::construct_llm() {
uint32_t embd = m_param.n_embd;
uint32_t ffn_size = m_param.n_mult;
uint32_t head = m_param.n_head;
Expand Down
5 changes: 2 additions & 3 deletions src/graph/baichuan.h → src/graph/llama_like.h
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
#pragma once

#include "llama.h"
#include "chatGLM.h"
#include "core/graph.h"

namespace inferllm {

class BaiChuanGraph : public Graph {
class LlamaLikeGraph : public Graph {
using Graph::Graph;

public:
Expand Down

0 comments on commit fcce99b

Please sign in to comment.