alibaba · huangzhengxiang · Oct 29, 2024 · Oct 29, 2024 · Oct 30, 2024 · Oct 30, 2024
diff --git a/.gitignore b/.gitignore
@@ -360,3 +360,10 @@ pymnn_build/
 
 # mnncompress generated
 MNN_compression_pb2.py
+
+# model path
+model/
+
+# datasets
+datasets/*
+!datasets/*.sh
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -20,7 +20,9 @@ endif()
 project(MNN VERSION ${MNN_VERSION} LANGUAGES C CXX ASM)
 # complier options
 set(CMAKE_C_STANDARD 99)
-set(CMAKE_CXX_STANDARD 11)
+IF (NOT (CMAKE_CXX_STANDARD EQUAL 17))
+  set(CMAKE_CXX_STANDARD 11)
+ENDIF()
 set(CMAKE_MODULE_PATH
   ${CMAKE_MODULE_PATH}
   "${CMAKE_CURRENT_LIST_DIR}/cmake"
@@ -284,7 +286,7 @@ if(CMAKE_SYSTEM_NAME MATCHES "^Android")
 endif()
 option(MNN_USE_CPP11 "Enable MNN use c++11" ON)
 if (NOT MSVC)
-    if(MNN_CUDA AND MNN_SUPPORT_TRANSFORMER_FUSE)
+    if((MNN_CUDA AND MNN_SUPPORT_TRANSFORMER_FUSE) OR (CMAKE_CXX_STANDARD EQUAL 17))
         set(CMAKE_CXX_STANDARD 17)
         set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=gnu99")
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17")

diff --git a/docs/transformers/llm.md b/docs/transformers/llm.md
@@ -157,7 +157,7 @@ sh package_scripts/ios/buildiOS.sh "-DMNN_ARM82=true -DMNN_LOW_MEMORY=true -DMNN
   - visual_model: 当使用VL模型时，visual_model的实际路径为`base_dir + visual_model`，默认为`base_dir + 'visual.mnn'`
 - 推理配置
   - max_new_tokens: 生成时最大token数，默认为`512`
-  - reuse_kv: 多轮对话时是否复用之前对话的`kv cache`，默认为`false`
+  - reuse_kv: 多轮对话时是否复用之前对话的`kv cache`，默认为`false`, 目前只有CPU后端支持设置为`true`.
   - quant_qkv: CPU attention 算子中`query, key, value`是否量化，可选为：`0, 1, 2, 3, 4`，默认为`0`，含义如下：
     - 0: key和value都不量化
     - 1: 使用非对称8bit量化存储key
@@ -173,6 +173,19 @@ sh package_scripts/ios/buildiOS.sh "-DMNN_ARM82=true -DMNN_LOW_MEMORY=true -DMNN
   - thread_num: CPU推理使用硬件线程数，默认为：`4`; OpenCL推理时使用`68`
   - precision: 推理使用精度策略，默认为：`"low"`，尽量使用`fp16`
   - memory: 推理使用内存策略，默认为：`"low"`，开启运行时量化
+- Sampler配置
+  - sampler_type: 使用的sampler种类，目前支持`greedy`, `temperature`, `topK`, `topP`, `minP`, `tfs`, `typical`, `penalty`8种基本sampler，外加`mixed`(混合sampler)。当选择`mixed`时，依次执行mixed_samplers中的sampler。默认为`mixed`。
+  - mixed_samplers: 当`sampler_type`为`mixed`时有效，默认为`["topK", "tfs", "typical", "topP", "min_p", "temperature"]`
+  - temperature: `temperature`, `topP`, `minP`, `tfsZ`, `typical`中temerature值，默认为1.0
+  - topK: `topK`中top K 个的个数，默认为40
+  - topP: `topP`中top P的值，默认为0.9
+  - minP: `minP`中min P的值，默认为0.1
+  - tfsZ: `tfs`中Z的值，默认为1.0，即不使用tfs算法
+  - typical: `typical`中p的值，默认为1.0，即不使用typical算法
+  - penalty: `penalty`中对于logits的惩罚项，默认为0.0，即不惩罚
+  - n_gram: `penalty`中最大存储的ngram大小，默认为8
+  - ngram_factor: `penalty`中对于重复ngram的额外惩罚，默认为1.0，即没有额外惩罚
+  - penalty_sampler: `penalty`中最后一步采用的sampling策略，可选"greedy"或"temperature"，默认greedy.
 
 ##### 配置文件示例
 - `config.json`
@@ -184,7 +197,15 @@ sh package_scripts/ios/buildiOS.sh "-DMNN_ARM82=true -DMNN_LOW_MEMORY=true -DMNN
       "backend_type": "cpu",
       "thread_num": 4,
       "precision": "low",
-      "memory": "low"
+      "memory": "low",
+      "sampler_type": "mixed",
+      "mixed_samplers": ["topK", "tfs", "typical", "topP", "min_p", "temperature"],
+      "temperature": 1.0,
+      "topK": 40,
+      "topP": 0.9,
+      "tfsZ": 1.0,
+      "minP": 0.1,
+      "reuse_kv": true
   }
   ```
 - `llm_config.json`
@@ -208,6 +229,7 @@ sh package_scripts/ios/buildiOS.sh "-DMNN_ARM82=true -DMNN_LOW_MEMORY=true -DMNN
 
 #### 推理用法
 `llm_demo`的用法如下：
+pc端直接推理
 ```
 # 使用config.json
 ## 交互式聊天
@@ -222,6 +244,13 @@ sh package_scripts/ios/buildiOS.sh "-DMNN_ARM82=true -DMNN_LOW_MEMORY=true -DMNN
 ./llm_demo model_dir/llm.mnn prompt.txt
 ```
 
+手机端adb推理用法：
+```bash
+# 利用adb push将链接库push到手机上
+adb shell mkdir /data/local/tmp/llm
+adb push llm_demo ppl_demo libllm.so libMNN_CL.so libMNN_Express.so libMNN.so tools/cv/libMNNOpenCV.so /data/local/tmp/llm
+```
+
 #### GPTQ权重加载
 - 使用脚本生成GPTQ模型权重，用法参考: [apply_gptq.py](../tools/script.html#apply-gptq-py)
 - 创建`gptq.json`配置文件

diff --git a/source/backend/cpu/CPUAttention.cpp b/source/backend/cpu/CPUAttention.cpp
@@ -25,7 +25,7 @@
 #endif
 
 // reduce the value of 'query' to 'query * FP16_QSCALE', avoid fp16 overflow
-#define FP16_QSCALE 0.5
+#define FP16_QSCALE 0.25
 
 namespace MNN {
 

diff --git a/transformers/llm/.gitignore b/transformers/llm/.gitignore
@@ -0,0 +1,7 @@
+datasets/*
+!datasets/*.sh
+
+
+!datasets/visualization/
+datasets/visualization/data
+datasets/visualization/pic
diff --git a/transformers/llm/datasets/get-sharegpt.sh b/transformers/llm/datasets/get-sharegpt.sh
@@ -0,0 +1,2 @@
+git lfs install
+git clone https://huggingface.co/datasets/shareAI/ShareGPT-Chinese-English-90k
diff --git a/transformers/llm/datasets/get-wikitext-2-raw.sh b/transformers/llm/datasets/get-wikitext-2-raw.sh
@@ -0,0 +1,2 @@
+wget https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
+unzip wikitext-2-raw-v1.zip
diff --git a/transformers/llm/datasets/visualization/stats.py b/transformers/llm/datasets/visualization/stats.py
@@ -0,0 +1,116 @@
+import matplotlib.pyplot as plt
+from matplotlib import colors
+from matplotlib.ticker import PercentFormatter
+from matplotlib import cbook
+from matplotlib.axes import Axes
+import pandas as pd
+import numpy as np
+import argparse
+import os
+
+vis_root = "pic"
+
+def remove_blanks(df: pd.DataFrame) -> pd.DataFrame:
+    # Removing unnamed columns using drop function
+    df.drop(df.columns[df.columns.str.contains(
+        'unnamed', case=False)], axis=1, inplace=True)
+    return df
+def add_turns(df: pd.DataFrame) -> pd.DataFrame:
+    df["turns"] = (1-df.isnull()).sum(axis=1) // 2
+    return df
+def get_max_turn(df: pd.DataFrame) -> int:
+    keys = list(df.keys())
+    return max([int(key.replace("decode", "")) for key in keys if "decode" in key]) + 1
+def add_pd_ratio(df: pd.DataFrame) -> pd.DataFrame:
+    max_turns = get_max_turn(df)
+    for i in range(max_turns):
+        df["pd_ratio{}".format(i)] = df["prefill{}".format(i)] / df["decode{}".format(i)]
+    return df 
+def preprocess(file_path: str) -> pd.DataFrame:
+    table = pd.read_csv(file_path)
+    table = remove_blanks(table)
+    table = add_turns(table)
+    table = add_pd_ratio(table)
+    print(table)
+    return table
+
+def draw_distribution(df: pd.DataFrame, file_path: str):
+    turns_bin = df.value_counts(subset=["turns"], sort=False)
+    print(turns_bin)
+    plt.close()
+    plt.rcParams['font.size'] = 10
+    _, ax = plt.subplots()
+    # N is the count in each bin, bins is the lower-limit of the bin
+    N, bins, patches = ax.hist(df["turns"], bins=get_max_turn(df), density=True, align="left", label=True)
+    # We'll color code by height, but you could use any scalar
+    fracs = N / N.max()
+    # we need to normalize the data to 0..1 for the full range of the colormap
+    norm = colors.Normalize(fracs.min(), fracs.max())
+    # Now, we'll loop through our objects and set the color of each accordingly
+    for thisfrac, thispatch in zip(fracs, patches):
+        color = plt.cm.viridis(norm(thisfrac))
+        thispatch.set_facecolor(color)
+    # Now we format the y-axis to display percentage
+    ax.yaxis.set_major_formatter(PercentFormatter(xmax=1))
+    ax.set_xlim((0.5, get_max_turn(df)-0.5))
+    ax.set_xticks(np.arange(1,get_max_turn(df)+1),np.arange(1,get_max_turn(df)+1),rotation=60, fontsize=9)
+    ax.set_ylabel("frequency", fontsize=14)
+    ax.set_xlabel("num of turns", fontsize=14)
+    plt.savefig(file_path, dpi=600)
+    plt.close()
+
+def draw_prefill(df: pd.DataFrame, ax: Axes):
+    stats = [cbook.boxplot_stats(df[df["prefill{}".format(i)].notna()]["prefill{}".format(i)], labels=[i+1])[0]
+                 for i in range(get_max_turn(df))]
+    print(stats)
+    ax.bxp(stats, patch_artist=True, boxprops={'facecolor': 'bisque'}, flierprops=dict(marker='o', markersize=2))
+    ax.set_ylim(0,600)
+    ax.set_yticks(np.arange(0,700,100), np.arange(0,700,100), fontsize=9)
+    ax.set_ylabel("prefill", fontsize=12, rotation=90)
+    return
+def draw_decode(df: pd.DataFrame, ax: Axes):
+    stats = [cbook.boxplot_stats(df[df["decode{}".format(i)].notna()]["decode{}".format(i)], labels=[i+1])[0]
+                 for i in range(get_max_turn(df))]
+    print(stats)
+    ax.bxp(stats, patch_artist=True, boxprops={'facecolor': 'bisque'}, flierprops=dict(marker='o', markersize=2))
+    ax.set_ylim(0,600)
+    ax.set_yticks(np.arange(0,700,100), np.arange(0,700,100), fontsize=9)
+    ax.set_ylabel("decode", fontsize=12, rotation=90)
+    return
+def draw_pd_ratio(df: pd.DataFrame, ax: Axes):
+    stats = [cbook.boxplot_stats(df[df["pd_ratio{}".format(i)].notna()]["pd_ratio{}".format(i)], labels=[i+1])[0]
+                 for i in range(get_max_turn(df))]
+    print(stats)
+    ax.bxp(stats, patch_artist=True, boxprops={'facecolor': 'bisque'}, flierprops=dict(marker='o', markersize=2))
+    ax.plot(np.arange(0,get_max_turn(df)+2), np.ones_like(np.arange(0,get_max_turn(df)+2),dtype=float))
+    ax.set_xlim(0, get_max_turn(df)+1)
+    ax.set_ylim(0, 2.)
+    ax.set_xticks(np.arange(1,get_max_turn(df)), np.arange(1,get_max_turn(df)), rotation=60, fontsize=9)
+    ax.set_yticks([0,0.5,1,2], [0,0.5,1,2], fontsize=9)
+    ax.set_xlabel("round", fontsize=12)
+    ax.set_ylabel("prefill/decode", fontsize=12, rotation=90)
+    return
+def draw_reuse_kv(df: pd.DataFrame, file_path: str):
+    plt.close()
+    _, axs = plt.subplots(3,1,sharex="col")
+    draw_prefill(df, axs[0])
+    draw_decode(df, axs[1])
+    draw_pd_ratio(df, axs[2])
+    plt.savefig(file_path, dpi=1200)
+    plt.close()
+    return
+def draw_no_reuse_kv():
+    return
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--root", type=str, default="./data")
+    parser.add_argument("--name", type=str, default="shareGPT_dialog_stats_common_en.csv")
+    args = parser.parse_args()
+
+    file_path = os.path.join(args.root, args.name)
+    dist_path = os.path.join(vis_root, args.name.split('.')[0]+"_dist.png")
+    pd_dist_path = os.path.join(vis_root, args.name.split('.')[0]+"_pd_dist.png")
+    table = preprocess(file_path)
+    draw_distribution(table, dist_path)
+    draw_reuse_kv(table, pd_dist_path)
diff --git a/transformers/llm/datasets/visualization/time.py b/transformers/llm/datasets/visualization/time.py
@@ -0,0 +1,83 @@
+import matplotlib.pyplot as plt
+from matplotlib import colors
+from matplotlib.ticker import PercentFormatter
+from matplotlib import cbook
+from matplotlib.axes import Axes
+from typing import List, Dict, Tuple
+import pandas as pd
+import numpy as np
+import argparse
+import os
+import re
+from io import StringIO
+
+def split_by_turns(id: str, content: str) -> List[pd.DataFrame]:
+    pattern = "<{id}>\n(.*?)</{id}>\n".format(id=id)
+    return [pd.read_csv(StringIO(item)) for item in re.findall(pattern, content, flags=re.DOTALL)]
+def preprocess(file_path: str) -> Tuple[List[pd.DataFrame], List[pd.DataFrame]]:
+    content = open(file_path, "rt").read()
+    return split_by_turns("prefill", content), split_by_turns("decode", content)
+def get_max_turn(no_reuse_prefill_record):
+    return max(10, max([len(record) for record in no_reuse_prefill_record]))
+def draw_history_len(ax: Axes, no_reuse_prefill_record:  List[pd.DataFrame]):
+    max_round = get_max_turn(no_reuse_prefill_record)
+    history_len = [0 for _ in range(0, max_round)]
+    for turn in range(0, max_round):
+        history_len[turn] = np.median([record["input_token"][turn] - record["prompt_token"][turn]
+                                     for record in no_reuse_prefill_record if len(record)>=turn+1]).item()
+    plt.plot(np.arange(1, max_round+1), history_len, label="median history len", marker=".", markersize=8)
+    return
+def draw_prefill_bar_chat(ax: Axes, no_reuse, reuse):
+    offset = 0.2
+    max_round = len(no_reuse)
+    no_reuse_med = [np.median(turn) for turn in no_reuse]
+    rects = ax.bar(np.arange(1,max_round+1) + offset, no_reuse_med, offset*2, label="no reuse kv", color="tomato")
+    ax.bar_label(rects, fmt="{:.2f}", padding=4, fontsize=6)
+    reuse_med = [np.median(turn) for turn in reuse]
+    rects = ax.bar(np.arange(1,max_round+1) - offset, reuse_med, offset*2, label="reuse kv", color="springgreen")
+    ax.bar_label(rects, fmt="{:.2f}", padding=4, fontsize=6)
+    return
+def compare_prefill_reuse_kv(no_reuse_prefill_record: List[pd.DataFrame],
+                             reuse_prefill_record: List[pd.DataFrame]):
+    plt.close()
+    _,ax1 = plt.subplots()
+    ax2 = ax1.twinx()
+    # plot history_len
+    draw_history_len(ax2, no_reuse_prefill_record)
+    # calculate per turn 
+    max_round = get_max_turn(no_reuse_prefill_record)
+    no_reuse = [[] for _ in range(0, max_round)]
+    for turn in range(0, max_round):
+        no_reuse[turn] = [record["response_speed"][turn] for record in no_reuse_prefill_record if len(record)>=turn+1]
+    reuse = [[] for _ in range(0, max_round)]
+    for turn in range(0, max_round):
+        reuse[turn] = [record["response_speed"][turn] for record in reuse_prefill_record if len(record)>=turn+1]
+    # plot the bar chat (with error bar)
+    draw_prefill_bar_chat(ax1, no_reuse, reuse)
+    ax1.set_xticks(np.arange(1,max_round+1),np.arange(1,max_round+1),fontsize=9)
+    ax1.set_ylim(0,100)
+    ax2.set_ylim(0,1000)
+    ax1.legend(loc='upper left', title="prefill response speed")
+    ax2.legend(loc='upper right')
+    ax1.set_ylabel("prefill\nresponse\nspeed", rotation=0, labelpad=12)
+    ax2.set_ylabel("history\nlen", rotation=0, labelpad=8)
+    ax1.set_xlabel("round")
+    plt.title("KV cache reuse for multi-turn chat\neffects on ShareGPT")
+    plt.tight_layout() 
+    plt.savefig("./pic/fig.png",dpi=1200)
+    plt.close()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--root", type=str, default="./data")
+    parser.add_argument("--no_reuse", type=str, default="shareGPT_common_en_70k_noreuse.txt")
+    parser.add_argument("--reuse", type=str, default="shareGPT_common_en_70k_reuse.txt")
+    args = parser.parse_args()
+
+    no_reuse_file_path = os.path.join(args.root, args.no_reuse)
+    reuse_file_path = os.path.join(args.root, args.reuse)
+    no_reuse_prefill_record, no_reuse_decode_record = preprocess(no_reuse_file_path)
+    reuse_prefill_record, reuse_decode_record = preprocess(reuse_file_path)
+    # visualize prefill
+    compare_prefill_reuse_kv(no_reuse_prefill_record, reuse_prefill_record)
diff --git a/transformers/llm/engine/CMakeLists.txt b/transformers/llm/engine/CMakeLists.txt
@@ -25,9 +25,12 @@ else()
     add_library(llm OBJECT ${SRCS})
 endif()
 
-add_executable(llm_demo ${CMAKE_CURRENT_LIST_DIR}/llm_demo.cpp)
+add_executable(llm_demo ${CMAKE_CURRENT_LIST_DIR}/app/llm_demo.cpp)
+add_executable(ppl_demo ${CMAKE_CURRENT_LIST_DIR}/app/ppl_demo.cpp)
 IF (NOT MNN_SEP_BUILD)
     target_link_libraries(llm_demo ${MNN_DEPS})
+    target_link_libraries(ppl_demo ${MNN_DEPS})
 ELSE ()
     target_link_libraries(llm_demo ${MNN_DEPS} llm)
+    target_link_libraries(ppl_demo ${MNN_DEPS} llm)
 ENDIF ()