Skip to content

Commit

Permalink
Merge 'origin/master' into hipblas
Browse files Browse the repository at this point in the history
  • Loading branch information
SlyEcho committed Jun 8, 2023
2 parents 4362e80 + b50b570 commit 85f902d
Show file tree
Hide file tree
Showing 15 changed files with 500 additions and 258 deletions.
9 changes: 7 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
option(LLAMA_METAL "llama: use Metal" OFF)
option(LLAMA_K_QUANTS "llama: use k-quants" ON)

option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
Expand Down Expand Up @@ -227,6 +228,11 @@ if (LLAMA_METAL)
)
endif()

if (LLAMA_K_QUANTS)
set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h)
add_compile_definitions(GGML_USE_K_QUANTS)
endif()

if (LLAMA_CLBLAST)
find_package(CLBlast)
if (CLBlast_FOUND)
Expand Down Expand Up @@ -428,11 +434,10 @@ endif()
add_library(ggml OBJECT
ggml.c
ggml.h
ggml-quants-k.h
ggml-quants-k.c
${GGML_SOURCES_CUDA}
${GGML_SOURCES_OPENCL}
${GGML_SOURCES_METAL}
${GGML_SOURCES_EXTRA}
)

target_include_directories(ggml PUBLIC .)
Expand Down
37 changes: 22 additions & 15 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,11 @@ ifneq ($(filter ppc64%,$(UNAME_M)),)
endif
endif

ifndef LLAMA_NO_K_QUANTS
CFLAGS += -DGGML_USE_K_QUANTS
OBJS += k_quants.o
endif

ifndef LLAMA_NO_ACCELERATE
# Mac M1 - include Accelerate framework.
# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
Expand All @@ -140,7 +145,7 @@ ifdef LLAMA_OPENBLAS
endif # LLAMA_OPENBLAS

ifdef LLAMA_BLIS
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
LDFLAGS += -lblis -L/usr/local/lib
endif # LLAMA_BLIS

Expand Down Expand Up @@ -230,6 +235,11 @@ ifneq ($(filter armv8%,$(UNAME_M)),)
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
endif

ifdef LLAMA_NO_K_QUANTS
k_quants.o: k_quants.c k_quants.h
$(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_NO_K_QUANTS

#
# Print build information
#
Expand All @@ -249,10 +259,7 @@ $(info )
# Build library
#

ggml.o: ggml.c ggml.h ggml-cuda.h ggml-quants-k.h
$(CC) $(CFLAGS) -c $< -o $@

ggml-quants-k.o: ggml-quants-k.c ggml-quants-k.h ggml.h ggml-cuda.h
ggml.o: ggml.c ggml.h ggml-cuda.h
$(CC) $(CFLAGS) -c $< -o $@

llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h
Expand All @@ -261,7 +268,7 @@ llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h
common.o: examples/common.cpp examples/common.h
$(CXX) $(CXXFLAGS) -c $< -o $@

libllama.so: llama.o ggml.o ggml-quants-k.o $(OBJS)
libllama.so: llama.o ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)

clean:
Expand All @@ -271,28 +278,28 @@ clean:
# Examples
#

main: examples/main/main.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
main: examples/main/main.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
@echo
@echo '==== Run ./main -h for help. ===='
@echo

quantize: examples/quantize/quantize.cpp build-info.h ggml.o ggml-quants-k.o llama.o $(OBJS)
quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o ggml-quants-k.o llama.o $(OBJS)
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

embedding: examples/embedding/embedding.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)

build-info.h: $(wildcard .git/index) scripts/build-info.sh
Expand All @@ -307,11 +314,11 @@ build-info.h: $(wildcard .git/index) scripts/build-info.sh
# Tests
#

benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o ggml-quants-k.o $(OBJS)
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
./$@

vdot: pocs/vdot/vdot.cpp ggml.o ggml-quants-k.o $(OBJS)
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

.PHONY: tests clean
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++

**Hot topics:**

- Roadmap June 2023: https://github.com/ggerganov/llama.cpp/discussions/1729
- GPU support with Metal (Apple Silicon): https://github.com/ggerganov/llama.cpp/pull/1642
- High-quality 2,3,4,5,6-bit quantization: https://github.com/ggerganov/llama.cpp/pull/1684
- Multi-GPU support: https://github.com/ggerganov/llama.cpp/pull/1607
Expand Down
3 changes: 3 additions & 0 deletions examples/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
params.path_prompt_cache = argv[i];
} else if (arg == "--prompt-cache-all") {
params.prompt_cache_all = true;
} else if (arg == "--prompt-cache-ro") {
params.prompt_cache_ro = true;
} else if (arg == "-f" || arg == "--file") {
if (++i >= argc) {
invalid_param = true;
Expand Down Expand Up @@ -432,6 +434,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, " --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n");
fprintf(stderr, " --prompt-cache-all if specified, saves user input and generations to cache as well.\n");
fprintf(stderr, " not supported with --interactive or other interactive options\n");
fprintf(stderr, " --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n");
fprintf(stderr, " --random-prompt start with a randomized prompt.\n");
fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n");
fprintf(stderr, " --in-suffix STRING string to suffix after user inputs with (default: empty)\n");
Expand Down
1 change: 1 addition & 0 deletions examples/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ struct gpt_params {
bool use_color = false; // use color to distinguish generations and inputs
bool interactive = false; // interactive mode
bool prompt_cache_all = false; // save user input and generations to prompt cache
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it

bool embedding = false; // get only sentence embedding
bool interactive_first = false; // wait for user input immediately
Expand Down
4 changes: 2 additions & 2 deletions examples/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,7 @@ int main(int argc, char ** argv) {
const bool penalize_nl = params.penalize_nl;

// optionally save the session on first sample (for faster prompt loading next time)
if (!path_session.empty() && need_to_save_session) {
if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) {
need_to_save_session = false;
llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
}
Expand Down Expand Up @@ -630,7 +630,7 @@ int main(int argc, char ** argv) {
}
}

if (!path_session.empty() && params.prompt_cache_all) {
if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
fprintf(stderr, "\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
}
Expand Down
30 changes: 24 additions & 6 deletions flake.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

26 changes: 18 additions & 8 deletions flake.nix
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,13 @@
outputs = { self, nixpkgs, flake-utils }:
flake-utils.lib.eachDefaultSystem (system:
let
inherit (pkgs.stdenv) isAarch64 isDarwin;
inherit (pkgs.lib) optionals;
isM1 = isAarch64 && isDarwin;
osSpecific =
if isM1 then with pkgs.darwin.apple_sdk_11_0.frameworks; [ Accelerate MetalKit MetalPerformanceShaders MetalPerformanceShadersGraph ]
else if isDarwin then with pkgs.darwin.apple_sdk.frameworks; [ Accelerate CoreGraphics CoreVideo ]
else [ ];
pkgs = import nixpkgs {
inherit system;
};
Expand All @@ -18,17 +25,22 @@
packages.default = pkgs.stdenv.mkDerivation {
name = "llama.cpp";
src = ./.;
postPatch =
if isM1 then ''
substituteInPlace ./ggml-metal.m \
--replace '[[NSBundle mainBundle] pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/ggml-metal.metal\";"
'' else "";
nativeBuildInputs = with pkgs; [ cmake ];
buildInputs = with pkgs; lib.optionals stdenv.isDarwin [
darwin.apple_sdk.frameworks.Accelerate
];
cmakeFlags = with pkgs; lib.optionals (system == "aarch64-darwin") [
buildInputs = osSpecific;
cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" ] ++ (optionals isM1 [
"-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
];
"-DLLAMA_METAL=ON"
]);
installPhase = ''
mkdir -p $out/bin
mv bin/* $out/bin/
mv $out/bin/main $out/bin/llama
mv $out/bin/server $out/bin/llama-server
echo "#!${llama-python}/bin/python" > $out/bin/convert.py
cat ${./convert.py} >> $out/bin/convert.py
Expand All @@ -40,9 +52,7 @@
packages = with pkgs; [
cmake
llama-python
] ++ lib.optionals stdenv.isDarwin [
darwin.apple_sdk.frameworks.Accelerate
];
] ++ osSpecific;
};
}
);
Expand Down
Loading

0 comments on commit 85f902d

Please sign in to comment.