Skip to content

Commit

Permalink
llama: Improve error handling
Browse files Browse the repository at this point in the history
Check for NULL return values from llama.cpp in more places and
convert them into Go errors, which should make debugging easier
in the future rather than having hidden surprises in our data
structures.
  • Loading branch information
jessegross committed Nov 2, 2024
1 parent a103dae commit 312d9de
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 40 deletions.
59 changes: 47 additions & 12 deletions llama/llama.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ import (
"fmt"
"runtime"
"runtime/cgo"
"slices"
"strings"
"unsafe"
)
Expand Down Expand Up @@ -260,7 +261,7 @@ func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) {
}

m := Model{c: C.llama_load_model_from_file(C.CString(modelPath), cparams)}
if m.c == (*C.struct_llama_model)(C.NULL) {
if m.c == nil {
return nil, fmt.Errorf("unable to load model: %s", modelPath)
}

Expand All @@ -276,7 +277,7 @@ func NewContextWithModel(model *Model, params ContextParams) (*Context, error) {
c: C.llama_new_context_with_model(model.c, params.c),
numThreads: int(params.c.n_threads),
}
if c.c == (*C.struct_llama_context)(C.NULL) {
if c.c == nil {
return nil, errors.New("unable to create llama context")
}

Expand All @@ -300,6 +301,9 @@ func (m *Model) ApplyLoraFromFile(context *Context, loraPath string, scale float
defer C.free(unsafe.Pointer(cLoraPath))

loraAdapter := C.llama_lora_adapter_init(m.c, cLoraPath)
if loraAdapter == nil {
return errors.New("unable to load lora")
}

err := -1
if loraAdapter != nil {
Expand All @@ -322,13 +326,25 @@ type Batch struct {
// Creates a new batch for either word tokens or image embeddings (if embedSize is non-zero).
// Batches cannot contain both types at the same time. batchSize is the maximum number of entries
// that can be added per sequence
func NewBatch(batchSize int, maxSeq int, embedSize int) *Batch {
return &Batch{
func NewBatch(batchSize int, maxSeq int, embedSize int) (*Batch, error) {
b := Batch{
c: C.llama_batch_init(C.int(batchSize*maxSeq), C.int(embedSize), C.int(maxSeq)),
batchSize: batchSize,
maxSeq: maxSeq,
embedSize: embedSize,
}

// Check to see if any of the allocations in llama_batch_init() failed
nilPointer := (embedSize == 0 && b.c.token == nil) || (embedSize != 0 && b.c.embd == nil) ||
b.c.pos == nil || b.c.n_seq_id == nil || b.c.seq_id == nil || b.c.logits == nil ||
slices.Contains(unsafe.Slice(b.c.seq_id, b.allocSize()), nil)

if nilPointer {
C.llama_batch_free(b.c)
return nil, fmt.Errorf("unable to allocate batch (batchSize=%v maxSeq=%v embedSize=%v)", batchSize, maxSeq, embedSize)
}

return &b, nil
}

func (b *Batch) Size() int {
Expand Down Expand Up @@ -484,6 +500,9 @@ func NewClipContext(llamaContext *Context, modelPath string) (*ClipContext, erro
mp := C.CString(modelPath)
defer C.free(unsafe.Pointer(mp))
c := C.clip_model_load(mp, 1)
if c == nil {
return nil, fmt.Errorf("unable to load clip model: %v", modelPath)
}

projEmbedSize := int(C.clip_n_mmproj_embd(c))
modelEmbedSize := llamaContext.Model().NEmbd()
Expand All @@ -498,8 +517,11 @@ func (c *ClipContext) Free() {
C.clip_free(c.c)
}

func (c *ClipContext) NewEmbed(llamaContext *Context, data []byte) [][]float32 {
func (c *ClipContext) NewEmbed(llamaContext *Context, data []byte) ([][]float32, error) {
l := C.llava_image_embed_make_with_bytes(c.c, C.int(llamaContext.numThreads), (*C.uchar)(unsafe.Pointer(&data[0])), C.int(len(data)))
if l == nil {
return nil, errors.New("unable to make llava embedding from image")
}

numTokens := int(l.n_image_pos)
numEmbed := llamaContext.Model().NEmbd()
Expand All @@ -516,7 +538,7 @@ func (c *ClipContext) NewEmbed(llamaContext *Context, data []byte) [][]float32 {

C.llava_image_embed_free(l)

return embed
return embed, nil
}

type MllamaContext struct {
Expand All @@ -527,6 +549,9 @@ func NewMllamaContext(llamaContext *Context, modelPath string) (*MllamaContext,
mp := C.CString(modelPath)
defer C.free(unsafe.Pointer(mp))
c := C.mllama_model_load(mp, 1)
if c == nil {
return nil, fmt.Errorf("unable to load mllama model: %v", modelPath)
}

projEmbedSize := int(C.mllama_n_embd(c))
modelEmbedSize := llamaContext.Model().NEmbd()
Expand All @@ -541,19 +566,25 @@ func (m *MllamaContext) Free() {
C.mllama_free(m.c)
}

func (m *MllamaContext) NewEmbed(llamaContext *Context, data []byte, aspectRatioId int) [][]float32 {
func (m *MllamaContext) NewEmbed(llamaContext *Context, data []byte, aspectRatioId int) ([][]float32, error) {
img := C.mllama_image_init()
defer C.mllama_image_free(img)

C.mllama_image_load_from_data(unsafe.Pointer(&data[0]), C.int(len(data)), 560, 560, 3, 4, C.int(aspectRatioId), img)
ok := bool(C.mllama_image_load_from_data(unsafe.Pointer(&data[0]), C.int(len(data)), 560, 560, 3, 4, C.int(aspectRatioId), img))
if !ok {
return nil, errors.New("unable to load mllama image data")
}

rows := make([]float32, m.EmbedSize(llamaContext))
C.mllama_image_encode(m.c, C.int(llamaContext.numThreads), img, (*C.float)(unsafe.Pointer(&rows[0])))
ok = bool(C.mllama_image_encode(m.c, C.int(llamaContext.numThreads), img, (*C.float)(unsafe.Pointer(&rows[0]))))
if !ok {
return nil, errors.New("unable to make mllama embedding from image")
}

embed := make([][]float32, 1)
embed[0] = rows

return embed
return embed, nil
}

func (m *MllamaContext) EmbedSize(llamaContext *Context) int {
Expand Down Expand Up @@ -592,7 +623,7 @@ type SamplingParams struct {
Grammar string
}

func NewSamplingContext(model *Model, params SamplingParams) *SamplingContext {
func NewSamplingContext(model *Model, params SamplingParams) (*SamplingContext, error) {
var cparams C.struct_gpt_sampler_cparams
cparams.top_k = C.int32_t(params.TopK)
cparams.top_p = C.float(params.TopP)
Expand All @@ -615,9 +646,13 @@ func NewSamplingContext(model *Model, params SamplingParams) *SamplingContext {

cparams.grammar = grammar
context := &SamplingContext{c: C.gpt_sampler_cinit(model.c, &cparams)}
if context.c == nil {
return nil, errors.New("unable to create sampling context")
}

runtime.SetFinalizer(context, func(s *SamplingContext) { C.gpt_sampler_cfree(s.c) })

return context
return context, nil
}

func (s *SamplingContext) Reset() {
Expand Down
18 changes: 12 additions & 6 deletions llama/runner/image.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,9 @@ func (c *ImageContext) Free(modelPath string) {
}
}

func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspectRatioId int) [][]float32 {
func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspectRatioId int) ([][]float32, error) {
if c == nil {
return nil
return nil, nil
}

hash := c.hashImage(data)
Expand All @@ -76,17 +76,23 @@ func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspect
embed, err := c.findImage(hash)
if err != nil {
if c.mllama != nil {
embed = c.mllama.NewEmbed(llamaContext, data, aspectRatioId)
embed, err = c.mllama.NewEmbed(llamaContext, data, aspectRatioId)
if err != nil {
return nil, err
}
} else if c.clip != nil {
embed = c.clip.NewEmbed(llamaContext, data)
embed, err = c.clip.NewEmbed(llamaContext, data)
if err != nil {
return nil, err
}
} else {
return nil
return nil, errors.New("received image but vision model not loaded")
}

c.addImage(hash, embed)
}

return embed
return embed, nil
}

func (c *ImageContext) BatchSize(configuredBatchSize int) int {
Expand Down
21 changes: 17 additions & 4 deletions llama/runner/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,10 @@ func (s *Server) NewSequence(prompt string, images []ImageData, params NewSequen

var sc *llama.SamplingContext
if params.samplingParams != nil {
sc = llama.NewSamplingContext(s.model, *params.samplingParams)
sc, err = llama.NewSamplingContext(s.model, *params.samplingParams)
if err != nil {
return nil, err
}
for _, input := range inputs {
if input.embed == nil {
sc.Accept(input.token, false)
Expand Down Expand Up @@ -194,7 +197,11 @@ func (s *Server) inputs(prompt string, images []ImageData) ([]input, error) {
return nil, fmt.Errorf("invalid image index: %d", n)
}

embed := s.image.NewEmbed(s.lc, images[imageIndex].Data, images[imageIndex].AspectRatioID)
embed, err := s.image.NewEmbed(s.lc, images[imageIndex].Data, images[imageIndex].AspectRatioID)
if err != nil {
return nil, err
}

for _, e := range embed {
inputs = append(inputs, input{embed: e})
}
Expand Down Expand Up @@ -305,13 +312,19 @@ func (s *Server) run(ctx context.Context) {

// Logically these batches are used only within the context of processBatch
// but it is better for performance to allocate them once here
tokenBatch := llama.NewBatch(s.batchSize, len(s.seqs), 0)
tokenBatch, err := llama.NewBatch(s.batchSize, len(s.seqs), 0)
if err != nil {
panic(err)
}
defer tokenBatch.Free()

var embedBatch *llama.Batch
embedBatchSize := s.image.BatchSize(s.batchSize)
if embedBatchSize != 0 {
embedBatch = llama.NewBatch(embedBatchSize, len(s.seqs), s.image.EmbedSize(s.lc))
embedBatch, err = llama.NewBatch(embedBatchSize, len(s.seqs), s.image.EmbedSize(s.lc))
if err != nil {
panic(err)
}
defer embedBatch.Free()
} else {
embedBatch = &llama.Batch{}
Expand Down
40 changes: 22 additions & 18 deletions llama/sampling_ext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,28 @@
struct gpt_sampler *gpt_sampler_cinit(
const struct llama_model *model, struct gpt_sampler_cparams *params)
{
gpt_sampler_params sparams;
sparams.top_k = params->top_k;
sparams.top_p = params->top_p;
sparams.min_p = params->min_p;
sparams.tfs_z = params->tfs_z;
sparams.typ_p = params->typical_p;
sparams.temp = params->temp;
sparams.penalty_last_n = params->penalty_last_n;
sparams.penalty_repeat = params->penalty_repeat;
sparams.penalty_freq = params->penalty_freq;
sparams.penalty_present = params->penalty_present;
sparams.mirostat = params->mirostat;
sparams.mirostat_tau = params->mirostat_tau;
sparams.mirostat_eta = params->mirostat_eta;
sparams.penalize_nl = params->penalize_nl;
sparams.seed = params->seed;
sparams.grammar = params->grammar;
return gpt_sampler_init(model, sparams);
try {
gpt_sampler_params sparams;
sparams.top_k = params->top_k;
sparams.top_p = params->top_p;
sparams.min_p = params->min_p;
sparams.tfs_z = params->tfs_z;
sparams.typ_p = params->typical_p;
sparams.temp = params->temp;
sparams.penalty_last_n = params->penalty_last_n;
sparams.penalty_repeat = params->penalty_repeat;
sparams.penalty_freq = params->penalty_freq;
sparams.penalty_present = params->penalty_present;
sparams.mirostat = params->mirostat;
sparams.mirostat_tau = params->mirostat_tau;
sparams.mirostat_eta = params->mirostat_eta;
sparams.penalize_nl = params->penalize_nl;
sparams.seed = params->seed;
sparams.grammar = params->grammar;
return gpt_sampler_init(model, sparams);
} catch (const std::exception & err) {
return nullptr;
}
}

void gpt_sampler_cfree(struct gpt_sampler *sampler)
Expand Down

0 comments on commit 312d9de

Please sign in to comment.