@@ -402,7 +402,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
402
402
s .cmd .Env = append (s .cmd .Env , visibleDevicesEnv + "=" + visibleDevicesEnvVal )
403
403
}
404
404
405
- slog .Info ("starting llama server" , "cmd" , s .cmd . String () )
405
+ slog .Info ("starting llama server" , "cmd" , s .cmd )
406
406
if envconfig .Debug () {
407
407
filteredEnv := []string {}
408
408
for _ , ev := range s .cmd .Env {
@@ -470,7 +470,7 @@ const ( // iota is reset to 0
470
470
ServerStatusError
471
471
)
472
472
473
- func (s ServerStatus ) ToString () string {
473
+ func (s ServerStatus ) String () string {
474
474
switch s {
475
475
case ServerStatusReady :
476
476
return "llm server ready"
@@ -485,12 +485,9 @@ func (s ServerStatus) ToString() string {
485
485
}
486
486
}
487
487
488
- type ServerStatusResp struct {
489
- Status string `json:"status"`
490
- SlotsIdle int `json:"slots_idle"`
491
- SlotsProcessing int `json:"slots_processing"`
492
- Error string `json:"error"`
493
- Progress float32 `json:"progress"`
488
+ type ServerStatusResponse struct {
489
+ Status ServerStatus `json:"status"`
490
+ Progress float32 `json:"progress"`
494
491
}
495
492
496
493
func (s * llmServer ) getServerStatus (ctx context.Context ) (ServerStatus , error ) {
@@ -502,7 +499,7 @@ func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
502
499
}
503
500
if s .cmd .ProcessState .ExitCode () == - 1 {
504
501
// Most likely a signal killed it, log some more details to try to help troubleshoot
505
- slog .Warn ("llama runner process no longer running" , "sys" , s .cmd .ProcessState .Sys (), "string" , s .cmd .ProcessState . String () )
502
+ slog .Warn ("llama runner process no longer running" , "sys" , s .cmd .ProcessState .Sys (), "string" , s .cmd .ProcessState )
506
503
}
507
504
return ServerStatusError , fmt .Errorf ("llama runner process no longer running: %d %s" , s .cmd .ProcessState .ExitCode (), msg )
508
505
}
@@ -527,21 +524,19 @@ func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
527
524
return ServerStatusError , fmt .Errorf ("read health request: %w" , err )
528
525
}
529
526
530
- var status ServerStatusResp
531
- if err := json .Unmarshal (body , & status ); err != nil {
527
+ var ssr ServerStatusResponse
528
+ if err := json .Unmarshal (body , & ssr ); err != nil {
532
529
return ServerStatusError , fmt .Errorf ("health unmarshal encode response: %w" , err )
533
530
}
534
531
535
- switch status .Status {
536
- case "ok" :
537
- return ServerStatusReady , nil
538
- case "no slot available" :
539
- return ServerStatusNoSlotsAvailable , nil
540
- case "loading model" :
541
- s .loadProgress = status .Progress
542
- return ServerStatusLoadingModel , nil
532
+ switch ssr .Status {
533
+ case ServerStatusLoadingModel :
534
+ s .loadProgress = ssr .Progress
535
+ return ssr .Status , nil
536
+ case ServerStatusReady , ServerStatusNoSlotsAvailable :
537
+ return ssr .Status , nil
543
538
default :
544
- return ServerStatusError , fmt .Errorf ("server error: %+v" , status )
539
+ return ssr . Status , fmt .Errorf ("server error: %+v" , ssr )
545
540
}
546
541
}
547
542
@@ -616,7 +611,7 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
616
611
status , _ := s .getServerStatus (ctx )
617
612
if lastStatus != status && status != ServerStatusReady {
618
613
// Only log on status changes
619
- slog .Info ("waiting for server to become available" , "status" , status . ToString () )
614
+ slog .Info ("waiting for server to become available" , "status" , status )
620
615
}
621
616
switch status {
622
617
case ServerStatusReady :
@@ -630,7 +625,7 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
630
625
slog .Debug (fmt .Sprintf ("model load progress %0.2f" , s .loadProgress ))
631
626
stallTimer = time .Now ().Add (stallDuration )
632
627
} else if ! fullyLoaded && int (s .loadProgress * 100.0 ) >= 100 {
633
- slog .Debug ("model load completed, waiting for server to become available" , "status" , status . ToString () )
628
+ slog .Debug ("model load completed, waiting for server to become available" , "status" , status )
634
629
stallTimer = time .Now ().Add (stallDuration )
635
630
fullyLoaded = true
636
631
}
@@ -671,71 +666,34 @@ type ImageData struct {
671
666
AspectRatioID int `json:"aspect_ratio_id"`
672
667
}
673
668
674
- type completion struct {
675
- Content string `json:"content"`
676
- Model string `json:"model"`
677
- Prompt string `json:"prompt"`
678
- Stop bool `json:"stop"`
679
- StoppedLimit bool `json:"stopped_limit"`
680
-
681
- Timings struct {
682
- PredictedN int `json:"predicted_n"`
683
- PredictedMS float64 `json:"predicted_ms"`
684
- PromptN int `json:"prompt_n"`
685
- PromptMS float64 `json:"prompt_ms"`
686
- }
687
- }
688
-
689
669
type CompletionRequest struct {
690
670
Prompt string
691
671
Format json.RawMessage
692
672
Images []ImageData
693
673
Options * api.Options
674
+
675
+ Grammar string // set before sending the request to the subprocess
694
676
}
695
677
696
678
type CompletionResponse struct {
697
- Content string
698
- DoneReason string
699
- Done bool
700
- PromptEvalCount int
701
- PromptEvalDuration time.Duration
702
- EvalCount int
703
- EvalDuration time.Duration
679
+ Content string `json:"content"`
680
+ DoneReason string `json:"done_reason"`
681
+ Done bool `json:"done"`
682
+ PromptEvalCount int `json:"prompt_eval_count"`
683
+ PromptEvalDuration time.Duration `json:"prompt_eval_duration"`
684
+ EvalCount int `json:"eval_count"`
685
+ EvalDuration time.Duration `json:"eval_duration"`
704
686
}
705
687
706
688
func (s * llmServer ) Completion (ctx context.Context , req CompletionRequest , fn func (CompletionResponse )) error {
707
- request := map [string ]any {
708
- "prompt" : req .Prompt ,
709
- "stream" : true ,
710
- "n_predict" : req .Options .NumPredict ,
711
- "n_keep" : req .Options .NumKeep ,
712
- "main_gpu" : req .Options .MainGPU ,
713
- "temperature" : req .Options .Temperature ,
714
- "top_k" : req .Options .TopK ,
715
- "top_p" : req .Options .TopP ,
716
- "min_p" : req .Options .MinP ,
717
- "typical_p" : req .Options .TypicalP ,
718
- "repeat_last_n" : req .Options .RepeatLastN ,
719
- "repeat_penalty" : req .Options .RepeatPenalty ,
720
- "presence_penalty" : req .Options .PresencePenalty ,
721
- "frequency_penalty" : req .Options .FrequencyPenalty ,
722
- "mirostat" : req .Options .Mirostat ,
723
- "mirostat_tau" : req .Options .MirostatTau ,
724
- "mirostat_eta" : req .Options .MirostatEta ,
725
- "seed" : req .Options .Seed ,
726
- "stop" : req .Options .Stop ,
727
- "image_data" : req .Images ,
728
- "cache_prompt" : true ,
729
- }
730
-
731
689
if len (req .Format ) > 0 {
732
690
switch string (req .Format ) {
733
691
case `null` , `""` :
734
692
// Field was set, but "missing" a value. We accept
735
693
// these as "not set".
736
694
break
737
695
case `"json"` :
738
- request [ "grammar" ] = grammarJSON
696
+ req . Grammar = grammarJSON
739
697
default :
740
698
if req .Format [0 ] != '{' {
741
699
return fmt .Errorf ("invalid format: %q; expected \" json\" or a valid JSON Schema object" , req .Format )
@@ -746,10 +704,15 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
746
704
if g == nil {
747
705
return fmt .Errorf ("invalid JSON schema in format" )
748
706
}
749
- request [ "grammar" ] = string (g )
707
+ req . Grammar = string (g )
750
708
}
751
709
}
752
710
711
+ if req .Options == nil {
712
+ opts := api .DefaultOptions ()
713
+ req .Options = & opts
714
+ }
715
+
753
716
if err := s .sem .Acquire (ctx , 1 ); err != nil {
754
717
if errors .Is (err , context .Canceled ) {
755
718
slog .Info ("aborting completion request due to client closing the connection" )
@@ -770,15 +733,15 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
770
733
if err != nil {
771
734
return err
772
735
} else if status != ServerStatusReady {
773
- return fmt .Errorf ("unexpected server status: %s" , status . ToString () )
736
+ return fmt .Errorf ("unexpected server status: %s" , status )
774
737
}
775
738
776
739
// Handling JSON marshaling with special characters unescaped.
777
740
buffer := & bytes.Buffer {}
778
741
enc := json .NewEncoder (buffer )
779
742
enc .SetEscapeHTML (false )
780
743
781
- if err := enc .Encode (request ); err != nil {
744
+ if err := enc .Encode (req ); err != nil {
782
745
return fmt .Errorf ("failed to marshal data: %v" , err )
783
746
}
784
747
@@ -829,7 +792,7 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
829
792
evt = line
830
793
}
831
794
832
- var c completion
795
+ var c CompletionResponse
833
796
if err := json .Unmarshal (evt , & c ); err != nil {
834
797
return fmt .Errorf ("error unmarshalling llm prediction response: %v" , err )
835
798
}
@@ -853,20 +816,8 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
853
816
})
854
817
}
855
818
856
- if c .Stop {
857
- doneReason := "stop"
858
- if c .StoppedLimit {
859
- doneReason = "length"
860
- }
861
-
862
- fn (CompletionResponse {
863
- Done : true ,
864
- DoneReason : doneReason ,
865
- PromptEvalCount : c .Timings .PromptN ,
866
- PromptEvalDuration : parseDurationMs (c .Timings .PromptMS ),
867
- EvalCount : c .Timings .PredictedN ,
868
- EvalDuration : parseDurationMs (c .Timings .PredictedMS ),
869
- })
819
+ if c .Done {
820
+ fn (c )
870
821
return nil
871
822
}
872
823
}
@@ -914,7 +865,7 @@ func (s *llmServer) Embedding(ctx context.Context, input string) ([]float32, err
914
865
if err != nil {
915
866
return nil , err
916
867
} else if status != ServerStatusReady {
917
- return nil , fmt .Errorf ("unexpected server status: %s" , status . ToString () )
868
+ return nil , fmt .Errorf ("unexpected server status: %s" , status )
918
869
}
919
870
920
871
data , err := json .Marshal (EmbeddingRequest {Content : input })
@@ -1059,12 +1010,3 @@ func (s *llmServer) EstimatedVRAMByGPU(gpuID string) uint64 {
1059
1010
}
1060
1011
return 0
1061
1012
}
1062
-
1063
- func parseDurationMs (ms float64 ) time.Duration {
1064
- dur , err := time .ParseDuration (fmt .Sprintf ("%fms" , ms ))
1065
- if err != nil {
1066
- panic (err )
1067
- }
1068
-
1069
- return dur
1070
- }
0 commit comments