@@ -13,7 +13,7 @@ struct server_params
13
13
14
14
struct llama_server_context
15
15
{
16
- bool streaming = false ;
16
+ bool stream = false ;
17
17
bool has_next_token = false ;
18
18
std::string generated_text = " " ;
19
19
@@ -35,7 +35,6 @@ struct llama_server_context
35
35
std::string stopping_word;
36
36
37
37
void rewind () {
38
- streaming = false ;
39
38
params.antiprompt .clear ();
40
39
num_tokens_predicted = 0 ;
41
40
generated_text = " " ;
@@ -253,9 +252,6 @@ struct llama_server_context
253
252
if (token == -1 ) {
254
253
return " " ;
255
254
}
256
- if (streaming) {
257
- generated_text = " " ;
258
- }
259
255
260
256
std::string token_text = llama_token_to_str (ctx, token);
261
257
generated_text += token_text;
@@ -270,7 +266,7 @@ struct llama_server_context
270
266
}
271
267
}
272
268
273
- return generated_text ;
269
+ return token_text ;
274
270
}
275
271
276
272
std::vector<float > embedding (std::string content, int threads) {
@@ -478,13 +474,13 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para
478
474
479
475
bool parse_options_completion (json body, llama_server_context& llama, Response &res) {
480
476
gpt_params default_params;
481
- if (!body[" streaming " ].is_null ())
477
+ if (!body[" stream " ].is_null ())
482
478
{
483
- llama.streaming = body[" streaming " ].get <bool >();
479
+ llama.stream = body[" stream " ].get <bool >();
484
480
}
485
481
else
486
482
{
487
- llama.streaming = false ;
483
+ llama.stream = false ;
488
484
}
489
485
if (!body[" n_predict" ].is_null ())
490
486
{
@@ -675,8 +671,6 @@ int main(int argc, char **argv)
675
671
llama_server_context llama;
676
672
params.model = " ggml-model.bin" ;
677
673
678
- std::string final_text;
679
-
680
674
if (server_params_parse (argc, argv, sparams, params) == false )
681
675
{
682
676
return 1 ;
@@ -693,98 +687,81 @@ int main(int argc, char **argv)
693
687
svr.Get (" /" , [](const Request &, Response &res)
694
688
{ res.set_content (" <h1>llama.cpp server works</h1>" , " text/html" ); });
695
689
696
- svr.Post (" /completion" , [&llama, &final_text ](const Request &req, Response &res)
697
- {
698
- if (llama. params . embedding ) {
699
- json data = {
700
- { " status " , " error " },
701
- { " reason " , " To use completion function, disable embedding mode " }};
702
- res. set_content (data. dump (), " application/json" );
703
- res.status = 400 ;
704
- return ;
705
- }
690
+ svr.Post (" /completion" , [&llama](const Request &req, Response &res) {
691
+ if (llama. params . embedding ) {
692
+ json data = {
693
+ { " status " , " error " },
694
+ { " reason " , " To use completion function, disable embedding mode " }};
695
+ res. set_content (data. dump (- 1 , ' ' , false , json:: error_handler_t ::replace),
696
+ " application/json" );
697
+ res.status = 400 ;
698
+ return ;
699
+ }
706
700
707
- llama.rewind ();
708
- final_text = " " ;
701
+ llama.rewind ();
709
702
710
- if (parse_options_completion (json::parse (req.body ), llama, res) == false ){
711
- return ;
712
- }
703
+ if (parse_options_completion (json::parse (req.body ), llama, res) == false ) {
704
+ return ;
705
+ }
713
706
714
- if (!llama.loadPrompt ())
715
- {
716
- json data = {
717
- {" status" , " error" },
718
- {" reason" , " Context too long." }};
719
- res.set_content (data.dump (), " application/json" );
720
- res.status = 400 ;
721
- return ;
722
- }
707
+ if (!llama.loadPrompt ()) {
708
+ json data = {{" status" , " error" }, {" reason" , " Context too long." }};
709
+ res.set_content (data.dump (-1 , ' ' , false , json::error_handler_t ::replace),
710
+ " application/json" );
711
+ res.status = 400 ;
712
+ return ;
713
+ }
714
+
715
+ llama.beginCompletion ();
716
+
717
+ if (!llama.stream ) {
718
+ while (llama.has_next_token ) {
719
+ llama.doCompletion ();
720
+ }
721
+
722
+ json data = {{" content" , llama.generated_text },
723
+ {" stop" , true },
724
+ {" model" , llama.params .model_alias },
725
+ {" tokens_predicted" , llama.num_tokens_predicted },
726
+ {" generation_settings" , format_generation_settings (llama)},
727
+ {" prompt" , llama.params .prompt },
728
+ {" stopping_word" , llama.stopping_word }};
729
+ return res.set_content (data.dump (-1 , ' ' , false , json::error_handler_t ::replace), " application/json" );
730
+ } else {
731
+ const auto chunked_content_provider = [&](size_t , DataSink &sink) {
732
+ while (llama.has_next_token ) {
733
+ std::string token_text = llama.doCompletion ();
723
734
724
- llama.beginCompletion ();
725
- if (llama.streaming )
726
- {
727
- res.set_chunked_content_provider (" text/event-stream" , [&](size_t /* offset*/ ,
728
- DataSink& sink) {
729
- std::string final_text = " " ;
730
- // loop inference until finish completion
731
- while (llama.has_next_token ) {
732
- std::string result = llama.doCompletion ();
733
735
json data;
734
- final_text += result;
735
- if (llama.has_next_token )
736
- {
737
- data = { {" content" , result}, {" stop" , false } };
738
- }
739
- else
740
- {
741
- // Generation is done, send extra information.
742
- data = { {" content" , result},
743
- {" stop" , true },
744
- {" tokens_predicted" , llama.num_tokens_predicted },
745
- {" generation_settings" , format_generation_settings (llama)},
746
- {" prompt" , llama.params .prompt },
747
- {" stopping_word" , llama.stopping_word },
748
- {" generated_text" , final_text} };
736
+ if (llama.has_next_token ) {
737
+ data = {{" content" , token_text}, {" stop" , false }};
738
+ } else {
739
+ // Generation is done, send extra information.
740
+ data = {
741
+ {" content" , token_text},
742
+ {" stop" , true },
743
+ {" model" , llama.params .model_alias },
744
+ {" tokens_predicted" , llama.num_tokens_predicted },
745
+ {" generation_settings" , format_generation_settings (llama)},
746
+ {" prompt" , llama.params .prompt },
747
+ {" stopping_word" , llama.stopping_word },
748
+ {" generated_text" , llama.generated_text }};
749
749
}
750
750
751
751
std::string str =
752
- " data: " + data.dump (4 , ' ' , false , json::error_handler_t ::replace) +
753
- " \n\n " ;
752
+ " data: " +
753
+ data.dump (-1 , ' ' , false , json::error_handler_t ::replace) +
754
+ " \n\n " ;
754
755
sink.write (str.data (), str.size ());
755
- }
756
-
757
- sink.done ();
758
- return true ;
759
- });
760
756
}
761
- else
762
- {
763
- // loop inference until finish completion
764
- while (llama.has_next_token )
765
- {
766
- llama.doCompletion ();
767
- }
768
- try
769
- {
770
- json data = {
771
- {" model" , llama.params .model_alias },
772
- {" content" , llama.generated_text },
773
- {" tokens_predicted" , llama.num_tokens_predicted },
774
- {" generation_settings" , format_generation_settings (llama)},
775
- {" prompt" , llama.params .prompt },
776
- {" stopping_word" , llama.stopping_word } };
777
- return res.set_content (data.dump (), " application/json" );
778
- }
779
- catch (const json::exception &e)
780
- {
781
- // Some tokens have bad UTF-8 strings, the json parser is very sensitive
782
- json data = {
783
- {" content" , " Bad encoding token" },
784
- {" tokens_predicted" , 0 }};
785
- return res.set_content (data.dump (), " application/json" );
786
- }
787
- } });
757
+
758
+ sink.done ();
759
+ return true ;
760
+ };
761
+ res.set_chunked_content_provider (" text/event-stream" , chunked_content_provider);
762
+ }
763
+ });
764
+
788
765
789
766
svr.Post (" /tokenize" , [&llama](const Request &req, Response &res)
790
767
{
@@ -811,7 +788,6 @@ int main(int argc, char **argv)
811
788
return res.set_content (data.dump (), " application/json" );
812
789
});
813
790
814
-
815
791
fprintf (stderr, " %s: http server Listening at http://%s:%i\n " , __func__, sparams.hostname .c_str (), sparams.port );
816
792
817
793
if (params.embedding ) {
0 commit comments