tmc
diff --git a/‎api/client.go
-13 b/‎api/client.go
-13
diff --git a/‎api/types.go
+22-52 b/‎api/types.go
+22-52
diff --git a/‎docs/api.md
+10-142 b/‎docs/api.md
+10-142
@@ -221,19 +221,6 @@ func (c *Client) Generate(ctx context.Context, req *GenerateRequest, fn Generate
 	})
 }
 
-type ChatResponseFunc func(ChatResponse) error
-
-func (c *Client) Chat(ctx context.Context, req *ChatRequest, fn ChatResponseFunc) error {
-	return c.stream(ctx, http.MethodPost, "/api/chat", req, func(bts []byte) error {
-		var resp ChatResponse
-		if err := json.Unmarshal(bts, &resp); err != nil {
-			return err
-		}
-
-		return fn(resp)
-	})
-}
-
 type PullProgressFunc func(ProgressResponse) error
 
 func (c *Client) Pull(ctx context.Context, req *PullRequest, fn PullProgressFunc) error {
 
@@ -36,49 +36,14 @@ type GenerateRequest struct {
 	Prompt   string `json:"prompt"`
 	System   string `json:"system"`
 	Template string `json:"template"`
-	Context  []int  `json:"context,omitempty"` // DEPRECATED: context is deprecated, use the /chat endpoint instead for chat history
+	Context  []int  `json:"context,omitempty"`
 	Stream   *bool  `json:"stream,omitempty"`
 	Raw      bool   `json:"raw,omitempty"`
 	Format   string `json:"format"`
 
 	Options map[string]interface{} `json:"options"`
 }
 
-type ChatRequest struct {
-	Model    string    `json:"model"`
-	Messages []Message `json:"messages"`
-	Template string    `json:"template"`
-	Stream   *bool     `json:"stream,omitempty"`
-	Format   string    `json:"format"`
-
-	Options map[string]interface{} `json:"options"`
-}
-
-type Message struct {
-	Role    string `json:"role"` // one of ["system", "user", "assistant"]
-	Content string `json:"content"`
-}
-
-type ChatResponse struct {
-	Model     string    `json:"model"`
-	CreatedAt time.Time `json:"created_at"`
-	Message   *Message  `json:"message,omitempty"`
-
-	Done    bool  `json:"done"`
-	Context []int `json:"context,omitempty"`
-
-	EvalMetrics
-}
-
-type EvalMetrics struct {
-	TotalDuration      time.Duration `json:"total_duration,omitempty"`
-	LoadDuration       time.Duration `json:"load_duration,omitempty"`
-	PromptEvalCount    int           `json:"prompt_eval_count,omitempty"`
-	PromptEvalDuration time.Duration `json:"prompt_eval_duration,omitempty"`
-	EvalCount          int           `json:"eval_count,omitempty"`
-	EvalDuration       time.Duration `json:"eval_duration,omitempty"`
-}
-
 // Options specfied in GenerateRequest, if you add a new option here add it to the API docs also
 type Options struct {
 	Runner
@@ -208,34 +173,39 @@ type GenerateResponse struct {
 	Done    bool  `json:"done"`
 	Context []int `json:"context,omitempty"`
 
-	EvalMetrics
+	TotalDuration      time.Duration `json:"total_duration,omitempty"`
+	LoadDuration       time.Duration `json:"load_duration,omitempty"`
+	PromptEvalCount    int           `json:"prompt_eval_count,omitempty"`
+	PromptEvalDuration time.Duration `json:"prompt_eval_duration,omitempty"`
+	EvalCount          int           `json:"eval_count,omitempty"`
+	EvalDuration       time.Duration `json:"eval_duration,omitempty"`
 }
 
-func (m *EvalMetrics) Summary() {
-	if m.TotalDuration > 0 {
-		fmt.Fprintf(os.Stderr, "total duration:       %v\n", m.TotalDuration)
+func (r *GenerateResponse) Summary() {
+	if r.TotalDuration > 0 {
+		fmt.Fprintf(os.Stderr, "total duration:       %v\n", r.TotalDuration)
 	}
 
-	if m.LoadDuration > 0 {
-		fmt.Fprintf(os.Stderr, "load duration:        %v\n", m.LoadDuration)
+	if r.LoadDuration > 0 {
+		fmt.Fprintf(os.Stderr, "load duration:        %v\n", r.LoadDuration)
 	}
 
-	if m.PromptEvalCount > 0 {
-		fmt.Fprintf(os.Stderr, "prompt eval count:    %d token(s)\n", m.PromptEvalCount)
+	if r.PromptEvalCount > 0 {
+		fmt.Fprintf(os.Stderr, "prompt eval count:    %d token(s)\n", r.PromptEvalCount)
 	}
 
-	if m.PromptEvalDuration > 0 {
-		fmt.Fprintf(os.Stderr, "prompt eval duration: %s\n", m.PromptEvalDuration)
-		fmt.Fprintf(os.Stderr, "prompt eval rate:     %.2f tokens/s\n", float64(m.PromptEvalCount)/m.PromptEvalDuration.Seconds())
+	if r.PromptEvalDuration > 0 {
+		fmt.Fprintf(os.Stderr, "prompt eval duration: %s\n", r.PromptEvalDuration)
+		fmt.Fprintf(os.Stderr, "prompt eval rate:     %.2f tokens/s\n", float64(r.PromptEvalCount)/r.PromptEvalDuration.Seconds())
 	}
 
-	if m.EvalCount > 0 {
-		fmt.Fprintf(os.Stderr, "eval count:           %d token(s)\n", m.EvalCount)
+	if r.EvalCount > 0 {
+		fmt.Fprintf(os.Stderr, "eval count:           %d token(s)\n", r.EvalCount)
 	}
 
-	if m.EvalDuration > 0 {
-		fmt.Fprintf(os.Stderr, "eval duration:        %s\n", m.EvalDuration)
-		fmt.Fprintf(os.Stderr, "eval rate:            %.2f tokens/s\n", float64(m.EvalCount)/m.EvalDuration.Seconds())
+	if r.EvalDuration > 0 {
+		fmt.Fprintf(os.Stderr, "eval duration:        %s\n", r.EvalDuration)
+		fmt.Fprintf(os.Stderr, "eval rate:            %.2f tokens/s\n", float64(r.EvalCount)/r.EvalDuration.Seconds())
 	}
 }
 
 
@@ -24,31 +24,30 @@ All durations are returned in nanoseconds.
 
 ### Streaming responses
 
-Certain endpoints stream responses as JSON objects.
+Certain endpoints stream responses as JSON objects delineated with the newline (`\n`) character.
 
 ## Generate a completion
 
 ```shell
 POST /api/generate
 ```
 
-Generate a response for a given prompt with a provided model. This is a streaming endpoint, so there will be a series of responses. The final response object will include statistics and additional data from the request.
+Generate a response for a given prompt with a provided model. This is a streaming endpoint, so will be a series of responses. The final response object will include statistics and additional data from the request.
 
 ### Parameters
 
-`model` is required.
-
 - `model`: (required) the [model name](#model-names)
 - `prompt`: the prompt to generate a response for
 
 Advanced parameters (optional):
 
 - `format`: the format to return a response in. Currently the only accepted value is `json`
 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
-- `template`: the full prompt or prompt template (overrides what is defined in the `Modelfile`)
 - `system`: system prompt to (overrides what is defined in the `Modelfile`)
+- `template`: the full prompt or prompt template (overrides what is defined in the `Modelfile`)
+- `context`: the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory
 - `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
-- `raw`: if `true` no formatting will be applied to the prompt. You may choose to use the `raw` parameter if you are specifying a full templated prompt in your request to the API.
+- `raw`: if `true` no formatting will be applied to the prompt and no context will be returned. You may choose to use the `raw` parameter if you are specifying a full templated prompt in your request to the API, and are managing history yourself.
 
 ### JSON mode
 
@@ -58,7 +57,7 @@ Enable JSON mode by setting the `format` parameter to `json`. This will structur
 
 ### Examples
 
-#### Request (Prompt)
+#### Request
 
 ```shell
 curl http://localhost:11434/api/generate -d '{
@@ -90,7 +89,7 @@ The final response in the stream also includes additional data about the generat
 - `prompt_eval_duration`: time spent in nanoseconds evaluating the prompt
 - `eval_count`: number of tokens the response
 - `eval_duration`: time in nanoseconds spent generating the response
-- `context`: deprecated, an encoding of the conversation used in this response, this can be sent in the next request to keep a conversational memory
+- `context`: an encoding of the conversation used in this response, this can be sent in the next request to keep a conversational memory
 - `response`: empty if the response was streamed, if not streamed, this will contain the full response
 
 To calculate how fast the response is generated in tokens per second (token/s), divide `eval_count` / `eval_duration`.
@@ -115,8 +114,6 @@ To calculate how fast the response is generated in tokens per second (token/s),
 
 #### Request (No streaming)
 
-A response can be recieved in one reply when streaming is off.
-
 ```shell
 curl http://localhost:11434/api/generate -d '{
   "model": "llama2",
@@ -147,9 +144,9 @@ If `stream` is set to `false`, the response will be a single JSON object:
 }
 ```
 
-#### Request (Raw Mode)
+#### Request (Raw mode)
 
-In some cases you may wish to bypass the templating system and provide a full prompt. In this case, you can use the `raw` parameter to disable formatting.
+In some cases you may wish to bypass the templating system and provide a full prompt. In this case, you can use the `raw` parameter to disable formatting and context.
 
 ```shell
 curl http://localhost:11434/api/generate -d '{
@@ -167,7 +164,6 @@ curl http://localhost:11434/api/generate -d '{
   "model": "mistral",
   "created_at": "2023-11-03T15:36:02.583064Z",
   "response": " The sky appears blue because of a phenomenon called Rayleigh scattering.",
-  "context": [1, 2, 3],
   "done": true,
   "total_duration": 14648695333,
   "load_duration": 3302671417,
@@ -279,6 +275,7 @@ curl http://localhost:11434/api/generate -d '{
   "model": "llama2",
   "created_at": "2023-08-04T19:22:45.499127Z",
   "response": "The sky is blue because it is the color of the sky.",
+  "context": [1, 2, 3],
   "done": true,
   "total_duration": 5589157167,
   "load_duration": 3013701500,
@@ -291,135 +288,6 @@ curl http://localhost:11434/api/generate -d '{
 }
 ```
 
-## Send Chat Messages
-```shell
-POST /api/chat
-```
-
-Generate the next message in a chat with a provided model. This is a streaming endpoint, so there will be a series of responses. The final response object will include statistics and additional data from the request.
-
-### Parameters
-
-`model` is required.
-
-- `model`: (required) the [model name](#model-names)
-- `messages`: the messages of the chat, this can be used to keep a chat memory
-
-Advanced parameters (optional):
-
-- `format`: the format to return a response in. Currently the only accepted value is `json`
-- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
-- `template`: the full prompt or prompt template (overrides what is defined in the `Modelfile`)
-- `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
-
-### Examples
-
-#### Request
-Send a chat message with a streaming response.
-
-```shell
-curl http://localhost:11434/api/generate -d '{
-  "model": "llama2",
-  "messages": [
-    {
-      "role": "user",
-      "content": "why is the sky blue?"
-    }
-  ]
-}'
-```
-
-#### Response
-
-A stream of JSON objects is returned:
-
-```json
-{
-  "model": "llama2",
-  "created_at": "2023-08-04T08:52:19.385406455-07:00",
-  "message": {
-    "role": "assisant",
-    "content": "The"
-  },
-  "done": false
-}
-```
-
-Final response:
-
-```json
-{
-  "model": "llama2",
-  "created_at": "2023-08-04T19:22:45.499127Z",
-  "done": true,
-  "total_duration": 5589157167,
-  "load_duration": 3013701500,
-  "sample_count": 114,
-  "sample_duration": 81442000,
-  "prompt_eval_count": 46,
-  "prompt_eval_duration": 1160282000,
-  "eval_count": 113,
-  "eval_duration": 1325948000
-}
-```
-
-#### Request (With History)
-Send a chat message with a conversation history.
-
-```shell
-curl http://localhost:11434/api/generate -d '{
-  "model": "llama2",
-  "messages": [
-    {
-      "role": "user",
-      "content": "why is the sky blue?"
-    },
-    {
-      "role": "assistant",
-      "content": "due to rayleigh scattering."
-    },
-    {
-      "role": "user",
-      "content": "how is that different than mie scattering?"
-    }
-  ]
-}'
-```
-
-#### Response
-
-A stream of JSON objects is returned:
-
-```json
-{
-  "model": "llama2",
-  "created_at": "2023-08-04T08:52:19.385406455-07:00",
-  "message": {
-    "role": "assisant",
-    "content": "The"
-  },
-  "done": false
-}
-```
-
-Final response:
-
-```json
-{
-  "model": "llama2",
-  "created_at": "2023-08-04T19:22:45.499127Z",
-  "done": true,
-  "total_duration": 5589157167,
-  "load_duration": 3013701500,
-  "sample_count": 114,
-  "sample_duration": 81442000,
-  "prompt_eval_count": 46,
-  "prompt_eval_duration": 1160282000,
-  "eval_count": 113,
-  "eval_duration": 1325948000
-}
-```
-
 ## Create a Model
 
 ```shell