server: tests: CORS and api key checks scenario

ggerganov · phymbert · Feb 24, 2024 · Feb 18, 2024 · Feb 19, 2024 · Feb 19, 2024
commit 672d98f6f0acee9f93bf74e44a032eee5942ff5a
@@ -1,7 +1,7 @@
 Feature: llama.cpp server
 
   Background: Server startup
-    Given a server listening on localhost:8080 with 2 slots and 42 as seed
+    Given a server listening on localhost:8080 with 2 slots, 42 as seed and llama.cpp as api key
     Then  the server is starting
     Then  the server is healthy
 
@@ -13,13 +13,17 @@ Feature: llama.cpp server
 
   @llama.cpp
   Scenario Outline: Completion
-    Given a <prompt> completion request with maximum <n_predict> tokens
+    Given a prompt <prompt>
+    And   a user api key <api_key>
+    And   <n_predict> max tokens to predict
+    And   a completion request
     Then  <n_predict> tokens are predicted
 
     Examples: Prompts
-      | prompt                           | n_predict |
-      | I believe the meaning of life is | 128       |
-      | Write a joke about AI            | 512       |
+      | prompt                           | n_predict | api_key   |
+      | I believe the meaning of life is | 128       | llama.cpp |
+      | Write a joke about AI            | 512       | llama.cpp |
+      | say goodbye                      | 0         |           |
 
   @llama.cpp
   Scenario Outline: OAI Compatibility
@@ -28,13 +32,15 @@ Feature: llama.cpp server
     And   a model <model>
     And   <max_tokens> max tokens to predict
     And   streaming is <enable_streaming>
-    Given an OAI compatible chat completions request
+    And   a user api key <api_key>
+    Given an OAI compatible chat completions request with an api error <api_error>
     Then  <max_tokens> tokens are predicted
 
     Examples: Prompts
-      | model        | system_prompt               | user_prompt                          | max_tokens | enable_streaming |
-      | llama-2      | You are ChatGPT.            | Say hello.                           | 64         | false            |
-      | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 512        | true             |
+      | model        | system_prompt               | user_prompt                          | max_tokens | enable_streaming | api_key   | api_error |
+      | llama-2      | You are ChatGPT.            | Say hello.                           | 64         | false            | llama.cpp | none      |
+      | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 512        | true             | llama.cpp | none      |
+      | John-Doe     | You are an hacker.          | Write segfault code in rust.         | 0          | true             | hackme    | raised    |
 
   @llama.cpp
   Scenario: Multi users
@@ -47,6 +53,7 @@ Feature: llama.cpp server
       Write another very long music lyrics.
       """
     And 32 max tokens to predict
+    And a user api key llama.cpp
     Given concurrent completion requests
     Then the server is busy
     And  all slots are busy
@@ -57,7 +64,7 @@ Feature: llama.cpp server
   @llama.cpp
   Scenario: Multi users OAI Compatibility
     Given a system prompt "You are an AI assistant."
-    And a model tinyllama-2
+    And   a model tinyllama-2
     Given a prompt:
       """
       Write a very long story about AI.
@@ -68,6 +75,7 @@ Feature: llama.cpp server
       """
     And 32 max tokens to predict
     And streaming is enabled
+    And a user api key llama.cpp
     Given concurrent OAI completions requests
     Then the server is busy
     And  all slots are busy
@@ -126,3 +134,15 @@ Feature: llama.cpp server
     """
     Then tokens can be detokenize
 
+  @llama.cpp
+  Scenario Outline: CORS Options
+    When an OPTIONS request is sent from <origin>
+    Then CORS header <cors_header> is set to <cors_header_value>
+
+    Examples: Headers
+      | origin          | cors_header                      | cors_header_value |
+      | localhost       | Access-Control-Allow-Origin      | localhost         |
+      | web.mydomain.fr | Access-Control-Allow-Origin      | web.mydomain.fr   |
+      | origin          | Access-Control-Allow-Credentials | true              |
+      | web.mydomain.fr | Access-Control-Allow-Methods     | POST              |
+      | web.mydomain.fr | Access-Control-Allow-Headers     | *                 |
@@ -7,8 +7,9 @@
 from behave import step
 
 
-@step(u"a server listening on {server_fqdn}:{server_port} with {n_slots} slots and {seed} as seed")
-def step_server_config(context, server_fqdn, server_port, n_slots, seed):
+@step(
+    u"a server listening on {server_fqdn}:{server_port} with {n_slots} slots, {seed} as seed and {api_key} as api key")
+def step_server_config(context, server_fqdn, server_port, n_slots, seed, api_key):
     context.server_fqdn = server_fqdn
     context.server_port = int(server_port)
     context.n_slots = int(n_slots)
@@ -19,7 +20,8 @@ def step_server_config(context, server_fqdn, server_port, n_slots, seed):
     context.completion_threads = []
     context.prompts = []
 
-    openai.api_key = 'llama.cpp'
+    context.api_key = api_key
+    openai.api_key = context.api_key
 
 
 @step(u"the server is {expecting_status}")
@@ -77,14 +79,16 @@ def step_all_slots_status(context, expected_slot_status_string):
     request_slots_status(context, expected_slots)
 
 
-@step(u'a {prompt} completion request with maximum {n_predict} tokens')
-def step_request_completion(context, prompt, n_predict):
-    request_completion(context, prompt, n_predict)
+@step(u'a completion request')
+def step_request_completion(context):
+    request_completion(context, context.prompts.pop(), context.n_predict, context.user_api_key)
+    context.user_api_key = None
 
 
 @step(u'{predicted_n} tokens are predicted')
 def step_n_tokens_predicted(context, predicted_n):
-    assert_n_tokens_predicted(context.completions[0], int(predicted_n))
+    if int(predicted_n) > 0:
+        assert_n_tokens_predicted(context.completions[0], int(predicted_n))
 
 
 @step(u'a user prompt {user_prompt}')
@@ -112,24 +116,40 @@ def step_streaming(context, enable_streaming):
     context.enable_streaming = enable_streaming == 'enabled' or bool(enable_streaming)
 
 
-@step(u'an OAI compatible chat completions request')
-def step_oai_chat_completions(context):
-    oai_chat_completions(context, context.user_prompt)
+@step(u'a user api key {user_api_key}')
+def step_user_api_key(context, user_api_key):
+    context.user_api_key = user_api_key
+
+
+@step(u'a user api key ')
+def step_user_api_key(context):
+    context.user_api_key = None
+
+
+@step(u'an OAI compatible chat completions request with an api error {api_error}')
+def step_oai_chat_completions(context, api_error):
+    oai_chat_completions(context, context.user_prompt, api_error=api_error == 'raised')
+    context.user_api_key = None
 
 
 @step(u'a prompt')
 def step_a_prompt(context):
     context.prompts.append(context.text)
 
 
+@step(u'a prompt {prompt}')
+def step_a_prompt_prompt(context, prompt):
+    context.prompts.append(prompt)
+
+
 @step(u'concurrent completion requests')
 def step_concurrent_completion_requests(context):
-    concurrent_requests(context, request_completion)
+    concurrent_requests(context, request_completion, context.n_predict, context.user_api_key)
 
 
 @step(u'concurrent OAI completions requests')
 def step_oai_chat_completions(context):
-    concurrent_requests(context, oai_chat_completions)
+    concurrent_requests(context, oai_chat_completions, context.user_api_key)
 
 
 @step(u'all prompts are predicted')
@@ -168,7 +188,7 @@ def step_oai_compute_embedding(context):
 def step_tokenize(context):
     context.tokenized_text = context.text
     response = requests.post(f'{context.base_url}/tokenize', json={
-        "content":context.tokenized_text,
+        "content": context.tokenized_text,
     })
     assert response.status_code == 200
     context.tokens = response.json()['tokens']
@@ -181,49 +201,82 @@ def step_detokenize(context):
         "tokens": context.tokens,
     })
     assert response.status_code == 200
-    print(response.json())
     # FIXME the detokenize answer contains a space prefix ? see #3287
     assert context.tokenized_text == response.json()['content'].strip()
 
 
-def concurrent_requests(context, f_completion):
+@step(u'an OPTIONS request is sent from {origin}')
+def step_options_request(context, origin):
+    options_response = requests.options(f'{context.base_url}/v1/chat/completions',
+                                        headers={"Origin": origin})
+    assert options_response.status_code == 200
+    context.options_response = options_response
+
+
+@step(u'CORS header {cors_header} is set to {cors_header_value}')
+def step_check_options_header_value(context, cors_header, cors_header_value):
+    assert context.options_response.headers[cors_header] == cors_header_value
+
+
+def concurrent_requests(context, f_completion, *argv):
     context.completions.clear()
     context.completion_threads.clear()
     for prompt in context.prompts:
-        completion_thread = threading.Thread(target=f_completion, args=(context, prompt))
+        completion_thread = threading.Thread(target=f_completion, args=(context, prompt, *argv))
         completion_thread.start()
         context.completion_threads.append(completion_thread)
     context.prompts.clear()
 
 
-def request_completion(context, prompt, n_predict=None):
-    response = requests.post(f'{context.base_url}/completion', json={
-        "prompt": prompt,
-        "n_predict": int(n_predict) if n_predict is not None else context.n_predict,
-        "seed": context.seed
-    })
-    assert response.status_code == 200
-    context.completions.append(response.json())
+def request_completion(context, prompt, n_predict=None, user_api_key=None):
+    origin = "my.super.domain"
+    headers = {
+        'Origin': origin
+    }
+    if 'user_api_key' in context:
+        headers['Authorization'] = f'Bearer {user_api_key}'
+
+    response = requests.post(f'{context.base_url}/completion',
+                             json={
+                                 "prompt": prompt,
+                                 "n_predict": int(n_predict) if n_predict is not None else context.n_predict,
+                                 "seed": context.seed
+                             },
+                             headers=headers)
+    if n_predict is not None and n_predict > 0:
+        assert response.status_code == 200
+        assert response.headers['Access-Control-Allow-Origin'] == origin
+        context.completions.append(response.json())
+    else:
+        assert response.status_code == 401
 
 
-def oai_chat_completions(context, user_prompt):
+
+def oai_chat_completions(context, user_prompt, api_error=None):
+    openai.api_key = context.user_api_key
     openai.api_base = f'{context.base_url}/v1/chat'
-    chat_completion = openai.Completion.create(
-        messages=[
-            {
-                "role": "system",
-                "content": context.system_prompt,
-            },
-            {
-                "role": "user",
-                "content": user_prompt,
-            }
-        ],
-        model=context.model,
-        max_tokens=context.n_predict,
-        stream=context.enable_streaming,
-        seed=context.seed
-    )
+    try:
+        chat_completion = openai.Completion.create(
+            messages=[
+                {
+                    "role": "system",
+                    "content": context.system_prompt,
+                },
+                {
+                    "role": "user",
+                    "content": user_prompt,
+                }
+            ],
+            model=context.model,
+            max_tokens=context.n_predict,
+            stream=context.enable_streaming,
+            seed=context.seed
+        )
+    except openai.error.APIError:
+        if api_error:
+            openai.api_key = context.api_key
+            return
+    openai.api_key = context.api_key
     if context.enable_streaming:
         completion_response = {
             'content': '',

@@ -29,6 +29,7 @@ set -eu
             --threads-batch 4 \
             --embedding \
             --cont-batching \
+            --api-key llama.cpp \
             "$@" &
 
 # Start tests