neuralmagic
diff --git a/‎tests/async_engine/test_openapi_server_ray.py‎
Lines changed: 11 additions & 14 deletions b/‎tests/async_engine/test_openapi_server_ray.py‎
Lines changed: 11 additions & 14 deletions
diff --git a/‎tests/entrypoints/test_openai_server.py‎
Lines changed: 84 additions & 85 deletions b/‎tests/entrypoints/test_openai_server.py‎
Lines changed: 84 additions & 85 deletions
diff --git a/‎tests/tensorizer_loader/test_tensorizer.py‎
Lines changed: 2 additions & 3 deletions b/‎tests/tensorizer_loader/test_tensorizer.py‎
Lines changed: 2 additions & 3 deletions
@@ -55,9 +55,8 @@ async def test_single_completion(server, client: openai.AsyncOpenAI):
                                                  temperature=0.0)
 
     assert completion.id is not None
-    assert completion.choices is not None and len(completion.choices) == 1
-    assert completion.choices[0].text is not None and len(
-        completion.choices[0].text) >= 5
+    assert len(completion.choices) == 1
+    assert len(completion.choices[0].text) >= 5
     assert completion.choices[0].finish_reason == "length"
     assert completion.usage == openai.types.CompletionUsage(
         completion_tokens=5, prompt_tokens=6, total_tokens=11)
@@ -69,8 +68,7 @@ async def test_single_completion(server, client: openai.AsyncOpenAI):
         max_tokens=5,
         temperature=0.0,
     )
-    assert completion.choices[0].text is not None and len(
-        completion.choices[0].text) >= 5
+    assert len(completion.choices[0].text) >= 5
 
 
 @pytest.mark.asyncio
@@ -90,15 +88,14 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI):
                                                            logprobs=True,
                                                            top_logprobs=5)
     assert chat_completion.id is not None
-    assert chat_completion.choices is not None and len(
-        chat_completion.choices) == 1
-    assert chat_completion.choices[0].message is not None
-    assert chat_completion.choices[0].logprobs is not None
-    assert chat_completion.choices[0].logprobs.content[
-        0].top_logprobs is not None
-    assert len(
-        chat_completion.choices[0].logprobs.content[0].top_logprobs) == 5
-    message = chat_completion.choices[0].message
+    assert len(chat_completion.choices) == 1
+
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=10, prompt_tokens=13, total_tokens=23)
+
+    message = choice.message
     assert message.content is not None and len(message.content) >= 10
     assert message.role == "assistant"
     messages.append({"role": "assistant", "content": message.content})
 
@@ -166,9 +166,10 @@ async def test_single_completion(server, client: openai.AsyncOpenAI,
 
     assert completion.id is not None
     assert completion.choices is not None and len(completion.choices) == 1
-    assert completion.choices[0].text is not None and len(
-        completion.choices[0].text) >= 5
-    assert completion.choices[0].finish_reason == "length"
+
+    choice = completion.choices[0]
+    assert len(choice.text) >= 5
+    assert choice.finish_reason == "length"
     assert completion.usage == openai.types.CompletionUsage(
         completion_tokens=5, prompt_tokens=6, total_tokens=11)
 
@@ -179,8 +180,7 @@ async def test_single_completion(server, client: openai.AsyncOpenAI,
         max_tokens=5,
         temperature=0.0,
     )
-    assert completion.choices[0].text is not None and len(
-        completion.choices[0].text) >= 5
+    assert len(completion.choices[0].text) >= 5
 
 
 @pytest.mark.asyncio
@@ -205,9 +205,9 @@ async def test_no_logprobs(server, client: openai.AsyncOpenAI,
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
-    # first test base model, then test loras
+    # just test 1 lora hereafter
     "model_name",
-    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
+    [MODEL_NAME, "zephyr-lora"],
 )
 async def test_zero_logprobs(server, client: openai.AsyncOpenAI,
                              model_name: str):
@@ -290,55 +290,7 @@ async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI,
         max_tokens=5,
         temperature=0.0,
     )
-    completion = completion.choices[0].text
-    assert completion is not None and len(completion) >= 0
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # just test 1 lora hereafter
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_single_chat_session(server, client: openai.AsyncOpenAI,
-                                   model_name: str):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role": "user",
-        "content": "what is 1+1?"
-    }]
-
-    # test single completion
-    chat_completion = await client.chat.completions.create(model=model_name,
-                                                           messages=messages,
-                                                           max_tokens=10,
-                                                           logprobs=True,
-                                                           top_logprobs=5)
-    assert chat_completion.id is not None
-    assert chat_completion.choices is not None and len(
-        chat_completion.choices) == 1
-    assert chat_completion.choices[0].message is not None
-    assert chat_completion.choices[0].logprobs is not None
-    assert chat_completion.choices[0].logprobs.content[
-        0].top_logprobs is not None
-    assert len(
-        chat_completion.choices[0].logprobs.content[0].top_logprobs) == 5
-    message = chat_completion.choices[0].message
-    assert message.content is not None and len(message.content) >= 10
-    assert message.role == "assistant"
-    messages.append({"role": "assistant", "content": message.content})
-
-    # test multi-turn dialogue
-    messages.append({"role": "user", "content": "express your result in json"})
-    chat_completion = await client.chat.completions.create(
-        model=model_name,
-        messages=messages,
-        max_tokens=10,
-    )
-    message = chat_completion.choices[0].message
-    assert message.content is not None and len(message.content) >= 0
+    assert len(completion.choices[0].text) >= 0
 
 
 @pytest.mark.asyncio
@@ -393,7 +345,7 @@ async def test_zero_logprobs_chat(server, client: openai.AsyncOpenAI,
     choice = chat_completion.choices[0]
     assert choice.logprobs is not None
     assert choice.logprobs.content is not None
-    assert len(choice.logprobs.content[0].top_logprobs) <= 1
+    assert len(choice.logprobs.content[0].top_logprobs) == 0
 
 
 @pytest.mark.asyncio
@@ -421,11 +373,14 @@ async def test_some_logprobs_chat(server, client: openai.AsyncOpenAI,
     choice = chat_completion.choices[0]
     assert choice.logprobs is not None
     assert choice.logprobs.content is not None
-    assert len(choice.logprobs.content[0].top_logprobs) <= 6
+    assert len(choice.logprobs.content[0].top_logprobs) == 5
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
 async def test_too_many_chat_logprobs(server, client: openai.AsyncOpenAI,
                                       model_name: str):
     messages = [{
@@ -466,7 +421,51 @@ async def test_too_many_chat_logprobs(server, client: openai.AsyncOpenAI,
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
-    # just test 1 lora hereafter
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_single_chat_session(server, client: openai.AsyncOpenAI,
+                                   model_name: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role": "user",
+        "content": "what is 1+1?"
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(model=model_name,
+                                                           messages=messages,
+                                                           max_tokens=10,
+                                                           logprobs=True,
+                                                           top_logprobs=5)
+    assert chat_completion.id is not None
+    assert len(chat_completion.choices) == 1
+
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=10, prompt_tokens=37, total_tokens=47)
+
+    message = choice.message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=10,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
     "model_name",
     [MODEL_NAME, "zephyr-lora"],
 )
@@ -752,8 +751,7 @@ async def test_logits_bias(server, client: openai.AsyncOpenAI):
         logit_bias={str(token_id): 100},
         seed=42,
     )
-    assert completion.choices[0].text is not None and len(
-        completion.choices[0].text) >= 5
+    assert len(completion.choices[0].text) >= 5
     response_tokens = tokenizer(completion.choices[0].text,
                                 add_special_tokens=False)["input_ids"]
     expected_tokens = tokenizer(tokenizer.decode([token_id] * 5),
@@ -800,9 +798,8 @@ async def test_guided_json_completion(server, client: openai.AsyncOpenAI,
                         guided_decoding_backend=guided_decoding_backend))
 
     assert completion.id is not None
-    assert completion.choices is not None and len(completion.choices) == 3
+    assert len(completion.choices) == 3
     for i in range(3):
-        assert completion.choices[i].text is not None
         output_json = json.loads(completion.choices[i].text)
         jsonschema.validate(instance=output_json, schema=TEST_SCHEMA)
 
@@ -869,9 +866,8 @@ async def test_guided_regex_completion(server, client: openai.AsyncOpenAI,
                         guided_decoding_backend=guided_decoding_backend))
 
     assert completion.id is not None
-    assert completion.choices is not None and len(completion.choices) == 3
+    assert len(completion.choices) == 3
     for i in range(3):
-        assert completion.choices[i].text is not None
         assert re.fullmatch(TEST_REGEX, completion.choices[i].text) is not None
 
 
@@ -928,7 +924,7 @@ async def test_guided_choice_completion(server, client: openai.AsyncOpenAI,
                         guided_decoding_backend=guided_decoding_backend))
 
     assert completion.id is not None
-    assert completion.choices is not None and len(completion.choices) == 2
+    assert len(completion.choices) == 2
     for i in range(2):
         assert completion.choices[i].text in TEST_CHOICE
 
@@ -1030,12 +1026,14 @@ async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI,
         top_logprobs=5,
         extra_body=dict(guided_choice=TEST_CHOICE,
                         guided_decoding_backend=guided_decoding_backend))
+
+    assert chat_completion.choices[0].logprobs is not None
+    assert chat_completion.choices[0].logprobs.content is not None
     top_logprobs = chat_completion.choices[0].logprobs.content[0].top_logprobs
 
     # -9999.0 is the minimum logprob returned by OpenAI
-    assert all(
-        isinstance(token.logprob, float) and token.logprob >= -9999.0
-        for token in top_logprobs)
+    for item in top_logprobs:
+        assert item.logprob >= -9999.0, f"Failed (top_logprobs={top_logprobs})"
 
 
 @pytest.mark.asyncio
@@ -1237,6 +1235,8 @@ async def test_response_format_json_object(server, client: openai.AsyncOpenAI):
             response_format={"type": "json_object"})
 
         content = resp.choices[0].message.content
+        assert content is not None
+
         loaded = json.loads(content)
         assert loaded == {"result": 2}, loaded
 
@@ -1364,8 +1364,7 @@ async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI,
 
         prompt_text = tokenizer.decode(prompt) if isinstance(prompt,
                                                              list) else prompt
-        assert (completion.choices[0].text is not None
-                and re.search(r"^" + prompt_text, completion.choices[0].text))
+        assert re.search(r"^" + prompt_text, completion.choices[0].text)
         logprobs = completion.choices[0].logprobs
         assert logprobs is not None
         assert len(logprobs.text_offset) > 5
@@ -1406,32 +1405,32 @@ async def test_long_seed(server, client: openai.AsyncOpenAI):
 )
 async def test_single_embedding(embedding_server, client: openai.AsyncOpenAI,
                                 model_name: str):
-    input = [
+    input_texts = [
         "The chef prepared a delicious meal.",
     ]
 
     # test single embedding
     embeddings = await client.embeddings.create(
         model=model_name,
-        input=input,
+        input=input_texts,
         encoding_format="float",
     )
     assert embeddings.id is not None
-    assert embeddings.data is not None and len(embeddings.data) == 1
+    assert len(embeddings.data) == 1
     assert len(embeddings.data[0].embedding) == 4096
     assert embeddings.usage.completion_tokens == 0
     assert embeddings.usage.prompt_tokens == 9
     assert embeddings.usage.total_tokens == 9
 
     # test using token IDs
-    input = [1, 1, 1, 1, 1]
+    input_tokens = [1, 1, 1, 1, 1]
     embeddings = await client.embeddings.create(
         model=model_name,
-        input=input,
+        input=input_tokens,
         encoding_format="float",
     )
     assert embeddings.id is not None
-    assert embeddings.data is not None and len(embeddings.data) == 1
+    assert len(embeddings.data) == 1
     assert len(embeddings.data[0].embedding) == 4096
     assert embeddings.usage.completion_tokens == 0
     assert embeddings.usage.prompt_tokens == 5
@@ -1446,29 +1445,29 @@ async def test_single_embedding(embedding_server, client: openai.AsyncOpenAI,
 async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI,
                                model_name: str):
     # test List[str]
-    inputs = [
+    input_texts = [
         "The cat sat on the mat.", "A feline was resting on a rug.",
         "Stars twinkle brightly in the night sky."
     ]
     embeddings = await client.embeddings.create(
         model=model_name,
-        input=inputs,
+        input=input_texts,
         encoding_format="float",
     )
     assert embeddings.id is not None
-    assert embeddings.data is not None and len(embeddings.data) == 3
+    assert len(embeddings.data) == 3
     assert len(embeddings.data[0].embedding) == 4096
 
     # test List[List[int]]
-    inputs = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
-              [25, 32, 64, 77]]
+    input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
+                    [25, 32, 64, 77]]
     embeddings = await client.embeddings.create(
         model=model_name,
-        input=inputs,
+        input=input_tokens,
         encoding_format="float",
     )
     assert embeddings.id is not None
-    assert embeddings.data is not None and len(embeddings.data) == 4
+    assert len(embeddings.data) == 4
     assert len(embeddings.data[0].embedding) == 4096
     assert embeddings.usage.completion_tokens == 0
     assert embeddings.usage.prompt_tokens == 17
 
@@ -214,9 +214,8 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
                                            temperature=0.0)
 
     assert completion.id is not None
-    assert completion.choices is not None and len(completion.choices) == 1
-    assert completion.choices[0].text is not None and len(
-        completion.choices[0].text) >= 5
+    assert len(completion.choices) == 1
+    assert len(completion.choices[0].text) >= 5
     assert completion.choices[0].finish_reason == "length"
     assert completion.usage == openai.types.CompletionUsage(
         completion_tokens=5, prompt_tokens=6, total_tokens=11)