Skip to content

Commit

Permalink
Fix test and make it faster (Giskard-AI#1592)
Browse files Browse the repository at this point in the history
* Fix test and make it faster

* Fix code

* Fix another test

* Remove outdated test

* FIxing another test

* Split tests (Giskard-AI#1594)

* Split tests for executions

* Removing xdist for tests

* Revert "Removing xdist for tests"

This reverts commit 167f7c3.

* Splitting more

* Stabilize tests for concurrency

---------

Co-authored-by: Hartorn <bazire@giskard.ai>
  • Loading branch information
mattbit and Hartorn authored Nov 9, 2023
1 parent 1f1e510 commit 8d1610c
Show file tree
Hide file tree
Showing 6 changed files with 62 additions and 46 deletions.
23 changes: 19 additions & 4 deletions .github/workflows/build-python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -161,12 +161,27 @@ jobs:
pdm run pip freeze | grep -q '^pandas==${{ matrix.pandas_v1 && '1' || '2' }}\.'
- name: Test code (concurrency)
run: pdm run test-worker
run: pdm test-worker

- name: Test code
- name: Tests integrations/
env:
PYTEST_XDIST_AUTO_NUM_WORKERS: ${{ startsWith(matrix.os,'windows-') && 1 || 2 }}
run: pdm run test-fast
run: pdm test-integrations

- name: Tests models/
env:
PYTEST_XDIST_AUTO_NUM_WORKERS: ${{ startsWith(matrix.os,'windows-') && 1 || 2 }}
run: pdm run test-models

- name: Tests scan/
env:
PYTEST_XDIST_AUTO_NUM_WORKERS: ${{ startsWith(matrix.os,'windows-') && 1 || 2 }}
run: pdm test-scan

- name: Test others
env:
PYTEST_XDIST_AUTO_NUM_WORKERS: ${{ startsWith(matrix.os,'windows-') && 1 || 2 }}
run: pdm test-fast

- name: SonarCloud Scan
if: ${{ matrix.os == 'ubuntu-latest' && matrix.python-version == '3.10' && !matrix.langchain_minimal && !matrix.pandas_v1 && !matrix.pydantic_v1 && (github.event.ref == 'refs/heads/main' || github.event_name == 'pull_request')}}
Expand All @@ -192,7 +207,7 @@ jobs:
if: ${{ inputs.run-integration-tests && matrix.os != 'windows-2019' }}
env:
PYTEST_XDIST_AUTO_NUM_WORKERS: 2
run: pdm run test -m 'slow'
run: pdm test-slow
install-poetry:
name: "Check if wheel can be installed with using Poetry"
runs-on: ubuntu-latest
Expand Down
11 changes: 8 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,15 @@ name = "torch"
[tool.pdm.scripts]
_.env = { GSK_DISABLE_ANALYTICS = "True", GISKARD_DEV_MODE = "True" }
# add "-n auto" to the pytest command to parallelize the execution
test.cmd = "pytest -c pyproject.toml tests --cov=giskard --cov-report=xml --disable-warnings --no-header -vv --durations=0"
test.cmd = "pdm base-test tests/"
# for some reason github runners don't work when calling 'pdm test -m "not slow"'
test-fast.cmd = "pytest -n auto -m 'not slow and not concurrency' -c pyproject.toml tests --cov=giskard --cov-report=xml --disable-warnings --no-header -vv --durations=0"
test-worker.cmd = "pytest -m 'concurrency' -c pyproject.toml tests --disable-warnings --no-header -vv --durations=0"
base-test.cmd = "pytest -c pyproject.toml --cov=giskard --cov-report=xml --cov-append --disable-warnings --no-header -vv --durations=0"
test-models.cmd = "pdm base-test -n auto -m 'not slow and not concurrency' tests/models"
test-integrations.cmd = "pdm base-test -n auto -m 'not slow and not concurrency' tests/integrations"
test-scan.cmd = "pdm base-test -n auto -m 'not slow and not concurrency' tests/scan"
test-fast.cmd = "pdm base-test -n auto -m 'not slow and not concurrency' tests --ignore=tests/models --ignore=tests/integrations --ignore=tests/scan"
test-slow.cmd = "pdm base-test -n auto -m 'slow and not concurrency' tests"
test-worker.cmd = "pdm base-test -m 'concurrency' tests/"
lint = "ruff giskard tests"
doc = "sphinx-build docs docs/_build/html"
watch-doc = "python -m sphinx_autobuild --watch giskard docs docs/_build/html"
Expand Down
9 changes: 7 additions & 2 deletions tests/models/langchain/test_qa_retreiver.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Optional, Callable, Any, Iterable, Dict
from typing import Any, Callable, Dict, Iterable, Optional

import pandas as pd
import pytest
Expand Down Expand Up @@ -81,7 +81,12 @@ def test_vectorstore():
query = "What did the president say about Ketanji Brown Jackson"

model = FaissRetrieverModel(
qa, model_type=SupportedModelTypes.TEXT_GENERATION, retriever=docsearch, feature_names=["query"]
qa,
model_type=SupportedModelTypes.TEXT_GENERATION,
name="Test model",
description="Test model",
retriever=docsearch,
feature_names=["query"],
)
dataset = giskard.Dataset(pd.DataFrame({"query": [query]}))

Expand Down
35 changes: 9 additions & 26 deletions tests/scan/test_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@
import numpy as np
import pandas as pd
import pytest
from langchain import LLMChain, PromptTemplate
from langchain.llms.fake import FakeListLLM

from giskard import Dataset, GiskardClient, Model
from giskard.core.core import ModelMeta, SupportedModelTypes
Expand Down Expand Up @@ -101,8 +99,15 @@ def test_default_dataset_is_used_with_generative_model():
def fake_model(*args, **kwargs):
return None

model = Model(model=fake_model, model_type=SupportedModelTypes.TEXT_GENERATION)
# model.is_text_generation = True
model = Model(
model=fake_model,
model_type=SupportedModelTypes.TEXT_GENERATION,
name="test",
description="test",
feature_names=["query"],
target="query",
)

model.meta = ModelMeta(
"Model name",
"Some meaningful model description",
Expand All @@ -123,28 +128,6 @@ def fake_model(*args, **kwargs):
generate_test_dataset.assert_called_once()


@pytest.mark.skip(reason="Now rely on LLM generated issues")
@pytest.mark.slow
def test_generative_model_dataset():
llm = FakeListLLM(responses=["Are you dumb or what?", "I don't know and I don't want to know."] * 100)
prompt = PromptTemplate(template="{instruct}: {question}", input_variables=["instruct", "question"])
chain = LLMChain(llm=llm, prompt=prompt)
model = Model(chain, model_type="text_generation")
dataset = Dataset(
pd.DataFrame(
{
"instruct": ["Paraphrase this", "Answer this question"],
"question": ["Who is the mayor of Rome?", "How many bridges are there in Paris?"],
}
),
column_types={"instruct": "text", "question": "text"},
)

scanner = Scanner()
result = scanner.analyze(model, dataset)
assert result.has_issues()


@pytest.mark.skip(reason="For active testing of the UI")
@pytest.mark.parametrize(
"dataset_name,model_name",
Expand Down
16 changes: 11 additions & 5 deletions tests/scan/test_text_perturbation_detector.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
import numpy as np
import pandas as pd
import pytest
from langchain import LLMChain, PromptTemplate
from langchain.llms.fake import FakeListLLM

import giskard
from giskard import Dataset, Model
from giskard.scanner.robustness.text_perturbation_detector import TextPerturbationDetector
from langchain import LLMChain, PromptTemplate


def test_perturbation_classification(titanic_model, titanic_dataset):
Expand Down Expand Up @@ -40,12 +39,17 @@ def test_text_perturbation_works_with_nan_values():
assert len(issues) == 0


@pytest.mark.slow
def test_llm_text_transformation():
llm = FakeListLLM(responses=["Are you dumb or what?", "I don't know and I don’t want to know."] * 100)
prompt = PromptTemplate(template="{instruct}: {question}", input_variables=["instruct", "question"])
chain = LLMChain(llm=llm, prompt=prompt)
model = Model(chain, model_type="text_generation")
model = Model(
chain,
model_type="text_generation",
name="Test model",
description="Test model description",
feature_names=["instruct", "question"],
)

dataset = Dataset(
pd.DataFrame(
Expand All @@ -57,5 +61,7 @@ def test_llm_text_transformation():
column_types={"instruct": "text", "question": "text"},
)

analyzer = TextPerturbationDetector()
from giskard.scanner.robustness.text_transformations import TextTypoTransformation

analyzer = TextPerturbationDetector(transformations=[TextTypoTransformation])
analyzer.run(model, dataset)
14 changes: 8 additions & 6 deletions tests/test_worker_pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@

import pytest

from giskard.utils.worker_pool import PoolState, WorkerPoolExecutor, GiskardMLWorkerException
from giskard.utils.worker_pool import GiskardMLWorkerException, PoolState, WorkerPoolExecutor


@pytest.fixture(scope="function")
def many_worker_pool():
Expand Down Expand Up @@ -127,7 +128,7 @@ def test_task_should_be_cancelled(one_worker_pool: WorkerPoolExecutor):
def test_after_cancel_should_work(one_worker_pool: WorkerPoolExecutor):
pid = set(one_worker_pool.processes.keys())
assert len(pid) == 1
future = one_worker_pool.schedule(sleep_add_one, [100, 1], timeout=3)
future = one_worker_pool.schedule(sleep_add_one, [100, 1], timeout=10)
with pytest.raises(TimeoutError) as exc_info:
future.result()
assert "Task took too long" in str(exc_info)
Expand All @@ -142,6 +143,7 @@ def test_after_cancel_should_work(one_worker_pool: WorkerPoolExecutor):
future = one_worker_pool.submit(add_one, 4)
assert future.result() == 5


@pytest.mark.concurrency
def test_after_cancel_should_shutdown_nicely():
one_worker_pool = WorkerPoolExecutor(nb_workers=1)
Expand All @@ -163,11 +165,9 @@ def test_after_cancel_should_shutdown_nicely():
assert exit_codes == [0]




@pytest.mark.skipif(condition=sys.platform == "win32", reason="Not working on windows")
@pytest.mark.concurrency
def test_many_tasks_should_shutdown_nicely(many_worker_pool : WorkerPoolExecutor):
def test_many_tasks_should_shutdown_nicely(many_worker_pool: WorkerPoolExecutor):
sleep(3)
futures = []
for _ in range(100):
Expand All @@ -176,10 +176,11 @@ def test_many_tasks_should_shutdown_nicely(many_worker_pool : WorkerPoolExecutor
exit_codes = many_worker_pool.shutdown(wait=True, timeout=60)
assert len([code is not None for code in exit_codes]) == 4
assert all([code is not None for code in exit_codes])
assert exit_codes == [0,0,0,0]
assert exit_codes == [0, 0, 0, 0]
assert all([f.done() for f in futures])
assert all([not t.is_alive() for t in many_worker_pool._threads])


@pytest.mark.concurrency
def test_submit_many_task(many_worker_pool: WorkerPoolExecutor):
futures = []
Expand All @@ -191,6 +192,7 @@ def test_submit_many_task(many_worker_pool: WorkerPoolExecutor):
for expected, future in enumerate(futures):
assert expected + 1 == future.result()


@pytest.mark.concurrency
def test_task_already_cancelled(one_worker_pool: WorkerPoolExecutor):
for _ in range(10):
Expand Down

0 comments on commit 8d1610c

Please sign in to comment.