Fix test and make it faster (Giskard-AI#1592)

* Fix test and make it faster * Fix code * Fix another test * Remove outdated test * FIxing another test * Split tests (Giskard-AI#1594) * Split tests for executions * Removing xdist for tests * Revert "Removing xdist for tests" This reverts commit 167f7c3. * Splitting more * Stabilize tests for concurrency --------- Co-authored-by: Hartorn <bazire@giskard.ai>
Sandy4321 · Nov 9, 2023 · 8d1610c · 8d1610c
1 parent 1f1e510
commit 8d1610c
Show file tree

Hide file tree

Showing 6 changed files with 62 additions and 46 deletions.
diff --git a/.github/workflows/build-python.yml b/.github/workflows/build-python.yml
@@ -161,12 +161,27 @@ jobs:
           pdm run pip freeze | grep -q '^pandas==${{ matrix.pandas_v1 && '1' || '2' }}\.'
 
       - name: Test code (concurrency)
-        run: pdm run test-worker
+        run: pdm test-worker
 
-      - name: Test code
+      - name: Tests integrations/
         env:
           PYTEST_XDIST_AUTO_NUM_WORKERS: ${{ startsWith(matrix.os,'windows-') && 1 || 2 }}
-        run: pdm run test-fast
+        run: pdm test-integrations
+
+      - name: Tests models/
+        env:
+          PYTEST_XDIST_AUTO_NUM_WORKERS: ${{ startsWith(matrix.os,'windows-') && 1 || 2 }}
+        run: pdm run test-models
+
+      - name: Tests scan/
+        env:
+          PYTEST_XDIST_AUTO_NUM_WORKERS: ${{ startsWith(matrix.os,'windows-') && 1 || 2 }}
+        run: pdm test-scan
+
+      - name: Test others
+        env:
+          PYTEST_XDIST_AUTO_NUM_WORKERS: ${{ startsWith(matrix.os,'windows-') && 1 || 2 }}
+        run: pdm test-fast
 
       - name: SonarCloud Scan
         if: ${{ matrix.os == 'ubuntu-latest' && matrix.python-version == '3.10' && !matrix.langchain_minimal && !matrix.pandas_v1 && !matrix.pydantic_v1 && (github.event.ref == 'refs/heads/main' || github.event_name == 'pull_request')}}
@@ -192,7 +207,7 @@ jobs:
         if: ${{ inputs.run-integration-tests && matrix.os != 'windows-2019' }}
         env:
           PYTEST_XDIST_AUTO_NUM_WORKERS: 2
-        run: pdm run test -m 'slow'
+        run: pdm test-slow
   install-poetry:
     name: "Check if wheel can be installed with using Poetry"
     runs-on: ubuntu-latest

diff --git a/pyproject.toml b/pyproject.toml
@@ -29,10 +29,15 @@ name = "torch"
 [tool.pdm.scripts]
 _.env = { GSK_DISABLE_ANALYTICS = "True", GISKARD_DEV_MODE = "True" }
 # add "-n auto" to the pytest command to parallelize the execution
-test.cmd = "pytest -c pyproject.toml tests --cov=giskard --cov-report=xml --disable-warnings --no-header -vv --durations=0"
+test.cmd = "pdm base-test tests/"
 # for some reason github runners don't work when calling 'pdm test -m "not slow"'
-test-fast.cmd = "pytest -n auto -m 'not slow and not concurrency' -c pyproject.toml tests --cov=giskard --cov-report=xml --disable-warnings --no-header -vv --durations=0"
-test-worker.cmd = "pytest -m 'concurrency' -c pyproject.toml tests --disable-warnings --no-header -vv --durations=0"
+base-test.cmd = "pytest -c pyproject.toml --cov=giskard --cov-report=xml --cov-append --disable-warnings --no-header -vv --durations=0"
+test-models.cmd = "pdm base-test -n auto -m 'not slow and not concurrency' tests/models"
+test-integrations.cmd = "pdm base-test -n auto -m 'not slow and not concurrency' tests/integrations"
+test-scan.cmd = "pdm base-test -n auto -m 'not slow and not concurrency' tests/scan"
+test-fast.cmd = "pdm base-test -n auto -m 'not slow and not concurrency' tests --ignore=tests/models --ignore=tests/integrations --ignore=tests/scan"
+test-slow.cmd = "pdm base-test -n auto -m 'slow and not concurrency' tests"
+test-worker.cmd = "pdm base-test -m 'concurrency' tests/"
 lint = "ruff giskard tests"
 doc = "sphinx-build docs docs/_build/html"
 watch-doc = "python -m sphinx_autobuild --watch giskard docs docs/_build/html"

diff --git a/tests/models/langchain/test_qa_retreiver.py b/tests/models/langchain/test_qa_retreiver.py
@@ -1,4 +1,4 @@
-from typing import Optional, Callable, Any, Iterable, Dict
+from typing import Any, Callable, Dict, Iterable, Optional
 
 import pandas as pd
 import pytest
@@ -81,7 +81,12 @@ def test_vectorstore():
     query = "What did the president say about Ketanji Brown Jackson"
 
     model = FaissRetrieverModel(
-        qa, model_type=SupportedModelTypes.TEXT_GENERATION, retriever=docsearch, feature_names=["query"]
+        qa,
+        model_type=SupportedModelTypes.TEXT_GENERATION,
+        name="Test model",
+        description="Test model",
+        retriever=docsearch,
+        feature_names=["query"],
     )
     dataset = giskard.Dataset(pd.DataFrame({"query": [query]}))
 

diff --git a/tests/scan/test_scanner.py b/tests/scan/test_scanner.py
@@ -5,8 +5,6 @@
 import numpy as np
 import pandas as pd
 import pytest
-from langchain import LLMChain, PromptTemplate
-from langchain.llms.fake import FakeListLLM
 
 from giskard import Dataset, GiskardClient, Model
 from giskard.core.core import ModelMeta, SupportedModelTypes
@@ -101,8 +99,15 @@ def test_default_dataset_is_used_with_generative_model():
     def fake_model(*args, **kwargs):
         return None
 
-    model = Model(model=fake_model, model_type=SupportedModelTypes.TEXT_GENERATION)
-    # model.is_text_generation = True
+    model = Model(
+        model=fake_model,
+        model_type=SupportedModelTypes.TEXT_GENERATION,
+        name="test",
+        description="test",
+        feature_names=["query"],
+        target="query",
+    )
+
     model.meta = ModelMeta(
         "Model name",
         "Some meaningful model description",
@@ -123,28 +128,6 @@ def fake_model(*args, **kwargs):
         generate_test_dataset.assert_called_once()
 
 
-@pytest.mark.skip(reason="Now rely on LLM generated issues")
-@pytest.mark.slow
-def test_generative_model_dataset():
-    llm = FakeListLLM(responses=["Are you dumb or what?", "I don't know and I don't want to know."] * 100)
-    prompt = PromptTemplate(template="{instruct}: {question}", input_variables=["instruct", "question"])
-    chain = LLMChain(llm=llm, prompt=prompt)
-    model = Model(chain, model_type="text_generation")
-    dataset = Dataset(
-        pd.DataFrame(
-            {
-                "instruct": ["Paraphrase this", "Answer this question"],
-                "question": ["Who is the mayor of Rome?", "How many bridges are there in Paris?"],
-            }
-        ),
-        column_types={"instruct": "text", "question": "text"},
-    )
-
-    scanner = Scanner()
-    result = scanner.analyze(model, dataset)
-    assert result.has_issues()
-
-
 @pytest.mark.skip(reason="For active testing of the UI")
 @pytest.mark.parametrize(
     "dataset_name,model_name",

diff --git a/tests/scan/test_text_perturbation_detector.py b/tests/scan/test_text_perturbation_detector.py
@@ -1,12 +1,11 @@
 import numpy as np
 import pandas as pd
-import pytest
+from langchain import LLMChain, PromptTemplate
 from langchain.llms.fake import FakeListLLM
 
 import giskard
 from giskard import Dataset, Model
 from giskard.scanner.robustness.text_perturbation_detector import TextPerturbationDetector
-from langchain import LLMChain, PromptTemplate
 
 
 def test_perturbation_classification(titanic_model, titanic_dataset):
@@ -40,12 +39,17 @@ def test_text_perturbation_works_with_nan_values():
     assert len(issues) == 0
 
 
-@pytest.mark.slow
 def test_llm_text_transformation():
     llm = FakeListLLM(responses=["Are you dumb or what?", "I don't know and I don’t want to know."] * 100)
     prompt = PromptTemplate(template="{instruct}: {question}", input_variables=["instruct", "question"])
     chain = LLMChain(llm=llm, prompt=prompt)
-    model = Model(chain, model_type="text_generation")
+    model = Model(
+        chain,
+        model_type="text_generation",
+        name="Test model",
+        description="Test model description",
+        feature_names=["instruct", "question"],
+    )
 
     dataset = Dataset(
         pd.DataFrame(
@@ -57,5 +61,7 @@ def test_llm_text_transformation():
         column_types={"instruct": "text", "question": "text"},
     )
 
-    analyzer = TextPerturbationDetector()
+    from giskard.scanner.robustness.text_transformations import TextTypoTransformation
+
+    analyzer = TextPerturbationDetector(transformations=[TextTypoTransformation])
     analyzer.run(model, dataset)
diff --git a/tests/test_worker_pool.py b/tests/test_worker_pool.py
@@ -8,7 +8,8 @@
 
 import pytest
 
-from giskard.utils.worker_pool import PoolState, WorkerPoolExecutor, GiskardMLWorkerException
+from giskard.utils.worker_pool import GiskardMLWorkerException, PoolState, WorkerPoolExecutor
+
 
 @pytest.fixture(scope="function")
 def many_worker_pool():
@@ -127,7 +128,7 @@ def test_task_should_be_cancelled(one_worker_pool: WorkerPoolExecutor):
 def test_after_cancel_should_work(one_worker_pool: WorkerPoolExecutor):
     pid = set(one_worker_pool.processes.keys())
     assert len(pid) == 1
-    future = one_worker_pool.schedule(sleep_add_one, [100, 1], timeout=3)
+    future = one_worker_pool.schedule(sleep_add_one, [100, 1], timeout=10)
     with pytest.raises(TimeoutError) as exc_info:
         future.result()
     assert "Task took too long" in str(exc_info)
@@ -142,6 +143,7 @@ def test_after_cancel_should_work(one_worker_pool: WorkerPoolExecutor):
     future = one_worker_pool.submit(add_one, 4)
     assert future.result() == 5
 
+
 @pytest.mark.concurrency
 def test_after_cancel_should_shutdown_nicely():
     one_worker_pool = WorkerPoolExecutor(nb_workers=1)
@@ -163,11 +165,9 @@ def test_after_cancel_should_shutdown_nicely():
     assert exit_codes == [0]
 
 
-
-
 @pytest.mark.skipif(condition=sys.platform == "win32", reason="Not working on windows")
 @pytest.mark.concurrency
-def test_many_tasks_should_shutdown_nicely(many_worker_pool : WorkerPoolExecutor):
+def test_many_tasks_should_shutdown_nicely(many_worker_pool: WorkerPoolExecutor):
     sleep(3)
     futures = []
     for _ in range(100):
@@ -176,10 +176,11 @@ def test_many_tasks_should_shutdown_nicely(many_worker_pool : WorkerPoolExecutor
     exit_codes = many_worker_pool.shutdown(wait=True, timeout=60)
     assert len([code is not None for code in exit_codes]) == 4
     assert all([code is not None for code in exit_codes])
-    assert exit_codes == [0,0,0,0]
+    assert exit_codes == [0, 0, 0, 0]
     assert all([f.done() for f in futures])
     assert all([not t.is_alive() for t in many_worker_pool._threads])
 
+
 @pytest.mark.concurrency
 def test_submit_many_task(many_worker_pool: WorkerPoolExecutor):
     futures = []
@@ -191,6 +192,7 @@ def test_submit_many_task(many_worker_pool: WorkerPoolExecutor):
     for expected, future in enumerate(futures):
         assert expected + 1 == future.result()
 
+
 @pytest.mark.concurrency
 def test_task_already_cancelled(one_worker_pool: WorkerPoolExecutor):
     for _ in range(10):