From 235853f5ed738f9fd4230dd6ca8df7abf0c60510 Mon Sep 17 00:00:00 2001
From: "L. R. Couto" <57910428+lrcouto@users.noreply.github.com>
Date: Thu, 11 Apr 2024 12:17:10 -0300
Subject: [PATCH] Update spaceflights tutorial and starter requirements for
 kedro-datasets optional dependencies (#3664)

* Update spaceflights tutorial and starter requirements

Signed-off-by: lrcouto <laurarccouto@gmail.com>

* fix e2e tests

Signed-off-by: lrcouto <laurarccouto@gmail.com>

* Fix e2e tests by distinguishing `kedro-datasets` dependency for different python versions (#3802)

Signed-off-by: Merel Theisen <merel.theisen@quantumblack.com>

* Update docs/source/tutorial/tutorial_template.md

Co-authored-by: Merel Theisen <49397448+merelcht@users.noreply.github.com>
Signed-off-by: L. R. Couto <57910428+lrcouto@users.noreply.github.com>

---------

Signed-off-by: lrcouto <laurarccouto@gmail.com>
Signed-off-by: L. R. Couto <57910428+lrcouto@users.noreply.github.com>
Signed-off-by: Merel Theisen <merel.theisen@quantumblack.com>
Co-authored-by: Merel Theisen <49397448+merelcht@users.noreply.github.com>
Signed-off-by: Ahdra Merali <ahdra.merali@quantumblack.com>
---
 Makefile                                      |  2 +-
 .../kedro_project_setup/dependencies.md       |  1 -
 docs/source/tutorial/tutorial_template.md     |  2 +-
 features/environment.py                       |  9 +++++-
 features/steps/cli_steps.py                   |  6 ++--
 .../requirements.txt                          |  3 +-
 features/steps/util.py                        | 29 +++++++++++++++++++
 pyproject.toml                                |  3 +-
 8 files changed, 47 insertions(+), 8 deletions(-)

diff --git a/Makefile b/Makefile
index d76892857c..40a2126f55 100644
--- a/Makefile
+++ b/Makefile
@@ -46,7 +46,7 @@ package: clean install
 
 install-test-requirements:
 	python -m pip install -U "pip>=21.2"
-	pip install .[test]
+	pip install -U .[test]
 
 install-pre-commit:
 	pre-commit install --install-hooks
diff --git a/docs/source/kedro_project_setup/dependencies.md b/docs/source/kedro_project_setup/dependencies.md
index 576ab266da..0ff2a8aa32 100644
--- a/docs/source/kedro_project_setup/dependencies.md
+++ b/docs/source/kedro_project_setup/dependencies.md
@@ -45,7 +45,6 @@ For example, your workflow might require the `pandas.ExcelDataset`, so to instal
 From `kedro-datasets` version 3.0.0 onwards, the names of the optional dataset-level dependencies have been normalised to follow [PEP 685](https://peps.python.org/pep-0685/). The '.' character has been replaced with a '-' character and the names are in lowercase. For example, if you had `kedro-datasets[pandas.ExcelDataset]` in your requirements file, it would have to be changed to `kedro-datasets[pandas-exceldataset]`.
 ```
 
-
 ## Reproducible environments
 To ensure that the project dependencies and the transitive dependencies are pinned to specific versions, use [`pip-tools`](https://pypi.org/project/pip-tools/) to compile `requirements.txt` file into a `requirements.lock` file.
 To install `pip-tools` in your virtual environment, run the following command:
diff --git a/docs/source/tutorial/tutorial_template.md b/docs/source/tutorial/tutorial_template.md
index 2b2c45cc82..d8462f1b20 100644
--- a/docs/source/tutorial/tutorial_template.md
+++ b/docs/source/tutorial/tutorial_template.md
@@ -48,7 +48,7 @@ pytest~=7.2
 
 # Kedro dependencies and datasets to work with different data formats (including CSV, Excel, and Parquet)
 kedro~=0.19.0
-kedro-datasets[pandas.CSVDataset, pandas.ExcelDataset, pandas.ParquetDataset]>=1.1
+kedro-datasets[pandas-csvdataset, pandas-exceldataset, pandas-parquetdataset]>=3.0
 kedro-telemetry>=0.3.1
 kedro-viz~=6.0 # Visualise pipelines
 
diff --git a/features/environment.py b/features/environment.py
index 26a6090a6e..14be1445ef 100644
--- a/features/environment.py
+++ b/features/environment.py
@@ -5,6 +5,7 @@
 import os
 import shutil
 import subprocess
+import sys
 import tempfile
 import venv
 from pathlib import Path
@@ -14,6 +15,7 @@
 _PATHS_TO_REMOVE: set[Path] = set()
 
 FRESH_VENV_TAG = "fresh_venv"
+MINOR_PYTHON_38_VERSION = 8
 
 
 def call(cmd, env):
@@ -130,6 +132,11 @@ def _install_project_requirements(context):
         .splitlines()
     )
     install_reqs = [req for req in install_reqs if "{" not in req and "#" not in req]
-    install_reqs.append("kedro-datasets[pandas.CSVDataset]")
+    # For Python versions 3.9 and above we use the new dataset dependency format introduced in `kedro-datasets` 3.0.0
+    if sys.version_info.minor > MINOR_PYTHON_38_VERSION:
+        install_reqs.append("kedro-datasets[pandas-csvdataset]")
+    # For Python 3.8 we use the older `kedro-datasets` dependency format
+    else:
+        install_reqs.append("kedro-datasets[pandas.CSVDataset]")
     call([context.pip, "install", *install_reqs], env=context.env)
     return context
diff --git a/features/steps/cli_steps.py b/features/steps/cli_steps.py
index 414d366136..d3c808472e 100644
--- a/features/steps/cli_steps.py
+++ b/features/steps/cli_steps.py
@@ -554,7 +554,8 @@ def check_one_node_run(context, number):
 def check_correct_nodes_run(context, node):
     expected_log_line = f"Running node: {node}"
     stdout = context.result.stdout
-    assert expected_log_line in stdout, (
+    clean_logs = util.clean_up_log(stdout)
+    assert expected_log_line in clean_logs, (
         "Expected the following message segment to be printed on stdout: "
         f"{expected_log_line},\nbut got {stdout}"
     )
@@ -595,7 +596,8 @@ def check_message_printed(context, msg):
     else:
         stdout = context.result.stdout
 
-    assert msg in stdout, (
+    clean_logs = util.clean_up_log(stdout)
+    assert msg in clean_logs, (
         "Expected the following message segment to be printed on stdout: "
         f"{msg},\nbut got {stdout}"
     )
diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/requirements.txt b/features/steps/test_starter/{{ cookiecutter.repo_name }}/requirements.txt
index fb756bd0f3..826097e88d 100644
--- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/requirements.txt	
+++ b/features/steps/test_starter/{{ cookiecutter.repo_name }}/requirements.txt	
@@ -3,7 +3,8 @@ ipython>=8.10
 jupyterlab>=3.0
 notebook
 kedro~={{ cookiecutter.kedro_version}}
-kedro-datasets[pandas.CSVDataset]
+kedro-datasets[pandas-csvdataset]; python_version >= "3.9"
+kedro-datasets[pandas.CSVDataset]<2.0.0; python_version < '3.9'
 kedro-telemetry>=0.3.1
 pytest-cov~=3.0
 pytest-mock>=1.7.1, <2.0
diff --git a/features/steps/util.py b/features/steps/util.py
index 74031232f1..f9c7b2c4e2 100644
--- a/features/steps/util.py
+++ b/features/steps/util.py
@@ -83,3 +83,32 @@ def parse_csv(text: str) -> list[str]:
         List of string tokens
     """
     return re.findall(r"\"(.+?)\"\s*,?", text)
+
+
+def clean_up_log(stdout: str) -> str:
+    """
+    Cleans up log output by removing duplicate lines, extra whitespaces,
+    and log levels (INFO, WARNING, ERROR) along with .py filenames.
+
+    Args:
+        stdout (str): The log output to be cleaned.
+
+    Returns:
+        str: Cleaned log output without unnecessary information.
+    """
+    cleaned_lines = []
+    already_extracted = set()
+
+    for line in stdout.split("\n"):
+        if any(word in line for word in ["WARNING", "INFO", "ERROR"]):
+            # Remove log levels and .py filenames
+            cleaned_line = re.sub(r"\b(INFO|WARNING|ERROR)\b|\s+\w+\.py:\d+", "", line)
+            cleaned_lines.append(cleaned_line.strip())
+            already_extracted.add(line)
+        elif line not in already_extracted:
+            cleaned_lines.append(line)
+
+    cleaned_output = "\n".join(cleaned_lines)
+    cleaned_output = re.sub(r"\s+", " ", cleaned_output)
+
+    return cleaned_output.strip()
diff --git a/pyproject.toml b/pyproject.toml
index 79a632fea5..5c0f6ee88a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -61,7 +61,8 @@ test = [
     "jupyterlab_server>=2.11.1",
     "jupyterlab>=3,<5",
     "jupyter~=1.0",
-    "kedro-datasets",
+    "kedro-datasets; python_version >= '3.9'",
+    "kedro-datasets<2.0.0; python_version < '3.9'",
     "mypy~=1.0",
     "pandas~=2.0",
     "pluggy>=1.0, <1.4", # pluggy 1.4 hide imports inside function and causing mocking issue