Skip to content

Commit

Permalink
fix(telemetry): Single project identifier (kedro-org#701)
Browse files Browse the repository at this point in the history
* Moved pyproject config name to constant

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Implemented _get_or_create_project_uuid

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Refactored _get_project_properties

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Fixed tests from hanging

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Fixed _is_known_ci_env

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Fixed test_before_command_run

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Fixed TestKedroTelemetryCLIHooks

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Fixed TestKedroTelemetryProjectHooks

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Fixed writing to pyproject.toml

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Fixed write mock

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Removed debug output

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Refactored _add_tool_properties

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Added debug message when pyproject_path does not exist

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Updated release notes

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Return None as project UUID in case of not generated

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Fixed pre-commit errors

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Updated the way project UUID is stored

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Renamed project_uuid -> project_id

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Check if pyproject file relates to kedro

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Changed debug message as suggested

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Added OSError handling

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Fixed unit test

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

---------

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>
Signed-off-by: tgoelles <thomas.goelles@gmail.com>
  • Loading branch information
ElenaKhaustova authored and tgoelles committed Jun 6, 2024
1 parent e193107 commit 41455ec
Show file tree
Hide file tree
Showing 3 changed files with 109 additions and 34 deletions.
3 changes: 2 additions & 1 deletion kedro-telemetry/RELEASE.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# Upcoming release
* Updated the plugin to generate a unique project UUID for kedro project and store it in `pyproject.toml`.

# Release 0.4.0
* Updated the plugin to generate an unique UUID for each user of `kedro-telemetry`.
* Updated the plugin to generate a unique UUID for each user of `kedro-telemetry`.
* Added support for Python 3.12.

# Release 0.3.2
Expand Down
95 changes: 72 additions & 23 deletions kedro-telemetry/kedro_telemetry/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@
}
TIMESTAMP_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ"
CONFIG_FILENAME = "telemetry.toml"
PYPROJECT_CONFIG_NAME = "pyproject.toml"
UNDEFINED_PACKAGE_NAME = "undefined_package_name"

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -78,10 +80,63 @@ def _get_or_create_uuid() -> str:
return ""


def _get_or_create_project_id(pyproject_path: Path) -> str | None:
"""
Reads a project id from a configuration file or generates and saves a new one if not present.
Returns None if configuration file does not exist or does not relate to Kedro.
"""
try:
with open(pyproject_path, "r+") as file:
pyproject_data = toml.load(file)

# Check if pyproject related to kedro
try:
_ = pyproject_data["tool"]["kedro"]
try:
project_id = pyproject_data["tool"]["kedro_telemetry"]["project_id"]
except KeyError:
project_id = uuid.uuid4().hex
toml_string = (
f'\n[tool.kedro_telemetry]\nproject_id = "{project_id}"\n'
)
file.write(toml_string)
return project_id
except KeyError:
logging.error(
f"Failed to retrieve project id or save project id: "
f"{str(pyproject_path)} does not contain a [tool.kedro] section"
)
return None
except OSError as exc:
logging.error(f"Failed to read the file: {str(pyproject_path)}.\n{str(exc)}")
return None


def _add_tool_properties(
properties: dict[str, Any], pyproject_path: Path
) -> dict[str, Any]:
"""
Extends project properties with tool's properties.
"""
if pyproject_path.exists():
with open(pyproject_path) as file:
pyproject_data = toml.load(file)

try:
tool_kedro = pyproject_data["tool"]["kedro"]
if "tools" in tool_kedro:
properties["tools"] = ", ".join(tool_kedro["tools"])
if "example_pipeline" in tool_kedro:
properties["example_pipeline"] = tool_kedro["example_pipeline"]
except KeyError:
pass

return properties


def _generate_new_uuid(full_path: str) -> str:
try:
config: dict[str, dict[str, Any]] = {}
config["telemetry"] = {}
config: dict[str, dict[str, Any]] = {"telemetry": {}}
new_uuid = uuid.uuid4().hex
config["telemetry"]["uuid"] = new_uuid

Expand Down Expand Up @@ -126,7 +181,7 @@ def before_command_run(
logger.debug("You have opted into product usage analytics.")
user_uuid = _get_or_create_uuid()
project_properties = _get_project_properties(
user_uuid, project_metadata.project_path
user_uuid, project_metadata.project_path / PYPROJECT_CONFIG_NAME
)
cli_properties = _format_user_cli_data(
project_properties, masked_command_args
Expand Down Expand Up @@ -177,7 +232,9 @@ def after_catalog_created(self, catalog):
default_pipeline = pipelines.get("__default__") # __default__
user_uuid = _get_or_create_uuid()

project_properties = _get_project_properties(user_uuid, self.project_path)
project_properties = _get_project_properties(
user_uuid, self.project_path / PYPROJECT_CONFIG_NAME
)

project_statistics_properties = _format_project_statistics_data(
project_properties, catalog, default_pipeline, pipelines
Expand All @@ -189,40 +246,32 @@ def after_catalog_created(self, catalog):
)


def _is_known_ci_env(known_ci_env_var_keys=KNOWN_CI_ENV_VAR_KEYS):
def _is_known_ci_env(known_ci_env_var_keys: set[str]):
# Most CI tools will set the CI environment variable to true
if os.getenv("CI") == "true":
return True
# Not all CI tools follow this convention, we can check through those that don't
return any(os.getenv(key) for key in known_ci_env_var_keys)


def _get_project_properties(user_uuid: str, project_path: Path) -> dict:
hashed_package_name = _hash(str(PACKAGE_NAME)) if PACKAGE_NAME else "undefined"
def _get_project_properties(user_uuid: str, pyproject_path: Path) -> dict:
project_id = _get_or_create_project_id(pyproject_path)
package_name = PACKAGE_NAME or UNDEFINED_PACKAGE_NAME
hashed_project_id = (
_hash(f"{project_id}{package_name}") if project_id is not None else None
)

properties = {
"username": user_uuid,
"package_name": hashed_package_name,
"project_id": hashed_project_id,
"project_version": KEDRO_VERSION,
"telemetry_version": TELEMETRY_VERSION,
"python_version": sys.version,
"os": sys.platform,
"is_ci_env": _is_known_ci_env(),
"is_ci_env": _is_known_ci_env(KNOWN_CI_ENV_VAR_KEYS),
}
pyproject_path = Path(project_path) / "pyproject.toml"
if pyproject_path.exists():
with open(pyproject_path) as file:
pyproject_data = toml.load(file)

if "tool" in pyproject_data and "kedro" in pyproject_data["tool"]:
if "tools" in pyproject_data["tool"]["kedro"]:
# convert list of tools to comma-separated string
properties["tools"] = ", ".join(
pyproject_data["tool"]["kedro"]["tools"]
)
if "example_pipeline" in pyproject_data["tool"]["kedro"]:
properties["example_pipeline"] = pyproject_data["tool"]["kedro"][
"example_pipeline"
]
properties = _add_tool_properties(properties, pyproject_path)

return properties

Expand Down
45 changes: 35 additions & 10 deletions kedro-telemetry/tests/test_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,14 +134,18 @@ def test_before_command_run(self, mocker, fake_metadata):
"kedro_telemetry.plugin._get_or_create_uuid",
return_value="user_uuid",
)
mocker.patch(
"kedro_telemetry.plugin._get_or_create_project_id",
return_value="project_id",
)

mocked_heap_call = mocker.patch("kedro_telemetry.plugin._send_heap_event")
telemetry_hook = KedroTelemetryCLIHooks()
command_args = ["--version"]
telemetry_hook.before_command_run(fake_metadata, command_args)
expected_properties = {
"username": "user_uuid",
"package_name": "digested",
"project_id": "digested",
"project_version": kedro_version,
"telemetry_version": TELEMETRY_VERSION,
"python_version": sys.version,
Expand Down Expand Up @@ -180,6 +184,10 @@ def test_before_command_run_with_tools(self, mocker, fake_metadata):
"kedro_telemetry.plugin._get_or_create_uuid",
return_value="user_uuid",
)
mocker.patch(
"kedro_telemetry.plugin._get_or_create_project_id",
return_value="project_id",
)

mocked_heap_call = mocker.patch("kedro_telemetry.plugin._send_heap_event")
mocker.patch("builtins.open", mocker.mock_open(read_data=MOCK_PYPROJECT_TOOLS))
Expand All @@ -189,7 +197,7 @@ def test_before_command_run_with_tools(self, mocker, fake_metadata):
telemetry_hook.before_command_run(fake_metadata, command_args)
expected_properties = {
"username": "user_uuid",
"package_name": "digested",
"project_id": "digested",
"project_version": kedro_version,
"telemetry_version": TELEMETRY_VERSION,
"python_version": sys.version,
Expand Down Expand Up @@ -230,14 +238,18 @@ def test_before_command_run_empty_args(self, mocker, fake_metadata):
"kedro_telemetry.plugin._get_or_create_uuid",
return_value="user_uuid",
)
mocker.patch(
"kedro_telemetry.plugin._get_or_create_project_id",
return_value="project_id",
)

mocked_heap_call = mocker.patch("kedro_telemetry.plugin._send_heap_event")
telemetry_hook = KedroTelemetryCLIHooks()
command_args = []
telemetry_hook.before_command_run(fake_metadata, command_args)
expected_properties = {
"username": "user_uuid",
"package_name": "digested",
"project_id": "digested",
"project_version": kedro_version,
"telemetry_version": TELEMETRY_VERSION,
"python_version": sys.version,
Expand Down Expand Up @@ -300,7 +312,7 @@ def test_before_command_run_anonymous(self, mocker, fake_metadata):
mocked_anon_id = mocker.patch("kedro_telemetry.plugin._hash")
mocked_anon_id.return_value = "digested"
mocker.patch("kedro_telemetry.plugin.PACKAGE_NAME", "spaceflights")
mocker.patch("builtins.open", side_effect=Exception)
mocker.patch("builtins.open", side_effect=OSError)

mocked_heap_call = mocker.patch("kedro_telemetry.plugin._send_heap_event")
telemetry_hook = KedroTelemetryCLIHooks()
Expand All @@ -309,7 +321,7 @@ def test_before_command_run_anonymous(self, mocker, fake_metadata):
expected_properties = {
"username": "",
"command": "kedro --version",
"package_name": "digested",
"project_id": None,
"project_version": kedro_version,
"telemetry_version": TELEMETRY_VERSION,
"python_version": sys.version,
Expand Down Expand Up @@ -481,9 +493,15 @@ def test_after_context_created_without_kedro_run( # noqa: PLR0913
"kedro_telemetry.plugin._get_or_create_uuid",
return_value="user_uuid",
)
mocker.patch(
"kedro_telemetry.plugin._get_or_create_project_id",
return_value="project_id",
)

mocked_heap_call = mocker.patch("kedro_telemetry.plugin._send_heap_event")
mocker.patch("kedro_telemetry.plugin.open")
mocker.patch("kedro_telemetry.plugin.toml.load")
mocker.patch("kedro_telemetry.plugin.toml.dump")

# Without CLI invoked - i.e. `session.run` in Jupyter/IPython
telemetry_hook = KedroTelemetryProjectHooks()
Expand All @@ -492,7 +510,7 @@ def test_after_context_created_without_kedro_run( # noqa: PLR0913

project_properties = {
"username": "user_uuid",
"package_name": "digested",
"project_id": "digested",
"project_version": kedro_version,
"telemetry_version": TELEMETRY_VERSION,
"python_version": sys.version,
Expand All @@ -505,7 +523,6 @@ def test_after_context_created_without_kedro_run( # noqa: PLR0913
"number_of_pipelines": 2,
}
expected_properties = {**project_properties, **project_statistics}

expected_call = mocker.call(
event_name="Kedro Project Statistics",
identity="user_uuid",
Expand Down Expand Up @@ -537,8 +554,13 @@ def test_after_context_created_with_kedro_run( # noqa: PLR0913
"kedro_telemetry.plugin._get_or_create_uuid",
return_value="user_uuid",
)
mocker.patch(
"kedro_telemetry.plugin._get_or_create_project_id",
return_value="project_id",
)
mocked_heap_call = mocker.patch("kedro_telemetry.plugin._send_heap_event")
mocker.patch("kedro_telemetry.plugin.toml.load")
mocker.patch("kedro_telemetry.plugin.toml.dump")
# CLI run first
telemetry_cli_hook = KedroTelemetryCLIHooks()
command_args = ["--version"]
Expand All @@ -551,7 +573,7 @@ def test_after_context_created_with_kedro_run( # noqa: PLR0913

project_properties = {
"username": "user_uuid",
"package_name": "digested",
"project_id": "digested",
"project_version": kedro_version,
"telemetry_version": TELEMETRY_VERSION,
"python_version": sys.version,
Expand Down Expand Up @@ -596,6 +618,10 @@ def test_after_context_created_with_kedro_run_and_tools( # noqa: PLR0913
"kedro_telemetry.plugin._get_or_create_uuid",
return_value="user_uuid",
)
mocker.patch(
"kedro_telemetry.plugin._get_or_create_project_id",
return_value="project_id",
)
mocked_heap_call = mocker.patch("kedro_telemetry.plugin._send_heap_event")
mocker.patch("builtins.open", mocker.mock_open(read_data=MOCK_PYPROJECT_TOOLS))
mocker.patch("pathlib.Path.exists", return_value=True)
Expand All @@ -612,7 +638,7 @@ def test_after_context_created_with_kedro_run_and_tools( # noqa: PLR0913

project_properties = {
"username": "user_uuid",
"package_name": "digested",
"project_id": "digested",
"project_version": kedro_version,
"telemetry_version": TELEMETRY_VERSION,
"python_version": sys.version,
Expand All @@ -633,7 +659,6 @@ def test_after_context_created_with_kedro_run_and_tools( # noqa: PLR0913
identity="user_uuid",
properties=expected_properties,
)

# CLI hook makes the first 2 calls, the 3rd one is the Project hook
assert mocked_heap_call.call_args_list[2] == expected_call

Expand Down

0 comments on commit 41455ec

Please sign in to comment.