diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..75119b5 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,36 @@ +# syntax=docker/dockerfile:1 + +ARG PY_VERSION=3.10 +FROM python:${PY_VERSION}-buster as builder + +WORKDIR /py4ai-data + +RUN apt-get update && apt-get upgrade -y + +COPY LICENSE MANIFEST.in versioneer.py setup.py pyproject.toml README.md Makefile ./ +COPY requirements requirements +COPY py4ai py4ai +COPY tests tests + +RUN addgroup --system tester && adduser --system --group tester +RUN chown -R tester:tester /py4ai-data +ENV PATH ${PATH}:/home/tester/.local/bin +USER tester + +# change to the tester user: switch to a non-root user is a best practice. +RUN make checks + +FROM python:${PY_VERSION}-slim-buster +WORKDIR /py4ai-data +COPY --from=builder /py4ai-data/dist /py4ai-data/dist + +RUN apt-get update && apt-get upgrade -y && apt-get install gcc libc6-dev -y --no-install-recommends --fix-missing + +RUN addgroup --system runner && adduser --system --group runner +RUN chown -R runner:runner /py4ai-data +ENV PATH ${PATH}:/home/runner/.local/bin +USER runner + +RUN pip install --upgrade pip +RUN ls -t ./dist/*.tar.gz | xargs pip install +ENTRYPOINT ["python"] diff --git a/Makefile b/Makefile index dc76cd3..3a37a47 100644 --- a/Makefile +++ b/Makefile @@ -11,16 +11,22 @@ files := $(shell find . -name "*.py") doc_files := $(shell find sphinx -name "*.*") # Uncomment to store cache installation in the environment -# package_dir := $(shell python -c 'import site; print(site.getsitepackages()[0])') -package_dir := .make_cache +# cache_dir := $(shell python -c 'import site; print(site.getsitepackages()[0])') +cache_dir := .make_cache package_name=$(shell python -c "import tomli;from pathlib import Path;print(tomli.loads(Path('pyproject.toml').read_text(encoding='utf-8'))['project']['name'])") -$(shell mkdir -p $(package_dir)) +$(shell mkdir -p $(cache_dir)) + +pre_deps_tag := $(cache_dir)/.pre_deps +env_tag := $(cache_dir)/.env_tag +env_dev_tag := $(cache_dir)/.env_dev_tag +install_tag := $(cache_dir)/.install_tag +docker_build_tag := $(cache_dir)/.docker_build_tag + +project_name := py4ai-data +registry := ghcr.io +image_name := $(registry)/nicoladonelli/$(project_name) -pre_deps_tag := $(package_dir)/.pre_deps -env_tag := $(package_dir)/.env_tag -env_dev_tag := $(package_dir)/.env_dev_tag -install_tag := $(package_dir)/.install_tag # ====================== # Rules and Dependencies @@ -49,6 +55,8 @@ help: @echo " - docs to produce documentation in html format using sphinx as configured in pyproject.toml" @echo " - checks to run mypy, lint, bandit, licensecheck, tests and check formatting altogether" @echo " - clean to remove cache file" + @echo " - docker_build to build docker image according to Dockerfile, tagged with app version" + @echo " - docker_run to run latest built docker image" @echo "------------------------------------" $(pre_deps_tag): @@ -148,3 +156,19 @@ clean: rm -rf sphinx/source/api rm -rf $(shell find . -name "*.pyc") $(shell find . -name "__pycache__") rm -rf *.egg-info .mypy_cache .pytest_cache .make_cache $(env_tag) $(env_dev_tag) $(install_tag) + +$(docker_build_tag): Dockerfile requirements/requirements.txt py4ai pyproject.toml + @echo "==Building docker container==" + TAG=$$(${PYTHON} py4ai/data/_version.py); \ + PYTHON_VERSION=$$(python --version); \ + PYTHON_VERSION="$${PYTHON_VERSION#Python }"; \ + PYTHON_VERSION="$${PYTHON_VERSION%.*}"; \ + docker build -t $(image_name):"$${TAG}" --build-arg PY_VERSION=$${PYTHON_VERSION} .; \ + VERSION=$$(cat $(docker_build_tag)); \ + if [[ "$${VERSION}" != "$${TAG}" ]]; then echo "==Updating docker version tag=="; echo "$${TAG}" > $(docker_build_tag); fi + +docker_build: $(docker_build_tag) + +docker_run: $(docker_build_tag) + @echo "==Run detached docker image '$(project_name)' from '$(image_name):$$(cat $(docker_build_tag))' container==" + docker run --rm -it --name $(project_name) $(image_name):$$(cat $(docker_build_tag)) diff --git a/py4ai/data/_version.py b/py4ai/data/_version.py index 6866f4a..3a6fa6e 100644 --- a/py4ai/data/_version.py +++ b/py4ai/data/_version.py @@ -691,3 +691,7 @@ def get_versions() -> Dict[str, Any]: "error": "unable to compute version", "date": None, } + + +if __name__ == "__main__": + print(get_versions()["version"].replace("+", ".")) diff --git a/py4ai/data/model/ml.py b/py4ai/data/model/ml.py index 906c1a9..e4f87b5 100644 --- a/py4ai/data/model/ml.py +++ b/py4ai/data/model/ml.py @@ -1,7 +1,7 @@ """Module for specifying data-models to be used in modelling.""" -import sys from abc import ABC, abstractmethod +from itertools import islice from typing import ( Any, Dict, @@ -20,6 +20,7 @@ import numpy as np import pandas as pd +from numpy.typing import NDArray from pandas import DataFrame, Series from py4ai.core.types import T from py4ai.core.utils.decorators import lazyproperty as lazy @@ -36,18 +37,12 @@ RegisterLazyCachedIterables, ) -if sys.version_info[0] < 3: - from itertools import islice - from itertools import izip as zip -else: - from itertools import islice - TPandasDataset = TypeVar("TPandasDataset", bound="PandasDataset") # type: ignore TDatasetUtilsMixin = TypeVar("TDatasetUtilsMixin", bound="DatasetUtilsMixin") # type: ignore FeatType = TypeVar( "FeatType", - bound=Union[List[Any], Tuple[Any], np.ndarray[Any, np.dtype[Any]], Dict[str, Any]], + bound=Union[List[Any], Tuple[Any], NDArray[Any], Dict[str, Any]], ) LabType = TypeVar("LabType", int, float, None) FeaturesType = Union[ @@ -118,11 +113,11 @@ def __init__( self.name: Optional[Union[str, int, Any]] = name -class MultiFeatureSample(Sample[List[np.ndarray[Any, Any]], LabType]): +class MultiFeatureSample(Sample[List[NDArray[Any]], LabType]): """Class representing an observation defined by a nested list of arrays.""" @staticmethod - def _check_features(features: List[np.ndarray[Any, Any]]) -> None: + def _check_features(features: List[NDArray[Any]]) -> None: """ Check that features is list of lists. @@ -138,7 +133,7 @@ def _check_features(features: List[np.ndarray[Any, Any]]) -> None: def __init__( self, - features: List[np.ndarray[Any, Any]], + features: List[NDArray[Any]], label: Optional[LabType] = None, name: Optional[str] = None, ) -> None: @@ -189,7 +184,7 @@ def checkNames(x: Optional[Union[str, int, Any]]) -> Union[str, int]: return x if isinstance(x, int) else str(x) @overload - def getFeaturesAs(self, type: Literal["array"]) -> np.ndarray[Any, Any]: + def getFeaturesAs(self, type: Literal["array"]) -> NDArray[Any]: ... @overload @@ -244,7 +239,7 @@ def getFeaturesAs(self, type: AllowedTypes = "array") -> FeaturesType[FeatType]: raise ValueError(f"Type {type} not allowed") @overload - def getLabelsAs(self, type: Literal["array"]) -> np.ndarray[Any, Any]: + def getLabelsAs(self, type: Literal["array"]) -> NDArray[Any]: ... @overload @@ -393,7 +388,7 @@ def labels(self) -> Iterator[LabType]: return self.getLabelsAs("lazy") @overload - def getFeaturesAs(self, type: Literal["array"]) -> np.ndarray[Any, Any]: + def getFeaturesAs(self, type: Literal["array"]) -> NDArray[Any]: ... @overload @@ -422,7 +417,7 @@ def getFeaturesAs(self, type: AllowedTypes = "lazy") -> FeaturesType[FeatType]: return super(LazyDataset, self).getFeaturesAs(type) @overload - def getLabelsAs(self, type: Literal["array"]) -> np.ndarray[Any, Any]: + def getLabelsAs(self, type: Literal["array"]) -> NDArray[Any]: ... @overload @@ -666,7 +661,7 @@ def intersection(self: TPandasDataset) -> TPandasDataset: return self.loc(idx) @overload - def getFeaturesAs(self, type: Literal["array"]) -> np.ndarray[Any, Any]: + def getFeaturesAs(self, type: Literal["array"]) -> NDArray[Any]: ... @overload @@ -708,7 +703,7 @@ def getFeaturesAs(self, type: AllowedTypes = "array") -> FeaturesType[FeatType]: ) @overload - def getLabelsAs(self, type: Literal["array"]) -> np.ndarray[Any, Any]: + def getLabelsAs(self, type: Literal["array"]) -> NDArray[Any]: ... @overload diff --git a/pyproject.toml b/pyproject.toml index da4bacc..3d39ca0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -101,7 +101,7 @@ disallow_incomplete_defs = true disallow_any_generics = true warn_redundant_casts = true strict_equality = false -plugins = ["sqlalchemy.ext.mypy.plugin"] +plugins = ["sqlalchemy.ext.mypy.plugin", "numpy.typing.mypy_plugin"] exclude = ['_version.py'] [[tool.mypy.overrides]] diff --git a/tests/data/model/test_ml.py b/tests/data/model/test_ml.py index 47b1ea9..86aa592 100644 --- a/tests/data/model/test_ml.py +++ b/tests/data/model/test_ml.py @@ -5,6 +5,7 @@ import numpy as np import pandas as pd +from numpy.typing import NDArray from py4ai.core.tests.core import TestCase, logTest from py4ai.core.utils.fs import create_dir_if_not_exists @@ -157,16 +158,16 @@ def samples_gen() -> Iterator[MultiFeatureSample[float]]: lookback = 3 batch_size = 4 - lazyDat: LazyDataset[List[np.ndarray[Any, np.dtype[Any]]], float] = LazyDataset( + lazyDat: LazyDataset[List[NDArray[Any]], float] = LazyDataset( IterGenerator(samples_gen) ) - lookbackDat: LazyDataset[ - List[np.ndarray[Any, np.dtype[Any]]], float - ] = lazyDat.withLookback(lookback) + lookbackDat: LazyDataset[List[NDArray[Any]], float] = lazyDat.withLookback( + lookback + ) batch_gen = lookbackDat.batch(batch_size) - batch1: CachedDataset[List[np.ndarray[Any, Any]], float] = next(batch_gen) - batch2: CachedDataset[List[np.ndarray[Any, Any]], float] = next(batch_gen) + batch1: CachedDataset[List[NDArray[Any]], float] = next(batch_gen) + batch2: CachedDataset[List[NDArray[Any]], float] = next(batch_gen) tmp1 = batch1.getFeaturesAs("array") temp1X = np.array(list(map(lambda x: np.stack(x), tmp1[:, :, 0]))) @@ -207,7 +208,7 @@ def test_withLookback_ArrayFeatureSample(self) -> None: Sample(features=np.array([116, 117]), label=9), ] - def samples_gen() -> Iterator[Sample[np.ndarray[Any, np.dtype[Any]], int]]: + def samples_gen() -> Iterator[Sample[NDArray[Any], int]]: for sample in samples: if not any([np.isnan(x).any() for x in sample.features]): yield sample @@ -233,16 +234,14 @@ def samples_gen() -> Iterator[Sample[np.ndarray[Any, np.dtype[Any]], int]]: lookback = 3 batch_size = 4 - lazyDat: LazyDataset[np.ndarray[Any, np.dtype[Any]], int] = LazyDataset( + lazyDat: LazyDataset[NDArray[Any], int] = LazyDataset( IterGenerator(samples_gen) ) - lookbackDat: LazyDataset[ - np.ndarray[Any, np.dtype[Any]], int - ] = lazyDat.withLookback(lookback) + lookbackDat: LazyDataset[NDArray[Any], int] = lazyDat.withLookback(lookback) batch_gen = lookbackDat.batch(batch_size) - batch1: CachedDataset[np.ndarray[Any, Any], int] = next(batch_gen) - batch2: CachedDataset[np.ndarray[Any, Any], int] = next(batch_gen) + batch1: CachedDataset[NDArray[Any], int] = next(batch_gen) + batch2: CachedDataset[NDArray[Any], int] = next(batch_gen) tmp1 = batch1.getFeaturesAs("array") tmp1lab = batch1.getLabelsAs("array")