diff --git a/.github/workflows/build_all_os.yml b/.github/workflows/build_all_os.yml index fd0388c..1ee6b0e 100644 --- a/.github/workflows/build_all_os.yml +++ b/.github/workflows/build_all_os.yml @@ -12,7 +12,7 @@ jobs: strategy: matrix: os: [ubuntu-latest, macos-latest, windows-latest] - python-version: ['3.8', '3.9', '3.10'] + python-version: ['3.7', '3.8', '3.9'] steps: - uses: actions/checkout@v2 - name: Set up Python diff --git a/README.md b/README.md index 5c6cf95..3b90c77 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ [![build](https://github.com/delftdata/valentine/actions/workflows/build.yml/badge.svg)](https://github.com/delftdata/valentine/actions/workflows/build.yml) [![codecov](https://codecov.io/gh/delftdata/valentine/branch/master/graph/badge.svg?token=4QR0X315CL)](https://codecov.io/gh/delftdata/valentine) [![PyPI version](https://badge.fury.io/py/valentine.svg)](https://badge.fury.io/py/valentine) -[![Python 3.8+](https://img.shields.io/badge/python-3.8|3.9|3.10-blue.svg)](https://www.python.org/downloads/release/python-380/) +[![Python 3.7+](https://img.shields.io/badge/python-3.7|3.8|3.9-blue.svg)](https://www.python.org/downloads/release/python-380/) A python package for capturing potential relationships among columns of different tabular datasets, which are given in the form of pandas DataFrames. Valentine is based on [Valentine: Evaluating Matching Techniques for Dataset Discovery](https://ieeexplore.ieee.org/abstract/document/9458921) @@ -17,7 +17,7 @@ pip install valentine ## Installation requirements -* Python>=3.8 +* Python>=3.7,<3.10 diff --git a/requirements.txt b/requirements.txt index b55e97d..1b71a43 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ # algorithms numpy==1.21.2 +scipy==1.7.1 pandas==1.3.4 nltk==3.6.5 snakecase==1.0.1 diff --git a/setup.py b/setup.py index a6fa60e..194e1ef 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setuptools.setup( name='valentine', - version='0.1.2', + version='0.1.3', description='Valentine Matcher', license_files=('LICENSE',), author='Delft Data', @@ -14,10 +14,11 @@ maintainer='Delft Data', maintainer_email='delftdatasystems@gmail.com', url='https://delftdata.github.io/valentine/', - download_url='https://github.com/delftdata/valentine/archive/refs/tags/v0.1.2.tar.gz', + download_url='https://github.com/delftdata/valentine/archive/refs/tags/v0.1.3.tar.gz', packages=setuptools.find_packages(exclude=('tests*', 'examples*')), install_requires=[ 'numpy>=1.21,<2.0', + 'scipy>=1.6,<2.0', 'pandas>=1.3,<1.4', 'nltk>=3.6,<3.7', 'snakecase>=1.0,<2.0', @@ -33,7 +34,7 @@ ], keywords=['matching', 'valentine', 'schema matching', 'dataset discovery', 'coma', 'cupid', 'similarity flooding'], include_package_data=True, - python_requires='>=3.8', + python_requires='>=3.7,<3.10', long_description=long_description, long_description_content_type='text/markdown' ) diff --git a/valentine/algorithms/distribution_based/column_model.py b/valentine/algorithms/distribution_based/column_model.py index 14eae1f..02656ff 100644 --- a/valentine/algorithms/distribution_based/column_model.py +++ b/valentine/algorithms/distribution_based/column_model.py @@ -108,6 +108,6 @@ def get_global_ranks(column: list, tmp_folder_path: str): """ with open(os.path.join(tmp_folder_path, 'ranks.pkl'), 'rb') as pkl_file: global_ranks: dict = pickle.load(pkl_file) - ranks = np.array(sorted([global_ranks[dt_x] for x in column - if (dt_x := convert_data_type(x)) in global_ranks])) + ranks = np.array(sorted([global_ranks[convert_data_type(x)] for x in column + if convert_data_type(x) in global_ranks])) return ranks diff --git a/valentine/algorithms/distribution_based/quantile_histogram.py b/valentine/algorithms/distribution_based/quantile_histogram.py index 1df0802..7281230 100644 --- a/valentine/algorithms/distribution_based/quantile_histogram.py +++ b/valentine/algorithms/distribution_based/quantile_histogram.py @@ -1,4 +1,4 @@ -from statistics import quantiles +import scipy.stats as ss from numpy import ndarray import numpy as np import math @@ -73,7 +73,7 @@ def __init__(self, self.dist_matrix = self.calc_dist_matrix() if reference_hist is None: self.add_buckets(ranks.min(initial=math.inf), - [round(q, 3) for q in quantiles(ranks, n=self.quantiles + 1, method='inclusive')]) + ss.mstats.mquantiles(ranks, np.array(list(range(1, self.quantiles + 1))) / self.quantiles)) self.add_values(ranks) else: self.bucket_boundaries = reference_hist.bucket_boundaries