Skip to content

Commit

Permalink
Merge pull request #32 from Faye-yufan/f-multicollinearity
Browse files Browse the repository at this point in the history
Add function `update_predictor_multicollinear`
  • Loading branch information
Faye-yufan authored Feb 9, 2023
2 parents d8da4d9 + bf33f74 commit 1c78cf3
Show file tree
Hide file tree
Showing 6 changed files with 80 additions and 26 deletions.
2 changes: 1 addition & 1 deletion conda-package/analyticsdf/meta.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{% set name = "analyticsdf" %}
{% set version = "0.0.4.3" %}
{% set version = "0.0.7" %}

package:
name: "{{ name|lower }}"
Expand Down
19 changes: 0 additions & 19 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,6 @@
requires = ["setuptools>=42.0", "wheel"]
build-backend = "setuptools.build_meta"

[project]
name = "analyticsdf"
version = "0.0.6"
authors = [
{ name="Yufan Fei", email="yufanfei@usc.edu" },
]
description = "Analytic generation of datasets with specified statistical characteristics."
readme = "README.md"
license = { file="LICENSE" }
requires-python = ">=3.6"
classifiers = [
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
]

[project.urls]
"Homepage" = "https://github.com/Faye-yufan/analytics-dataset"

[tool.pytest.ini_options]
addopts = "--cov=analyticsdf"
testpaths = [
Expand Down
7 changes: 6 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
[metadata]
name = analyticsdf
description = An example package of Analytics Dataframe
version = 0.0.7
description = Analytic generation of datasets with specified statistical characteristics.
long_description = file: README.md
long_description_content_type = text/markdown
author = Fei, Eli
author_email = yufanfei@usc.edu
license = MIT
license_files = LICENSE
url = https://github.com/Faye-yufan/analytics-dataset
platforms = unix, linux, osx, cygwin, win32
classifiers =
Programming Language :: Python :: 3
Expand Down
29 changes: 28 additions & 1 deletion src/analyticsdf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,5 +76,32 @@ def decorated(self, predictor_name_list, *args, **kwargs):
return decorated


def _check_columns_exist(df, target):
"""Raise an exception if the given columns does not exists in dataframe.
Args:
df:
Pandas DataFrame, the data
target:
str or list, the columns one may want to check if exists in df
Raises:
KeyError: If the column does not exists
"""
missing = []
if isinstance(target, list):
for c in target:
if c in df.columns:
continue
missing.append(c)
else:
if target not in df.columns:
missing.append(target)

if missing:
raise KeyError(f'The columns {missing} were not found in predictors.')


def check_is_numeric(col):
return np.issubdtype(col.dtype, np.number)
return np.issubdtype(col.dtype, np.number)
36 changes: 34 additions & 2 deletions src/analyticsdf/analyticsdataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from sklearn.utils.extmath import safe_sparse_dot
from itertools import combinations

from analyticsdf import check_columns_exist, set_random_state, validate_random_state, check_is_numeric
from analyticsdf import check_columns_exist, set_random_state, validate_random_state, _check_columns_exist, check_is_numeric

class AnalyticsDataframe:
"""Create a AnalyticsDataframe class.
Expand Down Expand Up @@ -150,7 +150,6 @@ def update_predictor_categorical(self, predictor_name = None,
df.loc[df.index[j], predictor_name] = catg_dict[value]


@check_columns_exist
def update_predictor_uniform(self, predictor_name = None, lower_bound = 0, upper_bound = 1.0):
"""Update a predictor to uniformly distributed.
Expand All @@ -166,11 +165,44 @@ def update_predictor_uniform(self, predictor_name = None, lower_bound = 0, upper
KeyError: If the column does not exists.
"""
_check_columns_exist(self.predictor_matrix, predictor_name)

with set_random_state(validate_random_state(self.seed)):
num_row = len(self.predictor_matrix)
self.predictor_matrix[predictor_name] = np.random.uniform(lower_bound, upper_bound, num_row)


def update_predictor_multicollinear(self, target_predictor_name = None, dependent_predictors_list = None,
beta: list = None,
epsilon_variance: float = None):
"""Update the predictor to be multicollinear with other predictors.
Args:
predictor_name:
A string of target predictor name in the initial AnalyticsDataframe.
dependent_predictors_list:
A list of predictor names which selected as dependents.
beta:
A list, coefficients of the linear model – first coefficient is the intercept
epsilon_variance:
A scalar variance specification.
Raises:
KeyError: If the column does not exists.
"""
check_columns = [target_predictor_name] + dependent_predictors_list
_check_columns_exist(self.predictor_matrix, check_columns)

with set_random_state(validate_random_state(self.seed)):
eps = epsilon_variance * np.random.randn(self.n)
beta = np.array(beta)
if not dependent_predictors_list:
dependent_predictors_list = self.predictor_matrix.columns.values.tolist()
self.predictor_matrix[target_predictor_name] = safe_sparse_dot(self.predictor_matrix[dependent_predictors_list],
beta[1:].T, dense_output=True) + beta[0] + eps


@check_columns_exist
def generate_response_vector_linear(self, predictor_name_list: list = None,
beta: list = None,
Expand Down
13 changes: 11 additions & 2 deletions tests/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,17 @@ def test_set_random_state():
ad_4 = generate_ad()
assert not ad_3.predictor_matrix.equals(ad_4.predictor_matrix)

# Test 'update_predictor_multicollinear'
def test_multicollinear():
ad = AnalyticsDataframe(5, 3, ["xx1", "xx2", "xx3"], "yy")
ad.update_predictor_uniform("xx2", 1, 3)
ad.update_predictor_uniform("xx3", 1, 3)
beta = [0, 1, 1.5]
eps_var = 1
ad.update_predictor_multicollinear(target_predictor_name = 'xx1', dependent_predictors_list = ['xx2', 'xx3'], beta=beta, epsilon_variance=eps_var)
assert ad.predictor_matrix['xx1'][0] >= ad.predictor_matrix['xx2'][0] + ad.predictor_matrix['xx3'][0] * 1.5 - 3 * eps_var
assert ad.predictor_matrix['xx1'][0] <= ad.predictor_matrix['xx2'][0] + ad.predictor_matrix['xx3'][0] * 1.5 + 3 * eps_var

# Test 'update_response_poly_categorical'
def test_update_response_poly_categorical():
ad = AnalyticsDataframe(1000, 6)
Expand All @@ -247,5 +258,3 @@ def test_update_response_poly_categorical():
ad.update_response_poly_categorical(predictor_name='X6', betas={'Red': -2000, 'Blue': -1700})
assert ad.predictor_matrix.loc[1, 'X6'] == 'Red'
assert ad.response_vector[1] < -1900


0 comments on commit 1c78cf3

Please sign in to comment.