Skip to content

Commit 5da24bb

Browse files
authored
Merge pull request #48 from UBC-MDS/imputer
format example in the doc of scale function
2 parents a005969 + 36b7436 commit 5da24bb

File tree

1 file changed

+45
-35
lines changed

1 file changed

+45
-35
lines changed

eda_utils_py/eda_utils_py.py

Lines changed: 45 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def imputer(df, strategy="mean", fill_value=None):
5555

5656
# Tests whether input fill_value is of type numbers or None
5757
if not isinstance(fill_value, type(None)) and not isinstance(
58-
fill_value, numbers.Number
58+
fill_value, numbers.Number
5959
):
6060
raise TypeError("fill_value must be of type None or numeric type")
6161

@@ -159,13 +159,17 @@ def cor_map(dataframe, num_col, col_scheme="purpleorange"):
159159

160160
plot = (
161161
alt.Chart(corr_matrix)
162-
.mark_rect()
163-
.encode(
162+
.mark_rect()
163+
.encode(
164164
x=alt.X("var1", title=None),
165165
y=alt.Y("var2", title=None),
166-
color=alt.Color("cor", title = 'Correlation', scale=alt.Scale(scheme=col_scheme, domain = (-1,1))),
166+
color=alt.Color(
167+
"cor",
168+
title="Correlation",
169+
scale=alt.Scale(scheme=col_scheme, domain=(-1, 1)),
170+
),
167171
)
168-
.properties(title="Correlation Matrix", width=400, height=400)
172+
.properties(title="Correlation Matrix", width=400, height=400)
169173
)
170174

171175
text = plot.mark_text(size=15).encode(
@@ -195,7 +199,7 @@ def outlier_identifier(dataframe, columns=None, method="trim"):
195199
- if "trim" : we completely remove data points that are outliers.
196200
- if "median" : we replace outliers with median values
197201
- if "mean" : we replace outliers with mean values
198-
202+
199203
200204
Returns
201205
-------
@@ -206,7 +210,7 @@ def outlier_identifier(dataframe, columns=None, method="trim"):
206210
--------
207211
>> import pandas as pd
208212
>> from eda_utils_py import cor_map
209-
213+
210214
>> data = pd.DataFrame({
211215
>> 'SepalLengthCm':[5.1, 4.9, 4.7],
212216
>> 'SepalWidthCm':[1.4, 1.4, 99],
@@ -224,24 +228,30 @@ def outlier_identifier(dataframe, columns=None, method="trim"):
224228
if columns is None:
225229
for col in dataframe.columns:
226230
if not is_numeric_dtype(dataframe[col]):
227-
raise Exception("The given dataframe contains column that is not numeric column.")
231+
raise Exception(
232+
"The given dataframe contains column that is not numeric column."
233+
)
228234

229235
if columns is not None:
230236
if not isinstance(columns, list):
231237
raise TypeError("The argument @columns must be of type list")
232238

233239
for col in columns:
234240
if col not in list(dataframe.columns):
235-
raise Exception("The given column list contains column that is not exist in the given dataframe.")
241+
raise Exception(
242+
"The given column list contains column that is not exist in the given dataframe."
243+
)
236244
if not is_numeric_dtype(dataframe[col]):
237-
raise Exception("The given column list contains column that is not numeric column.")
245+
raise Exception(
246+
"The given column list contains column that is not numeric column."
247+
)
238248

239249
if method not in ("trim", "median", "mean"):
240250
raise Exception("The method must be -trim- or -median- or -mean-")
241251

242252
df = dataframe.copy()
243253
target_columns = []
244-
if (columns is None):
254+
if columns is None:
245255
target_columns = list(df.columns.values.tolist())
246256
else:
247257
target_columns = columns
@@ -257,14 +267,14 @@ def outlier_identifier(dataframe, columns=None, method="trim"):
257267
current_item = current_column[i]
258268
z = (current_item - mean) / std
259269
if z >= threshold:
260-
if (i not in outlier_index):
270+
if i not in outlier_index:
261271
outlier_index.append(i)
262-
if (method == "mean"):
272+
if method == "mean":
263273
df.at[i, column] = round(mean, 2)
264-
if (method == "median"):
274+
if method == "median":
265275
df.at[i, column] = np.median(current_column)
266276

267-
if (method == "trim"):
277+
if method == "trim":
268278
df = df.drop(outlier_index)
269279

270280
df.index = range(len(df))
@@ -307,12 +317,12 @@ def scale(dataframe, columns, scaler="standard"):
307317
308318
>> scale(data, numerical_columns, scaler="minmax")
309319
310-
SepalLengthCm SepalWidthCm PetalWidthCm
311-
0 0.25 1.00 1.0
312-
1 0.00 0.25 0.0
313-
2 0.00 0.25 0.0
314-
3 0.75 0.00 1.0
315-
4 1.00 0.25 0.5
320+
>> SepalLengthCm SepalWidthCm PetalWidthCm
321+
>> 0 0.25 1.00 1.0
322+
>> 1 0.00 0.25 0.0
323+
>> 2 0.00 0.25 0.0
324+
>> 3 0.75 0.00 1.0
325+
>> 4 1.00 0.25 0.5
316326
"""
317327

318328
# Check if input data is of pd.DataFrame type
@@ -379,24 +389,24 @@ def _standardize(dataframe):
379389

380390
def _minmax(dataframe):
381391
"""Transform features by rescaling each feature to the range between 0 and 1.
382-
The transformation is given by:
392+
The transformation is given by:
383393
384-
scaled_value = (feature_value - min) / (mix - min)
394+
scaled_value = (feature_value - min) / (mix - min)
385395
386-
where min, max = feature_range.
396+
where min, max = feature_range.
387397
388-
This transformation is often used as an alternative to zero mean,
389-
unit variance scaling.
398+
This transformation is often used as an alternative to zero mean,
399+
unit variance scaling.
390400
391-
Parameters
392-
----------
393-
dataframe : pandas.DataFrame
394-
The data frame to be used for EDA.
395-
Returns
396-
-------
397-
res : pandas.core.frame.DataFrame
398-
Scaled dataset
399-
"""
401+
Parameters
402+
----------
403+
dataframe : pandas.DataFrame
404+
The data frame to be used for EDA.
405+
Returns
406+
-------
407+
res : pandas.core.frame.DataFrame
408+
Scaled dataset
409+
"""
400410

401411
res = dataframe.copy()
402412
for feature_name in dataframe.columns:

0 commit comments

Comments
 (0)