Skip to content

Commit 09c8e11

Browse files
committed
Finished pipeline for num and cat data, UI text and csv export
1 parent 72b023c commit 09c8e11

File tree

3 files changed

+713
-459
lines changed

3 files changed

+713
-459
lines changed

notebooks/helper_function.py

+112
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
import numpy as np
2+
import pandas as pd
3+
import scipy.stats as stats
4+
5+
def diff_df(df, features, type=None, cluster1=None, cluster2=None):
6+
'''
7+
Creates difference dataframe, for numerical data only:
8+
Takes dataframe of two clusters of interest and
9+
computes difference in means, incl. 95% confidence
10+
interval. Default to analyze most deviating cluster
11+
vs rest of the dataset, except specified otherwise.
12+
'''
13+
14+
# Cluster comparison (optional)
15+
if cluster1 != None and cluster2 != None:
16+
# Dataframes
17+
df1 = df[df['Cluster'] == cluster1]
18+
df2 = df[df['Cluster'] == cluster2]
19+
20+
# Default (most biased vs rest of dataset)
21+
else:
22+
# Dataframes
23+
df1 = df[df['Cluster'] == 0]
24+
df2 = df[df['Cluster'] != 0]
25+
26+
# Number of datapoints in clusters
27+
n_df1 = df1.shape[0]
28+
n_df2 = df2.shape[0]
29+
30+
# Initialize dictionaries
31+
diff_dict = {}
32+
CI_dict = {}
33+
34+
# range through features
35+
for feat in features:
36+
# Samples
37+
sample1 = df1[feat]
38+
sample2 = df2[feat]
39+
40+
# numercial data
41+
if type == 'Numerical':
42+
# Mean per sample
43+
mean1 = np.mean(sample1)
44+
mean2 = np.mean(sample2)
45+
46+
# Difference in sample means
47+
diff = mean1 - mean2
48+
49+
# Store results in dict
50+
diff_dict[feat] = diff
51+
52+
# categorical data
53+
else:
54+
# get all values for categorical feature
55+
freq1 = sample1.value_counts()
56+
freq2 = sample2.value_counts()
57+
58+
# difference in sample freqs
59+
diff = freq1 - freq2
60+
61+
# Store results in dict
62+
diff_dict[feat] = diff
63+
64+
# # Standard deviation per sample
65+
# std1 = np.std(sample1, ddof=1) # ddof=1 for sample standard deviation
66+
# std2 = np.std(sample2, ddof=1)
67+
68+
# # Standard error of the difference
69+
# SE = np.sqrt((std1**2 / n_df1) + (std2**2 / n_df2))
70+
71+
# # Degrees of freedom for the t-distribution
72+
# degree_fr = n_df1 + n_df2 - 2
73+
74+
# # Determine the critical value (t-value) for a 95% confidence interval
75+
# t_critical = stats.t.ppf(1 - 0.025, degree_fr) # 95% confidence -> alpha = 0.05, two-tailed
76+
77+
# # Margin of error
78+
# ME = t_critical * SE
79+
80+
# # Confidence intervals
81+
# lower_bound = diff - ME
82+
# upper_bound = diff + ME
83+
84+
# # store confidence interval
85+
# CI_dict[feat] = (lower_bound, upper_bound)
86+
87+
# store numerical results in dataframe
88+
if type == 'Numerical':
89+
# Store results in dataframe
90+
pd.set_option('display.float_format', lambda x: '%.5f' % x)
91+
diff_df = pd.DataFrame.from_dict(diff_dict, orient='index', columns=['Difference'])
92+
# diff_df.columns = ['lower CI', 'upper CI']
93+
# diff_df['diff sample means'] = diff_df.index.map(diff_dict)
94+
95+
# store numerical results in dataframe
96+
else:
97+
# Store results in dataframe
98+
diff_df = pd.DataFrame()
99+
pd.set_option('display.float_format', lambda x: '%.5f' % x)
100+
101+
# range through all values per feature and concatenate to dataframe
102+
for _, value in diff_dict.items():
103+
df_temp = pd.DataFrame(value)
104+
diff_df = pd.concat([diff_df,df_temp], axis=0,)
105+
106+
# replace Nan with 0
107+
diff_df = diff_df.fillna(0)
108+
109+
# rename columns
110+
diff_df.columns = ['Difference']
111+
112+
return(diff_df)

0 commit comments

Comments
 (0)