1
+ import numpy as np
2
+ import pandas as pd
3
+ import scipy .stats as stats
4
+
5
+ def diff_df (df , features , type = None , cluster1 = None , cluster2 = None ):
6
+ '''
7
+ Creates difference dataframe, for numerical data only:
8
+ Takes dataframe of two clusters of interest and
9
+ computes difference in means, incl. 95% confidence
10
+ interval. Default to analyze most deviating cluster
11
+ vs rest of the dataset, except specified otherwise.
12
+ '''
13
+
14
+ # Cluster comparison (optional)
15
+ if cluster1 != None and cluster2 != None :
16
+ # Dataframes
17
+ df1 = df [df ['Cluster' ] == cluster1 ]
18
+ df2 = df [df ['Cluster' ] == cluster2 ]
19
+
20
+ # Default (most biased vs rest of dataset)
21
+ else :
22
+ # Dataframes
23
+ df1 = df [df ['Cluster' ] == 0 ]
24
+ df2 = df [df ['Cluster' ] != 0 ]
25
+
26
+ # Number of datapoints in clusters
27
+ n_df1 = df1 .shape [0 ]
28
+ n_df2 = df2 .shape [0 ]
29
+
30
+ # Initialize dictionaries
31
+ diff_dict = {}
32
+ CI_dict = {}
33
+
34
+ # range through features
35
+ for feat in features :
36
+ # Samples
37
+ sample1 = df1 [feat ]
38
+ sample2 = df2 [feat ]
39
+
40
+ # numercial data
41
+ if type == 'Numerical' :
42
+ # Mean per sample
43
+ mean1 = np .mean (sample1 )
44
+ mean2 = np .mean (sample2 )
45
+
46
+ # Difference in sample means
47
+ diff = mean1 - mean2
48
+
49
+ # Store results in dict
50
+ diff_dict [feat ] = diff
51
+
52
+ # categorical data
53
+ else :
54
+ # get all values for categorical feature
55
+ freq1 = sample1 .value_counts ()
56
+ freq2 = sample2 .value_counts ()
57
+
58
+ # difference in sample freqs
59
+ diff = freq1 - freq2
60
+
61
+ # Store results in dict
62
+ diff_dict [feat ] = diff
63
+
64
+ # # Standard deviation per sample
65
+ # std1 = np.std(sample1, ddof=1) # ddof=1 for sample standard deviation
66
+ # std2 = np.std(sample2, ddof=1)
67
+
68
+ # # Standard error of the difference
69
+ # SE = np.sqrt((std1**2 / n_df1) + (std2**2 / n_df2))
70
+
71
+ # # Degrees of freedom for the t-distribution
72
+ # degree_fr = n_df1 + n_df2 - 2
73
+
74
+ # # Determine the critical value (t-value) for a 95% confidence interval
75
+ # t_critical = stats.t.ppf(1 - 0.025, degree_fr) # 95% confidence -> alpha = 0.05, two-tailed
76
+
77
+ # # Margin of error
78
+ # ME = t_critical * SE
79
+
80
+ # # Confidence intervals
81
+ # lower_bound = diff - ME
82
+ # upper_bound = diff + ME
83
+
84
+ # # store confidence interval
85
+ # CI_dict[feat] = (lower_bound, upper_bound)
86
+
87
+ # store numerical results in dataframe
88
+ if type == 'Numerical' :
89
+ # Store results in dataframe
90
+ pd .set_option ('display.float_format' , lambda x : '%.5f' % x )
91
+ diff_df = pd .DataFrame .from_dict (diff_dict , orient = 'index' , columns = ['Difference' ])
92
+ # diff_df.columns = ['lower CI', 'upper CI']
93
+ # diff_df['diff sample means'] = diff_df.index.map(diff_dict)
94
+
95
+ # store numerical results in dataframe
96
+ else :
97
+ # Store results in dataframe
98
+ diff_df = pd .DataFrame ()
99
+ pd .set_option ('display.float_format' , lambda x : '%.5f' % x )
100
+
101
+ # range through all values per feature and concatenate to dataframe
102
+ for _ , value in diff_dict .items ():
103
+ df_temp = pd .DataFrame (value )
104
+ diff_df = pd .concat ([diff_df ,df_temp ], axis = 0 ,)
105
+
106
+ # replace Nan with 0
107
+ diff_df = diff_df .fillna (0 )
108
+
109
+ # rename columns
110
+ diff_df .columns = ['Difference' ]
111
+
112
+ return (diff_df )
0 commit comments