-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathttest_and_assumptions.py
223 lines (183 loc) · 7.25 KB
/
ttest_and_assumptions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import logging
# Configure logging
logging.basicConfig(level=logging.WARNING)
def check_normality(sample1, sample2, significance_level=0.05):
"""
Check the normality of two input samples using the Shapiro-Wilk test.
Parameters
----------
sample1, sample2 : array-like
Input samples.
significance_level : float, optional
Significance level for the normality test. Default is 0.05.
Returns
-------
bool
True if both samples are normally distributed, False otherwise.
"""
_, p1 = stats.shapiro(sample1)
_, p2 = stats.shapiro(sample2)
if p1 < significance_level or p2 < significance_level:
logging.warning(f"Data may not be normally distributed. p-values: {p1:.4f}, {p2:.4f}")
logging.warning("Note: For large samples, the test may reject normality even for approximately normal data.")
return False
return True
def check_variance(sample1, sample2, significance_level=0.05):
"""
Checks the equality of variances of two input samples using Levene's test.
Parameters
----------
sample1, sample2 : array-like
Input samples.
significance_level : float, optional
Significance level for the variance test. Default is 0.05.
Returns
-------
bool
True if samples have equal variances, False otherwise.
"""
_, p = stats.levene(sample1, sample2)
if p < significance_level:
logging.warning(f"Variances are not equal. p-value: {p:.4f}")
return False
return True
def validate_input(tail, direction):
"""
Validates the input parameters for tail and direction.
Parameters
----------
tail : str
Specifies the type of t-test, should be "one" or "two".
direction : str or None
Specifies the direction of a one-tailed test, should be "greater", "less", or None.
Raises
------
ValueError
If the input values are not valid.
"""
valid_tails = ["one", "two"]
valid_directions = ["greater", "less", None]
if tail not in valid_tails:
raise ValueError(f"Invalid value for tail: {tail}. It should be one of {valid_tails}.")
if direction not in valid_directions:
raise ValueError(f"Invalid value for direction: {direction}. It should be one of {valid_directions}.")
if tail == "two" and direction is not None:
raise ValueError("Direction should be None for a two-tailed test.")
if tail == "one" and direction is None:
raise ValueError("Direction should be specified for a one-tailed test.")
def perform_ttest(sample1, sample2, sample="independent", tail="two", direction=None, significance_level=0.05):
"""
Wrapper function to perform an independent or dependent t-test.
Parameters
----------
sample1, sample2 : array-like
Input samples.
sample : str, optional
Type of t-test, should be "dependent" or "independent". Default is "independent".
tail : str, optional
Specifies the type of t-test, should be "one" or "two". Default is "two".
direction : str or None, optional
Specifies the direction of a one-tailed test, should be "greater", "less", or None. Default is None.
significance_level : float, optional
Significance level for assumption tests. Default is 0.05.
Returns
-------
tuple
t-statistic and the p-value.
"""
validate_input(tail, direction)
if not check_normality(sample1, sample2, significance_level):
logging.warning("At least one of the samples is not normally distributed, results may be unreliable")
if sample == "dependent":
return dependent_ttest(sample1, sample2, tail=tail, direction=direction)
elif sample == "independent":
return independent_ttest(sample1, sample2, tail=tail, direction=direction, significance_level=significance_level)
else:
raise ValueError("Invalid value for sample: {}. It should be 'dependent' or 'independent'.".format(sample))
def independent_ttest(sample1, sample2, tail="two", direction=None, significance_level=0.05):
"""
Perform an independent t-test.
Parameters
----------
sample1, sample2 : array-like
Input samples.
tail : str, optional
Specifies the type of t-test, should be "one" or "two". Default is "two".
direction : str or None, optional
Specifies the direction of a one-tailed test, should be "greater", "less", or None. Default is None.
significance_level : float, optional
Significance level for assumption tests. Default is 0.05.
Returns
-------
tuple
t-statistic and the p-value.
"""
equal_var = check_variance(sample1, sample2, significance_level)
# If equal_var is False, scipy will automatically use Welch's t-test
t_stat, p_value = stats.ttest_ind(sample1, sample2, equal_var=equal_var)
if tail == "one":
p_value /= 2
if (direction == "less" and t_stat > 0) or (direction == "greater" and t_stat < 0):
p_value = 1 - p_value
return t_stat, p_value
def dependent_ttest(sample1, sample2, tail="two", direction=None):
"""
Perform a dependent t-test.
Parameters
----------
sample1, sample2 : array-like
Input samples.
tail : str, optional
Specifies the type of t-test, should be "one" or "two". Default is "two".
direction : str or None, optional
Specifies the direction of a one-tailed test, should be "greater", "less", or None. Default is None.
Returns
-------
tuple
t-statistic and the p-value.
"""
if len(sample1) != len(sample2):
raise ValueError("Samples must be the same length for a dependent t-test")
t_stat, p_value = stats.ttest_rel(sample1, sample2)
if tail == "one":
p_value /= 2
if (direction == "less" and t_stat > 0) or (direction == "greater" and t_stat < 0):
p_value = 1 - p_value
return t_stat, p_value
def plot_means(sample1, sample2, filename=None):
"""
Plots the means of the two samples with error bars.
Parameters
----------
sample1, sample2 : array-like
Input samples.
filename : str or None, optional
The filename for saving the plot. If None, the plot will be displayed. Default is None.
"""
means = [np.mean(sample1), np.mean(sample2)]
errors = [stats.sem(sample1), stats.sem(sample2)]
plt.figure(figsize=(10, 6))
plt.bar(['Sample 1', 'Sample 2'], means, yerr=errors, color=['blue', 'green'], capsize=5)
plt.ylabel('Mean')
plt.title('Means of the two samples with error bars')
if filename:
plt.savefig(filename)
else:
plt.show()
plt.close()
# Example usage
sample1 = np.random.randn(30) * 10 + 50
sample2 = np.random.randn(30) * 10 + 51
t_stat, p_value = perform_ttest(sample1, sample2, sample="independent", tail="two")
print("Independent two-tailed t-test:")
print("T-statistic:", t_stat)
print("P-value:", p_value)
t_stat, p_value = perform_ttest(sample1, sample2, sample="dependent", tail="one", direction="less")
print("\nDependent one-tailed t-test (less):")
print("T-statistic:", t_stat)
print("P-value:", p_value)
# Plotting the means
plot_means(sample1, sample2)