-
Notifications
You must be signed in to change notification settings - Fork 0
/
code.py
200 lines (128 loc) · 6.68 KB
/
code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
#Importing the modules
import pandas as pd
import numpy as np
from scipy.stats import mode
#Code for categorical variable
def categorical(df):
""" Extract names of categorical column
This function accepts a dataframe and returns categorical list,
containing the names of categorical columns(categorical_var).
Keyword arguments:
df - Pandas dataframe from which the columns name will be extracted
Returns:
categorical_var - List of categorical features
"""
categorical_var=df.select_dtypes(include ='object').columns.tolist()
return categorical_var
#Code for numerical variable
def numerical(df):
""" Extract names of numerical column
This function accepts a dataframe and returns numerical list,
containing the names of numerical columns(numerical_var).
Keyword arguments:
df - Pandas dataframe from which the columns name will be extracted
Returns:
numerical_var - List of numerical features
"""
numerical_var=df.select_dtypes(include =['float64','int64']).columns.tolist()
return numerical_var
#code to check distribution of variable
def clear(df,col,val):
""" Check distribution of variable
This function accepts a dataframe,column(feature) and value which returns count of the value,
containing the value counts of a variable(value_counts)
Keyword arguments:
df - Pandas dataframe
col - Feature of the datagrame
val - value of the feature
Returns:
value_counts - Value count of the feature
"""
value_counts=df[col][df[col]==val].count()
return value_counts
#Code to check instances based on the condition
def instances_based_condition(df,col1,val1,col2,val2):
""" Instances based on the condition
This function accepts a dataframe, 2 columns(feature) and 2 values which returns the dataframe
based on the condition.
Keyword arguments:
df - Pandas dataframe which has the data.
col1 - First feature of the dataframe on which you want to apply the filter
val1 - Value to be filtered on the first feature
col2 - Second feature of the dataframe on which you want to apply the filter
val2 - Value to be filtered on second feature
Returns:
instance - Generated dataframe
"""
instance=df[(df[col1]>val1) & (df[col2]==val2)]
return instance
# Code to calculate different aggreagted values according to month
def agg_values_ina_month(df,date_col,agg_col, agg):
""" Aggregate values according to month
This function accepts a dataframe, 2 columns(feature) and aggregated funcion(agg) which returns the Pivot
table with different aggregated value of the feature with an index of the month.
Keyword arguments:
df - Pandas dataframe which has the data.
date_col - Date feature of the dataframe on which you want to apply to_datetime conversion
agg_col - Feature of the dataframe on which values will be aggregated.
agg - The function to be used for aggregating the df (eg. 'mean', 'min', 'max').
Returns:
aggregated_value - Generated pivot table
"""
df[date_col] = pd.to_datetime(df[date_col])
aggregate = {'mean':np.mean,'max':np.max,'min':np.min,'sum':np.sum,'len':len}
aggregated_value = df.pivot_table(values=[agg_col], index=df[date_col].dt.month,aggfunc={agg_col:aggregate[agg]})
return aggregated_value
# Code to group values based on the feature
def group_values(df,col1,agg1):
""" Agrregate values by grouping
This function accepts a dataframe, 1 column(feature) and aggregated function(agg1) which groupby the
datframe based on the column.
Keyword arguments:
df - Pandas dataframe which has the data.
col1 - Feature of the dataframe on which values will be aggregated.
agg1 - The function to be used for aggregating the df (eg. 'mean', 'min', 'max').
Returns:
grouping - Dataframe with all columns on which it is grouped on.
"""
grouping=df.groupby(col1).agg(agg1)
return grouping
# function for convert temperatures
def convert(df,celsius):
""" Convert temperatures from celsius to fahrenhheit
This function accepts a dataframe, 1 column(feature) which returns the dataframe with converted values from
celsius to fahrenhheit.
Keyword arguments:
df - Pandas dataframe which has the data.
celsius - Temperature feature of the dataframe which you want to convert to fahrenhheit
Returns:
converted_temp - Generated dataframe with Fahrenhheit temp.
"""
converted_temp=(df[celsius]*(9/5))+32
return converted_temp
#1 Load the weather_2012 data csv file and storing it in weather variable.
weather=pd.read_csv(path)
print(weather.head())
print(weather.info())
#2 Checking the categorical and numerical variables by calling categorical and numerical function.
print(categorical(weather))
print(numerical(weather))
#3 Checking the distribution of a specific value like the number of times the weather was exactly Cloudy in the given column.
#Checking it by calling the function clear with respective parameters.
#By using index of the value or name of the value, we can check the number of count
print(clear(weather,'Weather','Cloudy'))
#4 Now suppose we want to check some instances based on a specific condition like when the wind speed was above 35 and visibility was 25.
#Checking it by calling the function instances_based_condition with respective parameters.
wind_speed_35_vis_25=instances_based_condition(weather,'Wind Spd (km/h)',35,'Visibility (km)',25)
print(wind_speed_35_vis_25.head())
#5 We have temperature data and want to calculate the mean temperature recorded by month.
#generating a pivot table which contains the aggregated values(like mean, max ,min, sum, len) recoreded by month.
#calling the function agg_values_ina_month with respective parameters.
agg_values_ina_month(weather,'Date/Time','Dew Point Temp (C)','mean')
print(agg_values_ina_month(weather,'Date/Time','Dew Point Temp (C)','mean'))
#6 To groupby based on a column like you want to groupby on Weather column and then aggregate the mean values of each column for different types of weather using mean.
#calling the function group_values with aggregated functions like max, min, sum, len.
mean_weather=group_values(weather,'Weather','mean')
#7 We have a temperature data and wanted to convert celsius temperature into fahrehheit temperatures
#calling the function convert.
weather_fahrehheit=convert(weather,'Temp (C)')