@@ -38,25 +38,24 @@ def load_data(file_path, delimiter=','):
38
38
39
39
# Task 2[10 marks]: Give back the data by removing the rows with -99 values
40
40
def filter_data (data ):
41
- filtered_data = [None ]* 1
42
- # Store the indices to drop
43
- indices_to_drop = []
44
- # Iterate over DataFrame rows
45
- for index , row in data .iterrows ():
46
- # Check if -99 is in the row values
47
- if - 99 in row .values :
48
- # Add the index to the list of indices to drop
49
- indices_to_drop .append (index )
50
- # Drop the rows
51
- filtered_data = data .drop (indices_to_drop )
52
- return filtered_data
41
+ # Find the indices to delete first
42
+ indices_to_delete = []
43
+ for i in range (len (data )):
44
+ if - 99 in data [i ]:
45
+ indices_to_delete .append (i )
46
+ # Reverse the indices to delete to avoid index shifting issues
47
+ indices_to_delete .reverse ()
48
+ # Delete rows from the end to avoid messing up the indices
49
+ for index in indices_to_delete :
50
+ data = np .delete (data , index , axis = 0 )
51
+ return data
53
52
54
53
# Task 3 [10 marks]: Data statistics, return the coefficient of variation for each feature, make sure to remove the rows with nan before doing this.
55
54
def statistics_data (data ):
56
55
coefficient_of_variation = None
57
56
data = filter_data (data )
58
57
# removes the rows with nan values
59
- data = data .dropna ()
58
+ # data = data.dropna()
60
59
# get the mean and standard deviation of the data
61
60
mean = data .mean ()
62
61
std = data .std ()
@@ -257,7 +256,7 @@ def important_feature(x_train, y_train,header_list):
257
256
258
257
'''
259
258
References:
260
- - Task 2-3 is referenced from pandas documentation at https://pandas.pydata. org/docs/
259
+ - Line 49-51 is referenced by https://numpy. org/doc/stable/reference/generated/numpy.delete.html
261
260
- Line 65 , the coeficient of variation formula is referenced from https://en.wikipedia.org/wiki/Coefficient_of_variation
262
261
- Line 76-78 is inspired from https://stackoverflow.com/questions/46519539/how-to-select-all-non-nan-columns-and-non-nan-last-column-using-pandas
263
262
- Line 80-85 is inspired from https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
0 commit comments