Skip to content

Commit 7fa97e9

Browse files
Refactor data filtering logic and remove unnecessary code in CW2.py
1 parent 78f7200 commit 7fa97e9

File tree

1 file changed

+13
-14
lines changed

1 file changed

+13
-14
lines changed

CW2 (2).py renamed to CW2.py

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -38,25 +38,24 @@ def load_data(file_path, delimiter=','):
3838

3939
# Task 2[10 marks]: Give back the data by removing the rows with -99 values
4040
def filter_data(data):
41-
filtered_data=[None]*1
42-
# Store the indices to drop
43-
indices_to_drop = []
44-
# Iterate over DataFrame rows
45-
for index, row in data.iterrows():
46-
# Check if -99 is in the row values
47-
if -99 in row.values:
48-
# Add the index to the list of indices to drop
49-
indices_to_drop.append(index)
50-
# Drop the rows
51-
filtered_data = data.drop(indices_to_drop)
52-
return filtered_data
41+
# Find the indices to delete first
42+
indices_to_delete = []
43+
for i in range(len(data)):
44+
if -99 in data[i]:
45+
indices_to_delete.append(i)
46+
# Reverse the indices to delete to avoid index shifting issues
47+
indices_to_delete.reverse()
48+
# Delete rows from the end to avoid messing up the indices
49+
for index in indices_to_delete:
50+
data = np.delete(data, index, axis=0)
51+
return data
5352

5453
# Task 3 [10 marks]: Data statistics, return the coefficient of variation for each feature, make sure to remove the rows with nan before doing this.
5554
def statistics_data(data):
5655
coefficient_of_variation=None
5756
data=filter_data(data)
5857
# removes the rows with nan values
59-
data = data.dropna()
58+
#data = data.dropna()
6059
# get the mean and standard deviation of the data
6160
mean = data.mean()
6261
std = data.std()
@@ -257,7 +256,7 @@ def important_feature(x_train, y_train,header_list):
257256

258257
'''
259258
References:
260-
- Task 2-3 is referenced from pandas documentation at https://pandas.pydata.org/docs/
259+
- Line 49-51 is referenced by https://numpy.org/doc/stable/reference/generated/numpy.delete.html
261260
- Line 65 , the coeficient of variation formula is referenced from https://en.wikipedia.org/wiki/Coefficient_of_variation
262261
- Line 76-78 is inspired from https://stackoverflow.com/questions/46519539/how-to-select-all-non-nan-columns-and-non-nan-last-column-using-pandas
263262
- Line 80-85 is inspired from https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

0 commit comments

Comments
 (0)