-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
The final version for assignment 1.
- Loading branch information
Showing
1 changed file
with
184 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,184 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import pandas \n", | ||
"import numpy\n", | ||
"from sklearn.naive_bayes import GaussianNB\n", | ||
"from sklearn.model_selection import train_test_split\n", | ||
"from sklearn.metrics import accuracy_score\n", | ||
"from sklearn.metrics import confusion_matrix\n", | ||
"from sklearn.metrics import classification_report\n", | ||
"import math\n", | ||
"\n", | ||
"# Read .csv data from local\n", | ||
"# The original data is downloaded from: https://drive.google.com/file/d/1hQjDotqV6ZrGOTnQQ8ANor5zp3L04D6M/view?usp=sharing\n", | ||
"# To test in another computer, please adjust the path below.\n", | ||
"\n", | ||
"path = r'D:\\Uni\\SAKI\\SAKI Exercise 1 - Transaction Classification - Data Set.csv'\n", | ||
"df = pandas.read_csv(path,delimiter = \";\", index_col = 0)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# The features of one type are then converted into numbers, e.g. for the type weather, [sunny, rainning, sunny] will be converted into [0, 1, 0]\n", | ||
"\n", | ||
"def convert (mylist):\n", | ||
" temp = mylist.copy()\n", | ||
" for i in range(len(temp)):\n", | ||
" temp[i] = i\n", | ||
" return temp" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# After analysing the data, it is to know that only in df.iloc[:,i], where i equals 0, 6, and 7, remain emplty value.\n", | ||
"# In the place where i equals 0, it is a math. nan, and in the place where i equals 6 or 7, it is an empty object.\n", | ||
"# Therefore, I reset all of the empty value into 0.\n", | ||
"\n", | ||
"for i in range(len(df.dtypes)-1):\n", | ||
" if numpy.any(df.iloc[:,i].isnull())==True:\n", | ||
"\n", | ||
" for j in range(len(df.iloc[:,0])):\n", | ||
" if i == 0 and math.isnan(df.iloc[j,0]):\n", | ||
" df.iloc[j,0] = 0\n", | ||
" if i == 6 and df.iloc[j,6] is numpy.nan:\n", | ||
" df.iloc[j,6] = 0\n", | ||
" if i == 7 and df.iloc[j,7] is numpy.nan:\n", | ||
" df.iloc[j,7] = 0" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": { | ||
"scrolled": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# In features_list, dinstict features of every type (ecept \"label\") will be stored into one list, which then will be appended together into the features_list array.\n", | ||
"# In features_index, the distinct features of every type will be converted into numbers.\n", | ||
"\n", | ||
"features_list = []\n", | ||
"features_index = []\n", | ||
"for i in range(len(df.dtypes)-1):\n", | ||
" features_list.append(list(set(df.iloc[:,i])))\n", | ||
" features_index.append(convert(features_list[i]))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# The distinct features of \"label\" will be stored into labels_list, and the converted value will be stored into labels_index\n", | ||
"\n", | ||
"labels_list = list(set(df.iloc[:,len(df.dtypes)-1]))\n", | ||
"labels_index = convert(labels_list)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# According to the different features in different types, and their corresponding number, the whole dataframe is \"converted\" into numbers\n", | ||
"\n", | ||
"features = numpy.array([[0]*(len(df.dtypes)-1) for i in range(len(df.iloc[:,0]))])\n", | ||
"labels = []\n", | ||
"for i in range(len(df.dtypes)):\n", | ||
" for j in range(len(df.iloc[:,0])):\n", | ||
" if i != len(df.dtypes)-1:\n", | ||
" place = features_list[i].index(df.iloc[j,i])\n", | ||
" features[j][i] = features_index[i][place]\n", | ||
" else:\n", | ||
" place = labels_list.index(df.iloc[j,i])\n", | ||
" labels.append(labels_index[place])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"classification_report: \n", | ||
" precision recall f1-score support\n", | ||
"\n", | ||
" private 0.20 1.00 0.33 1\n", | ||
"standardOfLiving 0.92 0.69 0.79 16\n", | ||
" income 1.00 1.00 1.00 3\n", | ||
" finance 1.00 0.89 0.94 9\n", | ||
" leisure 0.73 0.89 0.80 27\n", | ||
" living 1.00 0.29 0.44 7\n", | ||
"\n", | ||
" accuracy 0.78 63\n", | ||
" macro avg 0.81 0.79 0.72 63\n", | ||
" weighted avg 0.85 0.78 0.78 63\n", | ||
"\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# Split the dataset into a 70%-30% train-test-process. The trained model will be used to predicte and compared with the test data.\n", | ||
"# Result will be present in classification report.\n", | ||
"\n", | ||
"features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.3,random_state=0)\n", | ||
"\n", | ||
"data = {\"data\":features_train,\"target\":labels_train}\n", | ||
"gnb = GaussianNB()\n", | ||
"gnb = gnb.fit(data[\"data\"],data[\"target\"])\n", | ||
"\n", | ||
"\n", | ||
"prob_pred = gnb.predict_proba(features_test)\n", | ||
"pred = gnb.predict(features_test)\n", | ||
"print(\"classification_report: \\n\", classification_report(labels_test, pred, target_names = labels_list))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.8.5" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 4 | ||
} |