Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
The final version for assignment 1.
  • Loading branch information
Emma-An94 authored May 11, 2021
1 parent f8ff066 commit 13b7242
Showing 1 changed file with 184 additions and 0 deletions.
184 changes: 184 additions & 0 deletions assignment 1.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas \n",
"import numpy\n",
"from sklearn.naive_bayes import GaussianNB\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import accuracy_score\n",
"from sklearn.metrics import confusion_matrix\n",
"from sklearn.metrics import classification_report\n",
"import math\n",
"\n",
"# Read .csv data from local\n",
"# The original data is downloaded from: https://drive.google.com/file/d/1hQjDotqV6ZrGOTnQQ8ANor5zp3L04D6M/view?usp=sharing\n",
"# To test in another computer, please adjust the path below.\n",
"\n",
"path = r'D:\\Uni\\SAKI\\SAKI Exercise 1 - Transaction Classification - Data Set.csv'\n",
"df = pandas.read_csv(path,delimiter = \";\", index_col = 0)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# The features of one type are then converted into numbers, e.g. for the type weather, [sunny, rainning, sunny] will be converted into [0, 1, 0]\n",
"\n",
"def convert (mylist):\n",
" temp = mylist.copy()\n",
" for i in range(len(temp)):\n",
" temp[i] = i\n",
" return temp"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# After analysing the data, it is to know that only in df.iloc[:,i], where i equals 0, 6, and 7, remain emplty value.\n",
"# In the place where i equals 0, it is a math. nan, and in the place where i equals 6 or 7, it is an empty object.\n",
"# Therefore, I reset all of the empty value into 0.\n",
"\n",
"for i in range(len(df.dtypes)-1):\n",
" if numpy.any(df.iloc[:,i].isnull())==True:\n",
"\n",
" for j in range(len(df.iloc[:,0])):\n",
" if i == 0 and math.isnan(df.iloc[j,0]):\n",
" df.iloc[j,0] = 0\n",
" if i == 6 and df.iloc[j,6] is numpy.nan:\n",
" df.iloc[j,6] = 0\n",
" if i == 7 and df.iloc[j,7] is numpy.nan:\n",
" df.iloc[j,7] = 0"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"# In features_list, dinstict features of every type (ecept \"label\") will be stored into one list, which then will be appended together into the features_list array.\n",
"# In features_index, the distinct features of every type will be converted into numbers.\n",
"\n",
"features_list = []\n",
"features_index = []\n",
"for i in range(len(df.dtypes)-1):\n",
" features_list.append(list(set(df.iloc[:,i])))\n",
" features_index.append(convert(features_list[i]))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# The distinct features of \"label\" will be stored into labels_list, and the converted value will be stored into labels_index\n",
"\n",
"labels_list = list(set(df.iloc[:,len(df.dtypes)-1]))\n",
"labels_index = convert(labels_list)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# According to the different features in different types, and their corresponding number, the whole dataframe is \"converted\" into numbers\n",
"\n",
"features = numpy.array([[0]*(len(df.dtypes)-1) for i in range(len(df.iloc[:,0]))])\n",
"labels = []\n",
"for i in range(len(df.dtypes)):\n",
" for j in range(len(df.iloc[:,0])):\n",
" if i != len(df.dtypes)-1:\n",
" place = features_list[i].index(df.iloc[j,i])\n",
" features[j][i] = features_index[i][place]\n",
" else:\n",
" place = labels_list.index(df.iloc[j,i])\n",
" labels.append(labels_index[place])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"classification_report: \n",
" precision recall f1-score support\n",
"\n",
" private 0.20 1.00 0.33 1\n",
"standardOfLiving 0.92 0.69 0.79 16\n",
" income 1.00 1.00 1.00 3\n",
" finance 1.00 0.89 0.94 9\n",
" leisure 0.73 0.89 0.80 27\n",
" living 1.00 0.29 0.44 7\n",
"\n",
" accuracy 0.78 63\n",
" macro avg 0.81 0.79 0.72 63\n",
" weighted avg 0.85 0.78 0.78 63\n",
"\n"
]
}
],
"source": [
"# Split the dataset into a 70%-30% train-test-process. The trained model will be used to predicte and compared with the test data.\n",
"# Result will be present in classification report.\n",
"\n",
"features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.3,random_state=0)\n",
"\n",
"data = {\"data\":features_train,\"target\":labels_train}\n",
"gnb = GaussianNB()\n",
"gnb = gnb.fit(data[\"data\"],data[\"target\"])\n",
"\n",
"\n",
"prob_pred = gnb.predict_proba(features_test)\n",
"pred = gnb.predict(features_test)\n",
"print(\"classification_report: \\n\", classification_report(labels_test, pred, target_names = labels_list))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

0 comments on commit 13b7242

Please sign in to comment.