Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Correct typos in knn notebook #1

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 13 additions & 11 deletions 初级算法梳理/Task6_knn.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -109,12 +109,14 @@
},
{
"cell_type": "code",
"execution_count": 298,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"data = pd.read_csv('datingTestSet2.txt',sep = '\\t',header = None)\n",
"X = np.array(data.iloc[:,:-1]) \n",
"y = np.array(data.iloc[:,-1])"
Expand All @@ -131,7 +133,7 @@
},
{
"cell_type": "code",
"execution_count": 229,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -150,7 +152,7 @@
},
{
"cell_type": "code",
"execution_count": 231,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -165,7 +167,7 @@
" (m,d) = np.shape(X_test) #测试集的数量和特征数\n",
" y_pred = np.zeros((m)) #将预测的标签初始化为0\n",
" for i in range(m): \n",
" distance = np.sum(np.abs(self.Xtrain - X_test[i,:]),axis = 1) #求距离的绝对之和\n",
" distance = np.sum(np.abs(self.X_train - X_test[i,:]),axis = 1) #求距离的绝对之和\n",
" min_index = np.argmin(distance) #找到最近点的索引\n",
" y_pred[i] = self.y_train[min_index] #将最近点的分类给新数据标记\n",
" return y_pred"
Expand Down Expand Up @@ -243,9 +245,9 @@
"metadata": {},
"outputs": [],
"source": [
"from collections import Counter\n",
"from collections import Counter\n",
"class KNN:\n",
" def __init__(self,k=1,metric ='euclidean'): #默认距离算法为欧式距离,默认最近邻\n",
" def __init__(self,k=1,metric='euclidean'): #默认距离算法为欧式距离,默认最近邻\n",
" self.metric = metric\n",
" self.k = k\n",
" def train(self,X_train,y_train):\n",
Expand All @@ -258,7 +260,7 @@
" \n",
" \n",
" #============================= show me your code =======================\n",
" return ypred"
" return y_pred"
]
},
{
Expand All @@ -281,7 +283,7 @@
"source": [
"那么到底如何选择K值呢?我们可以选择在测试集中表现最好的K值。 \n",
" \n",
"本任务中我们直接调用sklearn中的kFold函数,将数据集进行k折验证,取每次验证的评分平均值作为此K值的误差评分。(这两个k表示的意思不一样,请留意)"
"本任务中我们直接调用sklearn中的KFold函数,将数据集进行k折验证,取每次验证的评分平均值作为此K值的误差评分。(这两个k表示的意思不一样,请留意)"
]
},
{
Expand Down Expand Up @@ -316,7 +318,7 @@
"source": [
"from collections import Counter\n",
"class KNN:\n",
" def __init__(self,k,metric ='euclidean'):\n",
" def __init__(self,k,metric='euclidean'):\n",
" pass\n",
" self.metric = metric\n",
" self.k = k\n",
Expand Down Expand Up @@ -386,7 +388,7 @@
"#循环,取k=1到k=31,查看误差效果\n",
"for k in k_range:\n",
" knn = KNeighborsClassifier(n_neighbors=k)\n",
" #cv参数决定数据集划分比例,这里是按照5:1划分训练集和测试集\n",
" #cv参数决定数据集划分比例,这里是按照4:1划分训练集和测试集\n",
" scores = cross_val_score(knn, X_std, y, cv=5, scoring='accuracy')\n",
" k_error.append(1 - scores.mean())\n",
"\n",
Expand Down Expand Up @@ -611,4 +613,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}