From f226a7c5354c9c7d899f9ab0fe40af90ab542f3b Mon Sep 17 00:00:00 2001 From: Israel Abebe Date: Thu, 14 Nov 2019 19:28:52 +0000 Subject: [PATCH] wrapper backwords done --- ...hod- with cross validation-backwords.ipynb | 249 ++++++++++++++++++ ...e selection methods - wrapper method.ipynb | 99 +++---- 2 files changed, 282 insertions(+), 66 deletions(-) create mode 100644 algorithms/wrapper_methods/Feature selection methods - wrapper method- with cross validation-backwords.ipynb diff --git a/algorithms/wrapper_methods/Feature selection methods - wrapper method- with cross validation-backwords.ipynb b/algorithms/wrapper_methods/Feature selection methods - wrapper method- with cross validation-backwords.ipynb new file mode 100644 index 0000000..9215f3a --- /dev/null +++ b/algorithms/wrapper_methods/Feature selection methods - wrapper method- with cross validation-backwords.ipynb @@ -0,0 +1,249 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import accuracy_score\n", + "from sklearn.datasets import load_iris\n", + "from itertools import combinations\n", + "import numpy as np\n", + "import pandas as pd\n", + "import ast" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [], + "source": [ + "class logisticregression():\n", + " def __init__(self,train_data,train_labels,lr=0.01,batch_size=None,epoch=10,print_every = 10):\n", + " dummy_once = np.ones((len(train_data),1))\n", + " self.train_data = np.hstack((dummy_once,train_data))\n", + " self.train_labels = train_labels\n", + " \n", + " self.params = np.zeros((len(self.train_data[0]),1))\n", + " \n", + " self.lr = lr\n", + " self.epoch = epoch\n", + " self.batch_size = batch_size\n", + " self.print_every = print_every\n", + " \n", + " def sigmoid(self,x):\n", + " return 1/(1+np.exp(-x))\n", + " \n", + " def cost(self,y,y_pred):\n", + " return -np.mean(y*np.log(y_pred)+(1-y)*np.log(1-y_pred))\n", + " \n", + " def gradient(self,y,y_pred,x):\n", + " return np.dot(x.T,(y_pred-y))\n", + " \n", + " def train(self):\n", + " for i in range(self.epoch):\n", + " y_pred = self.sigmoid(np.dot(self.train_data,self.params))\n", + " loss = self.cost(self.train_labels,y_pred)\n", + " \n", + " gra = self.gradient(self.train_labels,y_pred,self.train_data)\n", + " \n", + " self.params -= self.lr*gra\n", + " \n", + " if self.print_every:\n", + " if i%self.print_every == 0 or i == self.epoch-1:\n", + " print('Epoch : {} Loss: {}'.format(i,loss))\n", + " def predict(self,test_data):\n", + " result = self.sigmoid(np.dot(test_data,self.params[1:])+self.params[0])\n", + " result[result >= 0.5 ] = 1\n", + " result[result < 0.5 ] = 0\n", + " return result\n", + " \n", + " def evaluate(self,test_data,labels):\n", + " accuracy = accuracy_score(self.predict(test_data),labels)\n", + " return accuracy" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "def cross_validate(data,k=5):\n", + " if len(data)%k != 0:\n", + " print('cant vsplit',len(data),' by ',k)\n", + " return\n", + " \n", + " data_splitted = np.vsplit(data,k)\n", + " aggrigate_result = []\n", + " for i in range(len(data_splitted)):\n", + " train = []\n", + " test = []\n", + " items = [j for j in range(len(data_splitted)) if j !=i ]\n", + " test = data_splitted[i]\n", + " for item in items:\n", + " if len(train) == 0:\n", + " train = data_splitted[item]\n", + " else:\n", + " train = np.concatenate((train,data_splitted[item]), axis=0)\n", + " \n", + " logistic = logisticregression(train[:,:-1],train[:,-1:],epoch=10,print_every=None)\n", + " logistic.train()\n", + " \n", + " result = logistic.evaluate(test[:,:-1],test[:,-1:])\n", + " aggrigate_result.append(result)\n", + " \n", + " return aggrigate_result" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "def get_combinations(items,number):\n", + " comb_list = []\n", + " for c in combinations(items, number):\n", + " c = list(c)\n", + " c.sort()\n", + " comb_list.append(c)\n", + " \n", + " return comb_list" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "metadata": {}, + "outputs": [], + "source": [ + "def wrapper_back_method(data,names,target_name,feature_count=2,cross_val_k = 5,visualize=True):\n", + " num = len(names)-1\n", + " classes = []\n", + " if num == len(names):\n", + " classes = [names]\n", + " else:\n", + " classes = get_combinations(names,num)\n", + " \n", + " \n", + " wrapper_output = {}\n", + " for i in classes: \n", + " cols = np.concatenate((i,target_name),axis=None)\n", + " sub_data = data[cols].values\n", + " result = cross_validate(sub_data,cross_val_k)\n", + " avarage = sum(result)/len(result)\n", + "\n", + " wrapper_output[str(i)]=avarage\n", + " \n", + " if visualize: \n", + " print(\"{} Classess\".format(len(classes)),i,'\\n',wrapper_output)\n", + " \n", + " wrapper_output_final = wrapper_output\n", + " wrapper_output = sorted(wrapper_output,key=wrapper_output.get)\n", + " \n", + " new_list = [wrapper_output[-1]]\n", + " \n", + " if visualize:\n", + " print('\\nSelected -- >\\n',new_list,wrapper_output_final[new_list[0]],'\\n')\n", + " \n", + " new_list = ast.literal_eval(new_list[0])\n", + " \n", + " if len(new_list)==feature_count:\n", + " return new_list\n", + " else:\n", + " new_list = wrapper_back_method(data,new_list,target_name=target_name,feature_count=feature_count,\n", + " cross_val_k=cross_val_k,visualize=visualize)\n", + " return new_list\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "metadata": {}, + "outputs": [], + "source": [ + "data = load_iris()\n", + "\n", + "data = pd.concat((pd.DataFrame(data['data']),pd.DataFrame(data['target'])),axis=1)\n", + "data.columns= ['sepal length (cm)','sepal width (cm)','petal length (cm)','petal width (cm)','target']\n", + "data = data[(data['target']==0) | (data['target']==1)]\n", + "data_np = data.values\n", + "np.random.shuffle(data_np)" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "metadata": {}, + "outputs": [], + "source": [ + "columns = list(data.columns)\n", + "columns.remove('target')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "4 Classess ['petal length (cm)', 'petal width (cm)', 'sepal width (cm)'] \n", + " {\"['petal length (cm)', 'sepal length (cm)', 'sepal width (cm)']\": 0.7, \"['petal width (cm)', 'sepal length (cm)', 'sepal width (cm)']\": 0.5, \"['petal length (cm)', 'petal width (cm)', 'sepal length (cm)']\": 0.6399999999999999, \"['petal length (cm)', 'petal width (cm)', 'sepal width (cm)']\": 1.0}\n", + "\n", + "Selected -- >\n", + " [\"['petal length (cm)', 'petal width (cm)', 'sepal width (cm)']\"] 1.0 \n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "['petal length (cm)', 'petal width (cm)', 'sepal width (cm)']" + ] + }, + "execution_count": 125, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wrapper_back_method(data,columns,['target'],3,10,visualize=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/algorithms/wrapper_methods/Feature selection methods - wrapper method.ipynb b/algorithms/wrapper_methods/Feature selection methods - wrapper method.ipynb index 7c15d3a..11ca85d 100644 --- a/algorithms/wrapper_methods/Feature selection methods - wrapper method.ipynb +++ b/algorithms/wrapper_methods/Feature selection methods - wrapper method.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 80, "metadata": {}, "outputs": [], "source": [ @@ -15,7 +15,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 81, "metadata": {}, "outputs": [], "source": [ @@ -24,14 +24,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 82, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "['sepal length (cm)',\n", + " 'sepal width (cm)',\n", + " 'petal length (cm)',\n", + " 'petal width (cm)']" + ] + }, + "execution_count": 82, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data['feature_names']" + ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 83, "metadata": {}, "outputs": [], "source": [ @@ -89,7 +105,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 84, "metadata": {}, "outputs": [], "source": [ @@ -103,35 +119,6 @@ " return comb_list" ] }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[[1, 2],\n", - " [1, 3],\n", - " [1, 4],\n", - " [1, 5],\n", - " [2, 3],\n", - " [2, 4],\n", - " [2, 5],\n", - " [3, 4],\n", - " [3, 5],\n", - " [4, 5]]" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "get_combinations([1,2,3,4,5],2)" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -142,7 +129,7 @@ }, { "cell_type": "code", - "execution_count": 229, + "execution_count": 86, "metadata": {}, "outputs": [], "source": [ @@ -163,7 +150,9 @@ " final_data = pd_data[col_names].values\n", "\n", " train_data = final_data[:int(len(final_data)*train_size)]\n", - " test_data = final_data[:int(len(final_data)*test_size)]\n", + " test_data = final_data[int(len(final_data)*train_size):]\n", + " \n", + " \n", "\n", " train_data.shape,test_data.shape \n", " \n", @@ -180,14 +169,7 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 230, + "execution_count": 87, "metadata": {}, "outputs": [], "source": [ @@ -212,34 +194,19 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 232, + "execution_count": 88, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 3 µs, sys: 1 µs, total: 4 µs\n", - "Wall time: 5.72 µs\n" - ] - }, { "data": { "text/plain": [ - "{\"['petal length (cm)', 'sepal length (cm)', 'sepal width (cm)']\": 0.975,\n", - " \"['petal width (cm)', 'sepal length (cm)', 'sepal width (cm)']\": 0.45,\n", - " \"['petal length (cm)', 'petal width (cm)', 'sepal length (cm)']\": 0.525,\n", + "{\"['petal length (cm)', 'sepal length (cm)', 'sepal width (cm)']\": 0.55,\n", + " \"['petal width (cm)', 'sepal length (cm)', 'sepal width (cm)']\": 0.475,\n", + " \"['petal length (cm)', 'petal width (cm)', 'sepal length (cm)']\": 1.0,\n", " \"['petal length (cm)', 'petal width (cm)', 'sepal width (cm)']\": 1.0}" ] }, - "execution_count": 232, + "execution_count": 88, "metadata": {}, "output_type": "execute_result" }