From f226a7c5354c9c7d899f9ab0fe40af90ab542f3b Mon Sep 17 00:00:00 2001
From: Israel Abebe <se.israel.abebe@gmail.com>
Date: Thu, 14 Nov 2019 19:28:52 +0000
Subject: [PATCH] wrapper backwords done

---
 ...hod- with cross validation-backwords.ipynb | 249 ++++++++++++++++++
 ...e selection methods - wrapper method.ipynb |  99 +++----
 2 files changed, 282 insertions(+), 66 deletions(-)
 create mode 100644 algorithms/wrapper_methods/Feature selection methods - wrapper method- with cross validation-backwords.ipynb

diff --git a/algorithms/wrapper_methods/Feature selection methods - wrapper method- with cross validation-backwords.ipynb b/algorithms/wrapper_methods/Feature selection methods - wrapper method- with cross validation-backwords.ipynb
new file mode 100644
index 0000000..9215f3a
--- /dev/null
+++ b/algorithms/wrapper_methods/Feature selection methods - wrapper method- with cross validation-backwords.ipynb	
@@ -0,0 +1,249 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.metrics import  accuracy_score\n",
+    "from sklearn.datasets import load_iris\n",
+    "from itertools import combinations\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import ast"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class logisticregression():\n",
+    "    def __init__(self,train_data,train_labels,lr=0.01,batch_size=None,epoch=10,print_every = 10):\n",
+    "        dummy_once = np.ones((len(train_data),1))\n",
+    "        self.train_data = np.hstack((dummy_once,train_data))\n",
+    "        self.train_labels = train_labels\n",
+    "        \n",
+    "        self.params = np.zeros((len(self.train_data[0]),1))\n",
+    "        \n",
+    "        self.lr = lr\n",
+    "        self.epoch = epoch\n",
+    "        self.batch_size = batch_size\n",
+    "        self.print_every = print_every\n",
+    "        \n",
+    "    def sigmoid(self,x):\n",
+    "        return 1/(1+np.exp(-x))\n",
+    "    \n",
+    "    def cost(self,y,y_pred):\n",
+    "        return -np.mean(y*np.log(y_pred)+(1-y)*np.log(1-y_pred))\n",
+    "    \n",
+    "    def gradient(self,y,y_pred,x):\n",
+    "        return np.dot(x.T,(y_pred-y))\n",
+    "    \n",
+    "    def train(self):\n",
+    "        for i in range(self.epoch):\n",
+    "            y_pred = self.sigmoid(np.dot(self.train_data,self.params))\n",
+    "            loss = self.cost(self.train_labels,y_pred)\n",
+    "            \n",
+    "            gra = self.gradient(self.train_labels,y_pred,self.train_data)\n",
+    "            \n",
+    "            self.params -= self.lr*gra\n",
+    "            \n",
+    "            if self.print_every:\n",
+    "                if i%self.print_every == 0 or i == self.epoch-1:\n",
+    "                    print('Epoch : {}  Loss: {}'.format(i,loss))\n",
+    "    def predict(self,test_data):\n",
+    "        result = self.sigmoid(np.dot(test_data,self.params[1:])+self.params[0])\n",
+    "        result[result >= 0.5 ] = 1\n",
+    "        result[result < 0.5 ] = 0\n",
+    "        return result\n",
+    "    \n",
+    "    def evaluate(self,test_data,labels):\n",
+    "        accuracy = accuracy_score(self.predict(test_data),labels)\n",
+    "        return accuracy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def cross_validate(data,k=5):\n",
+    "    if len(data)%k != 0:\n",
+    "        print('cant vsplit',len(data),' by ',k)\n",
+    "        return\n",
+    "    \n",
+    "    data_splitted = np.vsplit(data,k)\n",
+    "    aggrigate_result = []\n",
+    "    for i in range(len(data_splitted)):\n",
+    "        train = []\n",
+    "        test = []\n",
+    "        items = [j for j in range(len(data_splitted)) if j !=i ]\n",
+    "        test = data_splitted[i]\n",
+    "        for item in items:\n",
+    "            if len(train) == 0:\n",
+    "                train = data_splitted[item]\n",
+    "            else:\n",
+    "                train = np.concatenate((train,data_splitted[item]), axis=0)\n",
+    "            \n",
+    "        logistic = logisticregression(train[:,:-1],train[:,-1:],epoch=10,print_every=None)\n",
+    "        logistic.train()\n",
+    "        \n",
+    "        result = logistic.evaluate(test[:,:-1],test[:,-1:])\n",
+    "        aggrigate_result.append(result)\n",
+    "        \n",
+    "    return aggrigate_result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_combinations(items,number):\n",
+    "    comb_list = []\n",
+    "    for c in combinations(items, number):\n",
+    "        c = list(c)\n",
+    "        c.sort()\n",
+    "        comb_list.append(c)\n",
+    "        \n",
+    "    return comb_list"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 121,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def wrapper_back_method(data,names,target_name,feature_count=2,cross_val_k = 5,visualize=True):\n",
+    "    num = len(names)-1\n",
+    "    classes = []\n",
+    "    if num == len(names):\n",
+    "        classes = [names]\n",
+    "    else:\n",
+    "        classes = get_combinations(names,num)\n",
+    "          \n",
+    "        \n",
+    "    wrapper_output = {}\n",
+    "    for i in classes: \n",
+    "        cols = np.concatenate((i,target_name),axis=None)\n",
+    "        sub_data = data[cols].values\n",
+    "        result = cross_validate(sub_data,cross_val_k)\n",
+    "        avarage = sum(result)/len(result)\n",
+    "\n",
+    "        wrapper_output[str(i)]=avarage\n",
+    "    \n",
+    "    if visualize:       \n",
+    "        print(\"{} Classess\".format(len(classes)),i,'\\n',wrapper_output)\n",
+    "    \n",
+    "    wrapper_output_final = wrapper_output\n",
+    "    wrapper_output = sorted(wrapper_output,key=wrapper_output.get)\n",
+    "    \n",
+    "    new_list = [wrapper_output[-1]]\n",
+    "    \n",
+    "    if visualize:\n",
+    "        print('\\nSelected -- >\\n',new_list,wrapper_output_final[new_list[0]],'\\n')\n",
+    "        \n",
+    "    new_list = ast.literal_eval(new_list[0])\n",
+    "    \n",
+    "    if len(new_list)==feature_count:\n",
+    "        return new_list\n",
+    "    else:\n",
+    "        new_list = wrapper_back_method(data,new_list,target_name=target_name,feature_count=feature_count,\n",
+    "                                       cross_val_k=cross_val_k,visualize=visualize)\n",
+    "        return new_list\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 122,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = load_iris()\n",
+    "\n",
+    "data = pd.concat((pd.DataFrame(data['data']),pd.DataFrame(data['target'])),axis=1)\n",
+    "data.columns= ['sepal length (cm)','sepal width (cm)','petal length (cm)','petal width (cm)','target']\n",
+    "data = data[(data['target']==0) | (data['target']==1)]\n",
+    "data_np = data.values\n",
+    "np.random.shuffle(data_np)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 123,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "columns = list(data.columns)\n",
+    "columns.remove('target')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 125,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "4 Classess ['petal length (cm)', 'petal width (cm)', 'sepal width (cm)'] \n",
+      " {\"['petal length (cm)', 'sepal length (cm)', 'sepal width (cm)']\": 0.7, \"['petal width (cm)', 'sepal length (cm)', 'sepal width (cm)']\": 0.5, \"['petal length (cm)', 'petal width (cm)', 'sepal length (cm)']\": 0.6399999999999999, \"['petal length (cm)', 'petal width (cm)', 'sepal width (cm)']\": 1.0}\n",
+      "\n",
+      "Selected -- >\n",
+      " [\"['petal length (cm)', 'petal width (cm)', 'sepal width (cm)']\"] 1.0 \n",
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "['petal length (cm)', 'petal width (cm)', 'sepal width (cm)']"
+      ]
+     },
+     "execution_count": 125,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "wrapper_back_method(data,columns,['target'],3,10,visualize=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/algorithms/wrapper_methods/Feature selection methods - wrapper method.ipynb b/algorithms/wrapper_methods/Feature selection methods - wrapper method.ipynb
index 7c15d3a..11ca85d 100644
--- a/algorithms/wrapper_methods/Feature selection methods - wrapper method.ipynb	
+++ b/algorithms/wrapper_methods/Feature selection methods - wrapper method.ipynb	
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 80,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -15,7 +15,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 81,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -24,14 +24,30 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 82,
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['sepal length (cm)',\n",
+       " 'sepal width (cm)',\n",
+       " 'petal length (cm)',\n",
+       " 'petal width (cm)']"
+      ]
+     },
+     "execution_count": 82,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data['feature_names']"
+   ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 83,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -89,7 +105,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 84,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -103,35 +119,6 @@
     "    return comb_list"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[[1, 2],\n",
-       " [1, 3],\n",
-       " [1, 4],\n",
-       " [1, 5],\n",
-       " [2, 3],\n",
-       " [2, 4],\n",
-       " [2, 5],\n",
-       " [3, 4],\n",
-       " [3, 5],\n",
-       " [4, 5]]"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "get_combinations([1,2,3,4,5],2)"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -142,7 +129,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 229,
+   "execution_count": 86,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -163,7 +150,9 @@
     "    final_data = pd_data[col_names].values\n",
     "\n",
     "    train_data = final_data[:int(len(final_data)*train_size)]\n",
-    "    test_data = final_data[:int(len(final_data)*test_size)]\n",
+    "    test_data = final_data[int(len(final_data)*train_size):]\n",
+    "    \n",
+    "    \n",
     "\n",
     "    train_data.shape,test_data.shape \n",
     "    \n",
@@ -180,14 +169,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 230,
+   "execution_count": 87,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -212,34 +194,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 232,
+   "execution_count": 88,
    "metadata": {},
    "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "CPU times: user 3 µs, sys: 1 µs, total: 4 µs\n",
-      "Wall time: 5.72 µs\n"
-     ]
-    },
     {
      "data": {
       "text/plain": [
-       "{\"['petal length (cm)', 'sepal length (cm)', 'sepal width (cm)']\": 0.975,\n",
-       " \"['petal width (cm)', 'sepal length (cm)', 'sepal width (cm)']\": 0.45,\n",
-       " \"['petal length (cm)', 'petal width (cm)', 'sepal length (cm)']\": 0.525,\n",
+       "{\"['petal length (cm)', 'sepal length (cm)', 'sepal width (cm)']\": 0.55,\n",
+       " \"['petal width (cm)', 'sepal length (cm)', 'sepal width (cm)']\": 0.475,\n",
+       " \"['petal length (cm)', 'petal width (cm)', 'sepal length (cm)']\": 1.0,\n",
        " \"['petal length (cm)', 'petal width (cm)', 'sepal width (cm)']\": 1.0}"
       ]
      },
-     "execution_count": 232,
+     "execution_count": 88,
      "metadata": {},
      "output_type": "execute_result"
     }