Skip to content

Latest commit

 

History

History
1688 lines (1534 loc) · 38.5 KB

README.md

File metadata and controls

1688 lines (1534 loc) · 38.5 KB

Previsão de classes de Ransoware

Nome da Familia ID

-----------------------------------------

Goodware            0
Critroni            1
CryptLocker         2
CryptoWall          3
KOLLAH              4
Kovter              5
Locker              6
MATSNU              7
PGPCODER            8
Reveton             9
TeslaCrypt         10
Trojan-Ransom      11
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from keras.models import Sequential
from sklearn import preprocessing
from keras.layers import Dense
from sklearn import metrics
import pandasql as ps
from utilsIA import *
import pandas as pd
import numpy as np
import itertools

Leitura do dataset
path = 'Ransoware.xlsx'
dataset = pd.read_excel(path)
dataset
Ransomware Classe 4 6 8 9 10 11 12 14 ... 29079 29218 29287 29758 29769 29770 29796 29903 30200 30285
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 1 0 0 0 1 1
1 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 1 0 0 0 0 0
2 1 4 0 0 0 0 0 0 0 0 ... 0 0 0 0 1 0 0 0 0 0
3 0 0 1 1 1 1 1 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 1 0 1 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1519 1 4 0 1 0 0 0 0 0 0 ... 0 0 0 0 1 0 0 0 1 1
1520 0 0 1 1 1 0 1 0 1 1 ... 0 0 0 0 0 0 0 0 0 0
1521 0 0 1 1 1 0 0 0 0 0 ... 0 0 0 0 1 0 1 0 0 0
1522 1 4 0 0 0 0 0 0 0 0 ... 1 0 0 0 1 0 0 0 0 0
1523 0 0 1 1 1 0 1 0 1 0 ... 0 0 0 0 0 0 0 0 0 0

1524 rows × 931 columns

Redução da dimensão do dataset usando PCA

  • Reduzindo as 929 colunas de modo que mantenha uma alta representatividade com os dados originais
    train_row = 900
    columns = 929
    column_init = 2
    column_label = 1
    
    train_label = dataset.iloc[:train_row, column_label]
    train_data = dataset.iloc[:train_row, column_init:columns+column_init]
    
    test_label = dataset.iloc[train_row:, column_label].reset_index(drop=True)
    test_data = dataset.iloc[train_row:, column_init:columns + column_init].reset_index(drop=True)
    n_comp = 165
    pca = PCA(n_components=n_comp)
    X_train_pca = pca.fit_transform(train_data.values)
    X_test_pca = pca.transform(test_data.values)
    print(f'Representação: {pca.explained_variance_ratio_.sum()}')
    Representação: 0.9715453985161124
    
    X_train_pca = pd.DataFrame(X_train_pca)
    X_test_pca = pd.DataFrame(X_test_pca)
    df = pd.concat([X_train_pca, X_test_pca]).reset_index()
    df['Classe']  = dataset['Classe']
    df
    index 0 1 2 3 4 5 6 7 8 ... 156 157 158 159 160 161 162 163 164 Classe
    0 0 -4.821125 -0.487440 -2.201017 1.152447 -0.033623 1.352974 -0.578600 0.059564 0.038959 ... -0.053499 0.311921 -0.050478 -0.117299 0.066474 -0.203876 0.246825 -0.010021 -0.282411 0
    1 1 -3.116892 -0.320086 -0.468679 0.077390 -0.733340 -0.059672 0.296803 -0.416835 -0.452211 ... -0.268512 0.186507 0.044608 0.237524 0.141487 -0.163366 -0.133308 -0.139559 0.031824 0
    2 2 -4.817464 -0.726339 -2.223047 0.763878 0.408207 0.848962 -0.412907 0.157133 0.103711 ... 0.115069 -0.030130 0.102367 0.148219 -0.342858 0.129300 -0.067159 -0.014187 0.124248 4
    3 3 5.331648 -2.676628 1.265138 0.893076 1.153713 -0.109961 0.198922 -0.561980 -0.057770 ... 0.036511 -0.175244 -0.323986 -0.024850 0.179646 -0.095332 -0.081275 -0.192800 0.104840 0
    4 4 4.054361 1.525786 -0.874177 -0.684646 -0.014614 -0.708375 0.977450 0.126336 -0.524389 ... -0.058353 -0.232046 -0.426571 0.095166 -0.060333 -0.154573 -0.096402 -0.048692 0.468370 0
    ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
    1519 619 -3.667744 -0.287221 -0.377622 -0.285508 -0.173065 -0.097861 -0.031989 -0.025919 0.053390 ... -0.033192 0.723736 -0.231849 -0.136856 -0.090843 0.277584 -0.120122 -0.174936 -0.394245 4
    1520 620 1.207262 4.874456 3.782525 1.782034 0.379592 -0.096894 -0.562963 1.003049 -3.265090 ... -0.010782 -0.362701 -0.555329 0.299275 0.414574 0.270008 -0.037364 -0.070711 0.199022 0
    1521 621 3.961603 1.436517 -1.136199 -0.921292 -0.807843 0.733954 1.622033 -2.877845 0.001690 ... -0.072145 -0.226455 0.478052 0.026402 0.172154 0.158334 -0.144294 -0.308613 -0.207756 0
    1522 622 -4.470373 -0.806404 -1.547585 0.073598 0.697804 0.348658 -0.494289 0.044260 0.074465 ... -0.149596 -0.241636 0.059691 0.157608 0.282329 0.006757 -0.122828 -0.280303 -0.022284 4
    1523 623 5.259862 -3.403548 0.995541 1.227109 0.961682 0.304587 -0.072982 -0.855472 0.098563 ... 0.031373 0.019168 -0.094154 0.007101 -0.000693 0.014536 -0.059940 -0.031743 0.070975 0

    1524 rows × 167 columns

    Verificando a quantidade de cada Classe dentro do dataset
    q1 = """
    SELECT distinct count(*) as Quantidade,  Classe
    FROM dataset 
    group by Classe
    order by Quantidade desc"""
    
    ps.sqldf(q1)
    Quantidade Classe
    0 942 0
    1 107 2
    2 97 6
    3 90 9
    4 64 5
    5 59 7
    6 50 1
    7 46 3
    8 34 11
    9 25 4
    10 6 10
    11 4 8

    Exclusão das classes com poucas ocorrências

  • Treinamento insuficiente do modelo
  • Teste insuficiente do modelo
  • Evitar alta acurácia por aleatoriedade
  • df.drop(df.loc[df['Classe']==8].index, inplace=True)
    df.drop(df.loc[df['Classe']==10].index, inplace=True)
    df.drop(df.loc[df['Classe']==4].index, inplace=True)
    df.drop('index', 1, inplace=True)
    df
    C:\Users\Edno\AppData\Local\Temp/ipykernel_5244/4213288135.py:4: FutureWarning: In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only
      df.drop('index', 1, inplace=True)
    
    0 1 2 3 4 5 6 7 8 9 ... 156 157 158 159 160 161 162 163 164 Classe
    0 -4.821125 -0.487440 -2.201017 1.152447 -0.033623 1.352974 -0.578600 0.059564 0.038959 0.154728 ... -0.053499 0.311921 -0.050478 -0.117299 0.066474 -0.203876 0.246825 -0.010021 -0.282411 0
    1 -3.116892 -0.320086 -0.468679 0.077390 -0.733340 -0.059672 0.296803 -0.416835 -0.452211 0.099741 ... -0.268512 0.186507 0.044608 0.237524 0.141487 -0.163366 -0.133308 -0.139559 0.031824 0
    3 5.331648 -2.676628 1.265138 0.893076 1.153713 -0.109961 0.198922 -0.561980 -0.057770 1.013530 ... 0.036511 -0.175244 -0.323986 -0.024850 0.179646 -0.095332 -0.081275 -0.192800 0.104840 0
    4 4.054361 1.525786 -0.874177 -0.684646 -0.014614 -0.708375 0.977450 0.126336 -0.524389 -0.025725 ... -0.058353 -0.232046 -0.426571 0.095166 -0.060333 -0.154573 -0.096402 -0.048692 0.468370 0
    5 1.761522 1.777536 1.389631 -0.734398 -1.769632 -1.707925 0.010637 1.006003 -1.032105 0.677926 ... -0.066147 -0.053367 -0.184165 -0.156341 0.059520 -0.043059 -0.013471 -0.092569 0.110692 0
    ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
    1516 5.471322 1.039605 -4.132083 0.309695 -2.036437 0.087284 0.457468 -0.841350 -0.999483 -1.548910 ... 0.175220 -0.015662 -0.196201 -0.293100 -0.004059 -0.188156 0.085008 0.043024 0.112271 0
    1517 5.177134 -3.400939 1.034620 1.240165 0.937027 0.308304 -0.070532 -0.924902 0.166870 1.175099 ... 0.038146 -0.034900 -0.006361 -0.103692 -0.046950 0.016334 -0.074340 -0.039927 -0.009890 0
    1520 1.207262 4.874456 3.782525 1.782034 0.379592 -0.096894 -0.562963 1.003049 -3.265090 4.992138 ... -0.010782 -0.362701 -0.555329 0.299275 0.414574 0.270008 -0.037364 -0.070711 0.199022 0
    1521 3.961603 1.436517 -1.136199 -0.921292 -0.807843 0.733954 1.622033 -2.877845 0.001690 2.131293 ... -0.072145 -0.226455 0.478052 0.026402 0.172154 0.158334 -0.144294 -0.308613 -0.207756 0
    1523 5.259862 -3.403548 0.995541 1.227109 0.961682 0.304587 -0.072982 -0.855472 0.098563 1.115860 ... 0.031373 0.019168 -0.094154 0.007101 -0.000693 0.014536 -0.059940 -0.031743 0.070975 0

    1489 rows × 166 columns

    q1 = """
    SELECT distinct count(*) as Quantidade,  Classe
    FROM df 
    group by Classe
    order by Quantidade desc"""
    
    ps.sqldf(q1)
    Quantidade Classe
    0 942 0
    1 107 2
    2 97 6
    3 90 9
    4 64 5
    5 59 7
    6 50 1
    7 46 3
    8 34 11

    Redução da quantidade de registros da categoria '0' para redução da chance de
    enviesamento do modelo para esta classe
    df_balanced = ProcessingData.balancedData(df, 'Classe', 107)
    #df_balanced = df

    png

    ProcessingData.showLabelsQtd(df_balanced, 'Classe')
    Quantidade de itens nas 9 categorias:
    0     107
    2     107
    6      97
    9      90
    5      64
    7      59
    1      50
    3      46
    11     34
    Name: Classe, dtype: int64
    
    #Colocando a coluna 'Classe' na primeira posição
    col = ['Classe'] + [c for c in range(n_comp)]
    df_balanced = df_balanced[col]
    df_balanced.head()
    Classe 0 1 2 3 4 5 6 7 8 ... 155 156 157 158 159 160 161 162 163 164
    1112 0 5.050851 -3.437326 0.880903 1.284616 1.039539 0.332499 -0.112654 -0.929544 0.185957 ... -0.003749 0.007185 -0.091705 0.030639 -0.184883 -0.014107 0.023129 -0.065869 -0.118227 0.052917
    904 0 -4.744892 -1.090630 -1.894263 0.623459 0.556025 0.847031 -0.458654 0.282898 0.140678 ... -0.006809 0.061832 0.089955 -0.009707 0.050636 -0.008242 0.017653 0.002273 0.099682 -0.081503
    1018 0 -0.894416 1.380315 0.817547 0.487481 -3.652643 -0.308384 0.959425 0.111086 -0.468956 ... 0.020760 -0.028250 0.168435 -0.041256 -0.425442 0.265058 0.060733 0.246020 0.026194 -0.033660
    294 0 5.585781 -3.418598 1.011767 1.184437 1.192173 0.112080 -0.042108 -0.783191 -0.261409 ... 0.015675 -0.016180 -0.023752 0.092484 -0.102187 -0.064161 0.023080 0.000156 -0.029936 -0.090316
    1355 0 4.773488 0.741391 -3.426230 0.476568 -1.845094 -0.454420 1.943682 -1.763280 -0.361983 ... -0.238522 -0.164905 0.178833 0.062664 0.089606 -0.098259 0.022611 0.108016 -0.008297 0.028628

    5 rows × 166 columns

    Aplicação do One Hot Encode
    n_classes = len(df_balanced['Classe'].astype(int).value_counts())
    
    datas = df_balanced.iloc[:, 1:].values
    
    enc = preprocessing.OneHotEncoder()
    labels = df_balanced.Classe.to_list()
    labels = [[la] for la in labels]
    enc.fit(labels)
    labels_hot = enc.transform(labels).toarray()

    Separando os dados para teste e treino
    X_train, X_test, y_train, y_test = train_test_split(datas, labels_hot, test_size=.3, random_state=0)#, stratify=labels)
    y_train = np.array(y_train) 
    y_test = np.array(y_test) 

    Modelo da rede neural utilizada
    model = Sequential()
    model.add(Dense(380, activation='relu', input_dim=n_comp))
    model.add(Dense(160, activation='relu'))
    model.add(Dense(90, activation='relu'))
    model.add(Dense(45, activation='relu'))
    model.add(Dense(n_classes, activation='softmax'))
    
    
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics = ['accuracy'])
    
    callbacks = [EarlyStopping(monitor='val_accuracy', patience=25),
            ModelCheckpoint(filepath='best_model.h5', monitor='val_accuracy', save_best_only=True)]
    
    
    
    history = model.fit(X_train, y_train, epochs=200, callbacks=callbacks, validation_data=(X_test, y_test), verbose=1)
    model.load_weights('best_model.h5')
    Epoch 1/200
    15/15 [==============================] - 1s 23ms/step - loss: 1.9844 - accuracy: 0.2910 - val_loss: 1.7298 - val_accuracy: 0.4162
    Epoch 2/200
    15/15 [==============================] - 0s 9ms/step - loss: 1.4077 - accuracy: 0.6105 - val_loss: 1.2256 - val_accuracy: 0.5888
    Epoch 3/200
    15/15 [==============================] - 0s 10ms/step - loss: 0.9652 - accuracy: 0.7068 - val_loss: 1.0074 - val_accuracy: 0.6853
    Epoch 4/200
    15/15 [==============================] - 0s 10ms/step - loss: 0.7094 - accuracy: 0.7746 - val_loss: 0.8882 - val_accuracy: 0.7157
    Epoch 5/200
    15/15 [==============================] - 0s 9ms/step - loss: 0.5448 - accuracy: 0.8184 - val_loss: 0.7992 - val_accuracy: 0.7665
    Epoch 6/200
    15/15 [==============================] - 0s 9ms/step - loss: 0.3985 - accuracy: 0.8884 - val_loss: 0.7419 - val_accuracy: 0.7716
    Epoch 7/200
    15/15 [==============================] - 0s 5ms/step - loss: 0.3295 - accuracy: 0.9015 - val_loss: 0.7838 - val_accuracy: 0.7513
    Epoch 8/200
    15/15 [==============================] - 0s 9ms/step - loss: 0.2932 - accuracy: 0.9103 - val_loss: 0.7610 - val_accuracy: 0.8122
    Epoch 9/200
    15/15 [==============================] - 0s 5ms/step - loss: 0.2505 - accuracy: 0.9212 - val_loss: 0.7309 - val_accuracy: 0.7766
    Epoch 10/200
    15/15 [==============================] - 0s 9ms/step - loss: 0.2223 - accuracy: 0.9344 - val_loss: 0.7284 - val_accuracy: 0.8173
    Epoch 11/200
    15/15 [==============================] - 0s 9ms/step - loss: 0.1786 - accuracy: 0.9475 - val_loss: 0.7725 - val_accuracy: 0.8223
    Epoch 12/200
    15/15 [==============================] - 0s 6ms/step - loss: 0.1751 - accuracy: 0.9453 - val_loss: 0.7342 - val_accuracy: 0.8020
    Epoch 13/200
    15/15 [==============================] - 0s 9ms/step - loss: 0.1385 - accuracy: 0.9562 - val_loss: 0.7827 - val_accuracy: 0.8325
    Epoch 14/200
    15/15 [==============================] - 0s 5ms/step - loss: 0.1194 - accuracy: 0.9606 - val_loss: 0.7634 - val_accuracy: 0.8274
    Epoch 15/200
    15/15 [==============================] - 0s 5ms/step - loss: 0.1201 - accuracy: 0.9628 - val_loss: 0.7973 - val_accuracy: 0.8173
    Epoch 16/200
    15/15 [==============================] - 0s 5ms/step - loss: 0.1045 - accuracy: 0.9606 - val_loss: 0.8194 - val_accuracy: 0.8223
    Epoch 17/200
    15/15 [==============================] - 0s 5ms/step - loss: 0.0962 - accuracy: 0.9781 - val_loss: 0.8739 - val_accuracy: 0.8223
    Epoch 18/200
    15/15 [==============================] - 0s 5ms/step - loss: 0.1099 - accuracy: 0.9628 - val_loss: 0.8371 - val_accuracy: 0.8325
    Epoch 19/200
    15/15 [==============================] - 0s 5ms/step - loss: 0.0886 - accuracy: 0.9759 - val_loss: 0.8321 - val_accuracy: 0.8223
    Epoch 20/200
    15/15 [==============================] - 0s 5ms/step - loss: 0.0760 - accuracy: 0.9759 - val_loss: 0.8440 - val_accuracy: 0.8325
    Epoch 21/200
    15/15 [==============================] - 0s 9ms/step - loss: 0.0788 - accuracy: 0.9803 - val_loss: 0.8539 - val_accuracy: 0.8376
    Epoch 22/200
    15/15 [==============================] - 0s 5ms/step - loss: 0.0632 - accuracy: 0.9825 - val_loss: 0.8903 - val_accuracy: 0.8173
    Epoch 23/200
    15/15 [==============================] - 0s 5ms/step - loss: 0.0817 - accuracy: 0.9716 - val_loss: 0.8795 - val_accuracy: 0.8274
    Epoch 24/200
    15/15 [==============================] - 0s 5ms/step - loss: 0.0654 - accuracy: 0.9825 - val_loss: 0.9905 - val_accuracy: 0.8274
    Epoch 25/200
    15/15 [==============================] - 0s 5ms/step - loss: 0.0876 - accuracy: 0.9716 - val_loss: 0.8827 - val_accuracy: 0.8325
    Epoch 26/200
    15/15 [==============================] - 0s 5ms/step - loss: 0.0677 - accuracy: 0.9825 - val_loss: 0.9454 - val_accuracy: 0.8173
    Epoch 27/200
    15/15 [==============================] - 0s 5ms/step - loss: 0.0775 - accuracy: 0.9759 - val_loss: 0.9085 - val_accuracy: 0.8274
    Epoch 28/200
    15/15 [==============================] - 0s 5ms/step - loss: 0.0712 - accuracy: 0.9781 - val_loss: 0.9352 - val_accuracy: 0.8325
    Epoch 29/200
    15/15 [==============================] - 0s 5ms/step - loss: 0.0671 - accuracy: 0.9825 - val_loss: 0.9524 - val_accuracy: 0.8122
    Epoch 30/200
    15/15 [==============================] - 0s 6ms/step - loss: 0.0605 - accuracy: 0.9825 - val_loss: 0.9559 - val_accuracy: 0.8223
    Epoch 31/200
    15/15 [==============================] - 0s 7ms/step - loss: 0.0775 - accuracy: 0.9781 - val_loss: 0.9911 - val_accuracy: 0.8274
    Epoch 32/200
    15/15 [==============================] - 0s 6ms/step - loss: 0.0886 - accuracy: 0.9716 - val_loss: 0.9290 - val_accuracy: 0.8325
    Epoch 33/200
    15/15 [==============================] - 0s 7ms/step - loss: 0.0729 - accuracy: 0.9803 - val_loss: 0.9835 - val_accuracy: 0.8274
    Epoch 34/200
    15/15 [==============================] - 0s 5ms/step - loss: 0.0556 - accuracy: 0.9847 - val_loss: 0.9801 - val_accuracy: 0.8325
    Epoch 35/200
    15/15 [==============================] - 0s 6ms/step - loss: 0.0586 - accuracy: 0.9825 - val_loss: 0.9715 - val_accuracy: 0.8173
    Epoch 36/200
    15/15 [==============================] - 0s 6ms/step - loss: 0.0566 - accuracy: 0.9803 - val_loss: 0.9987 - val_accuracy: 0.8325
    Epoch 37/200
    15/15 [==============================] - 0s 7ms/step - loss: 0.0683 - accuracy: 0.9781 - val_loss: 0.9747 - val_accuracy: 0.8173
    Epoch 38/200
    15/15 [==============================] - 0s 8ms/step - loss: 0.0949 - accuracy: 0.9716 - val_loss: 1.0825 - val_accuracy: 0.8274
    Epoch 39/200
    15/15 [==============================] - 0s 7ms/step - loss: 0.0746 - accuracy: 0.9672 - val_loss: 0.9609 - val_accuracy: 0.8325
    Epoch 40/200
    15/15 [==============================] - 0s 5ms/step - loss: 0.0505 - accuracy: 0.9847 - val_loss: 1.0257 - val_accuracy: 0.8223
    Epoch 41/200
    15/15 [==============================] - 0s 5ms/step - loss: 0.0544 - accuracy: 0.9847 - val_loss: 0.9983 - val_accuracy: 0.8376
    Epoch 42/200
    15/15 [==============================] - 0s 5ms/step - loss: 0.0465 - accuracy: 0.9847 - val_loss: 1.0335 - val_accuracy: 0.8223
    Epoch 43/200
    15/15 [==============================] - 0s 5ms/step - loss: 0.0485 - accuracy: 0.9847 - val_loss: 1.0488 - val_accuracy: 0.8223
    Epoch 44/200
    15/15 [==============================] - 0s 5ms/step - loss: 0.0454 - accuracy: 0.9847 - val_loss: 1.0534 - val_accuracy: 0.8376
    Epoch 45/200
    15/15 [==============================] - 0s 5ms/step - loss: 0.0509 - accuracy: 0.9803 - val_loss: 1.0727 - val_accuracy: 0.8274
    Epoch 46/200
    15/15 [==============================] - 0s 5ms/step - loss: 0.0494 - accuracy: 0.9847 - val_loss: 1.0572 - val_accuracy: 0.8274
    

    Avaliação do modelo

  • Histórico da evolução do modelo e sua Loss function
  • Acurácia do modelo nos dados de teste
  • Matriz de confução
  • def plot_confusion_matrix(y_test, y_pred, labels, normalize=False):
        cm = confusion_matrix(y_test, y_pred)
    
        if normalize:
            cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
            print("Normalized confusion matrix")
        else:
            print('Confusion matrix, without normalization')
    
        plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
        plt.gcf().set_size_inches(17, 11)
        plt.title('Confusion matrix')
        plt.colorbar()
        tick_marks = np.arange(len(labels))
        plt.xticks(tick_marks, labels, rotation=45)
        plt.yticks(tick_marks, labels)
    
        fmt = '.2f' if normalize else 'd'
        thresh = cm.max() / 2.
        for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
            plt.text(j, i, format(cm[i, j], fmt),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
    
        plt.tight_layout()
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        plt.show()
    def evaluate_model(model, history, X_test, y_test):
        scores = model.evaluate((X_test), y_test, verbose=0)
        print('=========================================')
        print("|| Accuracy: %.2f%%" % (scores[1]*100))
        print('=========================================')
    
        fig, axs = plt.subplots(1, 2, figsize=(12,6))
        axs[0].plot(history.history['accuracy'])
        axs[0].plot(history.history['val_accuracy'])
        axs[0].set_title("Accuracy")
        axs[0].legend(['Training', 'Validation'])
        axs[1].plot(history.history['loss'])
        axs[1].plot(history.history['val_loss'])
        axs[1].set_title("Model- Loss")
        axs[1].legend(['Training', 'Validation'])
        fig.tight_layout()
    evaluate_model(model, history, X_test, y_test)
    =========================================
    || Accuracy: 83.76%
    =========================================
    

    png

    OBS: O valores numéricos das classes não correspondem mais com os apresentados no inicio do arquivo
    def find_class(probl):
        probl_max = probl.max()
        label = int(np.where(probl == probl_max)[0])
    
        return  label, round(probl_max*100,1)
    
    prediction_proba = model.predict(X_test)
    
    prediction_cat = [find_class(c)[0] for c in prediction_proba]
    y_test_label =  [find_class(c)[0] for c in y_test]
    
    
    plot_confusion_matrix(y_test_label, prediction_cat, enc.categories_[0], True)
    Normalized confusion matrix
    

    png

    Área Sob a Curva ROC

    Geralmente, a sensibilidade e a especificidade são características difíceis de conciliar, isto é, é complicado aumentar
    a sensibilidade e a especificidade de um teste ao mesmo tempo. As curvas ROC (receiver operator characteristic curve)
    são uma forma de representar a relação, normalmente antagónica, entre a sensibilidade e a especificidade de um teste
    diagnóstico quantitativo, ao longo de um contínuo de valores de "cutoff point".

    # First aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
    
    # Then interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
    
    # Finally average it and compute AUC
    mean_tpr /= n_classes
    
    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = metrics.auc(fpr["macro"], tpr["macro"])
    
    # Plot all ROC curves
    plt.figure(figsize=(12,8))
    plt.plot(
        fpr["micro"],
        tpr["micro"],
        label="Média-mínima da curva ROC (area = {0:0.2f})".format(roc_auc["micro"]),
        color="deeppink",
        linestyle=":",
        linewidth=4,
    )
    
    plt.plot(
        fpr["macro"],
        tpr["macro"],
        label="Média-máxima da curva ROC(area = {0:0.2f})".format(roc_auc["macro"]),
        color="navy",
        linestyle=":",
        linewidth=4,
    )
    
    for i in range(n_classes):
        plt.plot(
            fpr[i],
            tpr[i],
            lw=lw,
            label="Cruva ROC da classe {0} (area = {1:0.2f})".format(i, roc_auc[i]),
        )
    
    plt.plot([0, 1], [0, 1], "k--", lw=lw)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel("Taxa de Falso Positivo")
    plt.ylabel("Taxa de Verdadeiro Positivo")
    plt.title("Curva ROC por classes")
    plt.legend(loc="lower right")
    plt.show()

    png