|
| 1 | +#!/usr/bin/env python |
| 2 | +# coding: utf-8 |
| 3 | + |
| 4 | +# In[1]: |
| 5 | + |
| 6 | + |
| 7 | +# Importing the necessary packages |
| 8 | +import numpy as np |
| 9 | +import pandas as pd |
| 10 | +import re |
| 11 | +import nltk |
| 12 | +import matplotlib.pyplot as plt |
| 13 | +get_ipython().run_line_magic('matplotlib', 'inline') |
| 14 | + |
| 15 | + |
| 16 | +# In[2]: |
| 17 | + |
| 18 | + |
| 19 | +# Data Source: https://raw.githubusercontent.com/kolaveridi/kaggle-Twitter-US-Airline-Sentiment-/master/Tweets.csv |
| 20 | +import_data_url = "https://raw.githubusercontent.com/kolaveridi/kaggle-Twitter-US-Airline-Sentiment-/master/Tweets.csv" |
| 21 | +sentiment_tweets = pd.read_csv(import_data_url) |
| 22 | + |
| 23 | + |
| 24 | +# In[3]: |
| 25 | + |
| 26 | + |
| 27 | +# Distribution of tweets based on the airlines |
| 28 | +sentiment_tweets.airline.value_counts().plot(kind='pie', label='') |
| 29 | + |
| 30 | + |
| 31 | +# In[4]: |
| 32 | + |
| 33 | + |
| 34 | +# Distinguish between the type of sentiments shown by users |
| 35 | +sentiment_tweets.airline_sentiment.value_counts().plot(kind='pie', autopct='%1.0f%%', colors=["brown", "orange", "blue"]) |
| 36 | + |
| 37 | + |
| 38 | +# In[5]: |
| 39 | + |
| 40 | + |
| 41 | +airline_grouped_sentiment = sentiment_tweets.groupby(['airline', 'airline_sentiment']).airline_sentiment.count().unstack() |
| 42 | +airline_grouped_sentiment.plot(figsize=(8,5), kind='bar', title='Individual Sentiments for Airlines', xlabel='Airline Company', ylabel='Sentiment Count') |
| 43 | + |
| 44 | + |
| 45 | +# In[6]: |
| 46 | + |
| 47 | + |
| 48 | +import seaborn as sns |
| 49 | +sns.barplot(x='airline_sentiment', y='airline_sentiment_confidence' , data=sentiment_tweets) |
| 50 | + |
| 51 | + |
| 52 | +# In[7]: |
| 53 | + |
| 54 | + |
| 55 | +# Cleaning of data: Since these tweets might contain punctuation marks and other non-relevant characters, we will process those and remove them from the model |
| 56 | +# Let us also divide the feature and label sets for this data |
| 57 | +feature_set = sentiment_tweets.iloc[:, 10].values |
| 58 | +label_set = sentiment_tweets.iloc[:, 1].values |
| 59 | +cleaned_feature_set = list() |
| 60 | +for input_phrase in range(0, len(feature_set)): |
| 61 | + # 1. Removing all the special characters (*,etc.) and single characters (a,an,etc.) |
| 62 | + clean_feature = re.sub(r'\W', ' ', str(feature_set[input_phrase])) |
| 63 | + clean_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', clean_feature) |
| 64 | + clean_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', clean_feature) |
| 65 | + # 2. Convert the entire phrase to lower cases |
| 66 | + clean_feature = clean_feature.lower() |
| 67 | + cleaned_feature_set.append(clean_feature) |
| 68 | + |
| 69 | +# Changing the text to a numerical form: All machine learning and statistical models use mathematics and numbers to compute data. Since the input here is textual, we will use the TF-IDF scheme to process words. |
| 70 | +# Import the necessary packages |
| 71 | +from nltk.corpus import stopwords |
| 72 | +from sklearn.feature_extraction.text import TfidfVectorizer |
| 73 | +input_vector = TfidfVectorizer (max_features=3000, min_df=6, max_df=0.8, stop_words=stopwords.words('english')) |
| 74 | +cleaned_feature_set = input_vector.fit_transform(cleaned_feature_set).toarray() |
| 75 | + |
| 76 | + |
| 77 | +# In[8]: |
| 78 | + |
| 79 | + |
| 80 | +# Let us now use the Train, Test, Split function to divide this data into training and testing sets. We will use the training set to train the model and find the best suitable model for this prediction and then run that model on the test data to finalize the prediction score |
| 81 | +from sklearn.model_selection import train_test_split |
| 82 | +X_train, X_test, y_train, y_test = train_test_split(cleaned_feature_set, label_set, test_size=0.33, random_state=42) |
| 83 | +from sklearn.ensemble import RandomForestClassifier |
| 84 | +from sklearn.linear_model import LogisticRegression |
| 85 | +from sklearn.svm import SVC |
| 86 | +from sklearn.neighbors import KNeighborsClassifier |
| 87 | +from sklearn.metrics import accuracy_score |
| 88 | +# Random Forest Classification |
| 89 | +rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42) |
| 90 | +rf_classifier.fit(X_train, y_train) |
| 91 | +rf_classifier_score = rf_classifier.score(X_train, y_train) |
| 92 | +# Support Vector Machine Linear Classification |
| 93 | +svc_classifier = SVC(kernel='linear') |
| 94 | +svc_classifier.fit(X_train, y_train) |
| 95 | +svc_classifier_score = svc_classifier.score(X_train, y_train) |
| 96 | +# Logistic Regression |
| 97 | +lr_classifier = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr').fit(X_train, y_train) |
| 98 | +lr_classifier_score = lr_classifier.score(X_train, y_train) |
| 99 | +# K-Nearest Neighbors Classification |
| 100 | +knn_classifier = KNeighborsClassifier(n_neighbors=5) |
| 101 | +knn_classifier.fit(X_train, y_train) |
| 102 | +knn_classifier_score = knn_classifier.score(X_train, y_train) |
| 103 | + |
| 104 | +# Comparision of individual accuracy scores |
| 105 | +accuracy_scores = [] |
| 106 | +Used_ML_Models = ['Random Forest Classification','Support Vector Machine Classification','Logistic Regression', |
| 107 | + 'KNN Classification'] |
| 108 | +accuracy_scores.append(rf_classifier_score) |
| 109 | +accuracy_scores.append(svc_classifier_score) |
| 110 | +accuracy_scores.append(lr_classifier_score) |
| 111 | +accuracy_scores.append(knn_classifier_score) |
| 112 | +score_comparisons = pd.DataFrame(Used_ML_Models, columns = ['Classifiers']) |
| 113 | +score_comparisons['Accuracy on Training Data'] = accuracy_scores |
| 114 | +score_comparisons |
| 115 | + |
| 116 | + |
| 117 | +# In[9]: |
| 118 | + |
| 119 | + |
| 120 | +# We see that the Random Forest Classifier performs the best |
| 121 | +# Final prediction using the best-case algorithm from the above table |
| 122 | +final_pred = rf_classifier.predict(X_test) |
| 123 | +# Accuracy score of the final prediction |
| 124 | +print(accuracy_score(y_test, final_pred)) |
| 125 | + |
0 commit comments