Skip to content

Commit 7dad5bf

Browse files
authored
Adding Python files via upload
1 parent d2b91a0 commit 7dad5bf

File tree

4 files changed

+226
-0
lines changed

4 files changed

+226
-0
lines changed
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#!/usr/bin/env python
2+
# coding: utf-8
3+
4+
# In[1]:
5+
6+
7+
# We will use the CountVectorizer from Scikit learn to convert the text into numeric vectors
8+
9+
10+
# In[2]:
11+
12+
13+
from sklearn.feature_extraction.text import CountVectorizer
14+
15+
16+
# In[3]:
17+
18+
19+
input_sent = ['Demonstration of the BoW NLTK model', 'This model builds numerical features for text input']
20+
input_cv = CountVectorizer()
21+
features_text = input_cv.fit_transform(input_sent).todense()
22+
print(input_cv.vocabulary_)
23+
24+
25+
# In[4]:
26+
27+
28+
# This allows us to build feature vectors that will successfully be used in Machine Learning algorithms.
29+
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
#!/usr/bin/env python
2+
# coding: utf-8
3+
4+
# In[1]:
5+
6+
7+
from sklearn.datasets import fetch_20newsgroups
8+
from sklearn.naive_bayes import MultinomialNB
9+
from sklearn.feature_extraction.text import TfidfTransformer
10+
from sklearn.feature_extraction.text import CountVectorizer
11+
12+
13+
# In[2]:
14+
15+
16+
dict_cat = {'talk.religion.misc': 'Religious Content', 'rec.autos': 'Automobile and Transport','rec.sport.hockey':'Sport: Hockey','sci.electronics':'Content: Electronics', 'sci.space': 'Content: Space'}
17+
18+
19+
# In[3]:
20+
21+
22+
data_train = fetch_20newsgroups(subset='train', categories = dict_cat.keys(), shuffle=True, random_state=3)
23+
24+
25+
# In[4]:
26+
27+
28+
cv_vector = CountVectorizer()
29+
data_train_fit = cv_vector.fit_transform(data_train.data)
30+
print("\nTraining Data Dimensions:", data_train_fit.shape)
31+
32+
33+
# In[5]:
34+
35+
36+
tfidf_transformer = TfidfTransformer()
37+
train_tfidf_transformer = tfidf_transformer.fit_transform(data_train_fit)
38+
39+
40+
# In[6]:
41+
42+
43+
sample_input_data = [
44+
'The Apollo Series were a bunch of space shuttles',
45+
'Islamism, Hinduism, Christianity, Sikhism are all major religions of the world',
46+
'It is a necessity to drive safely',
47+
'Gloves are made of rubber',
48+
'Gadgets like TV, Refrigerator and Grinders, all use electricity'
49+
]
50+
51+
52+
# In[7]:
53+
54+
55+
input_classifier = MultinomialNB().fit(train_tfidf_transformer, data_train.target)
56+
input_cv = cv_vector.transform(sample_input_data)
57+
tfidf_input = tfidf_transformer.transform(input_cv)
58+
predictions_sample = input_classifier.predict(tfidf_input)
59+
60+
61+
# In[8]:
62+
63+
64+
for inp, cat in zip(sample_input_data, predictions_sample):
65+
print('\nInput Data:', inp, '\n Category:', dict_cat[data_train.target_names[cat]])
66+
67+
68+
# In[ ]:
69+
70+
71+
72+
Loading
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
#!/usr/bin/env python
2+
# coding: utf-8
3+
4+
# In[1]:
5+
6+
7+
# Importing the necessary packages
8+
import numpy as np
9+
import pandas as pd
10+
import re
11+
import nltk
12+
import matplotlib.pyplot as plt
13+
get_ipython().run_line_magic('matplotlib', 'inline')
14+
15+
16+
# In[2]:
17+
18+
19+
# Data Source: https://raw.githubusercontent.com/kolaveridi/kaggle-Twitter-US-Airline-Sentiment-/master/Tweets.csv
20+
import_data_url = "https://raw.githubusercontent.com/kolaveridi/kaggle-Twitter-US-Airline-Sentiment-/master/Tweets.csv"
21+
sentiment_tweets = pd.read_csv(import_data_url)
22+
23+
24+
# In[3]:
25+
26+
27+
# Distribution of tweets based on the airlines
28+
sentiment_tweets.airline.value_counts().plot(kind='pie', label='')
29+
30+
31+
# In[4]:
32+
33+
34+
# Distinguish between the type of sentiments shown by users
35+
sentiment_tweets.airline_sentiment.value_counts().plot(kind='pie', autopct='%1.0f%%', colors=["brown", "orange", "blue"])
36+
37+
38+
# In[5]:
39+
40+
41+
airline_grouped_sentiment = sentiment_tweets.groupby(['airline', 'airline_sentiment']).airline_sentiment.count().unstack()
42+
airline_grouped_sentiment.plot(figsize=(8,5), kind='bar', title='Individual Sentiments for Airlines', xlabel='Airline Company', ylabel='Sentiment Count')
43+
44+
45+
# In[6]:
46+
47+
48+
import seaborn as sns
49+
sns.barplot(x='airline_sentiment', y='airline_sentiment_confidence' , data=sentiment_tweets)
50+
51+
52+
# In[7]:
53+
54+
55+
# Cleaning of data: Since these tweets might contain punctuation marks and other non-relevant characters, we will process those and remove them from the model
56+
# Let us also divide the feature and label sets for this data
57+
feature_set = sentiment_tweets.iloc[:, 10].values
58+
label_set = sentiment_tweets.iloc[:, 1].values
59+
cleaned_feature_set = list()
60+
for input_phrase in range(0, len(feature_set)):
61+
# 1. Removing all the special characters (*,etc.) and single characters (a,an,etc.)
62+
clean_feature = re.sub(r'\W', ' ', str(feature_set[input_phrase]))
63+
clean_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', clean_feature)
64+
clean_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', clean_feature)
65+
# 2. Convert the entire phrase to lower cases
66+
clean_feature = clean_feature.lower()
67+
cleaned_feature_set.append(clean_feature)
68+
69+
# Changing the text to a numerical form: All machine learning and statistical models use mathematics and numbers to compute data. Since the input here is textual, we will use the TF-IDF scheme to process words.
70+
# Import the necessary packages
71+
from nltk.corpus import stopwords
72+
from sklearn.feature_extraction.text import TfidfVectorizer
73+
input_vector = TfidfVectorizer (max_features=3000, min_df=6, max_df=0.8, stop_words=stopwords.words('english'))
74+
cleaned_feature_set = input_vector.fit_transform(cleaned_feature_set).toarray()
75+
76+
77+
# In[8]:
78+
79+
80+
# Let us now use the Train, Test, Split function to divide this data into training and testing sets. We will use the training set to train the model and find the best suitable model for this prediction and then run that model on the test data to finalize the prediction score
81+
from sklearn.model_selection import train_test_split
82+
X_train, X_test, y_train, y_test = train_test_split(cleaned_feature_set, label_set, test_size=0.33, random_state=42)
83+
from sklearn.ensemble import RandomForestClassifier
84+
from sklearn.linear_model import LogisticRegression
85+
from sklearn.svm import SVC
86+
from sklearn.neighbors import KNeighborsClassifier
87+
from sklearn.metrics import accuracy_score
88+
# Random Forest Classification
89+
rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)
90+
rf_classifier.fit(X_train, y_train)
91+
rf_classifier_score = rf_classifier.score(X_train, y_train)
92+
# Support Vector Machine Linear Classification
93+
svc_classifier = SVC(kernel='linear')
94+
svc_classifier.fit(X_train, y_train)
95+
svc_classifier_score = svc_classifier.score(X_train, y_train)
96+
# Logistic Regression
97+
lr_classifier = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr').fit(X_train, y_train)
98+
lr_classifier_score = lr_classifier.score(X_train, y_train)
99+
# K-Nearest Neighbors Classification
100+
knn_classifier = KNeighborsClassifier(n_neighbors=5)
101+
knn_classifier.fit(X_train, y_train)
102+
knn_classifier_score = knn_classifier.score(X_train, y_train)
103+
104+
# Comparision of individual accuracy scores
105+
accuracy_scores = []
106+
Used_ML_Models = ['Random Forest Classification','Support Vector Machine Classification','Logistic Regression',
107+
'KNN Classification']
108+
accuracy_scores.append(rf_classifier_score)
109+
accuracy_scores.append(svc_classifier_score)
110+
accuracy_scores.append(lr_classifier_score)
111+
accuracy_scores.append(knn_classifier_score)
112+
score_comparisons = pd.DataFrame(Used_ML_Models, columns = ['Classifiers'])
113+
score_comparisons['Accuracy on Training Data'] = accuracy_scores
114+
score_comparisons
115+
116+
117+
# In[9]:
118+
119+
120+
# We see that the Random Forest Classifier performs the best
121+
# Final prediction using the best-case algorithm from the above table
122+
final_pred = rf_classifier.predict(X_test)
123+
# Accuracy score of the final prediction
124+
print(accuracy_score(y_test, final_pred))
125+

0 commit comments

Comments
 (0)