Skip to content

Commit b239e94

Browse files
committed
Cleaning and splitting data into test and training
1 parent 3cd27b5 commit b239e94

File tree

1 file changed

+44
-0
lines changed

1 file changed

+44
-0
lines changed

CleaningDataCode/CleanAndSplitData.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
#!/usr/bin/env python2
2+
# -*- coding: utf-8 -*-
3+
"""
4+
Created on Fri Oct 6 19:48:16 2017
5+
6+
@author: akashsingh
7+
"""
8+
9+
10+
#importing the libraries
11+
import numpy as np
12+
import pandas as pd
13+
import matplotlib.pyplot as plt
14+
15+
#importing about the dataset
16+
dataset = pd.read_csv('TrainingData.csv')
17+
answer=pd.read_csv('Answer.csv')
18+
final = pd.merge(dataset, answer, on='Id')
19+
final[final.Text != '']
20+
21+
Slovak = final[final['Category'] == 0]
22+
Slovak.apply(lambda x: x.astype(str).str.lower())
23+
French= final[final['Category'] == 1]
24+
French.apply(lambda x: x.astype(str).str.lower())
25+
Spanish= final[final['Category'] == 2]
26+
Spanish.apply(lambda x: x.astype(str).str.lower())
27+
German= final[final['Category'] == 3]
28+
German.apply(lambda x: x.astype(str).str.lower())
29+
Polish= final[final['Category'] == 4]
30+
Polish.apply(lambda x: x.astype(str).str.lower())
31+
Slovak.to_csv(Slovak, index=False, sep=',',encoding='utf-8')
32+
French.to_csv(French, index=False,sep=',', encoding='utf-8')
33+
Spanish.to_csv(Spanish, index=False,sep=',', encoding='utf-8')
34+
German.to_csv(German, index=False, sep=',',encoding='utf-8')
35+
Polish.to_csv(Polish, index=False,sep=',', encoding='utf-8')
36+
37+
38+
39+
40+
from sklearn.cross_validation import train_test_split
41+
X_train, X_test, y_train, y_test = train_test_split(dataset,answer, test_size = 0.20,random_state=1)
42+
43+
44+

0 commit comments

Comments
 (0)