forked from luxzia/fraud_nlp
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscratch.py
110 lines (60 loc) · 1.43 KB
/
scratch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# -*- coding: utf-8 -*-
# <nbformat>3.0</nbformat>
# <codecell>
import os
import pandas as pd
import re
from bs4 import BeautifulSoup
import urllib2
import unicodedata
import string
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import nltk
import numpy as np
# <codecell>
X = pd.read_json("/Users/Zipfian/Desktop/train.json")
# <codecell>
X.columns
# <codecell>
X.body_length.head(5)
# <codecell>
foo1 = X.description.ix[0]
# <codecell>
len(foo1)
# <codecell>
X.acct_type.value_counts()
# <codecell>
X['binary_acct_type'] = X['acct_type'].apply(lambda x: 1 if x == 'premium' else 0)
# <codecell>
X.acct_type.head()
# <codecell>
# <codecell>
X.binary_acct_type.value_counts()
# <codecell>
X.body_length.value_counts()
# <codecell>
print (X.body_length != 0).any(axis=0)
# <codecell>
X = X[X.body_length != 0]
# <codecell>
X.binary_acct_type.value_counts()
# <codecell>
X_premium = X[X.binary_acct_type == 1]
# <codecell>
X_fraud = X[X.binary_acct_type == 0]
# <codecell>
X_premium.shape
# <codecell>
to_choose = np.random.randint(11887, size=1622)
print to_choose
X_premium_sample = X_premium.ix[to_choose]
# <codecell>
X_final_pent = X_premium_sample.append(X_fraud)
# <codecell>
X_final = X_final_pent[['binary_acct_type', 'uid', 'description']]
# <codecell>
for i in X_final.index:
print X_final['binary_acct_type'].ix[i]
# <codecell>
# <codecell>