-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathdata_process.py
78 lines (61 loc) · 2.3 KB
/
data_process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tensorflow import set_random_seed
import os
import nltk, re, string
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
np.random.seed(1)
set_random_seed(1)
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
print(os.listdir("./data"))
def clean_text(text):
print(text)
# Remove puncuation 去除标点
text = text.translate(string.punctuation)
# Convert words to lower case and split them
text = text.lower()
# Remove stop words
# text = text.split()
# stops = set(stopwords.words("english"))
# text = [w for w in text if not w in stops and len(w) >= 3]
# text = " ".join(text)
# Clean the text
text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text) # 除A-Za-z0-9(),!?'`外的字符,去除
text = re.sub(r"what's", "what is ", text)
text = re.sub(r"\'s", " ", text)
text = re.sub(r"\'ve", " have ", text)
text = re.sub(r"n't", " not ", text)
text = re.sub(r"i'm", "i am ", text)
text = re.sub(r"\'re", " are ", text)
text = re.sub(r"\'d", " would ", text)
text = re.sub(r"\'ll", " will ", text)
text = re.sub(r",", " ", text)
text = re.sub(r"\.", " ", text)
text = re.sub(r"!", " ! ", text)
text = re.sub(r"\/", " ", text)
text = re.sub(r"\^", " ^ ", text)
text = re.sub(r"\+", " + ", text)
text = re.sub(r"\-", " - ", text)
text = re.sub(r"\=", " = ", text)
text = re.sub(r"'", " ", text)
text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
text = re.sub(r":", " : ", text)
text = re.sub(r" e g ", " eg ", text)
text = re.sub(r" b g ", " bg ", text)
text = re.sub(r" u s ", " american ", text)
text = re.sub(r"\0s", "0", text)
text = re.sub(r" 9 11 ", "911", text)
text = re.sub(r"e - mail", "email", text)
text = re.sub(r"j k", "jk", text)
text = re.sub(r"\s{2,}", " ", text)
# text = text.split()
# stemmer = SnowballStemmer('english')
# stemmed_words = [stemmer.stem(word) for word in text]
# text = " ".join(stemmed_words)
print(text)
print("")
return text