Sentiment-Analysis-Chinese-pytorch/Sentiment_Analysis_DataProcess.py at master · SoulDGXu/Sentiment-Analysis-Chinese-pytorch · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
# !usr/bin/env python
# -*- coding:utf-8 -*-

"""
@Author: HsuDan
@Date: 2022-02-18 19:12:58
@Version: 1.0
@LastEditors: HsuDan
@LastEditTime: 2022-02-25 11:18:01
@Description: DataProcess
@FilePath: /Sentiment-Analysis-Chinese-pytorch/Sentiment_Analysis_DataProcess.py
"""

from __future__ import unicode_literals, print_function, division
from io import open
import torch
import re
import pandas as pd
import numpy as np
import gensim
from torch.utils.data import Dataset
from Sentiment_Analysis_Config import Config


class Data_set(Dataset):
    def __init__(self, Data, Label):
        self.Data = Data
        if Label is not None:  # 考虑对测试集的使用
            self.Label = Label

    def __len__(self):
        return len(self.Data)

    def __getitem__(self, index):
        if self.Label is not None:
            data = torch.from_numpy(self.Data[index])
            label = torch.from_numpy(self.Label[index])
            return data, label
        else:
            data = torch.from_numpy(self.Data[index])
            return data


def data_preview(file_path):
    """
    @description: preview the original data set: size, description info, etc.
    @param {*}
    - file_path: str, given the data file path.
    @return {*}
    - df: DataFrame, data in DataFrame format
    """
    with open(file_path, "r", encoding="utf-8") as f:
        data = []
        for line in f.readlines():
            sall = line.strip().split()
            data.append([int(sall[0]), sall[1:]])
    df = pd.DataFrame(data, columns=["label", "text"])
    print(
        "\n"
        + "*" * 20
        + " Original Data Description ("
        + file_path.split("/")[-1]
        + ")"
        + "*" * 20
        + "\n"
    )
    print("data length: ", len(data))
    print("data preview:\n", df.head())
    print("count label:\n", df["label"].value_counts())
    return df


def stopwordslist():  # 创建停用词表
    stopwords = [
        line.strip()
        for line in open(Config.stopword_path, encoding="UTF-8").readlines()
    ]
    return stopwords


def build_word2id(file):
    """
    @description: build the dict of 'word2id'
    @param {*}
    - file: str, word2id保存地址
    @return {*}
    返回word2id的字典
    """
    stopwords = stopwordslist()
    word2id = {"_PAD_": 0}
    path = [Config.train_path, Config.val_path]

    for _path in path:
        with open(_path, encoding="utf-8") as f:
            for line in f.readlines():
                out_list = []
                # 去停用词
                sp = line.strip().split()
                for word in sp[1:]:
                    if word not in stopwords:
                        rt = re.findall("[a-zA-Z]+", word)
                        if word != "\t":
                            # 去除长度为1的word
                            if len(rt) == 1:
                                continue
                            else:
                                out_list.append(word)
                for word in out_list:
                    if word not in word2id.keys():
                        word2id[word] = len(word2id)
    with open(file, "w", encoding="utf-8") as f:
        for w in word2id:
            f.write(w + "\t")
            f.write(str(word2id[w]))
            f.write("\n")
    return word2id


def build_id2word(word2id):
    """
    @description: 得到id2word
    @param {*}
    - word2id: dict, 字典word2id
    @return {*}
    - id2word: dict， 字典id2word
    """
    id2word = {}
    for key, val in word2id.items():
        id2word[val] = key
    return id2word


def build_word2vec(fname, word2id, save_to_path=None):
    """
    @description: 返回语料文本中词汇集对应的word2vec向量
    @param {*}
    - fname: str, 预训练的word2vec.
    - word2id: dict, 语料文本中包含的词汇集.
    - save_to_path: str, 保存训练语料库中的词组对应的word2vec到本地.
    @return {*}
    语料文本中词汇集对应的word2vec向量{id: word2vec}.
    """

    n_words = max(word2id.values()) + 1
    model = gensim.models.KeyedVectors.load_word2vec_format(fname, binary=True)
    word_vecs = np.array(np.random.uniform(-1.0, 1.0, [n_words, model.vector_size]))
    for word in word2id.keys():
        try:
            word_vecs[word2id[word]] = model[word]
        except KeyError:
            pass
    if save_to_path:
        with open(save_to_path, "w", encoding="utf-8") as f:
            for vec in word_vecs:
                vec = [str(w) for w in vec]
                f.write(" ".join(vec))
                f.write("\n")
    return word_vecs


def text_to_array(word2id, seq_lenth, path):
    """
    @description: 文本转为索引数字模式-将原始文本（包括标签和文本）里的每个词转为word2id对应的索引数字，并以数组返回.
    @param {*}
    - word2id: dict, 语料文本中包含的词汇集.
    - seq_length: int, 序列的限定长度.
    - path: str, 待处理的原始文本数据集.
    @return {*}
    返回原始文本转化索引数字数组后的数据集(array), 标签集(list).
    """

    label_array = []
    i = 0
    sa = []
    # 获取句子个数
    with open(path, encoding="utf-8") as f1:
        for l1 in f1.readlines():
            s = l1.strip().split()
            s1 = s[1:]
            new_s = [word2id.get(word, 0) for word in s1]  # 单词转索引数字
            sa.append(new_s)
        # print(len(sa))

    with open(path, encoding="utf-8") as f:
        sentences_array = np.zeros(shape=(len(sa), seq_lenth))  # 行：句子个数 列：句子长度
        for line in f.readlines():
            sl1 = line.strip().split()
            sen = sl1[1:]
            new_sen = [word2id.get(word, 0) for word in sen]  # 单词转索引数字,不存在则为0
            new_sen_np = np.array(new_sen).reshape(1, -1)

            # 补齐每个句子长度，多余补零，少了就直接赋值,0填在前面。
            if np.size(new_sen_np, 1) < seq_lenth:
                sentences_array[i, seq_lenth - np.size(new_sen_np, 1) :] = new_sen_np[
                    0, :
                ]
            else:
                sentences_array[i, 0:seq_lenth] = new_sen_np[0, 0:seq_lenth]

            i = i + 1
            label = int(sl1[0])  # 标签
            label_array.append(label)
    return np.array(sentences_array), label_array


def text_to_array_nolabel(word2id, seq_lenth, path):
    """
    @description: 文本转为索引数字模式-将原始文本（仅包括文本）里的每个词转为word2id对应的索引数字，并以数组返回.
    @param {*}
    - word2id: dict, 语料文本中包含的词汇集.
    - seq_length: int, 序列的限定长度.
    - path: str, 待处理的原始文本数据集.
    @return {*}
    原始文本转化索引数字数组后的数据集(array).
    """

    i = 0
    sa = []
    # 获取句子个数
    with open(path, encoding="utf-8") as f1:
        for l1 in f1.readlines():
            s = l1.strip().split()
            s1 = s[0:]  # s1=s[1:]
            new_s = [word2id.get(word, 0) for word in s1]  # 单词转索引数字
            sa.append(new_s)
    with open(path, encoding="utf-8") as f:
        sentences_array = np.zeros(shape=(len(sa), seq_lenth))  # 行：句子个数 列：句子长度
        for line in f.readlines():
            sl1 = line.strip().split()
            sen = sl1[0:]  # sen=sl1[1:]
            new_sen = [word2id.get(word, 0) for word in sen]  # 单词转索引数字,不存在则为0
            new_sen_np = np.array(new_sen).reshape(1, -1)
            if np.size(new_sen_np, 1) < seq_lenth:
                sentences_array[i, seq_lenth - np.size(new_sen_np, 1) :] = new_sen_np[
                    0, :
                ]
            else:
                sentences_array[i, 0:seq_lenth] = new_sen_np[0, 0:seq_lenth]
            i = i + 1
    return np.array(sentences_array)


def to_categorical(y, num_classes=None):
    """
    @description: 将类别转化为one-hot编码
    @param {*}
    - y: list, 类别特征的列表
    - num_class: int, 类别个数
    @return {*}
    返回one-hot编码数组,shape:（len(y), num_classes）
    """
    y = np.array(y, dtype="int")
    input_shape = y.shape
    if input_shape and input_shape[-1] == 1 and len(input_shape) > 1:
        input_shape = tuple(input_shape[:-1])
    y = y.ravel()
    if not num_classes:
        num_classes = np.max(y) + 1
    n = y.shape[0]
    categorical = np.zeros((n, num_classes))
    categorical[np.arange(n), y] = 1
    output_shape = input_shape + (num_classes,)
    categorical = np.reshape(categorical, output_shape)
    return categorical


def prepare_data(w2id, train_path, val_path, test_path, seq_lenth):
    """
    @description: 得到数字索引表示的句子和标签
    @param {*}
    - w2id: dict, 语料文本中包含的词汇集.
    - train_path: str, 训练数据集所在路径
    - val_path: str, 验证数据集所在路径
    - test_path: str, 测试数据集所在路径
    - seq_lenth: int, 序列的固定长度
    @return {*}
    - train_array: array, 训练集文本数组, shape:(len(train), seq_len)
    - train_label: array, 训练集标签数组, shape:(len(train), 1)
    - val_array: array, 验证集文本数组, shape:(len(val), seq_len)
    - val_label: array, 验证集标签数组, shape:(len(val), 1)
    - test_array: array, 测试集文本数组, shape:(len(test), seq_len)
    - test_label: array, 测试集标签数组, shape:(len(test), 1)
    """

    train_array, train_label = text_to_array(w2id, seq_lenth=seq_lenth, path=train_path)
    val_array, val_label = text_to_array(w2id, seq_lenth=seq_lenth, path=val_path)
    test_array, test_label = text_to_array(w2id, seq_lenth=seq_lenth, path=test_path)

    # 标签为[1, 1, 1, 1, 1, 1, 1, 1, 0, 0...]将标签转为onehot
    # train_label=to_categorical(train_label,num_classes=2)
    # val_label=to_categorical(val_label,num_classes=2)
    # print(train_lab,"\nval\n",val_lab)
    # """转换后标签
    #         [[0. 1.]
    #         [0. 1.]
    #         [0. 1.]
    #         ...
    #         [1. 0.]
    #         [1. 0.]
    #         [1. 0.]]"""

    train_label = np.array([train_label]).T
    val_label = np.array([val_label]).T
    test_label = np.array([test_label]).T

    return train_array, train_label, val_array, val_label, test_array, test_label


if __name__ == "__main__":
    # preview data
    train_df = data_preview(Config.train_path)
    test_df = data_preview(Config.test_path)
    val_df = data_preview(Config.val_path)

    # 建立word2id
    word2id = build_word2id(Config.word2id_path)

    # 建立id2word
    id2word = build_id2word(word2id)

    # 建立word2vec
    w2vec = build_word2vec(
        Config.pre_word2vec_path, word2id, Config.corpus_word2vec_path
    )

    # 得到句子id表示和标签
    (
        train_array,
        train_label,
        val_array,
        val_label,
        test_array,
        test_label,
    ) = prepare_data(
        word2id,
        train_path=Config.train_path,
        val_path=Config.val_path,
        test_path=Config.test_path,
        seq_lenth=Config.max_sen_len,
    )

    np.savetxt("./word2vec/train_data.txt", train_array, fmt="%d")
    np.savetxt("./word2vec/val_data.txt", val_array, fmt="%d")
    np.savetxt("./word2vec/test_data.txt", test_array, fmt="%d")
    np.savetxt("./word2vec/train_label.txt", train_label, fmt="%d")
    np.savetxt("./word2vec/val_label.txt", val_label, fmt="%d")
    np.savetxt("./word2vec/test_label.txt", test_label, fmt="%d")
    print("Data processing is complete.")