Skip to content

Commit 3ac9e24

Browse files
committed
add preprocess
1 parent 89ecd63 commit 3ac9e24

File tree

1 file changed

+305
-15
lines changed

1 file changed

+305
-15
lines changed

6.CHATBOT/6.3.preprocess.ipynb

Lines changed: 305 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"cells": [
33
{
44
"cell_type": "code",
5-
"execution_count": null,
5+
"execution_count": 10,
66
"metadata": {},
77
"outputs": [],
88
"source": [
@@ -11,17 +11,17 @@
1111
},
1212
{
1313
"cell_type": "code",
14-
"execution_count": null,
14+
"execution_count": 11,
1515
"metadata": {},
1616
"outputs": [],
1717
"source": [
18-
"PATH = 'data_in/ChatBotData.csv'\n",
18+
"PATH = 'data_in/ChatBotData.csv_short'\n",
1919
"VOCAB_PATH = 'data_in/vocabulary.txt'"
2020
]
2121
},
2222
{
2323
"cell_type": "code",
24-
"execution_count": null,
24+
"execution_count": 12,
2525
"metadata": {},
2626
"outputs": [],
2727
"source": [
@@ -30,18 +30,37 @@
3030
},
3131
{
3232
"cell_type": "code",
33-
"execution_count": null,
33+
"execution_count": 13,
3434
"metadata": {},
35-
"outputs": [],
35+
"outputs": [
36+
{
37+
"name": "stderr",
38+
"output_type": "stream",
39+
"text": [
40+
"100%|██████████| 20/20 [00:00<00:00, 457.99it/s]\n",
41+
"100%|██████████| 20/20 [00:00<00:00, 256.02it/s]\n"
42+
]
43+
}
44+
],
3645
"source": [
3746
"char2idx, idx2char, vocab_size = load_vocabulary(PATH, VOCAB_PATH, tokenize_as_morph=True)"
3847
]
3948
},
4049
{
4150
"cell_type": "code",
42-
"execution_count": null,
51+
"execution_count": 14,
4352
"metadata": {},
44-
"outputs": [],
53+
"outputs": [
54+
{
55+
"name": "stderr",
56+
"output_type": "stream",
57+
"text": [
58+
"100%|██████████| 20/20 [00:00<00:00, 462.61it/s]\n",
59+
"100%|██████████| 20/20 [00:00<00:00, 247.84it/s]\n",
60+
"100%|██████████| 20/20 [00:00<00:00, 386.92it/s]\n"
61+
]
62+
}
63+
],
4564
"source": [
4665
"index_inputs, input_seq_len = enc_processing(inputs, char2idx, tokenize_as_morph=True)\n",
4766
"index_outputs, output_seq_len = dec_output_processing(outputs, char2idx, tokenize_as_morph=True)\n",
@@ -50,7 +69,7 @@
5069
},
5170
{
5271
"cell_type": "code",
53-
"execution_count": null,
72+
"execution_count": 15,
5473
"metadata": {},
5574
"outputs": [],
5675
"source": [
@@ -66,7 +85,7 @@
6685
},
6786
{
6887
"cell_type": "code",
69-
"execution_count": null,
88+
"execution_count": 16,
7089
"metadata": {},
7190
"outputs": [],
7291
"source": [
@@ -85,21 +104,292 @@
85104
},
86105
{
87106
"cell_type": "code",
88-
"execution_count": null,
107+
"execution_count": 17,
89108
"metadata": {},
90-
"outputs": [],
109+
"outputs": [
110+
{
111+
"data": {
112+
"text/plain": [
113+
"{'<PAD>': 0,\n",
114+
" '<SOS>': 1,\n",
115+
" '<END>': 2,\n",
116+
" '<UNK>': 3,\n",
117+
" '승진': 4,\n",
118+
" '불': 5,\n",
119+
" '설득': 6,\n",
120+
" '절약': 7,\n",
121+
" '좋을까': 8,\n",
122+
" '마음': 9,\n",
123+
" '가상': 10,\n",
124+
" '출발': 11,\n",
125+
" '에는': 12,\n",
126+
" '선물': 13,\n",
127+
" '따라': 14,\n",
128+
" '사세요': 15,\n",
129+
" '그럴거예요': 16,\n",
130+
" '줄까': 17,\n",
131+
" '따뜻하게': 18,\n",
132+
" '다음': 19,\n",
133+
" '남자친구': 20,\n",
134+
" '달': 21,\n",
135+
" '구': 22,\n",
136+
" '훈훈해': 23,\n",
137+
" '믿어줘': 24,\n",
138+
" '가스': 25,\n",
139+
" '까지': 26,\n",
140+
" '딱': 27,\n",
141+
" '인데': 28,\n",
142+
" '빨리': 29,\n",
143+
" '들어올거예요': 30,\n",
144+
" '해보세요': 31,\n",
145+
" '가만있어도': 32,\n",
146+
" '자의': 33,\n",
147+
" '운동': 34,\n",
148+
" '거짓말': 35,\n",
149+
" '열': 36,\n",
150+
" '갔어': 37,\n",
151+
" '보인다': 38,\n",
152+
" '또': 39,\n",
153+
" '생일': 40,\n",
154+
" '땀': 41,\n",
155+
" '데려가고싶어': 42,\n",
156+
" '비비': 43,\n",
157+
" '좋을것': 44,\n",
158+
" '착하지': 45,\n",
159+
" '비': 46,\n",
160+
" '더': 47,\n",
161+
" '로': 48,\n",
162+
" '필요했던게': 49,\n",
163+
" '궁금해': 50,\n",
164+
" '돈': 51,\n",
165+
" '가끔': 52,\n",
166+
" '즐기세요': 53,\n",
167+
" '오늘': 54,\n",
168+
" '가': 55,\n",
169+
" '감기': 56,\n",
170+
" '하셨나요': 57,\n",
171+
" '어서': 58,\n",
172+
" '너': 59,\n",
173+
" '돌아가서': 60,\n",
174+
" '나갔어': 61,\n",
175+
" '좋아요': 62,\n",
176+
" '하는지': 63,\n",
177+
" '같아요': 64,\n",
178+
" '식혀주세요': 65,\n",
179+
" '가난한': 66,\n",
180+
" '적당히하세요': 67,\n",
181+
" '많이': 68,\n",
182+
" '켜놓고': 69,\n",
183+
" '나온거': 70,\n",
184+
" '집': 71,\n",
185+
" '너무': 72,\n",
186+
" '나왔다': 73,\n",
187+
" '바빠': 74,\n",
188+
" '를': 75,\n",
189+
" '뭐': 76,\n",
190+
" '함께': 77,\n",
191+
" '은': 78,\n",
192+
" '평소': 79,\n",
193+
" '동': 80,\n",
194+
" '무운': 81,\n",
195+
" '다': 82,\n",
196+
" '난': 83,\n",
197+
" '해봐요': 84,\n",
198+
" '때': 85,\n",
199+
" '좀질': 86,\n",
200+
" '사람': 87,\n",
201+
" '하세요': 88,\n",
202+
" '나라': 89,\n",
203+
" '잊고': 90,\n",
204+
" '잘생겼어': 91,\n",
205+
" '가나안': 92,\n",
206+
" '그': 93,\n",
207+
" '같아': 94,\n",
208+
" '켜고': 95,\n",
209+
" '필요한것': 96,\n",
210+
" '빠를수록': 97,\n",
211+
" '설움': 98,\n",
212+
" '도': 99,\n",
213+
" '결단': 100,\n",
214+
" '뭘': 101,\n",
215+
" '을': 102,\n",
216+
" '화폐': 103,\n",
217+
" '생각': 104,\n",
218+
" '다시': 105,\n",
219+
" '새': 106,\n",
220+
" '전생': 107,\n",
221+
" '만해': 108,\n",
222+
" '좋다': 109,\n",
223+
" '인게': 110,\n",
224+
" '무집': 111,\n",
225+
" '에': 112,\n",
226+
" '쫄': 113,\n",
227+
" '교회': 114,\n",
228+
" '마세요': 115,\n",
229+
" '싼데': 116,\n",
230+
" '린다': 117,\n",
231+
" '혼자': 118,\n",
232+
" '걸리겠어': 119,\n",
233+
" '망함': 120,\n",
234+
" '끄고나오세요': 121}"
235+
]
236+
},
237+
"execution_count": 17,
238+
"metadata": {},
239+
"output_type": "execute_result"
240+
}
241+
],
91242
"source": [
92243
"char2idx"
93244
]
94245
},
95246
{
96247
"cell_type": "code",
97-
"execution_count": null,
248+
"execution_count": 18,
98249
"metadata": {},
99-
"outputs": [],
250+
"outputs": [
251+
{
252+
"data": {
253+
"text/plain": [
254+
"{0: '<PAD>',\n",
255+
" 1: '<SOS>',\n",
256+
" 2: '<END>',\n",
257+
" 3: '<UNK>',\n",
258+
" 4: '승진',\n",
259+
" 5: '불',\n",
260+
" 6: '설득',\n",
261+
" 7: '절약',\n",
262+
" 8: '좋을까',\n",
263+
" 9: '마음',\n",
264+
" 10: '가상',\n",
265+
" 11: '출발',\n",
266+
" 12: '에는',\n",
267+
" 13: '선물',\n",
268+
" 14: '따라',\n",
269+
" 15: '사세요',\n",
270+
" 16: '그럴거예요',\n",
271+
" 17: '줄까',\n",
272+
" 18: '따뜻하게',\n",
273+
" 19: '다음',\n",
274+
" 20: '남자친구',\n",
275+
" 21: '달',\n",
276+
" 22: '구',\n",
277+
" 23: '훈훈해',\n",
278+
" 24: '믿어줘',\n",
279+
" 25: '가스',\n",
280+
" 26: '까지',\n",
281+
" 27: '딱',\n",
282+
" 28: '인데',\n",
283+
" 29: '빨리',\n",
284+
" 30: '들어올거예요',\n",
285+
" 31: '해보세요',\n",
286+
" 32: '가만있어도',\n",
287+
" 33: '자의',\n",
288+
" 34: '운동',\n",
289+
" 35: '거짓말',\n",
290+
" 36: '열',\n",
291+
" 37: '갔어',\n",
292+
" 38: '보인다',\n",
293+
" 39: '또',\n",
294+
" 40: '생일',\n",
295+
" 41: '땀',\n",
296+
" 42: '데려가고싶어',\n",
297+
" 43: '비비',\n",
298+
" 44: '좋을것',\n",
299+
" 45: '착하지',\n",
300+
" 46: '비',\n",
301+
" 47: '더',\n",
302+
" 48: '로',\n",
303+
" 49: '필요했던게',\n",
304+
" 50: '궁금해',\n",
305+
" 51: '돈',\n",
306+
" 52: '가끔',\n",
307+
" 53: '즐기세요',\n",
308+
" 54: '오늘',\n",
309+
" 55: '가',\n",
310+
" 56: '감기',\n",
311+
" 57: '하셨나요',\n",
312+
" 58: '어서',\n",
313+
" 59: '너',\n",
314+
" 60: '돌아가서',\n",
315+
" 61: '나갔어',\n",
316+
" 62: '좋아요',\n",
317+
" 63: '하는지',\n",
318+
" 64: '같아요',\n",
319+
" 65: '식혀주세요',\n",
320+
" 66: '가난한',\n",
321+
" 67: '적당히하세요',\n",
322+
" 68: '많이',\n",
323+
" 69: '켜놓고',\n",
324+
" 70: '나온거',\n",
325+
" 71: '집',\n",
326+
" 72: '너무',\n",
327+
" 73: '나왔다',\n",
328+
" 74: '바빠',\n",
329+
" 75: '를',\n",
330+
" 76: '뭐',\n",
331+
" 77: '함께',\n",
332+
" 78: '은',\n",
333+
" 79: '평소',\n",
334+
" 80: '동',\n",
335+
" 81: '무운',\n",
336+
" 82: '다',\n",
337+
" 83: '난',\n",
338+
" 84: '해봐요',\n",
339+
" 85: '때',\n",
340+
" 86: '좀질',\n",
341+
" 87: '사람',\n",
342+
" 88: '하세요',\n",
343+
" 89: '나라',\n",
344+
" 90: '잊고',\n",
345+
" 91: '잘생겼어',\n",
346+
" 92: '가나안',\n",
347+
" 93: '그',\n",
348+
" 94: '같아',\n",
349+
" 95: '켜고',\n",
350+
" 96: '필요한것',\n",
351+
" 97: '빠를수록',\n",
352+
" 98: '설움',\n",
353+
" 99: '도',\n",
354+
" 100: '결단',\n",
355+
" 101: '뭘',\n",
356+
" 102: '을',\n",
357+
" 103: '화폐',\n",
358+
" 104: '생각',\n",
359+
" 105: '다시',\n",
360+
" 106: '새',\n",
361+
" 107: '전생',\n",
362+
" 108: '만해',\n",
363+
" 109: '좋다',\n",
364+
" 110: '인게',\n",
365+
" 111: '무집',\n",
366+
" 112: '에',\n",
367+
" 113: '쫄',\n",
368+
" 114: '교회',\n",
369+
" 115: '마세요',\n",
370+
" 116: '싼데',\n",
371+
" 117: '린다',\n",
372+
" 118: '혼자',\n",
373+
" 119: '걸리겠어',\n",
374+
" 120: '망함',\n",
375+
" 121: '끄고나오세요'}"
376+
]
377+
},
378+
"execution_count": 18,
379+
"metadata": {},
380+
"output_type": "execute_result"
381+
}
382+
],
100383
"source": [
101384
"idx2char"
102385
]
386+
},
387+
{
388+
"cell_type": "code",
389+
"execution_count": null,
390+
"metadata": {},
391+
"outputs": [],
392+
"source": []
103393
}
104394
],
105395
"metadata": {
@@ -118,7 +408,7 @@
118408
"name": "python",
119409
"nbconvert_exporter": "python",
120410
"pygments_lexer": "ipython3",
121-
"version": "3.6.9"
411+
"version": "3.7.6"
122412
}
123413
},
124414
"nbformat": 4,

0 commit comments

Comments
 (0)