Skip to content

Commit e18dda3

Browse files
committed
simple-nmt detok 추가
1 parent 9429425 commit e18dda3

File tree

18 files changed

+1802
-0
lines changed

18 files changed

+1802
-0
lines changed

src/11_seq2seq/modules/seq2seq2.py

Lines changed: 407 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#-*- coding:utf-8 -*-
2+
import sys
3+
sys.stdin.reconfigure(encoding='utf-8')
4+
5+
6+
if __name__ == "__main__":
7+
for line in sys.stdin:
8+
if line.strip() != "":
9+
if '▁▁' in line:
10+
line = line.strip().replace(' ', '').replace('▁▁', ' ').replace('▁', '').strip()
11+
else:
12+
line = line.strip().replace(' ', '').replace('▁', ' ').strip()
13+
14+
sys.stdout.write(line + '\n')
15+
else:
16+
sys.stdout.write('\n')
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
import sys
2+
3+
STR = '▁'
4+
5+
if __name__ == "__main__":
6+
ref_fn = sys.argv[1]
7+
8+
f = open(ref_fn, 'r')
9+
10+
for ref in f:
11+
ref = ref.strip()
12+
input_line = sys.stdin.readline().strip()
13+
14+
if input_line != "":
15+
buf = [STR]
16+
17+
ref_index = 0
18+
input_index = 0
19+
while input_index < len(input_line):
20+
c = input_line[input_index]
21+
input_index += 1
22+
23+
if c != ' ':
24+
while ref_index < len(ref):
25+
c_ = ref[ref_index]
26+
ref_index += 1
27+
28+
if c_ == ' ':
29+
c = STR + c
30+
else:
31+
break
32+
buf += [c]
33+
34+
# # We assume that stdin has more tokens than reference input.
35+
# for ref_token in ref_tokens:
36+
# tmp_buf = []
37+
#
38+
# while idx < len(tokens):
39+
# if tokens[idx].strip() == '':
40+
# idx += 1
41+
# continue
42+
#
43+
# tmp_buf += [tokens[idx]]
44+
# idx += 1
45+
#
46+
# if ''.join(tmp_buf) == ref_token:
47+
# break
48+
#
49+
# if len(tmp_buf) > 0:
50+
# buf += [STR + tmp_buf[0].strip()] + tmp_buf[1:]
51+
52+
sys.stdout.write(''.join(buf) + '\n')
53+
else:
54+
sys.stdout.write('\n')
55+
56+
f.close()
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
# 코퍼스 셔플
2+
shuf ./data/corpus.tsv > ./data/corpus.shuf.tsv
3+
4+
# 코퍼스 셋 나누기
5+
head -n 1200000 ./data/corpus.shuf.tsv > ./data/corpus.shuf.train.tsv
6+
tail -n 402409 ./data/corpus.shuf.tsv > ./data/temp.tsv
7+
head -n 200000 ./data/temp.tsv > ./data/corpus.shuf.valid.tsv
8+
rm ./data/temp.tsv
9+
tail -n 202409 ./data/corpus.shuf.tsv > ./data/corpus.shuf.test.tsv
10+
11+
# 코퍼스 한영 말뭉치 분리
12+
cut -f1 ./data/corpus.shuf.train.tsv > ./data/corpus.shuf.train.ko
13+
cut -f2 ./data/corpus.shuf.train.tsv > ./data/corpus.shuf.train.en
14+
cut -f1 ./data/corpus.shuf.valid.tsv > ./data/corpus.shuf.valid.ko
15+
cut -f2 ./data/corpus.shuf.valid.tsv > ./data/corpus.shuf.valid.en
16+
cut -f1 ./data/corpus.shuf.test.tsv > ./data/corpus.shuf.test.ko
17+
cut -f2 ./data/corpus.shuf.test.tsv > ./data/corpus.shuf.test.en
18+
# head -n 3 ./data/corpus.shuf.*.ko
19+
# head -n 3 ./data/corpus.shuf.*.en
20+
# wc -l ./data/corpus.shuf.*.*
21+
22+
# 코퍼스 토크나이징
23+
cat ./data/corpus.shuf.train.ko | mecab -O wakati -b 99999 | python3 post_tokenize.py ./data/corpus.shuf.train.ko > ./data/corpus.shuf.train.tok.ko
24+
cat ./data/corpus.shuf.train.en | python3 tokenizer.py | python3 post_tokenize.py ./data/corpus.shuf.train.en > ./data/corpus.shuf.train.tok.en
25+
cat ./data/corpus.shuf.valid.ko | mecab -O wakati -b 99999 | python3 post_tokenize.py ./data/corpus.shuf.valid.ko > ./data/corpus.shuf.valid.tok.ko
26+
cat ./data/corpus.shuf.valid.en | python3 tokenizer.py | python3 post_tokenize.py ./data/corpus.shuf.valid.en > ./data/corpus.shuf.valid.tok.en
27+
cat ./data/corpus.shuf.test.ko | mecab -O wakati -b 99999 | python3 post_tokenize.py ./data/corpus.shuf.test.ko > ./data/corpus.shuf.test.tok.ko
28+
cat ./data/corpus.shuf.test.en | python3 tokenizer.py | python3 post_tokenize.py ./data/corpus.shuf.test.en > ./data/corpus.shuf.test.tok.en
29+
# head -n 3 ./data/corpus.shuf.*.tok.*
30+
# wc -l ./data/corpus.shuf.*.tok.*
31+
32+
# 코퍼스 서브워드 세그멘테이션
33+
# 보통 트레인 셋에 대한 bpe만 학습시켜서 valid, test에도 일관적으로 적용
34+
# symbol의 경우, BPE 알고리즘 상에서 몇번이나 머지를 시도할 것인지 물어봄
35+
# 너무 합쳐졌다 싶으면 심볼을 낮춤
36+
# 너무 안 찢어졌다 싶으면 심볼을 높임
37+
python3 ./subword-nmt/learn_bpe.py --input ./data/corpus.shuf.train.tok.ko --output bpe.ko.model --symbols 30000 --verbose
38+
python3 ./subword-nmt/learn_bpe.py --input ./data/corpus.shuf.train.tok.en --output bpe.en.model --symbols 50000 --verbose
39+
cat ./data/corpus.shuf.train.tok.ko | python3 ./subword-nmt/apply_bpe.py --c bpe.ko.model > ./data/corpus.shuf.train.tok.bpe.ko
40+
cat ./data/corpus.shuf.train.tok.en | python3 ./subword-nmt/apply_bpe.py --c bpe.en.model > ./data/corpus.shuf.train.tok.bpe.en
41+
cat ./data/corpus.shuf.valid.tok.ko | python3 ./subword-nmt/apply_bpe.py --c bpe.ko.model > ./data/corpus.shuf.valid.tok.bpe.ko
42+
cat ./data/corpus.shuf.valid.tok.en | python3 ./subword-nmt/apply_bpe.py --c bpe.en.model > ./data/corpus.shuf.valid.tok.bpe.en
43+
cat ./data/corpus.shuf.test.tok.ko | python3 ./subword-nmt/apply_bpe.py --c bpe.ko.model > ./data/corpus.shuf.test.tok.bpe.ko
44+
cat ./data/corpus.shuf.test.tok.en | python3 ./subword-nmt/apply_bpe.py --c bpe.en.model > ./data/corpus.shuf.test.tok.bpe.en
45+
# head -n 3 ./data/corpus.shuf.*.tok.bpe.*
46+
# wc -l ./data/corpus.shuf.*.tok.bpe.*
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
__pycache__/
2+
*.pyc
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
CHANGELOG
2+
---------
3+
4+
v0.2:
5+
- different, more consistent handling of end-of-word token (commit a749a7) (https://github.com/rsennrich/subword-nmt/issues/19)
6+
- allow passing of vocabulary and frequency threshold to apply_bpe.py, preventing the production of OOV (or rare) subword units (commit a00db)
7+
- made learn_bpe.py deterministic (commit 4c54e)
8+
- various changes to make handling of UTF more consistent between Python versions
9+
- new command line arguments for apply_bpe.py:
10+
- '--glossaries' to prevent given strings from being affected by BPE
11+
- '--merges' to apply a subset of learned BPE operations
12+
- new command line arguments for learn_bpe.py:
13+
- '--dict-input': rather than raw text file, interpret input as a frequency dictionary (as created by get_vocab.py).
14+
15+
16+
v0.1:
17+
- consistent cross-version unicode handling
18+
- all scripts are now deterministic
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
The MIT License (MIT)
2+
3+
Copyright (c) 2015 University of Edinburgh
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
Subword Neural Machine Translation
2+
==================================
3+
4+
This repository contains preprocessing scripts to segment text into subword
5+
units. The primary purpose is to facilitate the reproduction of our experiments
6+
on Neural Machine Translation with subword units (see below for reference).
7+
8+
INSTALLATION
9+
------------
10+
11+
Clone or copy this repository and follow the usage instructions below.
12+
13+
For an installable package, see https://github.com/rsennrich/subword-nmt/tree/package
14+
15+
16+
USAGE INSTRUCTIONS
17+
------------------
18+
19+
Check the individual files for usage instructions.
20+
21+
To apply byte pair encoding to word segmentation, invoke these commands:
22+
23+
./learn_bpe.py -s {num_operations} < {train_file} > {codes_file}
24+
./apply_bpe.py -c {codes_file} < {test_file}
25+
26+
To segment rare words into character n-grams, do the following:
27+
28+
./get_vocab.py < {train_file} > {vocab_file}
29+
./segment-char-ngrams.py --vocab {vocab_file} -n {order} --shortlist {size} < {test_file}
30+
31+
The original segmentation can be restored with a simple replacement:
32+
33+
sed -r 's/(@@ )|(@@ ?$)//g'
34+
35+
36+
BEST PRACTICE ADVICE FOR BYTE PAIR ENCODING IN NMT
37+
--------------------------------------------------
38+
39+
We found that for languages that share an alphabet, learning BPE on the
40+
concatenation of the (two or more) involved languages increases the consistency
41+
of segmentation, and reduces the problem of inserting/deleting characters when
42+
copying/transliterating names.
43+
44+
However, this introduces undesirable edge cases in that a word may be segmented
45+
in a way that has only been observed in the other language, and is thus unknown
46+
at test time. To prevent this, `apply_bpe.py` accepts a `--vocabulary` and a
47+
`--vocabulary-threshold` option so that the script will only produce symbols
48+
which also appear in the vocabulary (with at least some frequency).
49+
50+
To use this functionality, we recommend the following recipe (assuming L1 and L2
51+
are the two languages):
52+
53+
Learn byte pair encoding on the concatenation of the training text, and get resulting vocabulary for each:
54+
55+
cat {train_file}.L1 {train_file}.L2 | ./learn_bpe.py -s {num_operations} -o {codes_file}
56+
./apply_bpe.py -c {codes_file} < {train_file}.L1 | ./get_vocab.py > {vocab_file}.L1
57+
./apply_bpe.py -c {codes_file} < {train_file}.L2 | ./get_vocab.py > {vocab_file}.L2
58+
59+
more conventiently, you can do the same with with this command:
60+
61+
./learn_joint_bpe_and_vocab.py --input {train_file}.L1 {train_file}.L2 -s {num_operations} -o {codes_file} --write-vocabulary {vocab_file}.L1 {vocab_file}.L2
62+
63+
re-apply byte pair encoding with vocabulary filter:
64+
65+
./apply_bpe.py -c {codes_file} --vocabulary {vocab_file}.L1 --vocabulary-threshold 50 < {train_file}.L1 > {train_file}.BPE.L1
66+
./apply_bpe.py -c {codes_file} --vocabulary {vocab_file}.L2 --vocabulary-threshold 50 < {train_file}.L2 > {train_file}.BPE.L2
67+
68+
as a last step, extract the vocabulary to be used by the neural network. Example with Nematus:
69+
70+
nematus/data/build_dictionary.py {train_file}.BPE.L1 {train_file}.BPE.L2
71+
72+
[you may want to take the union of all vocabularies to support multilingual systems]
73+
74+
for test/dev data, re-use the same options for consistency:
75+
76+
./apply_bpe.py -c {codes_file} --vocabulary {vocab_file}.L1 --vocabulary-threshold 50 < {test_file}.L1 > {test_file}.BPE.L1
77+
78+
79+
PUBLICATIONS
80+
------------
81+
82+
The segmentation methods are described in:
83+
84+
Rico Sennrich, Barry Haddow and Alexandra Birch (2016):
85+
Neural Machine Translation of Rare Words with Subword Units
86+
Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany.
87+
88+
ACKNOWLEDGMENTS
89+
---------------
90+
This project has received funding from Samsung Electronics Polska sp. z o.o. - Samsung R&D Institute Poland, and from the European Union’s Horizon 2020 research and innovation programme under grant agreement 645452 (QT21).

0 commit comments

Comments
 (0)