iml1111
diff --git a/‎src/11_seq2seq/modules/seq2seq2.py‎
Lines changed: 407 additions & 0 deletions b/‎src/11_seq2seq/modules/seq2seq2.py‎
Lines changed: 407 additions & 0 deletions
diff --git a/‎src/simple-nmt/data_preparation/detokenizer.py‎
Lines changed: 16 additions & 0 deletions b/‎src/simple-nmt/data_preparation/detokenizer.py‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎src/simple-nmt/data_preparation/post_tokenize.py‎
Lines changed: 56 additions & 0 deletions b/‎src/simple-nmt/data_preparation/post_tokenize.py‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎src/simple-nmt/data_preparation/preprocess.sh‎
Lines changed: 46 additions & 0 deletions b/‎src/simple-nmt/data_preparation/preprocess.sh‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎src/simple-nmt/data_preparation/subword-nmt/.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎src/simple-nmt/data_preparation/subword-nmt/.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/simple-nmt/data_preparation/subword-nmt/CHANGELOG.md‎
Lines changed: 18 additions & 0 deletions b/‎src/simple-nmt/data_preparation/subword-nmt/CHANGELOG.md‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎src/simple-nmt/data_preparation/subword-nmt/LICENSE‎
Lines changed: 21 additions & 0 deletions b/‎src/simple-nmt/data_preparation/subword-nmt/LICENSE‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎src/simple-nmt/data_preparation/subword-nmt/README.md‎
Lines changed: 90 additions & 0 deletions b/‎src/simple-nmt/data_preparation/subword-nmt/README.md‎
Lines changed: 90 additions & 0 deletions
@@ -0,0 +1,16 @@
+#-*- coding:utf-8 -*-
+import sys
+sys.stdin.reconfigure(encoding='utf-8')
+
+
+if __name__ == "__main__":
+    for line in sys.stdin:
+        if line.strip() != "":
+            if '▁▁' in line:
+                line = line.strip().replace(' ', '').replace('▁▁', ' ').replace('▁', '').strip()
+            else:
+                line = line.strip().replace(' ', '').replace('▁', ' ').strip()
+
+            sys.stdout.write(line + '\n')
+        else:
+            sys.stdout.write('\n')
@@ -0,0 +1,56 @@
+import sys
+
+STR = '▁'
+
+if __name__ == "__main__":
+    ref_fn = sys.argv[1]
+
+    f = open(ref_fn, 'r')
+
+    for ref in f:
+        ref = ref.strip()
+        input_line = sys.stdin.readline().strip()
+
+        if input_line != "":
+            buf = [STR]
+
+            ref_index = 0
+            input_index = 0
+            while input_index < len(input_line):
+                c = input_line[input_index]
+                input_index += 1
+
+                if c != ' ':
+                    while ref_index < len(ref):
+                        c_ = ref[ref_index]
+                        ref_index += 1
+
+                        if c_ == ' ':
+                            c = STR + c
+                        else:
+                            break
+                buf += [c]
+
+#            # We assume that stdin has more tokens than reference input.
+#            for ref_token in ref_tokens:
+#                tmp_buf = []
+#
+#                while idx < len(tokens):
+#                    if tokens[idx].strip() == '':
+#                        idx += 1
+#                        continue
+#
+#                    tmp_buf += [tokens[idx]]
+#                    idx += 1
+#
+#                    if ''.join(tmp_buf) == ref_token:
+#                        break
+#
+#                if len(tmp_buf) > 0:
+#                    buf += [STR + tmp_buf[0].strip()] + tmp_buf[1:]
+
+            sys.stdout.write(''.join(buf) + '\n')
+        else:
+            sys.stdout.write('\n')
+
+    f.close()
@@ -0,0 +1,46 @@
+# 코퍼스 셔플
+shuf ./data/corpus.tsv > ./data/corpus.shuf.tsv
+
+# 코퍼스 셋 나누기
+head -n 1200000 ./data/corpus.shuf.tsv > ./data/corpus.shuf.train.tsv
+tail -n 402409 ./data/corpus.shuf.tsv > ./data/temp.tsv
+head -n 200000 ./data/temp.tsv > ./data/corpus.shuf.valid.tsv
+rm ./data/temp.tsv
+tail -n 202409 ./data/corpus.shuf.tsv > ./data/corpus.shuf.test.tsv
+
+# 코퍼스 한영 말뭉치 분리
+cut -f1 ./data/corpus.shuf.train.tsv > ./data/corpus.shuf.train.ko
+cut -f2 ./data/corpus.shuf.train.tsv > ./data/corpus.shuf.train.en
+cut -f1 ./data/corpus.shuf.valid.tsv > ./data/corpus.shuf.valid.ko
+cut -f2 ./data/corpus.shuf.valid.tsv > ./data/corpus.shuf.valid.en
+cut -f1 ./data/corpus.shuf.test.tsv > ./data/corpus.shuf.test.ko
+cut -f2 ./data/corpus.shuf.test.tsv > ./data/corpus.shuf.test.en
+# head -n 3 ./data/corpus.shuf.*.ko
+# head -n 3 ./data/corpus.shuf.*.en
+# wc -l ./data/corpus.shuf.*.*
+
+# 코퍼스 토크나이징
+cat ./data/corpus.shuf.train.ko | mecab -O wakati -b 99999 | python3 post_tokenize.py ./data/corpus.shuf.train.ko > ./data/corpus.shuf.train.tok.ko
+cat ./data/corpus.shuf.train.en | python3 tokenizer.py | python3 post_tokenize.py ./data/corpus.shuf.train.en > ./data/corpus.shuf.train.tok.en
+cat ./data/corpus.shuf.valid.ko | mecab -O wakati -b 99999 | python3 post_tokenize.py ./data/corpus.shuf.valid.ko > ./data/corpus.shuf.valid.tok.ko
+cat ./data/corpus.shuf.valid.en | python3 tokenizer.py | python3 post_tokenize.py ./data/corpus.shuf.valid.en > ./data/corpus.shuf.valid.tok.en
+cat ./data/corpus.shuf.test.ko | mecab -O wakati -b 99999 | python3 post_tokenize.py ./data/corpus.shuf.test.ko > ./data/corpus.shuf.test.tok.ko
+cat ./data/corpus.shuf.test.en | python3 tokenizer.py | python3 post_tokenize.py ./data/corpus.shuf.test.en > ./data/corpus.shuf.test.tok.en
+# head -n 3 ./data/corpus.shuf.*.tok.*
+# wc -l ./data/corpus.shuf.*.tok.*
+
+# 코퍼스 서브워드 세그멘테이션
+# 보통 트레인 셋에 대한 bpe만 학습시켜서 valid, test에도 일관적으로 적용
+# symbol의 경우, BPE 알고리즘 상에서 몇번이나 머지를 시도할 것인지 물어봄
+# 너무 합쳐졌다 싶으면 심볼을 낮춤
+# 너무 안 찢어졌다 싶으면 심볼을 높임
+python3 ./subword-nmt/learn_bpe.py --input ./data/corpus.shuf.train.tok.ko --output bpe.ko.model --symbols 30000 --verbose
+python3 ./subword-nmt/learn_bpe.py --input ./data/corpus.shuf.train.tok.en --output bpe.en.model --symbols 50000 --verbose
+cat ./data/corpus.shuf.train.tok.ko | python3 ./subword-nmt/apply_bpe.py --c bpe.ko.model > ./data/corpus.shuf.train.tok.bpe.ko
+cat ./data/corpus.shuf.train.tok.en | python3 ./subword-nmt/apply_bpe.py --c bpe.en.model > ./data/corpus.shuf.train.tok.bpe.en
+cat ./data/corpus.shuf.valid.tok.ko | python3 ./subword-nmt/apply_bpe.py --c bpe.ko.model > ./data/corpus.shuf.valid.tok.bpe.ko
+cat ./data/corpus.shuf.valid.tok.en | python3 ./subword-nmt/apply_bpe.py --c bpe.en.model > ./data/corpus.shuf.valid.tok.bpe.en
+cat ./data/corpus.shuf.test.tok.ko | python3 ./subword-nmt/apply_bpe.py --c bpe.ko.model > ./data/corpus.shuf.test.tok.bpe.ko
+cat ./data/corpus.shuf.test.tok.en | python3 ./subword-nmt/apply_bpe.py --c bpe.en.model > ./data/corpus.shuf.test.tok.bpe.en
+# head -n 3 ./data/corpus.shuf.*.tok.bpe.*
+# wc -l ./data/corpus.shuf.*.tok.bpe.*
@@ -0,0 +1,2 @@
+__pycache__/
+*.pyc
@@ -0,0 +1,18 @@
+CHANGELOG
+---------
+
+v0.2:
+ - different, more consistent handling of end-of-word token (commit a749a7) (https://github.com/rsennrich/subword-nmt/issues/19)
+ - allow passing of vocabulary and frequency threshold to apply_bpe.py, preventing the production of OOV (or rare) subword units (commit a00db)
+ - made learn_bpe.py deterministic (commit 4c54e)
+ - various changes to make handling of UTF more consistent between Python versions
+ - new command line arguments for apply_bpe.py:
+   - '--glossaries' to prevent given strings from being affected by BPE
+   - '--merges' to apply a subset of learned BPE operations
+ - new command line arguments for learn_bpe.py:
+   - '--dict-input': rather than raw text file, interpret input as a frequency dictionary (as created by get_vocab.py).
+
+
+v0.1:
+ - consistent cross-version unicode handling
+ - all scripts are now deterministic
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2015 University of Edinburgh
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
@@ -0,0 +1,90 @@
+Subword Neural Machine Translation
+==================================
+
+This repository contains preprocessing scripts to segment text into subword
+units. The primary purpose is to facilitate the reproduction of our experiments
+on Neural Machine Translation with subword units (see below for reference).
+
+INSTALLATION
+------------
+
+Clone or copy this repository and follow the usage instructions below.
+
+For an installable package, see https://github.com/rsennrich/subword-nmt/tree/package
+
+
+USAGE INSTRUCTIONS
+------------------
+
+Check the individual files for usage instructions.
+
+To apply byte pair encoding to word segmentation, invoke these commands:
+
+    ./learn_bpe.py -s {num_operations} < {train_file} > {codes_file}
+    ./apply_bpe.py -c {codes_file} < {test_file}
+
+To segment rare words into character n-grams, do the following:
+
+    ./get_vocab.py < {train_file} > {vocab_file}
+    ./segment-char-ngrams.py --vocab {vocab_file} -n {order} --shortlist {size} < {test_file}
+
+The original segmentation can be restored with a simple replacement:
+
+    sed -r 's/(@@ )|(@@ ?$)//g'
+
+
+BEST PRACTICE ADVICE FOR BYTE PAIR ENCODING IN NMT
+--------------------------------------------------
+
+We found that for languages that share an alphabet, learning BPE on the
+concatenation of the (two or more) involved languages increases the consistency
+of segmentation, and reduces the problem of inserting/deleting characters when
+copying/transliterating names.
+
+However, this introduces undesirable edge cases in that a word may be segmented
+in a way that has only been observed in the other language, and is thus unknown
+at test time. To prevent this, `apply_bpe.py` accepts a `--vocabulary` and a
+`--vocabulary-threshold` option so that the script will only produce symbols
+which also appear in the vocabulary (with at least some frequency).
+
+To use this functionality, we recommend the following recipe (assuming L1 and L2
+are the two languages):
+
+Learn byte pair encoding on the concatenation of the training text, and get resulting vocabulary for each:
+
+    cat {train_file}.L1 {train_file}.L2 | ./learn_bpe.py -s {num_operations} -o {codes_file}
+    ./apply_bpe.py -c {codes_file} < {train_file}.L1 | ./get_vocab.py > {vocab_file}.L1
+    ./apply_bpe.py -c {codes_file} < {train_file}.L2 | ./get_vocab.py > {vocab_file}.L2
+
+more conventiently, you can do the same with with this command:
+
+    ./learn_joint_bpe_and_vocab.py --input {train_file}.L1 {train_file}.L2 -s {num_operations} -o {codes_file} --write-vocabulary {vocab_file}.L1 {vocab_file}.L2
+
+re-apply byte pair encoding with vocabulary filter:
+
+    ./apply_bpe.py -c {codes_file} --vocabulary {vocab_file}.L1 --vocabulary-threshold 50 < {train_file}.L1 > {train_file}.BPE.L1
+    ./apply_bpe.py -c {codes_file} --vocabulary {vocab_file}.L2 --vocabulary-threshold 50 < {train_file}.L2 > {train_file}.BPE.L2
+
+as a last step, extract the vocabulary to be used by the neural network. Example with Nematus:
+
+    nematus/data/build_dictionary.py {train_file}.BPE.L1 {train_file}.BPE.L2
+
+[you may want to take the union of all vocabularies to support multilingual systems]
+
+for test/dev data, re-use the same options for consistency:
+
+    ./apply_bpe.py -c {codes_file} --vocabulary {vocab_file}.L1 --vocabulary-threshold 50 < {test_file}.L1 > {test_file}.BPE.L1
+
+
+PUBLICATIONS
+------------
+
+The segmentation methods are described in:
+
+Rico Sennrich, Barry Haddow and Alexandra Birch (2016):
+    Neural Machine Translation of Rare Words with Subword Units
+    Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany.
+
+ACKNOWLEDGMENTS
+---------------
+This project has received funding from Samsung Electronics Polska sp. z o.o. - Samsung R&D Institute Poland, and from the European Union’s Horizon 2020 research and innovation programme under grant agreement 645452 (QT21).