forked from getalp/WCE-LIG
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconfig_end_user.yml
152 lines (123 loc) · 7.2 KB
/
config_end_user.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
## YAML Template.
---
#####################################################################################################
# Groupe d'Étude pour la Traduction/le Traitement Automatique des Langues et de la Parole (GETALP)
# Homepage: http:/getalp.imag.fr
#
# Author: Tien LE (ngoc-tien.le@imag.fr)
# Advisors: Laurent Besacier & Benjamin Lecouteux
# URL: tienhuong.weebly.com
#####################################################################################################
# Purpose: User should change the configuration of our Confidence Estimation System (CE System) before installing
input_data:
## Language pair
source_language: fr
target_language: en
language_pair: fr_en
###################################################################################################
## Corpus Names, if NOT CONFIG then we use DEFAULT CONFIG.
#note: no need tokenizer & lowercasing
lowercase: True
tokenizer: True
###################################################################################################
#WCE_System - corpus
raw_corpus_source_language: /input_data/src-ref-all.fr
raw_corpus_target_language: /input_data/tgt-mt-all.en
post_edition_of_machine_translation_sentences_target_language: /input_data/tgt-pe-all.en
tags_file_path: None
raw_corpus_training_size: 10000
raw_corpus_test_size: 881
###################################################################################################
## Language Models, if NOT CONFIG then we use DEFAULT CONFIG.
#french - english
language_model_source_language: /input_data/lm_5gram.fr
language_model_target_language: /input_data/lm_5gram.en
###################################################################################################
## Output from Google & Bing Translator
google_translator: /input_data/output_Google_Translator.en
bing_translator: /input_data/output_Bing_Translator.en
###################################################################################################
## n best list using MOSES
one_best_list_included_alignment: /input_data/src-ref-all.fr.translated_output10881
n_best_list_included_alignment: /input_data/src-ref-all.fr.translated_1000_output10881
## It means that there is a result that is got from MOSES with hypothesis and alignment.
## is_has_a file that included alignment. 1: True ~ Existed; 0: False ~ not existed
## if not existed, we would not have some features that uses alignment file such as: Longest Source gram length, 18 features: Target; Right_Target; Left_Target; Source; Right_Source; Left_Source (Word; POS; Stemming); WPP any; Max; Min; Nodes; WPP Exact
#ce_agent
is_has_a_file_included_alignment: 1
#wmt
#is_has_a_file_included_alignment: 2
##tool_moses
#tool_moses: ../../tool/moses/bin/moses
#tool_moses: /tools/moses/bin/moses
tool_train_model: /tools/moses/scripts/training/train-model.perl
## version moses
#ce_agent
## output version 2009:
#0 ||| Encore a crucial step for the Balkans . ||| d: 0 -1.16766 0 0 -0.731304 0 0 lm: -230.634 tm: -2.46672 -8.64038 -1.53706 -4.09358 2.99969 w: -8 ||| -310.634 ||| 0=0 1-4=1-4 5=5 6=6 7=7 ||| 0=0 1=1 2=3 3=2 4=4 5=5 6=6 7=7 ||| 0=0 1=1 2=3 3=2 4=4 5=5 6=6 7=7
version_moses: 2009
## output version 2013, 2014
# 0 ||| the chirurgiens of los angeles ont said qu' ils étaient outrés , said m camus . ||| LexicalReordering0= -1.81744 0 0 -1.64139 0 0 Distortion0= 0 LM0= -146.242 WordPenalty0= -16 PhrasePenalty0= 15 TranslationModel0= -7.20993 -7.62317 -2.93815 -4.42405 ||| -1014.93 ||| 0=0 1=1 2=2 3=3 4=4 5=5 6=6 7=7 8=8 9=9 10=10 11-13=11-12 14=13 15=14 16=15 ||| 0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 11-11 12-12 13-12 14-13 15-14 16-15
#version_moses: 2014
## Path to moses.ini, in directory "input_data"
#moses_ini: output_moses2009/moses.ini
###################################################################################################
## Berkeley Parser
## for FR
#tool_get_constituent_fr: ../../tool/bonsai_v3.2/bin/get_constituent_tree.sh
tool_get_constituent_fr: /tools/bonsai_v3.2/bin
grammar_fr_for_berkeley_parser_path: /tools/berkeley_parser/fra_sm5.gr
## for EN
grammar_en_for_berkeley_parser_path: /tools/berkeley_parser/eng_sm6.gr
## for ES
tool_berkeley_parser_path: /tools/berkeley_parser/berkeley_parser.sh
grammar_es_for_berkeley_parser_path: None
## for arabe
grammar_ar_for_berkeley_parser_path: None
##SRILM tool
## Paths to tools that user must install (You should install SRILM)
srilm_bin_directory: /tools/srilm-1.7.1/bin/i686-m64
tool_ngram: /tools/srilm-1.7.1/bin/i686-m64/ngram
##Babel Net
tool_babel_net_dir: /tools/BabelSenseCount_v25/BabelNet-2.5
tool_babel_net_en: /tools/BabelSenseCount_v25/BabelNet-2.5/calculateSenses_english.sh
tool_babel_net_fr: /tools/BabelSenseCount_v25/BabelNet-2.5/calculateSenses_french.sh
tool_babel_net_es: /tools/BabelSenseCount_v25/BabelNet-2.5/calculateSenses_spanish.sh
###################################################################################################
##TreeTagger
tree_tagger_path: /tools/treetagger
#ce_agent
treetagger_french: /tools/treetagger/cmd/tree-tagger-french
###################################################################################################
##tool_fastnc
tool_fastnc: /tools/fastnc/bin/fastnc
tool_RefToCtm: /wce_system/lib/shell_script/RefToCtmWCE.pl
###################################################################################################
##terpa
tool_java: /usr/bin/java
tool_java_mem_param: "-Xms1G -Xmx3G"
tool_terpa: /tools/terplus/terp.v1/bin/terpa_TienLE_TanLE
tool_terpa_param: /tools/terplus/terp.v1/data/terpa2.param
tool_terpa_param_loc: /tools/terplus/terp.v1/data/data_loc.param
tool_terpa_dir: /tools/terplus/terp.v1/
tool_terpa_jar: /tools/terplus/terp.v1/dist/lib/terp.jar
tool_terpa_no_shift_cost: /tools/terplus/terp.v1/bin/terpa_TienLE_TanLE_no_shift_cost
tool_terp: /tools/terplus/terp.v1/bin/terp
tool_tercom: /tools/terplus/terp.v1/bin/tercom
tercom_jar: /tools/Evaluation_Script/tercom-0.7.25/tercom.7.25.jar
#within Normalize = true --> tokenize
tool_terpa_within_tokenizing: /tools/terplus/terp.v1/bin/terpa
#TERCPP
tool_tercpp: /tools/tercpp/./tercpp.0.7.1
###################################################################################################
## CRF model
tool_wapiti: /tools/wapiti-1.5.0/./wapiti
###################################################################################################
##Giza++ (After install Giza++ --> copy into new directory"bin")
#path_to_tool_giza: /tools/giza-pp/GIZA++-v2
#path_to_tool_mkcls: /tools/giza-pp/mkcls-v2
path_to_tool_giza: /tools/giza-pp/bin
path_to_tool_mkcls: /tools/giza-pp/bin
###################################################################################################
##fast-align
tool_fast_align: /tools/fast_align/./fast_align