add wmt22 recipes with TowerInstruct and Llama3.1 LLMs

eole-nlp · vince62s · Sep 12, 2024 · Sep 12, 2024 · Sep 12, 2024 · Sep 12, 2024
commit 6f454da77ab873048523ac130f3ab508e50bd043
diff --git a/recipes/llama3.1/llama-instruct-inference.yaml b/recipes/llama3.1/llama-instruct-inference.yaml
@@ -0,0 +1,41 @@
+transforms: [onmt_tokenize]
+
+transforms_configs:
+  onmt_tokenize:
+    src_subword_type: bpe
+    src_subword_model: "${EOLE_MODEL_DIR}/llama3.1-8b-instruct/bpe.model"
+    src_onmttok_kwargs: {"mode": "space", "spacer_annotate": True, "preserve_placeholders": True}
+    tgt_subword_type: bpe
+    tgt_subword_model: "${EOLE_MODEL_DIR}/llama3.1-8b-instruct/bpe.model"
+    tgt_onmttok_kwargs: {"mode": "space", "spacer_annotate": True, "preserve_placeholders": True}
+    gpt2_pretok: true
+    mapped_tokens: [['<|start_header_id|>', '｟start_header_id｠'], ['<|end_header_id|>', '｟end_header_id｠'], ['<|eot_id|>', '｟eot_id｠']]
+
+optional_eos: ['<|eot_id|>']
+
+# Model info
+model_path: "${EOLE_MODEL_DIR}/llama3.1-8b-instruct"
+
+# Inference
+seed: 42
+max_length: 256
+# max_length: 1
+# gpu: 0
+batch_type: tokens
+batch_size: 8192
+#world_size: 1
+#gpu_ranks: [0]
+world_size: 2
+gpu_ranks: [0, 1]
+parallel_mode: "tensor_parallel"
+#quant_layers: ['gate_up_proj', 'down_proj', 'up_proj', 'linear_values', 'linear_query', 'linear_keys', 'final_linear']
+#quant_type: "bnb_NF4"
+compute_dtype: fp16
+#random_sampling_topk: 1
+#random_sampling_topp: 0.0
+#random_sampling_temp: 0.9
+beam_size: 1
+n_best: 1
+report_time: true
+src: None
+
diff --git a/recipes/wmt22_with_TowerInstruct-Mistral/README.md b/recipes/wmt22_with_TowerInstruct-Mistral/README.md
@@ -0,0 +1,38 @@
+# TowerInstruct (Mistral)
+
+---
+**NOTE**
+To make your life easier, run these commands from the recipe directory (here `recipes/wmt22_with_TowerInstruct-Mistral`).
+---
+
+## Retrieve and convert model
+
+### Set environment variables
+
+```
+export EOLE_MODEL_DIR=<where_to_store_models>
+export HF_TOKEN=<your_hf_token>
+```
+
+### Download and convert model
+
+```
+eole convert HF --model_dir Unbabel/TowerInstruct-Mistral-7B-v0.2 --output $EOLE_MODEL_DIR/TowerInstruct-Mistral-7b-v0.2 --token $HF_TOKEN
+```
+
+
+## Inference
+
+### Build the prompt for translation of newstest2022-src.en
+
+```
+python promptize_mistral.py
+```
+
+### Run inference
+
+```
+eole predict -c tower-inference.yaml -src newstest2022-src-prompt.en -output newstest2022-hyp.de
+```
+
+Then you can score newstest2022-hyp.de against newstest2022-ref.de with a scorer (sacrebleu or comet) or just use cometkiwi for reference-less score.
diff --git a/recipes/wmt22_with_TowerInstruct-Mistral/newstest2022-hyp.de b/recipes/wmt22_with_TowerInstruct-Mistral/newstest2022-hyp.de
diff --git a/recipes/wmt22_with_TowerInstruct-Mistral/newstest2022-ref.de b/recipes/wmt22_with_TowerInstruct-Mistral/newstest2022-ref.de
diff --git a/recipes/wmt22_with_TowerInstruct-Mistral/newstest2022-src-prompt.en b/recipes/wmt22_with_TowerInstruct-Mistral/newstest2022-src-prompt.en
diff --git a/recipes/wmt22_with_TowerInstruct-Mistral/newstest2022-src.en b/recipes/wmt22_with_TowerInstruct-Mistral/newstest2022-src.en
diff --git a/recipes/wmt22_with_TowerInstruct-Mistral/promptize_mistral.py b/recipes/wmt22_with_TowerInstruct-Mistral/promptize_mistral.py
@@ -0,0 +1,14 @@
+
+# Input and output file paths
+input_file_path = "newstest2022-src.en"
+output_file_path = "newstest2022-src-prompt.en"
+
+# Open the input file for reading and the output file for writing
+with open(input_file_path, "r") as input_file, open(output_file_path, "w") as output_file:
+    # Loop through each line in the input file
+    for line in input_file:
+        # Set up the prompt provided by Tower-instruct with the current line's content
+        # PLEASE note the space between "<|im_start|> user" and "<|im_start|> assistant"
+        # This is due to a specific behavior of onmt_tokenize - HF tokenizer has a similar strange behavior: https://github.com/huggingface/transformers/issues/31513#issuecomment-2340151976
+        prompt = f"<|im_start|> user｟newline｠Translate the following text from English into German.｟newline｠English: {line.strip()}｟newline｠German:<|im_end|> ｟newline｠<|im_start|> assistant｟newline｠"
+        output_file.write(prompt + "\n")
diff --git a/recipes/wmt22_with_TowerInstruct-Mistral/tower-inference.yaml b/recipes/wmt22_with_TowerInstruct-Mistral/tower-inference.yaml
@@ -0,0 +1,38 @@
+
+transforms: [onmt_tokenize]
+transforms_configs:
+  onmt_tokenize:
+    src_subword_type: sentencepiece
+    src_subword_model: "${EOLE_MODEL_DIR}/TowerInstruct-Mistral-7b-v0.2/tokenizer.model"
+    src_onmttok_kwargs: {"mode": "space", "spacer_annotate": True, "preserve_placeholders": True}
+    tgt_subword_type: sentencepiece
+    tgt_subword_model: "${EOLE_MODEL_DIR}/TowerInstruct-Mistral-7b-v0.2/tokenizer.model"
+    tgt_onmttok_kwargs: {"mode": "space", "spacer_annotate": True, "preserve_placeholders": True}
+    mapped_tokens: [['<|im_start|>', '｟im_start｠'], ['<|im_end|>', '｟im_end｠'],]
+optional_eos: ['<|im_end|>']
+
+# Model info
+model_path: ["${EOLE_MODEL_DIR}/TowerInstruct-Mistral-7b-v0.2"]
+
+# Inference
+seed: 42
+max_length: 512
+gpu: 0
+batch_type: tokens
+batch_size: 8192
+world_size: 1
+gpu_ranks: [0]
+#parallel_mode: "tensor_parallel"
+#quant_layers: ['gate_up_proj', 'down_proj', 'up_proj', 'linear_values', 'linear_query', 'linear_keys', 'final_linear']
+#quant_type: "bnb_NF4"
+compute_dtype: fp16
+#random_sampling_topk: 1
+#random_sampling_topp: 0.0
+#random_sampling_temp: 0.9
+beam_size: 1
+n_best: 1
+report_time: true
+#backend: OpenNMT-py
+#backend: CT2
+src: None
+
diff --git a/recipes/wmt22_with_TowerInstruct-llama2/README.md b/recipes/wmt22_with_TowerInstruct-llama2/README.md
@@ -0,0 +1,38 @@
+# TowerInstruct-(Llama2)
+
+---
+**NOTE**
+To make your life easier, run these commands from the recipe directory (here `recipes/wmt22_with_TowerInstruct-llama2`).
+---
+
+## Retrieve and convert model
+
+### Set environment variables
+
+```
+export EOLE_MODEL_DIR=<where_to_store_models>
+export HF_TOKEN=<your_hf_token>
+```
+
+### Download and convert model
+
+```
+eole convert HF --model_dir Unbabel/TowerInstruct-7B-v0.2 --output $EOLE_MODEL_DIR/TowerInstruct-7b-v0.2 --token $HF_TOKEN
+```
+
+
+## Inference
+
+### Build the prompt for translation of newstest2022-src.en
+
+```
+python promptize_llama2.py
+```
+
+### Run inference
+
+```
+eole predict -c tower-inference.yaml -src newstest2022-src-prompt.en -output newstest2022-hyp.de
+```
+
+Then you can score newstest2022-hyp.de against newstest2022-ref.de with a scorer (sacrebleu or comet) or just use cometkiwi for reference-less score.