bug fixing

ArmelRandy · Sep 7, 2023 · 793659c · 793659c
1 parent 5bed946
commit 793659c
Show file tree

Hide file tree

Showing 11 changed files with 113 additions and 5,355 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,11 @@
+# python cache
+__pycache__/
+
+# All jsonl files
+data/bi/code_llama_13b/*.jsonl
+
+# logs folder
+logs/
+
+# slurm files
+*.slurm
diff --git a/conversation.py b/conversation.py
diff --git a/data/bi/code_llama_13b/output.jsonl b/data/bi/code_llama_13b/output.jsonl
diff --git a/data/code_tasks.jsonl b/data/code_tasks.jsonl
diff --git a/data/output.jsonl b/data/output.jsonl
diff --git a/logs/self-instruct-418678.out b/logs/self-instruct-418678.out
diff --git a/main.py b/main.py
@@ -98,8 +98,8 @@
                 - args.num_prompt_synthetic_instructions,
                 replace=False,
             )
+            # sample machine generated instructions
             if len(machine_instructions) >= args.num_prompt_synthetic_instructions:
-                # sample machine generated instructions
                 synthetic_indices = rng.choice(
                     a=np.arange(len(machine_instructions)),
                     size=args.num_prompt_synthetic_instructions,

diff --git a/processing.py b/processing.py
@@ -1,14 +1,9 @@
 import os
 import json
-import random
-import re
-import string
 import tqdm
-import argparse
 import numpy as np
 import pandas as pd
 from multiprocessing import Pool
-from functools import partial
 from rouge_score import rouge_scorer
 from sentence_transformers import SentenceTransformer, util
 
@@ -26,7 +21,7 @@
 similarity_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
 df = pd.DataFrame(columns=["rouge score", "sbert score"], data=np.zeros((10, 2)))
 if __name__ == "__main__":
-    # os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
     args = parse_args_for_post_processing()
     rng = np.random.default_rng(args.seed)
     set_seed(args.seed)
@@ -61,7 +56,7 @@
                 instruction_info = json.loads(line)
                 machine_instructions.append(instruction_info)
         print(f"Loaded {len(machine_instructions)} machine-generated instructions")
-    # check if there is already a file containing the scores
+    # check if the output file exists and is not empty, read its content and start accordingly
     start = 0
     if os.path.exists(args.output_data_path):
         with open(args.output_data_path, "r") as fin:
@@ -132,8 +127,7 @@
                 end_of_prompt + len(anchor) :
             ].strip()
             if predicted_instruction.endswith("###"):
-                predicted_instruction = predicted_instruction[::-3]
-            print(f"PREDICTION\n{predicted_instruction}")
+                predicted_instruction = predicted_instruction[:-3]
             instructions.append({"instruction": predicted_instruction})
 
         process_duration = time.time() - process_start
@@ -142,7 +136,8 @@
             output_dictionary[instruction] = []
             L.append(i)
             if accelerator.is_main_process:
-                fout.write(json.dumps(output_dictionary) + "\n")
+                with open(args.output_data_path, "a") as fout:
+                    fout.write(json.dumps(output_dictionary) + "\n")
             continue
         for candidate in instructions:
             rouge_score = rouge_scorer._score_lcs(

diff --git a/run.slurm b/run.slurm
@@ -4,7 +4,7 @@
 #SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
 #SBATCH --exclusive
 #SBATCH --cpus-per-task=48
-#SBATCH --gres=gpu:1
+#SBATCH --gres=gpu:8
 #SBATCH --partition=production-cluster
 #SBATCH --output=/fsx/armel/Self-instruct/logs/%x-%j.out
 #SBATCH --mem-per-cpu=11G
@@ -18,7 +18,7 @@ conda activate finetune
 echo "START TIME: $(date)"
 
 # Training setup
-GPUS_PER_NODE=1
+GPUS_PER_NODE=8
 # so processes know who to talk to
 MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
 MASTER_PORT=6000
@@ -37,11 +37,11 @@ LOG_PATH=$PATH_TO_LOG/test_log.txt
 CMD=" \
     main.py \
     --seed_tasks_path="data/code_tasks.jsonl" \
-    --output_data_path data/output.jsonl \
+    --output_data_path data/bi/code_llama_13b/output.jsonl \
     --num_instructions_to_generate 20000 \
     --template_name better \
     --format 2 \
-    --model_name_or_path="bigcode/starcoderbase-1b" \
+    --model_name_or_path="codellama/CodeLlama-13b-hf" \
     --num_prompt_instructions 8 \
     --request_batch_size 8 \
     --num_prompt_synthetic_instructions 2 \
@@ -53,8 +53,8 @@ CMD=" \
     --threshold 0.7 \
     --seed 42 \
     --keep_programming \
-    --use_tgi \
 "
+#    --use_tgi \
 
 #CMD=" \
 #    processing.py \

diff --git a/template.py b/template.py
@@ -14,11 +14,10 @@ class Template:
     Output :
     {output}
     """
-
     instruction_token: str = "Instruction"
     input_token: str = "Input"
     output_token: str = "Output"
-
+    
     def get_triprompt(self, example, prefix="") -> str:
         """
         takes as input a dictionary, i.e. a seed example with an instruction, its input and its output.

diff --git a/tgi.py b/tgi.py
@@ -28,7 +28,8 @@ def run_eval(inputs):
             "do_sample": True,
             "top_p": top_p,
             "temperature": temperature,
-            "best_of" : 1
+            "best_of" : 1,
+            #"stop" :
         },
     }
     headers = {