Skip to content

Commit

Permalink
bug fixing
Browse files Browse the repository at this point in the history
  • Loading branch information
ArmelRandy committed Sep 7, 2023
1 parent 5bed946 commit 793659c
Show file tree
Hide file tree
Showing 11 changed files with 113 additions and 5,355 deletions.
11 changes: 11 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# python cache
__pycache__/

# All jsonl files
data/bi/code_llama_13b/*.jsonl

# logs folder
logs/

# slurm files
*.slurm
71 changes: 0 additions & 71 deletions conversation.py

This file was deleted.

45 changes: 45 additions & 0 deletions data/bi/code_llama_13b/output.jsonl

Large diffs are not rendered by default.

86 changes: 43 additions & 43 deletions data/code_tasks.jsonl

Large diffs are not rendered by default.

1,019 changes: 0 additions & 1,019 deletions data/output.jsonl

This file was deleted.

4,203 changes: 0 additions & 4,203 deletions logs/self-instruct-418678.out

This file was deleted.

2 changes: 1 addition & 1 deletion main.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,8 @@
- args.num_prompt_synthetic_instructions,
replace=False,
)
# sample machine generated instructions
if len(machine_instructions) >= args.num_prompt_synthetic_instructions:
# sample machine generated instructions
synthetic_indices = rng.choice(
a=np.arange(len(machine_instructions)),
size=args.num_prompt_synthetic_instructions,
Expand Down
15 changes: 5 additions & 10 deletions processing.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,9 @@
import os
import json
import random
import re
import string
import tqdm
import argparse
import numpy as np
import pandas as pd
from multiprocessing import Pool
from functools import partial
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer, util

Expand All @@ -26,7 +21,7 @@
similarity_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
df = pd.DataFrame(columns=["rouge score", "sbert score"], data=np.zeros((10, 2)))
if __name__ == "__main__":
# os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
args = parse_args_for_post_processing()
rng = np.random.default_rng(args.seed)
set_seed(args.seed)
Expand Down Expand Up @@ -61,7 +56,7 @@
instruction_info = json.loads(line)
machine_instructions.append(instruction_info)
print(f"Loaded {len(machine_instructions)} machine-generated instructions")
# check if there is already a file containing the scores
# check if the output file exists and is not empty, read its content and start accordingly
start = 0
if os.path.exists(args.output_data_path):
with open(args.output_data_path, "r") as fin:
Expand Down Expand Up @@ -132,8 +127,7 @@
end_of_prompt + len(anchor) :
].strip()
if predicted_instruction.endswith("###"):
predicted_instruction = predicted_instruction[::-3]
print(f"PREDICTION\n{predicted_instruction}")
predicted_instruction = predicted_instruction[:-3]
instructions.append({"instruction": predicted_instruction})

process_duration = time.time() - process_start
Expand All @@ -142,7 +136,8 @@
output_dictionary[instruction] = []
L.append(i)
if accelerator.is_main_process:
fout.write(json.dumps(output_dictionary) + "\n")
with open(args.output_data_path, "a") as fout:
fout.write(json.dumps(output_dictionary) + "\n")
continue
for candidate in instructions:
rouge_score = rouge_scorer._score_lcs(
Expand Down
10 changes: 5 additions & 5 deletions run.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
#SBATCH --exclusive
#SBATCH --cpus-per-task=48
#SBATCH --gres=gpu:1
#SBATCH --gres=gpu:8
#SBATCH --partition=production-cluster
#SBATCH --output=/fsx/armel/Self-instruct/logs/%x-%j.out
#SBATCH --mem-per-cpu=11G
Expand All @@ -18,7 +18,7 @@ conda activate finetune
echo "START TIME: $(date)"

# Training setup
GPUS_PER_NODE=1
GPUS_PER_NODE=8
# so processes know who to talk to
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
MASTER_PORT=6000
Expand All @@ -37,11 +37,11 @@ LOG_PATH=$PATH_TO_LOG/test_log.txt
CMD=" \
main.py \
--seed_tasks_path="data/code_tasks.jsonl" \
--output_data_path data/output.jsonl \
--output_data_path data/bi/code_llama_13b/output.jsonl \
--num_instructions_to_generate 20000 \
--template_name better \
--format 2 \
--model_name_or_path="bigcode/starcoderbase-1b" \
--model_name_or_path="codellama/CodeLlama-13b-hf" \
--num_prompt_instructions 8 \
--request_batch_size 8 \
--num_prompt_synthetic_instructions 2 \
Expand All @@ -53,8 +53,8 @@ CMD=" \
--threshold 0.7 \
--seed 42 \
--keep_programming \
--use_tgi \
"
# --use_tgi \

#CMD=" \
# processing.py \
Expand Down
3 changes: 1 addition & 2 deletions template.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,10 @@ class Template:
Output :
{output}
"""

instruction_token: str = "Instruction"
input_token: str = "Input"
output_token: str = "Output"

def get_triprompt(self, example, prefix="") -> str:
"""
takes as input a dictionary, i.e. a seed example with an instruction, its input and its output.
Expand Down
3 changes: 2 additions & 1 deletion tgi.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ def run_eval(inputs):
"do_sample": True,
"top_p": top_p,
"temperature": temperature,
"best_of" : 1
"best_of" : 1,
#"stop" :
},
}
headers = {
Expand Down

0 comments on commit 793659c

Please sign in to comment.