JoshuaL3000
diff --git a/‎.github/workflows/config/mpt_deltatuner.yaml‎
Lines changed: 30 additions & 0 deletions b/‎.github/workflows/config/mpt_deltatuner.yaml‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎.github/workflows/config/mpt_deltatuner_deepspeed.yaml‎
Lines changed: 30 additions & 0 deletions b/‎.github/workflows/config/mpt_deltatuner_deepspeed.yaml‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎.github/workflows/config/update_finetune_config_on_intel_gpu.py‎
Lines changed: 45 additions & 0 deletions b/‎.github/workflows/config/update_finetune_config_on_intel_gpu.py‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎.github/workflows/workflow_finetune.yml‎
Lines changed: 13 additions & 2 deletions b/‎.github/workflows/workflow_finetune.yml‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎.github/workflows/workflow_finetune_gpu.yml‎
Lines changed: 25 additions & 0 deletions b/‎.github/workflows/workflow_finetune_gpu.yml‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎.github/workflows/workflow_inference.yml‎
Lines changed: 14 additions & 7 deletions b/‎.github/workflows/workflow_inference.yml‎
Lines changed: 14 additions & 7 deletions
diff --git a/‎.github/workflows/workflow_orders_nightly.yml‎
Lines changed: 4 additions & 1 deletion b/‎.github/workflows/workflow_orders_nightly.yml‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 20 additions & 12 deletions b/‎README.md‎
Lines changed: 20 additions & 12 deletions
diff --git a/‎common/trainer/default_trainer.py‎
Lines changed: 10 additions & 8 deletions b/‎common/trainer/default_trainer.py‎
Lines changed: 10 additions & 8 deletions
diff --git a/‎dev/docker/Dockerfile‎
Lines changed: 2 additions & 5 deletions b/‎dev/docker/Dockerfile‎
Lines changed: 2 additions & 5 deletions
@@ -0,0 +1,30 @@
+port: 8000
+name: mpt-7b
+route_prefix: /mpt-7b
+precision: 'bf16'
+cpus_per_worker: 24
+gpus_per_worker: 0
+deepspeed: false
+workers_per_group: 2
+ipex: false
+device: "cpu"
+model_description:
+  model_id_or_path: mosaicml/mpt-7b
+  tokenizer_name_or_path: EleutherAI/gpt-neox-20b
+  chat_processor: ChatModelGptJ
+  peft_model_id_or_path: nathan0/mpt-7b-deltatuner-model
+  peft_type: deltatuner
+  prompt:
+    intro: 'Below is an instruction that describes a task, paired with an input that
+      provides further context. Write a response that appropriately completes the request.
+
+      '
+    human_id: '
+
+      ### Instruction'
+    bot_id: '
+
+      ### Response'
+    stop_words: []
+  config:
+    trust_remote_code: true
@@ -0,0 +1,30 @@
+port: 8000
+name: mpt-7b
+route_prefix: /mpt-7b
+precision: 'bf16'
+cpus_per_worker: 24
+gpus_per_worker: 0
+deepspeed: true
+workers_per_group: 2
+ipex: false
+device: "cpu"
+model_description:
+  model_id_or_path: mosaicml/mpt-7b
+  tokenizer_name_or_path: EleutherAI/gpt-neox-20b
+  chat_processor: ChatModelGptJ
+  peft_model_id_or_path: nathan0/mpt-7b-deltatuner-model
+  peft_type: deltatuner
+  prompt:
+    intro: 'Below is an instruction that describes a task, paired with an input that
+      provides further context. Write a response that appropriately completes the request.
+
+      '
+    human_id: '
+
+      ### Instruction'
+    bot_id: '
+
+      ### Response'
+    stop_words: []
+  config:
+    trust_remote_code: true
@@ -0,0 +1,45 @@
+import argparse
+
+
+def update_finetune_config(base_model):
+    conf_file = "finetune/finetune.conf"
+    with open(conf_file) as f:
+        config = eval(f.read())
+        # due to compute node can't connect network
+        # base models are downloaded as local files in directory ~/models/
+        # avaiable base models are:
+        #
+        # Mistral-7B-v0.1
+        # Llama-2-7b 
+        # pythia-1.4b
+        # pythia-2.8b
+        # pythia-70m
+        # gpt-j-6b
+        # pythia-160m
+        # pythia-410m
+        # pythia-12b
+        # pythia-1b
+        # pythia-6.9b
+
+        config["General"]["base_model"] = base_model
+        # config["General"]["base_model"] = "pythia-70m"
+        config["Training"]["device"] = "GPU"
+        config["Training"]["resources_per_worker"]["CPU"] = 1
+        config["Training"]["resources_per_worker"]["GPU"] = 1
+        config["Training"]["accelerate_mode"] = "GPU_DDP"
+
+    with open(conf_file, "w") as f:
+        f.write(str(config))
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(description="Finetuning on Intel GPU")
+    parser.add_argument("--base_model", type=str, required=True, default=None)
+    return parser
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+
+    update_finetune_config(args.base_model)
@@ -45,7 +45,7 @@ jobs:
 
       - name: Run Finetune Test
         run: |
-          docker exec "finetune" bash -c "RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --head --node-ip-address 127.0.0.1 --ray-debugger-external; RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1  ray start --address='127.0.0.1:6379' --ray-debugger-external"
+          docker exec "finetune" bash -c "source \$(python -c 'import oneccl_bindings_for_pytorch as torch_ccl;print(torch_ccl.cwd)')/env/setvars.sh; RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --head --node-ip-address 127.0.0.1 --ray-debugger-external; RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1  ray start --address='127.0.0.1:6379' --ray-debugger-external"
           CMD=$(cat << EOF
           conf_path = "finetune/finetune.conf"
           with open(conf_path, encoding="utf-8") as reader:
@@ -64,7 +64,11 @@ jobs:
               else:
                   result['General']["config"]["use_auth_token"] = None
               result['Training']['epochs'] = 1
-              result['Training']['num_training_workers'] = 1
+              if "${{ matrix.model }}" == "gpt2":
+                  # to verify oneccl
+                  result['Training']['num_training_workers'] = 2
+              else:
+                  result['Training']['num_training_workers'] = 1
               result['General']['lora_config'] = None
           with open(conf_path, 'w') as output:
               print(result, file=output)
@@ -123,6 +127,13 @@ jobs:
             docker exec "finetune" bash -c "python finetune/finetune.py --config_path finetune/finetune.conf"
           fi
 
+      - name: Stop Ray
+        run: |
+          cid=$(docker ps -q --filter "name=finetune")
+          if [[ ! -z "$cid" ]]; then
+            docker exec "finetune" bash -c "ray stop"
+          fi
+          
       - name: Stop Container
         if: success() || failure()
         run: |
 
@@ -0,0 +1,25 @@
+name: Finetune on Intel GPU
+
+on:
+  workflow_call:
+
+jobs:
+  finetune:
+    name: finetune on gpu test
+    strategy:
+      matrix:
+        model: [ pythia-6.9b, gpt-j-6b ]
+    runs-on: self-hosted
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+
+      - name: Running task on Intel GPU
+        run: |
+          rm ~/borealis-runner/llm-ray.tar.gz -f
+          tar zcf ~/borealis-runner/llm-ray.tar.gz -C ~/actions-runner/_work/llm-ray .
+          cd ~/borealis-runner/
+          python3 finetune_on_pvc.py --base_model "${{ matrix.model }}"
+
+      - name: Test Summary
+        run: echo "to be continued"
@@ -12,10 +12,10 @@ jobs:
     name: inference test
     strategy:
       matrix:
-        model: [ gpt-j-6B, gpt2, bloom, opt, mpt ]
+        model: [ gpt-j-6b, gpt2, bloom-560m, opt-125m, mpt-7b ]
         include:
-          - dtuner_model: /root/.cache/huggingface/hub/mpt-7b-deltatuner-model
-            model: mpt
+          - dtuner_model: nathan0/mpt-7b-deltatuner-model
+            model: mpt-7b
     runs-on: self-hosted
     steps:
       - name: Checkout
@@ -43,13 +43,13 @@ jobs:
       - name: Run Inference Test with Deltatuner
         if: ${{ matrix.dtuner_model }}
         run: |
-          docker exec "inference" bash -c "KEEP_SERVE_TERMINAL='false' MODEL_TO_SERVE=\"${{ matrix.model }}\" python inference/run_model_serve.py --deltatuner_model ${{ matrix.dtuner_model }}"
+          docker exec "inference" bash -c "KEEP_SERVE_TERMINAL='false'  python inference/run_model_serve.py --config_file .github/workflows/config/mpt_deltatuner.yaml"
           docker exec "inference" bash -c "python inference/run_model_infer.py --num_iter 1 --model_endpoint http://127.0.0.1:8000/${{ matrix.model }}"
           docker exec "inference" bash -c "python inference/run_model_infer.py --num_iter 1 --model_endpoint http://127.0.0.1:8000/${{ matrix.model }} --streaming_response"
 
       - name: Run Inference Test with DeepSpeed
         run: |
-          if [[ ${{ matrix.model }} =~ ^(gpt2|mpt)$ ]]; then
+          if [[ ${{ matrix.model }} =~ ^(gpt2|mpt-7b)$ ]]; then
             echo ${{ matrix.model }} is not supported!
           else
             docker exec "inference" bash -c "KEEP_SERVE_TERMINAL='false' MODEL_TO_SERVE=\"${{ matrix.model }}\" python inference/run_model_serve.py --deepspeed"
@@ -60,14 +60,21 @@ jobs:
       - name: Run Inference Test with DeepSpeed and Deltatuner
         if: ${{ matrix.dtuner_model }}
         run: |
-          if [[ ${{ matrix.model }} =~ ^(gpt2|mpt)$ ]]; then
+          if [[ ${{ matrix.model }} =~ ^(gpt2|mpt-7b)$ ]]; then
             echo ${{ matrix.model }} is not supported!
           else
-            docker exec "inference" bash -c "KEEP_SERVE_TERMINAL='false' MODEL_TO_SERVE=\"${{ matrix.model }}\" python inference/run_model_serve.py --deepspeed --deltatuner_model ${{ matrix.dtuner_model }}"
+            docker exec "inference" bash -c "KEEP_SERVE_TERMINAL='false' python inference/run_model_serve.py --config_file .github/workflows/config/mpt_deltatuner_deepspeed.yaml"
             docker exec "inference" bash -c "python inference/run_model_infer.py --num_iter 1 --model_endpoint http://127.0.0.1:8000/${{ matrix.model }}"
             docker exec "inference" bash -c "python inference/run_model_infer.py --num_iter 1 --model_endpoint http://127.0.0.1:8000/${{ matrix.model }} --streaming_response"
           fi
 
+      - name: Stop Ray
+        run: |
+          cid=$(docker ps -q --filter "name=inference")
+          if [[ ! -z "$cid" ]]; then
+            docker exec "inference" bash -c "ray stop"
+          fi
+      
       - name: Stop Container
         if: success() || failure()
         run: |
 
@@ -2,7 +2,7 @@ name: llm-ray inference & finetune
 
 on:
   schedule:
-    - cron: "* 21 * * *"
+    - cron: "0 21 * * *"
 
 jobs:
 
@@ -13,3 +13,6 @@ jobs:
     uses: ./.github/workflows/workflow_finetune.yml
     with:
       ci_type: nightly
+
+  call-finetune-on-intel-gpu:
+    uses: ./.github/workflows/workflow_finetune_gpu.yml
@@ -45,7 +45,9 @@ Workflow has been tested on Linux-4.18.0-408.el8.x86_64 and Ubuntu 22.04
 ```bash
 git clone https://github.com/intel-sandbox/llm-ray.git
 cd llm-ray
-pip install -r ./requirements.txt -f https://developer.intel.com/ipex-whl-stable-cpu -f https://download.pytorch.org/whl/torch_stable.html
+pip install .[cpu] -f https://developer.intel.com/ipex-whl-stable-cpu -f https://download.pytorch.org/whl/torch_stable.html
+# Dynamic link oneCCL and Intel MPI libraries
+source $(python -c "import oneccl_bindings_for_pytorch as torch_ccl;print(torch_ccl.cwd)")/env/setvars.sh
 ```
 
 #### 2. Launch ray cluster
@@ -64,18 +66,23 @@ If deploying a ray cluster on multiple nodes, please download the workflow repos
 
 #### 1. Prepare Dataset
 
-Now, the workflow only supports datasets in the specified format
-
-The format of dataset similar to [databricks/databricks-dolly-15k](https://huggingface.co/datasets/databricks/databricks-dolly-15k). This type of data is used for finetuning in prompt mode and this type of data is characterized by containing `instruction` `context` and `response` fields where `instruction` and `response` are required fields and `context` is an optional field. In the data preprocessing stage, the three fields will be concatenated to the corresponding format according to [dolly](https://github.com/databrickslabs/dolly/blob/master/training/trainer.py#LL93).
+The workflow only supports datasets with JSONL (JSON Lines) format, where each line is a separate JSON object. Here’s the structure each line should follow:
 
+``` json
+{"instruction":"<User Input>", "context":"<Additional Information>", "response":"<Expected Output>"}
+```
 
-The meaning of the above three columns:
-+ Instruction Column: The column in the dataset is the user input, such as a question or a command.
-+ Context Column: This column is other information used by instruction, such as the options used in the question and so on. It can be empty.
-+ Response: The column in the dataset containing the expected output.
+- Instruction: This is the user's input, such as a question, command, or prompt for content generation.
+- Context: Supplementary information that aids the instruction. This can include previous conversation parts, background details, or specificities influencing the response. It's optional and can be left empty.
+- Response: The model's expected output in response to the 'instruction', considering the 'context' if provided.
 
+##### Examples:
+``` json
+{"instruction":"Which is a species of fish? Tope or Rope", "context":"", "response":"Tope"}
+{"instruction":"What is the average lifespan of a Golden Retriever?","context":"Golden Retrievers are a generally healthy breed; they have an average lifespan of 12 to 13 years. Irresponsible breeding to meet high demand has led to the prevalence of inherited health problems in some breed lines, including allergic skin conditions, eye problems and sometimes snappiness. These problems are rarely encountered in dogs bred from responsible breeders.","response":"The average lifespan of a Golden Retriever is 12 to 13 years."}
+```
 
-Therefore, if the your data meets the above two formats, you can use the data by configuring the local data path or huggingface dataset. If not, please refer to the following **Adopt to Your Dataset**.
+An example dataset can be accessed at `examples/data/sample_finetune_data.jsonl`. Ensure each line in your dataset follows the above format.
 
 #### 2. Finetune
 
@@ -147,15 +154,16 @@ A specific model can be deployed by specifying the model path and tokenizer path
 # If you dont' want to view serve logs, you can set env var, "KEEP_SERVE_TERMINAL" to false
 
 # Run model serve with specified model and tokenizer
-python inference/run_model_serve.py --model $model --tokenizer $tokenizer --streaming_response
+python inference/run_model_serve.py --model $model --tokenizer $tokenizer
 
 # INFO - Deployment 'custom-model_PredictDeployment' is ready at `http://127.0.0.1:8000/custom-model`. component=serve deployment=custom-model_PredictDeployment
 # Service is deployed successfully
 
 # Verfiy the inference on deployed model
-python inference/run_model_infer.py --model_endpoint http://127.0.0.1:8000/custom-model
+python inference/run_model_infer.py --model_endpoint http://127.0.0.1:8000/custom-model --streaming_response
 ```
-Otherwise, all the models configured in `inference/config.py` will be deployed by default. If you want to choose a specific model to deploy, you can set env var, "MODEL_TO_SERVE", to your choice. You can add customized models in it as needed.
+Otherwise, all the models placed under `inference/models` folder will be deployed by default. If you want to choose a specific model to deploy, you can set env var, "MODEL_TO_SERVE", to your choice. You can also specify your model by either `--model` or `--config_file`.
+For `--config_file`, you can copy one of them from `inference/models` and make necessary changes.
 
 Llm-ray also supports serving with deepspeed. Please follow the [guide](inference/deepspeed/README.md) under inference/deepspeed folder.
 
 
@@ -49,7 +49,7 @@ def recovery(self, config):
 
                 # update lr_scheduler status
                 if Path.exists(checkpoint_dir / "lr_scheduler.pt") and hasattr(self, "lr_scheduler"):
-                    scheduler_state = torch.load(checkpoint_dir / "lr_schduler.pt", map_location="cpu")
+                    scheduler_state = torch.load(checkpoint_dir / "lr_scheduler.pt", map_location="cpu")
                     self.lr_scheduler.load_state_dict(scheduler_state)
 
                 # update current epoch
@@ -111,12 +111,14 @@ def prepare(self, model, tokenizer, dataset, optimizer, accelerator):
             lr_scheduler = None
 
         model.train()
-        self.model, self.optimizer, self.lr_scheduler = accelerator.prepare(
-            model, optimizer, lr_scheduler
-        )
 
-        self.train_dataloader, self.eval_dataloader = accelerator.prepare(
-            train_dataloader, eval_dataloader,
+        # self.model, self.optimizer, self.lr_scheduler, ..., are prepared with 2 steps
+        # because it is recommended way to prepare model and optimizer while using FSDP.
+        # https://huggingface.co/docs/accelerate/usage_guides/fsdp#a-few-caveats-to-be-aware-of
+        self.model = accelerator.prepare(model)
+
+        self.optimizer, self.train_dataloader, self.eval_dataloader, self.lr_scheduler = accelerator.prepare(
+            optimizer, train_dataloader, eval_dataloader, lr_scheduler
         )
 
         checkpoint = self.config.get("checkpoint")
@@ -144,7 +146,7 @@ def train(self):
                         self.lr_scheduler.step()
                     self.optimizer.zero_grad()
                     if step % log_step == 0:
-                        logger.info(f"train epoch:[{idx}/{num_train_epochs}]\tstep:[{step}/{total_steps}]\tloss:{loss}\tppl:{math.exp(loss)}\ttime:{time.time()-start}")
+                        logger.info(f"train epoch:[{idx}/{num_train_epochs}]\tstep:[{step}/{total_steps}]\tloss:{loss:.6f}\tppl:{math.exp(loss):.6f}\ttime:{time.time()-start:.6f}")
                         report({"train_epoch": idx, "total_epochs": num_train_epochs, "train_step": step, "total_steps": min(max_train_step, total_steps) if max_train_step else total_steps})
                         start = time.time()
                 if max_train_step is not None:
@@ -207,7 +209,7 @@ def save(self, config, epoch = 0):
             torch.save(self.optimizer.state_dict(), os.path.join(tmpdir, "optim.pt"))
             torch.save({"epoch": epoch}, os.path.join(tmpdir, "epoch.pt"))
             if self.lr_scheduler:
-                torch.save(self.lr_scheduler.state_dict(), os.path.join(tmpdir, "lr_schduler.pt"))
+                torch.save(self.lr_scheduler.state_dict(), os.path.join(tmpdir, "lr_scheduler.pt"))
             checkpoint = Checkpoint.from_directory(tmpdir)
             checkpoint.to_directory(local_checkpoint_path)
             logger.info(f"save checkpoint to {local_checkpoint_path} finished")
@@ -23,13 +23,10 @@ RUN conda init bash && \
     conda config --add channels intel && \
     conda install python==3.9
 
-COPY ./requirements.txt /tmp
-RUN pip install -r /tmp/requirements.txt -f https://developer.intel.com/ipex-whl-stable-cpu \
+COPY ./ .
+RUN pip install -e .[cpu,deepspeed] -f https://developer.intel.com/ipex-whl-stable-cpu \
     -f https://download.pytorch.org/whl/torch_stable.html
 
-# For serving with DeepSpeed
-COPY ./inference/deepspeed/requirements.cpu.txt /tmp
-RUN pip install -r /tmp/requirements.cpu.txt
 RUN ds_report
 
 # Used to invalidate docker build cache with --build-arg CACHEBUST=$(date +%s)