FedML-AI
diff --git a/‎python/examples/deploy/dummy_failed/config.yaml‎
Lines changed: 26 additions & 0 deletions b/‎python/examples/deploy/dummy_failed/config.yaml‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎python/examples/deploy/dummy_failed/src/serve_main.py‎
Lines changed: 32 additions & 0 deletions b/‎python/examples/deploy/dummy_failed/src/serve_main.py‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎python/examples/deploy/dummy_gpu_occupier/config.yaml‎
Lines changed: 9 additions & 0 deletions b/‎python/examples/deploy/dummy_gpu_occupier/config.yaml‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎python/examples/deploy/dummy_gpu_occupier/src/serve_main.py‎
Lines changed: 32 additions & 0 deletions b/‎python/examples/deploy/dummy_gpu_occupier/src/serve_main.py‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎python/examples/deploy/dummy_job/config.yaml‎
Lines changed: 2 additions & 2 deletions b/‎python/examples/deploy/dummy_job/config.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/examples/deploy/dummy_job/config/bootstrap.sh‎
Lines changed: 0 additions & 12 deletions b/‎python/examples/deploy/dummy_job/config/bootstrap.sh‎
Lines changed: 0 additions & 12 deletions
diff --git a/‎python/examples/launch/train_build_package/src/bootstrap.sh‎
Lines changed: 10 additions & 0 deletions b/‎python/examples/launch/train_build_package/src/bootstrap.sh‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎python/examples/launch/train_build_package/train.py‎ renamed to ‎python/examples/launch/train_build_package/src/train.py‎ b/‎python/examples/launch/train_build_package/train.py‎ renamed to ‎python/examples/launch/train_build_package/src/train.py‎
diff --git a/‎python/examples/launch/train_build_package/train_job.yaml‎
Lines changed: 3 additions & 6 deletions b/‎python/examples/launch/train_build_package/train_job.yaml‎
Lines changed: 3 additions & 6 deletions
diff --git a/‎python/examples/train/llm_train/README.md‎
Lines changed: 52 additions & 41 deletions b/‎python/examples/train/llm_train/README.md‎
Lines changed: 52 additions & 41 deletions
@@ -0,0 +1,26 @@
+workspace: "./src"
+
+inference_image_name: "raphaeljin/fedml"
+enable_custom_image: true
+
+bootstrap: |
+  echo "Bootstrap start..."
+  pwd
+  ls -l
+  echo "Check shell script"
+  cat fedml-deploy-bootstrap-entry-auto-gen.sh
+  echo "Check main script"
+  cat serve_main.py  
+  echo "Bootstrap finished"
+
+## Simulate a successful deployment
+#job: |
+#  python3 serve_main.py
+
+# Then during update, simulate a failed deployment
+job: |
+  echo "Simulate a failed deployment"
+  exit 1
+
+auto_detect_public_ip: true
+use_gpu: true
@@ -0,0 +1,32 @@
+from fedml.serving import FedMLPredictor
+from fedml.serving import FedMLInferenceRunner
+import uuid
+import torch
+
+# Calculate the number of elements
+num_elements = 1_073_741_824 // 4  # using integer division for whole elements
+
+
+class DummyPredictor(FedMLPredictor):
+    def __init__(self):
+        super().__init__()
+        # Create a tensor with these many elements
+        tensor = torch.empty(num_elements, dtype=torch.float32)
+
+        # Move the tensor to GPU
+        tensor_gpu = tensor.cuda()
+
+        # for debug
+        with open("/tmp/dummy_gpu_occupier.txt", "w") as f:
+            f.write("GPU is occupied")
+
+        self.worker_id = uuid.uuid4()
+        
+    def predict(self, request):
+        return {f"AlohaV0From{self.worker_id}": request}
+
+
+if __name__ == "__main__":
+    predictor = DummyPredictor()
+    fedml_inference_runner = FedMLInferenceRunner(predictor)
+    fedml_inference_runner.run()
@@ -0,0 +1,9 @@
+workspace: "./src"
+entry_point: "serve_main.py"
+bootstrap: |
+  echo "Bootstrap start..."
+  sleep 5
+  echo "Bootstrap finished"
+
+auto_detect_public_ip: true
+use_gpu: true
@@ -0,0 +1,32 @@
+from fedml.serving import FedMLPredictor
+from fedml.serving import FedMLInferenceRunner
+import uuid
+import torch
+
+# Calculate the number of elements
+num_elements = 1_073_741_824 // 4  # using integer division for whole elements
+
+
+class DummyPredictor(FedMLPredictor):
+    def __init__(self):
+        super().__init__()
+        # Create a tensor with these many elements
+        tensor = torch.empty(num_elements, dtype=torch.float32)
+
+        # Move the tensor to GPU
+        tensor_gpu = tensor.cuda()
+
+        # for debug
+        with open("/tmp/dummy_gpu_occupier.txt", "w") as f:
+            f.write("GPU is occupied")
+
+        self.worker_id = uuid.uuid4()
+        
+    def predict(self, request):
+        return {f"AlohaV0From{self.worker_id}": request}
+
+
+if __name__ == "__main__":
+    predictor = DummyPredictor()
+    fedml_inference_runner = FedMLInferenceRunner(predictor)
+    fedml_inference_runner.run()
@@ -2,5 +2,5 @@ workspace: "./src"
 entry_point: "serve_main.py"
 bootstrap: |
   echo "Bootstrap start..."
-  sleep 15
-  echo "Bootstrap finished"
+  sleep 5
+  echo "Bootstrap finished"
@@ -0,0 +1,10 @@
+### don't modify this part ###
+set -x
+##############################
+
+pip install -r requirements.txt
+echo "Bootstrap finished."
+
+### don't modify this part ###
+exit 0
+##############################
@@ -1,7 +1,7 @@
 # Local directory where your source code resides.
 # It should be the relative path to this job yaml file or the absolute path.
 # If your job doesn't contain any source code, it can be empty.
-workspace: .
+workspace: "./src"
 
 # Running entry commands which will be executed as the job entry point.
 # If an error occurs, you should exit with a non-zero code, e.g. exit 1.
@@ -14,14 +14,11 @@ job_type: train              # options: train, deploy, federate
 
 # Bootstrap shell commands which will be executed before running entry commands.
 # Support multiple lines, which can be empty.
-bootstrap: |
-  echo "Bootstrap finished."
+bootstrap: bash bootstrap.sh
 
 computing:
   minimum_num_gpus: 1           # minimum # of GPUs to provision
   maximum_cost_per_hour: $3000   # max cost per hour for your job per gpu card
-  #allow_cross_cloud_resources: true # true, false
-  #device_type: CPU              # options: GPU, CPU, hybrid
   resource_type: A100-80G       # e.g., A100-80G, please check the resource type list by "fedml show-resource-type" or visiting URL: https://open.fedml.ai/accelerator_resource_type
 
 data_args:
@@ -36,4 +33,4 @@ model_args:
   output_dim: '10'
 
 training_params:
-  learning_rate: 0.004
+  learning_rate: 0.004
@@ -2,12 +2,12 @@
  <img src="assets/fedml_logo_light_mode.png" width="400px" alt="FedML logo">
 </div>
 
-# LLM Fine-tune
+# LLM Training
 
 This repo contains an MLOps-supported training pipeline to help users build their own large language model (LLM) on proprietary/private
 data.
 This repo aims to provide a minimalist example of efficient LLM training/fine-tuning
-and to illustrate how to use FedML Launch and fine-tuning.
+and to illustrate how to use FEDML Launch.
 We leverage Pythia 7B by default and recently added support for Llama 2.
 
 The repo contains:
@@ -18,41 +18,16 @@ The repo contains:
     - Supports [DeepSpeed](https://www.deepspeed.ai/).
     - Dataset implementation with [datasets](https://huggingface.co/docs/datasets/index).
 
-## How to Use Llama 2
-
-Our example uses Pythia by default, but we recently added support for Llama2.
-If you'd like to use Llama2, please see the following instructions before getting started.
-
-To use [Llama 2](https://ai.meta.com/llama/), you need to apply access from Meta and request Meta's private
-Hugging Face repo access.
-
-1. Make sure your `transformers` version is `4.31.0` or newer. You could update your transformers via
-   `pip install --upgrade transformers`.
-2. Please visit the [Meta website](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and apply for
-   access.
-3. Apply for [Meta's private repo](https://huggingface.co/meta-llama/Llama-2-7b-hf)
-   on [Hugging Face](https://huggingface.co/meta-llama/Llama-2-7b-hf). See below image for detail.
-   ![Meta's private repo on Hugging Face](assets/Llama/huggingface_llama_repo.png)
-4. Once both access are granted, you can start using Llama by passing `--model_name "meta-llama/Llama-2-7b-hf"` to the training script.
-
-> **Warning**
-> Since Llama 2 is on a private Hugging Face repo, you need to either login to Hugging Face or provide your access token.
-> - To login to huggingface (see https://huggingface.co/settings/tokens for detail), run `huggingface-cli login` in
-    command line.
-> - To pass an access token, you need to do one of the following:
->   - Set environment variable `HUGGING_FACE_HUB_TOKEN="<your access token>"`
->   - For centralized/conventional training, pass `--auth_token "<your access token>"` in the command line.
-
 ## Getting Started
 
 Clone the repo then go to the project directory:
 
 ```shell
 # clone the repo
-git clone https://github.com/FedML-AI/llm-finetune.git
+git clone https://github.com/FedML-AI/FedML.git
 
 # go to the project directory
-cd llm-finetune
+cd python/examples/train/llm_train
 ```
 
 Install dependencies with the following command:
@@ -63,7 +38,7 @@ pip install -r requirements.txt
 
 See [Dependencies](#dependencies) for more information on the dependency versions.
 
-### Conventional/Centralized Training
+### Training
 
 The [`run_train.py`](run_train.py) contains a minimal example for conventional/centralized LLM training and fine-tuning
 on [`databricks-dolly-15k`](https://huggingface.co/datasets/FedML/databricks-dolly-15k-niid) dataset.
@@ -84,6 +59,9 @@ bash scripts/train_deepspeed.sh \
   ... # additional arguments
 ```
 
+> **Note**
+> You can use `bash scripts/train.sh -h` to list all the supported CLI options.
+
 > **Note**
 > If you have an Amper or newer GPU (e.g., RTX 3000 series or newer), you could turn on **bf16** to have more
 > efficient training by passing `--bf16 "True"` in the command line.
@@ -92,20 +70,53 @@ bash scripts/train_deepspeed.sh \
 > when using PyTorch DDP with LoRA and gradient checkpointing, you need to turn off `find_unused_parameters`
 > by passing `--ddp_find_unused_parameters "False"` in the command line.
 
+### Train with FEDML Launch
+
+If you have trouble finding computing resources, you can launch your training job via [FEDML Launch](https://doc.fedml.ai/launch) and left FEDML to find the most cost-effective resource for your task.
+
+```shell
+# install fedml library
+pip3 install fedml
+
+# launch your training job
+fedml launch job.yaml
+```
+
+You can modify the training command in [job.yaml](job.yaml) by
+- specify training settings in `job` section
+- specify environment setup settings in `bootstrap` section
+- specify compute resources in `computing` section
+
+## How to Use Llama 2
+
+Our example uses Pythia by default, but we recently added support for Llama2.
+If you'd like to use Llama2, please see the following instructions before getting started.
+
+To use [Llama 2](https://ai.meta.com/llama/), you need to apply access from Meta and request Meta's private
+Hugging Face repo access.
+
+1. Make sure your `transformers` version is `4.31.0` or newer. You could update your transformers via
+   `pip install --upgrade transformers`.
+2. Please visit the [Meta website](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and apply for
+   access.
+3. Apply for [Meta's private repo](https://huggingface.co/meta-llama/Llama-2-7b-hf)
+   on [Hugging Face](https://huggingface.co/meta-llama/Llama-2-7b-hf). See below image for detail.
+   ![Meta's private repo on Hugging Face](assets/Llama/huggingface_llama_repo.png)
+4. Once both access are granted, you can start using Llama by passing `--model_name "meta-llama/Llama-2-7b-hf"` to the training script.
+
+> **Warning**
+> Since Llama 2 is on a private Hugging Face repo, you need to either login to Hugging Face or provide your access token.
+> - To login to huggingface (see https://huggingface.co/settings/tokens for detail), run `huggingface-cli login` in
+    command line.
+> - To pass an access token, you need to do one of the following:
+>   - Set environment variable `HUGGING_FACE_HUB_TOKEN="<your access token>"`
+>   - For centralized/conventional training, pass `--auth_token "<your access token>"` in the command line.
+
+
 ### Dependencies
 
 We have tested our implement with the following setup:
 
 - Ubuntu `20.04.5 LTS` and `22.04.2 LTS`
 - CUDA `12.2`, `11.8`, `11.7` and `11.6`
-- Python `3.8.13` and `3.9.16`
-    - `fedml>=0.8.4a7`
-    - `torch>=2.0.0,<=2.1.0`
-    - `torchvision>=0.15.1,<=0.16.0`
-    - `transformers>=4.31.0,<=4.34.0`
-    - `peft>=0.4.0,<=0.5.0`
-    - `datasets>=2.11.0,<=2.14.5`
-    - `deepspeed>=0.9.1,<=0.10.3`
-    - `numpy>=1.24.3,<=1.24.4`
-    - `tensorboard>=2.12.2,<=2.13.0`
-    - `mpi4py>=3.1.4,<=3.1.5`
+- Python `3.8.13`, `3.9.16` and `3.10.13`