diff --git a/.gitignore b/.gitignore index 0164367eb1..535642f52d 100644 --- a/.gitignore +++ b/.gitignore @@ -13,4 +13,6 @@ venv/* *.jpg *.crt *.key -.eggs \ No newline at end of file +.eggs +eggs +*.pyi \ No newline at end of file diff --git a/openfl-tutorials/experimental/LLM_Horovod.MD b/openfl-tutorials/experimental/LLM_Horovod.MD new file mode 100644 index 0000000000..4d60bfdf54 --- /dev/null +++ b/openfl-tutorials/experimental/LLM_Horovod.MD @@ -0,0 +1,40 @@ +This readme provides instructions for setting up and running the Horovod example using OpenFL. + +## Prerequisites +Before running the Horovod example, ensure that the following prerequisites are met: + +1. Python environment should be set up on all nodes. +2. The environment should be sourced when logging into SSH. + +## Setting up Horovod Dependencies +To set up the Horovod dependencies, follow these steps: + +1. Run the `setup_env.sh` script located in `openfl-tutorials/experimental/setup_env.sh` within your virtual environment (venv). +2. Run the `setup_horovod_fed.sh` script to create aggregator and collaborator workspaces. +3. Ensure that the collaborator workspace is present in each node with the same file structure. +4. Make sure the dataset is available in each node. + +## Setting up Passwordless SSH Login +Horovod requires passwordless SSH login. Follow the instructions provided at [this link](http://www.linuxproblem.org/art_9.html) to set it up. + +## Environmental Variables +Set the following environmental variables for Horovod: + +- `OPENFL_HOROVOD_DEMO_NP`: Set this variable to the number of processes to run (e.g., "4"). +- `OPENFL_HOROVOD_DEMO_NICS`: Set this variable to the common network interface name to use with all nodes (e.g., "en01,lo"). +- `OPENFL_HOROVOD_DEMO_LOCALHOSTIP`: Set this variable to the IP address of the local node (e.g., "ip1"). +- `OPENFL_HOROVOD_DEMO_HOSTS`: Set this variable to the IP address of each node and the number of slots (e.g., "ip1:2,ip2:2"). + +## Customizing Data and Models +To use your own data and models, follow these steps: + +1. Copy the `openfl/openfl-workspace/torch_llm_horovod` directory to `openfl/openfl-workspace/name_of_your_template`. +2. In the `src/InHorovodrun` file, make the following changes: + - Replace `GlueMrpcDataLoader` with your own dataloader. + - Replace `LLMTrainer` with your own training/validation scripts. + +## Running the Experiment +To run the experiment, follow the instructions provided in the [OpenFL documentation](https://openfl.readthedocs.io/en/latest/running_the_federation.html#bare-metal-approach) using either the `torch_llm_horovod` template or your own template. + +That's it! You're now ready to use the Horovod example with your own data and models. Enjoy! + diff --git a/openfl-workspace/torch_llm_horovod/src/InHorovodLLMTaskRunner.py b/openfl-workspace/torch_llm_horovod/src/InHorovodLLMTrainer.py similarity index 99% rename from openfl-workspace/torch_llm_horovod/src/InHorovodLLMTaskRunner.py rename to openfl-workspace/torch_llm_horovod/src/InHorovodLLMTrainer.py index aaf8580581..7baebb113d 100644 --- a/openfl-workspace/torch_llm_horovod/src/InHorovodLLMTaskRunner.py +++ b/openfl-workspace/torch_llm_horovod/src/InHorovodLLMTrainer.py @@ -21,7 +21,7 @@ import torch.nn as nn -class InHorovodLLMTaskRunner(nn.Module): +class LLMTrainer(nn.Module): def __init__( self, data_loader, diff --git a/openfl-workspace/torch_llm_horovod/src/InHorovodrun.py b/openfl-workspace/torch_llm_horovod/src/InHorovodrun.py index f354255199..0a396f6b30 100644 --- a/openfl-workspace/torch_llm_horovod/src/InHorovodrun.py +++ b/openfl-workspace/torch_llm_horovod/src/InHorovodrun.py @@ -6,8 +6,8 @@ import horovod.torch as hvd SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) sys.path.append(os.path.dirname(SCRIPT_DIR)) -from src.ptglue_inmemory import InHorovodGlueMrpcFederatedDataLoader -from src.InHorovodLLMTaskRunner import InHorovodLLMTaskRunner +from src.ptglue_inmemory import GlueMrpcDataLoader +from src.InHorovodLLMTrainer import LLMTrainer import json from logging import getLogger import traceback @@ -54,11 +54,11 @@ def main(): logger.info('getting arguments') args = get_args() logger.info('loading data') - data_loader = InHorovodGlueMrpcFederatedDataLoader( + data_loader = GlueMrpcDataLoader( data_path=args.data_path, batch_size=args.batch_size ) logger.info('get taskrunner') - taskrunner = InHorovodLLMTaskRunner(data_loader) + taskrunner = LLMTrainer(data_loader) func = getattr(taskrunner, args.func) kwargs = json.loads(args.kwargs) kwargs.update( diff --git a/openfl-workspace/torch_llm_horovod/src/ptglue_inmemory.py b/openfl-workspace/torch_llm_horovod/src/ptglue_inmemory.py index 73e8f30f97..7d202079c8 100644 --- a/openfl-workspace/torch_llm_horovod/src/ptglue_inmemory.py +++ b/openfl-workspace/torch_llm_horovod/src/ptglue_inmemory.py @@ -93,7 +93,7 @@ def get_valid_data_size(self): return len(self.valid_set) -class InHorovodGlueMrpcFederatedDataLoader(GlueMrpcFederatedDataLoader): +class GlueMrpcDataLoader(GlueMrpcFederatedDataLoader): def __init__(self, data_path, batch_size, **kwargs): logger.info('get dataset') train_set, valid_set, data_collator = get_dataset()