name changes, added readme

securefederatedai · Jan 8, 2024 · 208b69a · 208b69a
1 parent ac17821
commit 208b69a
Show file tree

Hide file tree

Showing 5 changed files with 49 additions and 7 deletions.
diff --git a/.gitignore b/.gitignore
@@ -13,4 +13,6 @@ venv/*
 *.jpg
 *.crt
 *.key
-.eggs
+.eggs
+eggs
+*.pyi
diff --git a/openfl-tutorials/experimental/LLM_Horovod.MD b/openfl-tutorials/experimental/LLM_Horovod.MD
@@ -0,0 +1,40 @@
+This readme provides instructions for setting up and running the Horovod example using OpenFL.
+
+## Prerequisites
+Before running the Horovod example, ensure that the following prerequisites are met:
+
+1. Python environment should be set up on all nodes.
+2. The environment should be sourced when logging into SSH.
+
+## Setting up Horovod Dependencies
+To set up the Horovod dependencies, follow these steps:
+
+1. Run the `setup_env.sh` script located in `openfl-tutorials/experimental/setup_env.sh` within your virtual environment (venv).
+2. Run the `setup_horovod_fed.sh` script to create aggregator and collaborator workspaces.
+3. Ensure that the collaborator workspace is present in each node with the same file structure.
+4. Make sure the dataset is available in each node.
+
+## Setting up Passwordless SSH Login
+Horovod requires passwordless SSH login. Follow the instructions provided at [this link](http://www.linuxproblem.org/art_9.html) to set it up.
+
+## Environmental Variables
+Set the following environmental variables for Horovod:
+
+- `OPENFL_HOROVOD_DEMO_NP`: Set this variable to the number of processes to run (e.g., "4").
+- `OPENFL_HOROVOD_DEMO_NICS`: Set this variable to the common network interface name to use with all nodes (e.g., "en01,lo").
+- `OPENFL_HOROVOD_DEMO_LOCALHOSTIP`: Set this variable to the IP address of the local node (e.g., "ip1").
+- `OPENFL_HOROVOD_DEMO_HOSTS`: Set this variable to the IP address of each node and the number of slots (e.g., "ip1:2,ip2:2").
+
+## Customizing Data and Models
+To use your own data and models, follow these steps:
+
+1. Copy the `openfl/openfl-workspace/torch_llm_horovod` directory to `openfl/openfl-workspace/name_of_your_template`.
+2. In the `src/InHorovodrun` file, make the following changes:
+   - Replace `GlueMrpcDataLoader` with your own dataloader.
+   - Replace `LLMTrainer` with your own training/validation scripts.
+
+## Running the Experiment
+To run the experiment, follow the instructions provided in the [OpenFL documentation](https://openfl.readthedocs.io/en/latest/running_the_federation.html#bare-metal-approach) using either the `torch_llm_horovod` template or your own template.
+
+That's it! You're now ready to use the Horovod example with your own data and models. Enjoy!
+
diff --git a/...llm_horovod/src/InHorovodLLMTaskRunner.py → ...ch_llm_horovod/src/InHorovodLLMTrainer.py b/...llm_horovod/src/InHorovodLLMTaskRunner.py → ...ch_llm_horovod/src/InHorovodLLMTrainer.py
@@ -21,7 +21,7 @@
 import torch.nn as nn
 
 
-class InHorovodLLMTaskRunner(nn.Module):
+class LLMTrainer(nn.Module):
     def __init__(
         self,
         data_loader,

diff --git a/openfl-workspace/torch_llm_horovod/src/InHorovodrun.py b/openfl-workspace/torch_llm_horovod/src/InHorovodrun.py
@@ -6,8 +6,8 @@
 import horovod.torch as hvd
 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(os.path.dirname(SCRIPT_DIR))
-from src.ptglue_inmemory import InHorovodGlueMrpcFederatedDataLoader
-from src.InHorovodLLMTaskRunner import InHorovodLLMTaskRunner
+from src.ptglue_inmemory import GlueMrpcDataLoader
+from src.InHorovodLLMTrainer import LLMTrainer
 import json
 from logging import getLogger
 import traceback
@@ -54,11 +54,11 @@ def main():
         logger.info('getting arguments')
         args = get_args()
         logger.info('loading data')
-        data_loader = InHorovodGlueMrpcFederatedDataLoader(
+        data_loader = GlueMrpcDataLoader(
             data_path=args.data_path, batch_size=args.batch_size
         )
         logger.info('get taskrunner')
-        taskrunner = InHorovodLLMTaskRunner(data_loader)
+        taskrunner = LLMTrainer(data_loader)
         func = getattr(taskrunner, args.func)
         kwargs = json.loads(args.kwargs)
         kwargs.update(

diff --git a/openfl-workspace/torch_llm_horovod/src/ptglue_inmemory.py b/openfl-workspace/torch_llm_horovod/src/ptglue_inmemory.py
@@ -93,7 +93,7 @@ def get_valid_data_size(self):
         return len(self.valid_set)
 
 
-class InHorovodGlueMrpcFederatedDataLoader(GlueMrpcFederatedDataLoader):
+class GlueMrpcDataLoader(GlueMrpcFederatedDataLoader):
     def __init__(self, data_path, batch_size, **kwargs):
         logger.info('get dataset')
         train_set, valid_set, data_collator = get_dataset()
-Original file line number
+Diff line change
@@ Expand Up / @@ -13,4 +13,6 @@ venv/* @@
     *.jpg
     *.crt
     *.key
-    .eggs
+    .eggs
+    eggs
+    *.pyi