diff --git a/README.md b/README.md
index 205dac7d4..1533f48ff 100644
--- a/README.md
+++ b/README.md
@@ -155,7 +155,7 @@ For more details on each entry point, see the [Training and Finetuning](#trainin
 
 GPT-NeoX parameters are defined in a YAML configuration file which is passed to the deepy.py launcher. We have provided some example .yaml files in [configs](./configs/), including one for GPT-NeoX-20B, and example configuration files for other model sizes.
 
-These files are generally complete, but non-optimal. For example, depending on your specific GPU configuration, you may need to change some settings such as `pipe-parallel-size`, `model-parallel-size` to increase or decrease the degree of parallelisation, `train_micro_batch_size_per_gpu` or `gradient-accumulation-steps` to modify batch size related settings, or the `zero_optimization` dict to modify how optimizer states are parallelised across workers.
+These files are generally complete, but non-optimal. For example, depending on your specific GPU configuration, you may need to change some settings such as `pipe-parallel-size`, `model-parallel-size` to increase or decrease the degree of parallelisation, `micro_batch_size_per_gpu` or `gradient-accumulation-steps` to modify batch size related settings, or the `zero_optimization` dict to modify how optimizer states are parallelised across workers.
 
 For a more detailed guide to all the features available and how to configure them, see [the configuration README](configs/README.md), and for documentation of every possible argument, see [configs/neox_arguments.md](configs/neox_arguments.md).
 
diff --git a/configs/13B.yml b/configs/13B.yml
index b2f1e1368..371b18221 100644
--- a/configs/13B.yml
+++ b/configs/13B.yml
@@ -40,7 +40,7 @@
   },
 
    # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
+   "micro_batch_size_per_gpu": 4,
    "data-impl": "mmap",
    "split": "949,50,1",
 
diff --git a/configs/175B.yml b/configs/175B.yml
index baaad3c82..0e8037ea6 100644
--- a/configs/175B.yml
+++ b/configs/175B.yml
@@ -40,7 +40,7 @@
   },
 
    # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
+   "micro_batch_size_per_gpu": 4,
    "data-impl": "mmap",
    "split": "949,50,1",
 
diff --git a/configs/2-7B.yml b/configs/2-7B.yml
index b795c310c..7e52f169d 100644
--- a/configs/2-7B.yml
+++ b/configs/2-7B.yml
@@ -41,7 +41,7 @@
   },
 
    # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
+   "micro_batch_size_per_gpu": 4,
    "data-impl": "mmap",
    "split": "949,50,1",
 
diff --git a/configs/20B.yml b/configs/20B.yml
index 7b0d5e481..1c4e7b13b 100644
--- a/configs/20B.yml
+++ b/configs/20B.yml
@@ -58,7 +58,7 @@
   },
 
   # batch / data settings (assuming 96 GPUs)
-  "train_micro_batch_size_per_gpu": 4,
+  "micro_batch_size_per_gpu": 4,
   "gradient_accumulation_steps": 32,
   "data-impl": "mmap",
   "split": "995,4,1",
diff --git a/configs/6-7B.yml b/configs/6-7B.yml
index 777848781..9f6865443 100644
--- a/configs/6-7B.yml
+++ b/configs/6-7B.yml
@@ -41,7 +41,7 @@
   },
 
    # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
+   "micro_batch_size_per_gpu": 4,
    "data-impl": "mmap",
    "split": "949,50,1",
 
diff --git a/configs/README.md b/configs/README.md
index 046f6d50b..f0a6018a2 100644
--- a/configs/README.md
+++ b/configs/README.md
@@ -50,7 +50,7 @@ For a detailed list of all the arguments available for neox, see [neox_arguments
   },
 
    # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
+   "micro_batch_size_per_gpu": 4,
    "gradient_accumulation_steps": 1,
    "data-impl": "mmap",
    "split": "949,50,1",
@@ -189,12 +189,12 @@ N.B - ZeRO stages 2+ are incompatible with pipeline parallelism. Please set `"pi
 
 ```yaml
    # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
+   "micro_batch_size_per_gpu": 4,
    "gradient_accumulation_steps": 1,
 ```
-Our global batch size configuration follows deepspeed's and can be configured in a number of ways. At least any one of `"train_batch_size"` and `"train_micro_batch_size_per_gpu"`.
+Our global batch size configuration follows deepspeed's and can be configured in a number of ways. At least any one of `"train_batch_size"` and `"micro_batch_size_per_gpu"`.
 - `"train_batch_size"`: The effective training batch size. This is the amount of data samples that leads to one step of model update. train_batch_size is aggregated by the batch size that a single GPU processes in one forward/backward pass (a.k.a., train_step_batch_size), the gradient accumulation steps (a.k.a., gradient_accumulation_steps), and the number of GPUs.
-- `"train_micro_batch_size_per_gpu""`: Batch size to be processed by one GPU in one step (without gradient accumulation). When specified, `gradient_accumulation_steps` is automatically calculated using train_batch_size and number of GPUs.
+- `"micro_batch_size_per_gpu""`: Batch size to be processed by one GPU in one step (without gradient accumulation). When specified, `gradient_accumulation_steps` is automatically calculated using train_batch_size and number of GPUs.
 - `"gradient_accumulation_steps"`: Number of training steps to accumulate gradients before averaging and applying them. This feature is sometimes useful to improve scalability since it results in less frequent communication of gradients between steps. Another impact of this feature is the ability to train with larger batch sizes per GPU. When specified, train_step_batch_size is automatically calculated using train_batch_size and number of GPUs.
 
 ### Dataset / Tokenizer / Checkpoint / Logging Settings:
diff --git a/configs/XL.yml b/configs/XL.yml
index 16aa48d02..a4a438444 100644
--- a/configs/XL.yml
+++ b/configs/XL.yml
@@ -40,7 +40,7 @@
   },
 
    # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
+   "micro_batch_size_per_gpu": 4,
    "data-impl": "mmap",
    "split": "949,50,1",
 
diff --git a/configs/bnb_small.yml b/configs/bnb_small.yml
index 5d7e8dcd3..db87702ad 100644
--- a/configs/bnb_small.yml
+++ b/configs/bnb_small.yml
@@ -42,7 +42,7 @@
   },
 
    # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
+   "micro_batch_size_per_gpu": 4,
    "data-impl": "mmap",
    "split": "949,50,1",
 
diff --git a/configs/gmlp_small.yml b/configs/gmlp_small.yml
index 6724b371a..6d0a11f38 100644
--- a/configs/gmlp_small.yml
+++ b/configs/gmlp_small.yml
@@ -29,7 +29,7 @@
    },
 
    # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
+   "micro_batch_size_per_gpu": 4,
    "data-impl": "mmap",
    "split": "949,50,1",
 
diff --git a/configs/large.yml b/configs/large.yml
index b03348d49..683774e8c 100644
--- a/configs/large.yml
+++ b/configs/large.yml
@@ -41,7 +41,7 @@
   },
 
    # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
+   "micro_batch_size_per_gpu": 4,
    "data-impl": "mmap",
    "split": "949,50,1",
 
diff --git a/configs/medium.yml b/configs/medium.yml
index 0e7ca304b..c554af53b 100644
--- a/configs/medium.yml
+++ b/configs/medium.yml
@@ -41,7 +41,7 @@
     "cpu_offload": False
   },
    # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
+   "micro_batch_size_per_gpu": 4,
    "data-impl": "mmap",
    "split": "949,50,1",
 
diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index 2afaa29cb..235e58eb4 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -1386,7 +1386,7 @@ Args for deepspeed config
 
 
 
-- **train_micro_batch_size_per_gpu**: int
+- **micro_batch_size_per_gpu**: int
 
     Default = None
 
diff --git a/configs/small.yml b/configs/small.yml
index 746743ff1..b15585819 100644
--- a/configs/small.yml
+++ b/configs/small.yml
@@ -41,7 +41,7 @@
   },
 
    # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
+   "micro_batch_size_per_gpu": 4,
    "data-impl": "mmap",
    "split": "949,50,1",
 
diff --git a/configs/small_bf16.yml b/configs/small_bf16.yml
index 5aa81be16..61d1d43f0 100644
--- a/configs/small_bf16.yml
+++ b/configs/small_bf16.yml
@@ -41,7 +41,7 @@
   },
 
    # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
+   "micro_batch_size_per_gpu": 4,
    "data-impl": "mmap",
    "split": "949,50,1",
 
diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
index cf13efe4b..cb9ff4af4 100644
--- a/megatron/checkpointing.py
+++ b/megatron/checkpointing.py
@@ -57,7 +57,7 @@ def do_forward_pass(neox_args, model, inference=False):
     # get context tokens
     # always forward full batch size
     context_tokens_tensor = (
-        torch.arange(2049).repeat((neox_args.train_micro_batch_size_per_gpu, 1)).cuda()
+        torch.arange(2049).repeat((neox_args.micro_batch_size_per_gpu, 1)).cuda()
     )
 
     # forward
diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py
index 3295c58e5..be50c891b 100644
--- a/megatron/neox_arguments/arguments.py
+++ b/megatron/neox_arguments/arguments.py
@@ -661,30 +661,30 @@ def calculate_derived(self):
             logging.error(error_message)
             raise AssertionError(error_message)
 
-        # Automatically derive train_batch_size = train_micro_batch_size_per_gpu*global_num_gpus*gradient_accumulation_steps
+        # Automatically derive train_batch_size = micro_batch_size_per_gpu*global_num_gpus*gradient_accumulation_steps
         (
             train_batch_size,
-            train_micro_batch_size_per_gpu,
+            micro_batch_size_per_gpu,
             gradient_accumulation_steps,
         ) = self.calculate_batch_parameters(
             dp_world_size=dp_world_size,
             train_batch=self.train_batch_size,
-            micro_batch=self.train_micro_batch_size_per_gpu,
+            micro_batch=self.micro_batch_size_per_gpu,
             grad_acc=self.gradient_accumulation_steps,
         )
         self.check_batch_parameters(
             dp_world_size=dp_world_size,
             train_batch=train_batch_size,
-            micro_batch=train_micro_batch_size_per_gpu,
+            micro_batch=micro_batch_size_per_gpu,
             grad_acc=gradient_accumulation_steps,
         )
         self.update_values(
             {
                 # batch size params
                 "train_batch_size": train_batch_size,
-                "train_micro_batch_size_per_gpu": train_micro_batch_size_per_gpu,
+                "micro_batch_size_per_gpu": micro_batch_size_per_gpu,
                 "gradient_accumulation_steps": gradient_accumulation_steps,
-                "batch_size": train_micro_batch_size_per_gpu,
+                "batch_size": micro_batch_size_per_gpu,
                 # duplicate items
                 "gas": self.gradient_accumulation_steps,
                 "clip_grad": self.gradient_clipping,
diff --git a/megatron/neox_arguments/deepspeed_args.py b/megatron/neox_arguments/deepspeed_args.py
index 9287725f3..b6ce84757 100644
--- a/megatron/neox_arguments/deepspeed_args.py
+++ b/megatron/neox_arguments/deepspeed_args.py
@@ -22,7 +22,7 @@ class NeoXArgsDeepspeedConfig(NeoXArgsTemplate):
     The effective training batch size. This is the amount of data samples that leads to one step of model update. train_batch_size is aggregated by the batch size that a single GPU processes in one forward/backward pass (a.k.a., train_step_batch_size), the gradient accumulation steps (a.k.a., gradient_accumulation_steps), and the number of GPUs.
     """
 
-    train_micro_batch_size_per_gpu: int = None
+    micro_batch_size_per_gpu: int = None
     """
     Batch size to be processed by one GPU in one step (without gradient accumulation). When specified, gradient_accumulation_steps is automatically calculated using train_batch_size and number of GPUs. Should not be concurrently specified with gradient_accumulation_steps in the configuration JSON.
     """
diff --git a/megatron/text_generation_utils.py b/megatron/text_generation_utils.py
index 059a30dc2..d2914e4f4 100644
--- a/megatron/text_generation_utils.py
+++ b/megatron/text_generation_utils.py
@@ -270,7 +270,7 @@ def stream_tokens(
     last_token_index_to_generate = min(
         neox_args.seq_length
         - 1,  # never generate more than the model's sequence length
-        token_index_to_generate + maximum_tokens - 1,
+        token_generation_start_index.max().item() + maximum_tokens - 1,
     )
 
     with torch.no_grad():
@@ -351,6 +351,10 @@ def stream_tokens(
                 token_generation_start_index <= token_index_to_generate
             )  # check which batch items have been started
 
+            state_started = state_started & (
+                token_generation_start_index + maximum_tokens > token_index_to_generate
+            )  # check which batch items have been ended
+
             # switch out padding tokens for generated tokens
             context_tokens[:, token_index_to_generate] = switch(
                 context_tokens[:, token_index_to_generate].view(-1),
@@ -439,29 +443,32 @@ def generate_samples_from_prompt(
 
         start_time = time.time()
         # Tokenize text, and check whether we should terminate process
+        batch_size = min(neox_args.micro_batch_size_per_gpu, input_count - input_pos)
         terminate_runs = 0
         if input_pos == input_count:
             terminate_runs = 1
         else:
-            raw_text = text[input_pos]
-            input_pos += 1
-
-            if raw_text == "":
-                context_tokens = [eos_token_id]
-            else:
-                context_tokens = neox_args.tokenizer.tokenize(raw_text)
-            context_length = len(context_tokens)
-
-            if context_length >= (neox_args.seq_length // 2):
-                print_rank_0(
-                    "\nWarning! Context length",
-                    context_length,
-                    "\nPlease give smaller context (e.g. half of the "
-                    "max sequence length)!",
-                )
+            context_tokens_list = []
+            for pos in range(input_pos, input_pos + batch_size):
+                raw_text = text[pos]
+                if raw_text == "":
+                    context_tokens = [eos_token_id]
+                else:
+                    context_tokens = neox_args.tokenizer.tokenize(raw_text)
+                context_length = len(context_tokens)
+                if context_length >= (neox_args.seq_length // 2):
+                    print_rank_0(
+                        "\nWarning! Context length",
+                        context_length,
+                        "\nPlease give smaller context (e.g. half of the "
+                        "max sequence length)!",
+                    )
+                context_tokens_list.append(context_tokens)
+            input_pos += batch_size
         if not is_mp_rank_0():
-            context_tokens = neox_args.tokenizer.tokenize("EMPTY TEXT")
-            context_length = len(context_tokens)
+            context_tokens_list = [
+                neox_args.tokenizer.tokenize("EMPTY TEXT") for _ in range(batch_size)
+            ]
             terminate_runs = 0
 
         terminate_runs = broadcast_terminate_signal(terminate_runs)
@@ -476,7 +483,7 @@ def generate_samples_from_prompt(
         ) in stream_tokens(
             neox_args=neox_args,
             model=model,
-            context_tokens=[context_tokens],
+            context_tokens=context_tokens_list,
             eos_token_id=eos_token_id,
             maximum_tokens=maximum_tokens,
             recompute=recompute,
@@ -496,12 +503,12 @@ def generate_samples_from_prompt(
         )
         batch_is_done = is_done.cpu().numpy().tolist()
 
-        for tokens, start_index, end_index, is_done in zip(
-            batch_context_tokens,
-            batch_token_generation_start_index,
-            batch_token_generation_end_index,
-            batch_is_done,
-        ):
+        for i in range(batch_size):
+            tokens = batch_context_tokens[i]
+            start_index = batch_token_generation_start_index[i]
+            end_index = batch_token_generation_end_index[i]
+            is_done = batch_is_done[i]
+            raw_text = text[input_pos - batch_size + i]
 
             if end_index >= start_index:
                 generated_tokens = tokens[start_index : end_index + 1]
diff --git a/tests/test_configs/test_train_base.yml b/tests/test_configs/test_train_base.yml
index bc82cc400..8d5cb514b 100644
--- a/tests/test_configs/test_train_base.yml
+++ b/tests/test_configs/test_train_base.yml
@@ -41,7 +41,7 @@
   },
 
    # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
+   "micro_batch_size_per_gpu": 4,
    "data_impl": "mmap",
    "split": "949,50,1",