Skip to content

Commit

Permalink
Add start with eval option (#84)
Browse files Browse the repository at this point in the history
* Add start with eval option

* Ping for training run

* Drop p3.6 from CI

* Turn off telemetry on CI

* Bump up to v0.0.18
  • Loading branch information
erogol authored Dec 7, 2022
1 parent 0a23eba commit 18ce42e
Show file tree
Hide file tree
Showing 6 changed files with 33 additions and 9 deletions.
5 changes: 4 additions & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.6, 3.7, 3.8, 3.9, "3.10"]
python-version: [3.7, 3.8, 3.9, "3.10"]
experimental: [false]
steps:
- uses: actions/checkout@v2
Expand All @@ -31,6 +31,9 @@ jobs:
cache-dependency-path: 'requirements*'
- name: check OS
run: cat /etc/os-release
- name: Telemetry off
run: |
export TRAINER_TELEMETRY=0
- name: Install dependencies
run: |
sudo apt-get update
Expand Down
2 changes: 1 addition & 1 deletion trainer/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
v0.0.17
v0.0.18
12 changes: 12 additions & 0 deletions trainer/analytics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import os

import requests

telemetry = os.environ.get("TRAINER_TELEMETRY")


def ping_training_run():

This comment has been minimized.

Copy link
@JaejinCho

JaejinCho Jan 4, 2023

Hello, @erogol Thanks for opening this repo to the public!

I was wondering what this function does and if this is necessary since this URL is blocked in our system due to a security-related problem.

This comment has been minimized.

Copy link
@erogol

erogol Jan 4, 2023

Author Member

This is anonymous telemetry for us to get basic usage stats about the library so that we can plan it better.

You can disable it by setting the env var TRAINER_TELEMETRY=0

This comment has been minimized.

Copy link
@JaejinCho

JaejinCho via email Jan 5, 2023

if telemetry == "0":
return
URL = "https://coqui.gateway.scarf.sh/trainer/training_run"
_ = requests.get(URL)
2 changes: 1 addition & 1 deletion trainer/callbacks.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Dict, Callable
from typing import Callable, Dict


class TrainerCallback:
Expand Down
12 changes: 10 additions & 2 deletions trainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from torch.nn.parallel import DistributedDataParallel as DDP_th
from torch.utils.data import DataLoader

from trainer.analytics import ping_training_run
from trainer.callbacks import TrainerCallback
from trainer.generic_utils import (
KeepAverage,
Expand Down Expand Up @@ -241,6 +242,10 @@ class TrainerArgs(Coqpit):
default=False,
metadata={"help": "Skip training and only run evaluation and test."},
)
start_with_eval: bool = field(
default=False,
metadata={"help": "Start with evaluation and test."},
)
small_run: int = field(
default=None,
metadata={
Expand Down Expand Up @@ -388,6 +393,7 @@ def __init__( # pylint: disable=dangerous-default-value
self.grad_accum_steps = args.grad_accum_steps
self.overfit_batch = args.overfit_batch
self.skip_train_epoch = args.skip_train_epoch
self.start_with_eval = args.start_with_eval

assert self.grad_accum_steps > 0, " [!] grad_accum_steps must be greater than 0."

Expand Down Expand Up @@ -519,6 +525,7 @@ def __init__( # pylint: disable=dangerous-default-value
self.callbacks.on_init_end(self)
self.dashboard_logger.add_config(config)
self.save_training_script()
ping_training_run()

def save_training_script(self):
"""Save the training script to tracking dashboard and output path."""
Expand Down Expand Up @@ -1519,7 +1526,7 @@ def _fit(self) -> None:
self.keep_avg_eval = KeepAverage() if self.config.run_eval else None
self.epochs_done = epoch
self.c_logger.print_epoch_start(epoch, self.config.epochs, self.output_path)
if not self.skip_train_epoch:
if not self.skip_train_epoch and not self.start_with_eval:
self.train_epoch()
if self.config.run_eval:
self.eval_epoch()
Expand All @@ -1532,6 +1539,7 @@ def _fit(self) -> None:
if self.args.rank in [None, 0]:
self.save_best_model()
self.callbacks.on_epoch_end(self)
self.start_with_eval = False

def fit_with_largest_batch_size(self, starting_batch_size=2048) -> None:
cuda_meminfo()
Expand All @@ -1552,7 +1560,7 @@ def fit_with_largest_batch_size(self, starting_batch_size=2048) -> None:
torch.cuda.empty_cache()
else:
raise
except Exception as exception: #pylint: disable=broad-except
except Exception as exception: # pylint: disable=broad-except
# catches the torch.cuda.OutOfMemoryError
if bs > 1 and should_reduce_batch_size(exception):
bs //= 2
Expand Down
9 changes: 5 additions & 4 deletions trainer/utils/cpu_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@ def get_available_cpu_memory():
available_memory = psutil.virtual_memory().available

try:
import resource # pylint: disable=import-outside-toplevel
_, hard_mem_limit = resource.getrlimit(resource.RLIMIT_AS) #pylint: disable=unused-variable
import resource # pylint: disable=import-outside-toplevel

_, hard_mem_limit = resource.getrlimit(resource.RLIMIT_AS) # pylint: disable=unused-variable
if hard_mem_limit != resource.RLIM_INFINITY:
used_memory = this_process.memory_info().vms
available_memory = min(hard_mem_limit - used_memory, available_memory)
Expand All @@ -21,9 +22,9 @@ def get_available_cpu_memory():

def set_cpu_memory_limit(num_gigabytes):
try:
import resource # pylint: disable=import-outside-toplevel
import resource # pylint: disable=import-outside-toplevel

num_bytes = int(num_gigabytes * 2 ** 30)
num_bytes = int(num_gigabytes * 2**30)
_, hard_limit = resource.getrlimit(resource.RLIMIT_AS)
if hard_limit != resource.RLIM_INFINITY:
hard_limit = min(num_bytes, hard_limit)
Expand Down

0 comments on commit 18ce42e

Please sign in to comment.