Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

System monitoring #215

Merged
merged 9 commits into from
Dec 8, 2023
2 changes: 1 addition & 1 deletion config/augmentations_train/osl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ defaults:
- random_horizontal_flip
- /augmentations/color:
- /augmentations/postprocessing:
- to_tensor
- to_tensor_torchvision
- normalize
# task:
# - one-shot-learning
2 changes: 1 addition & 1 deletion config/augmentations_val/osl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ defaults:
- random_horizontal_flip
- /augmentations/color:
- /augmentations/postprocessing:
- to_tensor
- to_tensor_torchvision
- normalize
# task:
# - one-shot-learning
2 changes: 1 addition & 1 deletion config/datasets/detection/detection_lungs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ description: Набор данных содержит РГ снимки легк
markup_info: Набор данных содержит разметку bounding box легких и патологий.
date_time: 18.07.2022

_target_: innofw.core.datamodules.lightning_datamodules.detection.YOLOv5DataModule
_target_: innofw.core.integrations.ultralytics.datamodule.UltralyticsDataModuleAdapter

train:
source: https://api.blackhole.ai.innopolis.university/public-datasets/lungs_detection/train.zip
Expand Down
24 changes: 9 additions & 15 deletions config/experiments/classification/KG_090323_wpng92_mnist.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,19 @@
defaults:
- override /models: classification/resnet.yaml
- override /datasets: classification/classification_mnist.yaml
- override /augmentations_train: none
- override /augmentations_train: none #classification.yaml
- override /augmentations_val: none
- override /augmentations_test: none
- override /losses: log_loss.yaml
- override /optimizers: lion
# - override /callbacks: classification.yaml

- override /optimizers: adam
- override /schedulers:

project: "mnist_classification"
task: "image-classification"
random_seed: 42
original_work_dir: ${hydra:runtime.cwd}
weights_freq: 1
batch_size: 32
stop_param: 1
epochs: 10

#wandb:
# enable: True
# project: mnist
# group: 160323_just_testing_wandb
# job_type: training
batch_size: 128
epochs: 5
accelerator: "gpu"
devices: 2
num_workers: 1
weights_freq: 7
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,7 @@ project: "faces"
task: "one-shot-learning"
random_seed: 42
weights_freq: 1
epochs: 20
epochs: 50
accelerator: "gpu"
devices: 1
batch_size: 64
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# @package _global_
defaults:
- override /models: semantic-segmentation/unet_smp.yaml
- override /datasets: semantic-segmentation/segmentation_arable.yaml
- override /optimizers: adam.yaml
- override /losses: segmentation_losses

trainers:
overfit_batches: 20 # fit only 20 val batches and 40 train batches

accelerator: gpu

models:
in_channels: 4


project: "segmentation"
task: "image-segmentation"
random_seed: 42
weights_freq: 1
batch_size: 10
39 changes: 39 additions & 0 deletions examples/measure_anomaly_detection.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/bin/bash
# run like this: sudo -E env "PATH=$PATH" bash examples/measure_image_classification.sh
sudo apt-get install -y iotop > /dev/null
PWD=$(pwd)

current_dir=${PWD##*/}

var2="examples"
if [ "$current_dir" = "$var2" ]; then
cd ..
fi
echo "" > nohup.out
echo "" > nvidiasmi_log
echo "" > mem_log
echo "" > cpu_log

export NO_CLI=True

INITIAL_PROCESSES=$(sudo lsof nohup.out | wc -l) && echo $INITIAL_PROCESSES


nohup time sudo -E env "PATH=$PATH" python train.py experiments=anomaly-detection/IM_040822_rbe23lls_anomaly_detection_timeseries_lstm.yaml epochs=100 optimizers=adam accelerator=gpu &
PID=$!
echo "Saving cpu+ram info and nvidia-smi to cpu_log + mem_log and nvidiasmi_log"
echo $PID

PROCESSES=$(sudo lsof nohup.out | wc -l) && echo $PROCESSES
while [ $PROCESSES -gt $INITIAL_PROCESSES ]
do
top -b n1 | grep -E 'Cpu' >> cpu_log
top -b n1 | grep -E 'MiB Mem' >> mem_log
sudo nvidia-smi --query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv | grep "00000000:25:00.0" >> nvidiasmi_log
sleep 1
PROCESSES=$(sudo lsof nohup.out | wc -l) # && echo $PROCESSES
done

sudo -E env "PATH=$PATH" python examples/measurements_compaction.py

sudo kill -9 $PID
42 changes: 42 additions & 0 deletions examples/measure_complexing.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/bin/bash
# run like this: sudo -E env "PATH=$PATH" bash examples/measure_image_classification.sh
sudo apt-get install -y iotop > /dev/null
PWD=$(pwd)

current_dir=${PWD##*/}

var2="examples"
if [ "$current_dir" = "$var2" ]; then
cd ..
fi
echo "" > nohup.out
echo "" > nvidiasmi_log
echo "" > mem_log
echo "" > cpu_log

export NO_CLI=True

INITIAL_PROCESSES=$(sudo lsof nohup.out | wc -l) && echo $INITIAL_PROCESSES


nohup time sudo -E env "PATH=$PATH" python innofw/utils/data_utils/preprocessing/band_composer.py\
--src_type sentinel2\
--src_path tests/data/images/other/satellite_cropped/sentinel2/one\
--channels "[\"RED\", \"GRN\", \"BLU\", \"NIR\"]" &
PID=$!
echo "Saving cpu+ram info and nvidia-smi to cpu_log + mem_log and nvidiasmi_log"
echo $PID

PROCESSES=$(sudo lsof nohup.out | wc -l) && echo $PROCESSES
while [ $PROCESSES -gt $INITIAL_PROCESSES ]
do
top -b n1 | grep -E 'Cpu' >> cpu_log
top -b n1 | grep -E 'MiB Mem' >> mem_log
sudo nvidia-smi --query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv | grep "00000000:25:00.0" >> nvidiasmi_log
sleep 1
PROCESSES=$(sudo lsof nohup.out | wc -l) # && echo $PROCESSES
done

sudo -E env "PATH=$PATH" python examples/measurements_compaction.py

sudo kill -9 $PID
32 changes: 32 additions & 0 deletions examples/measure_detection_yolov5.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/bin/bash
# run like this: sudo -E env "PATH=$PATH" bash examples/measure_image_classification.sh
sudo apt-get install -y iotop > /dev/null
PWD=$(pwd)

current_dir=${PWD##*/}

var2="examples"
if [ "$current_dir" = "$var2" ]; then
cd ..
fi
echo "" > nohup.out
echo "" > nvidiasmi_log
echo "" > mem_log
echo "" > cpu_log

export NO_CLI=True

nohup time sudo -E env "PATH=$PATH" python train.py experiments=detection/KA_120722_8adfcdaa_yolov5.yaml epochs=100 optimizers=adam accelerator=gpu &
PID=$!
echo "Saving cpu+ram info and nvidia-smi to cpu_log+mem_log and nvidiasmi_log"
echo $PID

while true
do
top -b n1 | grep -E 'Cpu' >> cpu_log
top -b n1 | grep -E 'MiB Mem' >> mem_log
sudo nvidia-smi --query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv >> nvidiasmi_log
sleep 1
done

kill $PID
39 changes: 39 additions & 0 deletions examples/measure_image_classification.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/bin/bash
# run like this: sudo -E env "PATH=$PATH" bash examples/measure_image_classification.sh
sudo apt-get install -y iotop > /dev/null
PWD=$(pwd)

current_dir=${PWD##*/}

var2="examples"
if [ "$current_dir" = "$var2" ]; then
cd ..
fi
echo "" > nohup.out
echo "" > nvidiasmi_log
echo "" > mem_log
echo "" > cpu_log

export NO_CLI=True

INITIAL_PROCESSES=$(sudo lsof nohup.out | wc -l) && echo $INITIAL_PROCESSES


nohup time sudo -E env "PATH=$PATH" python train.py experiments=classification/KG_090323_wpng92_mnist.yaml epochs=100 &
PID=$!
echo "Saving cpu+ram info and nvidia-smi to cpu_log + mem_log and nvidiasmi_log"
echo $PID

PROCESSES=$(sudo lsof nohup.out | wc -l) && echo $PROCESSES
while [ $PROCESSES -gt $INITIAL_PROCESSES ]
do
top -b n1 | grep -E 'Cpu' >> cpu_log
top -b n1 | grep -E 'MiB Mem' >> mem_log
sudo nvidia-smi --query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv | grep "00000000:25:00.0" >> nvidiasmi_log
sleep 1
PROCESSES=$(sudo lsof nohup.out | wc -l) # && echo $PROCESSES
done

sudo -E env "PATH=$PATH" python examples/measurements_compaction.py

sudo kill -9 $PID
39 changes: 39 additions & 0 deletions examples/measure_lin_regression.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/bin/bash
# run like this: sudo -E env "PATH=$PATH" bash examples/measure_image_classification.sh
sudo apt-get install -y iotop > /dev/null
PWD=$(pwd)

current_dir=${PWD##*/}

var2="examples"
if [ "$current_dir" = "$var2" ]; then
cd ..
fi
echo "" > nohup.out
echo "" > nvidiasmi_log
echo "" > mem_log
echo "" > cpu_log

export NO_CLI=True

INITIAL_PROCESSES=$(sudo lsof nohup.out | wc -l) && echo $INITIAL_PROCESSES


nohup time sudo -E env "PATH=$PATH" python train.py experiments=regression/KA_130722_9f7134db_linear_regression.yaml &
PID=$!
echo "Saving cpu+ram info and nvidia-smi to cpu_log + mem_log and nvidiasmi_log"
echo $PID

PROCESSES=$(sudo lsof nohup.out | wc -l) && echo $PROCESSES
while [ $PROCESSES -gt $INITIAL_PROCESSES ]
do
top -b n1 | grep -E 'Cpu' >> cpu_log
top -b n1 | grep -E 'MiB Mem' >> mem_log
sudo nvidia-smi --query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv | grep "00000000:25:00.0" >> nvidiasmi_log
sleep 1
PROCESSES=$(sudo lsof nohup.out | wc -l) # && echo $PROCESSES
done

sudo -E env "PATH=$PATH" python examples/measurements_compaction.py

sudo kill -9 $PID
39 changes: 39 additions & 0 deletions examples/measure_oneshot_learning.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/bin/bash
# run like this: sudo -E env "PATH=$PATH" bash examples/measure_image_classification.sh
sudo apt-get install -y iotop > /dev/null
PWD=$(pwd)

current_dir=${PWD##*/}

var2="examples"
if [ "$current_dir" = "$var2" ]; then
cd ..
fi
echo "" > nohup.out
echo "" > nvidiasmi_log
echo "" > mem_log
echo "" > cpu_log

export NO_CLI=True

INITIAL_PROCESSES=$(sudo lsof nohup.out | wc -l) && echo $INITIAL_PROCESSES


nohup time sudo -E env "PATH=$PATH" python train.py experiments=one-shot-learning/IM_190722_vwer3f23_oneshotlearning.yaml &
PID=$!
echo "Saving cpu+ram info and nvidia-smi to cpu_log + mem_log and nvidiasmi_log"
echo $PID

PROCESSES=$(sudo lsof nohup.out | wc -l) && echo $PROCESSES
while [ $PROCESSES -gt $INITIAL_PROCESSES ]
do
top -b n1 | grep -E 'Cpu' >> cpu_log
top -b n1 | grep -E 'MiB Mem' >> mem_log
sudo nvidia-smi --query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv | grep "00000000:25:00.0" >> nvidiasmi_log
sleep 1
PROCESSES=$(sudo lsof nohup.out | wc -l) # && echo $PROCESSES
done

sudo -E env "PATH=$PATH" python examples/measurements_compaction.py

sudo kill -9 $PID
39 changes: 39 additions & 0 deletions examples/measure_segmentation.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/bin/bash
# run like this: sudo -E env "PATH=$PATH" bash examples/measure_image_classification.sh
sudo apt-get install -y iotop > /dev/null
PWD=$(pwd)

current_dir=${PWD##*/}

var2="examples"
if [ "$current_dir" = "$var2" ]; then
cd ..
fi
echo "" > nohup.out
echo "" > nvidiasmi_log
echo "" > mem_log
echo "" > cpu_log

export NO_CLI=True

INITIAL_PROCESSES=$(sudo lsof nohup.out | wc -l) && echo $INITIAL_PROCESSES


nohup time sudo -E env "PATH=$PATH" python train.py experiments=semantic-segmentation/KG_041223_ingieq921_unet.yaml epochs=100 optimizers=adam accelerator=gpu &
PID=$!
echo "Saving cpu+ram info and nvidia-smi to cpu_log + mem_log and nvidiasmi_log"
echo $PID

PROCESSES=$(sudo lsof nohup.out | wc -l) && echo $PROCESSES
while [ $PROCESSES -gt $INITIAL_PROCESSES ]
do
top -b n1 | grep -E 'Cpu' >> cpu_log
top -b n1 | grep -E 'MiB Mem' >> mem_log
sudo nvidia-smi --query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv | grep "00000000:25:00.0" >> nvidiasmi_log
sleep 1
PROCESSES=$(sudo lsof nohup.out | wc -l) # && echo $PROCESSES
done

sudo -E env "PATH=$PATH" python examples/measurements_compaction.py

sudo kill -9 $PID
Loading