[PLAT-6231][K8s] update log_cleanup.sh to recover from full disk

Summary: The log_cleanup.sh was trying to compress the log files, and was failing with 'No space left on device'. This commit makes changes to recover from such state. - Old core dump files are deleted first before cleaning up the logs. - In case gzip fails on a file, we still continue with our cleanup. - We first delete the gz files, and then start deleting the non-gz files if we are still over the limit. - The current log files which are not rotated are still kept as they are. - This commit also changes the interpreter to Bash, and changes the rest of the script accordingly for consistency. Test Plan: I created dummy 4GiB log files in the logs directory of a master pod. And there was no space left on the device. - The script was able to iterate through all the files, it tried to gzip them. - It deleted the gz files first and then deleted those dummy large log files. - Inspected the output of script with set -x, rest of the parts seems to be working as expected. Command used to create dummy log files: ``` dd if=/dev/zero of=dummy_log_file bs=1M count=100 mv dummy_log_file yb-master.yb-master-0.root.log.WARNING.20221208-test1.20 ``` Reviewers: sneelakantan, sanketh Reviewed By: sanketh Subscribers: yugaware Differential Revision: https://phabricator.dev.yugabyte.com/D21835
dhallas · Jan 2, 2023 · 987aa88 · 987aa88
1 parent 12e23ad
commit 987aa88
Showing 1 changed file with 108 additions and 69 deletions.
diff --git a/bin/log_cleanup.sh b/bin/log_cleanup.sh
@@ -1,5 +1,5 @@
-#!/bin/sh
-
+#!/usr/bin/env bash
+#
 # Copyright (c) YugaByte, Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
@@ -12,7 +12,7 @@
 # or implied.  See the License for the specific language governing permissions and limitations
 # under the License.
 
-set -euo pipefail
+set -euo pipefail -o noglob
 
 print_help() {
   cat <<EOT
@@ -33,14 +33,14 @@ Options:
 EOT
 }
 
-gzip_only=false
-YB_HOME_DIR=/home/yugabyte
+gzip_only="false"
+YB_HOME_DIR="/home/yugabyte"
 YB_CORES_DIR="/var/yugabyte/cores"
 
 logs_disk_percent_max=10
-postgres_max_log_size_kb=$((100 * 1000))
+postgres_max_log_size_kb=$(( 100 * 1000 ))
 cores_disk_percent_max=10
-logs_purge_threshold_kb=$((10 * 1000000))
+logs_purge_threshold_kb=$(( 10 * 1000000 ))
 
 while [[ $# -gt 0 ]]; do
   case "$1" in
@@ -49,15 +49,15 @@ while [[ $# -gt 0 ]]; do
       shift
     ;;
     -s|--postgres_max_log_size)
-      postgres_max_log_size_kb=$(($2 * 1000))
+      postgres_max_log_size_kb=$(( $2 * 1000 ))
       shift
     ;;
     -t|--logs_purge_threshold)
-      logs_purge_threshold_kb=$(($2 * 1000000))
+      logs_purge_threshold_kb=$(( $2 * 1000000 ))
       shift
     ;;
     -z|--gzip_only)
-      gzip_only=true
+      gzip_only="true"
     ;;
     -d|--cores_disk_percent_max)
       cores_disk_percent_max=$2
@@ -75,78 +75,112 @@ while [[ $# -gt 0 ]]; do
   shift
 done
 
+# TODO(bhavin192): this will just fail here in OpenShift kind of
+# environment.
 if [[ "$(id -u)" != "0" && "$USER" != "yugabyte" ]]; then
   echo "This script must be run as root or yugabyte" >&2
   exit 1
 fi
 
-if [[ $logs_disk_percent_max -lt 1 || $logs_disk_percent_max -gt 100 ]]; then
+if [[ "${logs_disk_percent_max}" -lt 1 || "${logs_disk_percent_max}" -gt 100 ]]; then
   echo "--logs_disk_percent_max needs to be [1, 100]" >&2
   exit 1
 fi
 
-if [[ $cores_disk_percent_max -lt 1 || $cores_disk_percent_max -gt 100 ]]; then
+if [[ "${cores_disk_percent_max}" -lt 1 || "${cores_disk_percent_max}" -gt 100 ]]; then
   echo "--cores_disk_percent_max needs to be [1, 100]" >&2
   exit 1
 fi
 
-if [[ $logs_purge_threshold_kb -lt 1000000 ]]; then
+if [[ "${logs_purge_threshold_kb}" -lt 1000000 ]]; then
   echo "--logs_purge_threshold needs to be at least 1 GB"
   exit 1
 fi
 
 # half for tserver and half for master.
-logs_disk_percent_max=$(($logs_disk_percent_max / 2))
-logs_purge_threshold_kb=$(($logs_purge_threshold_kb / 2))
+logs_disk_percent_max=$(( logs_disk_percent_max / 2 ))
+logs_purge_threshold_kb=$(( logs_purge_threshold_kb / 2 ))
+
+find_and_sort() {
+  dir=$1
+  regex=$2
+  find "${dir}" -type f -name "${regex}" -print0 | \
+    xargs -0 -r stat -c '%Y %n' | \
+    sort | cut -d' ' -f2-
+}
 
-delete_gz_files() {
-  set -f
+delete_log_files() {
   local log_dir=$1
   local find_regex=$2
   local permitted_usage=$3
-  local logs_disk_usage_bytes=$(find $log_dir -type f -name $find_regex -print0 | \
+  local logs_disk_usage_bytes=$(find "${log_dir}" -type f -name "${find_regex}" -print0 | \
     xargs -0 -r stat -c '%s' | \
     awk '{sum+=$1;}END{print sum;}')
-  if [ -z $logs_disk_usage_bytes ]; then
+  if [[ -z "${logs_disk_usage_bytes}" ]]; then
     logs_disk_usage_bytes=0
   fi
-  local logs_disk_usage_kb=$(($logs_disk_usage_bytes / 1000 ))
-  echo "Permitted disk usage for $find_regex files in kb: $permitted_usage"
-  echo "Disk usage by $find_regex files in kb: $logs_disk_usage_kb"
+  local logs_disk_usage_kb=$(( logs_disk_usage_bytes / 1000 ))
+  echo "Permitted disk usage for $find_regex files in kb: ${permitted_usage}"
+  echo "Disk usage by $find_regex files in kb: ${logs_disk_usage_kb}"
+
   # get all the gz files.
-  local gz_files=$(find $log_dir -type f -name $find_regex.gz -print0 | \
-    xargs -0 -r stat -c '%Y %n' | \
-    sort | awk '{print $2}')
-  for file in $gz_files; do
+  local gz_files=$(find_and_sort "${log_dir}" "${find_regex}.gz")
+  for file in ${gz_files}; do
     # If usage exceeds permitted, delete the old gz files.
-    if [ $logs_disk_usage_kb -gt $permitted_usage ]; then
+    if [[ "${logs_disk_usage_kb}" -gt "${permitted_usage}" ]]; then
+      local file_size=$(du -k "${file}" | awk '{print $1}')
+      logs_disk_usage_kb=$(( logs_disk_usage_kb - file_size ))
+      echo "Delete file ${file}"
+      rm "${file}"
+    else
+      break
+    fi
+  done
+
+  # Skip deletion of non-gz files if we are under permitted usage
+  if [[ "${logs_disk_usage_kb}" -le "${permitted_usage}" ]]; then
+    return
+  fi
+
+  # All the non-gz files
+  local files=$(find_and_sort "${log_dir}" "${find_regex}")
+  # Remove the current log files from the list
+  for log_regex in ${log_regexes}; do
+    local current_file=$(find_and_sort "${log_dir}" "${log_regex}" | tail -n1)
+    # double quotes around files are import
+    # https://stackoverflow.com/a/4651495
+    files=$(echo "${files}" | grep -v -E "^${current_file}$")
+  done
+  for file in ${files}; do
+    # If usage exceeds permitted, delete the old files.
+    if [[ "${logs_disk_usage_kb}" -gt "${permitted_usage}" ]]; then
       local file_size=$(du -k $file | awk '{print $1}')
-      logs_disk_usage_kb=$(($logs_disk_usage_kb-$file_size))
-      echo "Delete file $file"
-      rm $file
+      logs_disk_usage_kb=$(( logs_disk_usage_kb - file_size ))
+      echo "Delete file ${file}"
+      rm "${file}"
     else
       break
     fi
   done
 }
 
 delete_core_dump_files () {
-  local core_dump_dir="$1"
-  local permitted_usage="$2"
-  local disk_usage_kb="$(du -sk $core_dump_dir | awk '{print $1}')"
-  echo "Permitted disk usage for core dump files in kb: $permitted_usage"
-  echo "Disk usage by core dump files in kb: $disk_usage_kb"
+  local core_dump_dir=$1
+  local permitted_usage=$2
+  local disk_usage_kb=$(du -sk "${core_dump_dir}" | awk '{print $1}')
+  echo "Permitted disk usage for core dump files in kb: ${permitted_usage}"
+  echo "Disk usage by core dump files in kb: ${disk_usage_kb}"
 
   # Sort by time: oldest first
-  local files="$(ls -Acr $core_dump_dir)"
+  local files=$(ls -Acr "${core_dump_dir}")
   # Handle space in a file name
   IFS=$'\n'
-  for file in $files; do
+  for file in ${files}; do
     file="${core_dump_dir}/${file}"
     # If usage exceeds permitted, delete the old files.
-    if [ $disk_usage_kb -gt $permitted_usage ]; then
-      local file_size=$(du -k ${file} | awk '{print $1}')
-      disk_usage_kb=$(($disk_usage_kb-$file_size))
+    if [[ "${disk_usage_kb}" -gt "${permitted_usage}" ]]; then
+      local file_size=$(du -k "${file}" | awk '{print $1}')
+      disk_usage_kb=$(( disk_usage_kb - file_size ))
       echo "Deleting core file ${file}"
       rm "${file}"
     else
@@ -156,55 +190,60 @@ delete_core_dump_files () {
   unset IFS
 }
 
+# Clean-up old core dump files
+if [[ -d "${YB_CORES_DIR}" ]]; then
+  core_dump_disk_size_kb=$(df -k "${YB_CORES_DIR}" | awk 'NR==2{print $2}')
+  core_dump_max_size_kb=$(( core_dump_disk_size_kb * cores_disk_percent_max / 100 ))
+  delete_core_dump_files "${YB_CORES_DIR}" "${core_dump_max_size_kb}"
+fi
+
+# Log clean-up
 server_types="master tserver"
 daemon_types=""
-for server_type in $server_types; do
-  if [[ -d "$YB_HOME_DIR/$server_type/logs" ]]; then
-    daemon_types="${daemon_types} $server_type"
+for server_type in ${server_types}; do
+  if [[ -d "${YB_HOME_DIR}/${server_type}/logs" ]]; then
+    daemon_types="${daemon_types} ${server_type}"
   fi
 done
 log_levels="INFO ERROR WARNING FATAL"
-for daemon_type in $daemon_types; do
-  YB_LOG_DIR="$YB_HOME_DIR/$daemon_type/logs/"
+for daemon_type in ${daemon_types}; do
+  YB_LOG_DIR="${YB_HOME_DIR}/${daemon_type}/logs/"
   log_regexes="postgres*log"
 
-  for level in $log_levels; do
-    log_regexes="${log_regexes} yb-$daemon_type*log.$level*"
+  for level in ${log_levels}; do
+    log_regexes="${log_regexes} yb-${daemon_type}*log.${level}*"
   done
 
-  for log_regex in $log_regexes; do
+  for log_regex in ${log_regexes}; do
     # Using print0 since printf is not supported on all UNIX systems.
     # xargs -0 -r stat -c '%Y %n' outputs: [unix time in millisecs] [name of file]
-    find_non_gz_files="find $YB_LOG_DIR -type f -name
-    '$log_regex' ! -name '*.gz' -print0 | xargs -0 -r stat -c '%Y %n' | sort | awk '{print \$2}'"
-    non_gz_file_count=$(eval $find_non_gz_files | wc -l)
-
+    non_gz_files=$(find "${YB_LOG_DIR}" -type f -name "${log_regex}" ! -name "*.gz" -print0 | \
+      xargs -0 -r stat -c '%Y %n' | \
+      sort | cut -d' ' -f2-
+    )
+    # TODO: grep -c can be used here instead of wc -l.
+    non_gz_file_count=$(echo "${non_gz_files}" | wc -l)
     # gzip all files but the current one.
-    if [ $non_gz_file_count -gt 1 ]; then
-      files_to_gzip=$(eval $find_non_gz_files | head -n$(($non_gz_file_count - 1)))
-      for file in $files_to_gzip; do
-        echo "Compressing file $file"
-        gzip $file
+    if [[ "${non_gz_file_count}" -gt 1 ]]; then
+      files_to_gzip=$(echo "${non_gz_files}" | head -n-1)
+      for file in ${files_to_gzip}; do
+        echo "Compressing file ${file}"
+        gzip "${file}" || echo "Compression failed. Continuing."
       done
     fi
   done
 
-  if [ "$gzip_only" == false ]; then
+  if [[ "${gzip_only}" == "false" ]]; then
     server_log="yb-$daemon_type*log.*"
     postgres_log="postgres*log*"
     # Get total size of disk in kb and then compute permitted usage for the log files.
     # We get the size of the target link of $YB_LOG_DIR
-    disk_size_kb=$(df -k $YB_LOG_DIR | awk 'NR==2{print $2}')
-    percent_disk_usage_kb=$(($disk_size_kb * $logs_disk_percent_max / 100))
-    permitted_disk_usage_kb=$([ $percent_disk_usage_kb -le $logs_purge_threshold_kb ] && \
-      echo "$percent_disk_usage_kb" || echo "$logs_purge_threshold_kb")
-    delete_gz_files $YB_LOG_DIR $server_log $permitted_disk_usage_kb
-    delete_gz_files $YB_LOG_DIR $postgres_log $postgres_max_log_size_kb
+    disk_size_kb=$(df -k "${YB_LOG_DIR}" | awk 'NR==2{print $2}')
+    percent_disk_usage_kb=$(( disk_size_kb * logs_disk_percent_max / 100 ))
+    permitted_disk_usage_kb=$([[ "${percent_disk_usage_kb}" -le "${logs_purge_threshold_kb}" ]] && \
+      echo "${percent_disk_usage_kb}" || echo "${logs_purge_threshold_kb}")
+    delete_log_files "${YB_LOG_DIR}" "${server_log}" "${permitted_disk_usage_kb}"
+    delete_log_files "${YB_LOG_DIR}" "${postgres_log}" "${postgres_max_log_size_kb}"
   fi
 done
 
-if [ -d "$YB_CORES_DIR" ]; then
-  core_dump_disk_size_kb=$(df -k $YB_CORES_DIR | awk 'NR==2{print $2}')
-  core_dump_max_size_kb=$(($core_dump_disk_size_kb * $cores_disk_percent_max / 100))
-  delete_core_dump_files $YB_CORES_DIR $core_dump_max_size_kb
-fi