From a1cc099c4662834754f87c5c4bee27ce68dfc92c Mon Sep 17 00:00:00 2001 From: Matthias Schmidt Date: Fri, 9 Apr 2021 13:50:55 +0200 Subject: [PATCH 1/7] added check_wal_amount() for monitoring log file utilization --- check_postgres.pl | 106 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 105 insertions(+), 1 deletion(-) diff --git a/check_postgres.pl b/check_postgres.pl index ba28f27..561953e 100755 --- a/check_postgres.pl +++ b/check_postgres.pl @@ -352,6 +352,7 @@ package check_postgres; 'version-ok' => q{version $1}, 'wal-numfound' => q{WAL files found: $1}, 'wal-numfound2' => q{WAL "$2" files found: $1}, + 'wal-amount' => q{WAL data written in past $2: $1}, }, ## Spanish @@ -1145,6 +1146,7 @@ package check_postgres; 'version-ok' => q{Version $1}, 'wal-numfound' => q{WAL-Dateien gefunden: $1}, 'wal-numfound2' => q{WAL "$2" Dateien gefunden: $1}, + 'wal-amount' => q{WAL Geschriebene Daten in den letzten $2: $1}, }, ## Persian @@ -1719,10 +1721,11 @@ package check_postgres; 'suffix=s', ## used by same_schema only 'replace', ## used by same_schema only 'skipsequencevals', ## used by same_schema only - 'lsfunc=s', ## used by wal_files and archive_ready + 'lsfunc=s', ## used by wal_files, wal_amount and archive_ready 'object=s@', ## used by same_schema for object types to include 'skipobject=s@', ## used by same_schema for object types to exclude 'skipcycled', ## used by sequence only + 'interval=s', ## used by wal_amount only ); die $USAGE if ! keys %opt and ! @ARGV; @@ -1925,6 +1928,7 @@ package check_postgres; txn_wraparound => [1, 'See how close databases are getting to transaction ID wraparound.'], version => [1, 'Check for proper Postgres version.'], wal_files => [1, 'Check the number of WAL files in the pg_xlog directory'], + wal_amount => [1, 'Check the amount of WAL data written within a specified period of time.'], }; ## XXX Need to i18n the above @@ -2681,6 +2685,9 @@ sub finishup { ## Check the number of WAL files. warning and critical are numbers check_wal_files() if $action eq 'wal_files'; +## Check the amount of WAL data written in a given time period. warning and critical are sizes +check_wal_amount() if $action eq 'wal_amount'; + ## Check the number of WAL files ready to archive. warning and critical are numbers check_archive_ready() if $action eq 'archive_ready'; @@ -9057,6 +9064,70 @@ sub check_wal_files { } ## end of check_wal_files + +sub check_wal_amount { + + ## Check the amount of WAL data written within a specified period of time and based on file modification timestamp. + + ## Critical and warning are the amount of written data + ## Warning and critical are bytes + ## Valid units: b, k, m, g, t, e + ## All above may be written as plural or with a trailing 'b' + + ## The period under consideration is defined with the option '--interval', the default is one day + ## Valid units: s[econd], m[inute], h[our], d[ay] + ## All above may be written as plural as well (e.g. "2 hours") + + my ($warning, $critical) = validate_range({type => 'size'}); + + ## Determine the time interval - the default is one day + my $interval = $opt{interval} || "1 day"; + $interval = size_in_seconds($interval, 'interval'); + if ($interval !~ /^[-+]?\d+$/) { + ndie msg('range-int', 'interval'); + } + if (! length $interval) { + ndie msg('range-notime', 'interval'); + } + + my $lsfunc = $opt{lsfunc} || 'pg_ls_dir'; + my $lsargs = $opt{lsfunc} ? q{} : "'pg_xlog'"; + + my $cond = qq{modification >= (now() - '$interval seconds'::interval)}; + + $SQL = qq{SELECT COALESCE(SUM(size), 0) AS size FROM $lsfunc($lsargs) AS filename INNER JOIN pg_stat_file('pg_xlog' || '/' || filename) ON isdir = 'f' WHERE $cond}; + my $SQL10 = $opt{lsfunc} ? $SQL : + qq{SELECT COALESCE(SUM(size), 0) AS size FROM pg_ls_waldir() WHERE $cond}; + + my $info = run_command($SQL, {regex => qr[\d], version => [">9.6 $SQL10"] }); + + for $db (@{$info->{db}}) { + my $r = $db->{slurp}[0]; + my $size = $r->{size}; + if ($MRTG) { + do_mrtg({one => $size}); + } + + my $msg = msg('wal-amount', pretty_size($size,6), pretty_time($interval)); + + $db->{perf} .= sprintf '%s=%s;%s;%s', + perfname(msg('size')), $size, $warning, $critical; + + if (length $critical and $size > $critical) { + add_critical $msg; + } + elsif (length $warning and $size > $warning) { + add_warning $msg; + } + else { + add_ok $msg; + } + } + + return; + +} ## end of check_wal_amount + =pod =encoding utf8 @@ -10807,6 +10878,39 @@ =head2 B For MRTG output, reports the number of WAL files on line 1. +=head2 B + +(C) Checks how many data was written in a past time period to WAL files in the F directory (PostgreSQL 10 and later: F), which is found +in your B, sometimes as a symlink to another physical disk for +performance reasons. If the I<--lsfunc> option is not used then this action must be run as a superuser, in order to access the +contents of the F directory. The minimum version to use this action is +Postgres 8.1. The I<--warning> and I<--critical> options are simply the amount of written +data in the F directory. The I<--interval> option is the past time period +in which changes to the WAL files are considered, the default is 24 hours. + +To avoid connecting as a database superuser, a wrapper function around +C should be defined as a superuser with SECURITY DEFINER, +and the I<--lsfunc> option used. This example function, if defined by +a superuser, will allow the script to connect as a normal user +I with I<--lsfunc=ls_xlog_dir> + + BEGIN; + CREATE FUNCTION ls_xlog_dir() + RETURNS SETOF TEXT + AS $$ SELECT pg_ls_dir('pg_xlog') $$ + LANGUAGE SQL + SECURITY DEFINER; + REVOKE ALL ON FUNCTION ls_xlog_dir() FROM PUBLIC; + GRANT EXECUTE ON FUNCTION ls_xlog_dir() to nagios; + COMMIT; + +Example 1: Check that the number of ready WAL files is 10 or less on host "pluto", +using a wrapper function C to avoid the need for superuser permissions + + check_postgres_wal_amount --host=pluto --critical=512MB --lsfunc=ls_xlog_dir --interval=60M + +For MRTG output, reports the amount of data written on line 1. + =head2 B =head2 B From 88a45951c005202667c58be94bb2869ac25908ac Mon Sep 17 00:00:00 2001 From: Matthias Schmidt Date: Tue, 13 Apr 2021 15:52:26 +0200 Subject: [PATCH 2/7] make critical and warning options optional, adjusted POD accordingly --- check_postgres.pl | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/check_postgres.pl b/check_postgres.pl index 561953e..57caeb6 100755 --- a/check_postgres.pl +++ b/check_postgres.pl @@ -9078,7 +9078,13 @@ sub check_wal_amount { ## Valid units: s[econd], m[inute], h[our], d[ay] ## All above may be written as plural as well (e.g. "2 hours") - my ($warning, $critical) = validate_range({type => 'size'}); + my ($warning, $critical) = ('', ''); + + # critical and warning states are optional for this check + # + if (length($opt{warning}) || length($opt{critical})) { + ($warning, $critical) = validate_range({type => 'size'}); + } ## Determine the time interval - the default is one day my $interval = $opt{interval} || "1 day"; @@ -10885,8 +10891,11 @@ =head2 B performance reasons. If the I<--lsfunc> option is not used then this action must be run as a superuser, in order to access the contents of the F directory. The minimum version to use this action is Postgres 8.1. The I<--warning> and I<--critical> options are simply the amount of written -data in the F directory. The I<--interval> option is the past time period -in which changes to the WAL files are considered, the default is 24 hours. +data in the F directory in I, specifying with a unit up to zeta byte is supported. +Both options are optional in case if you just want to monitor the amount. +The I<--interval> option is the past time period in I, in which changes to the +WAL files are considered, the units s(econds), m(inutes), h(ours), +d(ays), w(eek)s and y(ears) are supported, the default value for this option is "24 hours". To avoid connecting as a database superuser, a wrapper function around C should be defined as a superuser with SECURITY DEFINER, @@ -10904,10 +10913,15 @@ =head2 B GRANT EXECUTE ON FUNCTION ls_xlog_dir() to nagios; COMMIT; -Example 1: Check that the number of ready WAL files is 10 or less on host "pluto", -using a wrapper function C to avoid the need for superuser permissions +Example 1: Check that the size of WAL files written in the last 90 minutes do not exceed 512MB +on host "pluto", using a wrapper function C to avoid the need for superuser permissions + + check_postgres_wal_amount --host=pluto --critical=512MB --lsfunc=ls_xlog_dir --interval=90m + +Example 2: Report the size of WAL files written in the last 5 minutes on the database +connectable on the unix socket "/tmp/cptesting_socket" as user "check_postgres_testing" - check_postgres_wal_amount --host=pluto --critical=512MB --lsfunc=ls_xlog_dir --interval=60M + check_postgres_wal_amount --host=/tmp/cptesting_socket --dbuser=check_postgres_testing --interval=5m For MRTG output, reports the amount of data written on line 1. From 5d22e8d1966140b66ffa88547aa1af514ad15074 Mon Sep 17 00:00:00 2001 From: Matthias Schmidt Date: Tue, 13 Apr 2021 15:54:40 +0200 Subject: [PATCH 3/7] use wal file directory dependent on postgres version to allow using the --lsfunc option with postgres databases >= 9.6 --- check_postgres.pl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/check_postgres.pl b/check_postgres.pl index 57caeb6..6f1d407 100755 --- a/check_postgres.pl +++ b/check_postgres.pl @@ -9101,7 +9101,10 @@ sub check_wal_amount { my $cond = qq{modification >= (now() - '$interval seconds'::interval)}; - $SQL = qq{SELECT COALESCE(SUM(size), 0) AS size FROM $lsfunc($lsargs) AS filename INNER JOIN pg_stat_file('pg_xlog' || '/' || filename) ON isdir = 'f' WHERE $cond}; + $SQL = qq{ SELECT COALESCE(SUM(size), 0) AS size + FROM $lsfunc($lsargs) AS filename + INNER JOIN pg_stat_file((SELECT CASE WHEN current_setting('server_version_num')::integer >= 96000 THEN 'pg_wal' ELSE 'pg_xlog' END) || '/' || filename) ON isdir = 'f' + WHERE $cond}; my $SQL10 = $opt{lsfunc} ? $SQL : qq{SELECT COALESCE(SUM(size), 0) AS size FROM pg_ls_waldir() WHERE $cond}; From 708a186d11d366cb6a3fe63e277b3d25a48ea7bf Mon Sep 17 00:00:00 2001 From: Matthias Schmidt Date: Tue, 13 Apr 2021 15:55:56 +0200 Subject: [PATCH 4/7] added test for wal size check --- t/02_wal_amount.t | 111 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 t/02_wal_amount.t diff --git a/t/02_wal_amount.t b/t/02_wal_amount.t new file mode 100644 index 0000000..baec73b --- /dev/null +++ b/t/02_wal_amount.t @@ -0,0 +1,111 @@ +#!perl + +## Test the "wal_amount" action + +use 5.008; +use strict; +use warnings; +use Data::Dumper; +use Test::More tests => 12; +use lib 't','.'; +use CP_Testing; + +use vars qw/$dbh $t/; + +my $cp = CP_Testing->new({default_action => 'wal_amount'}); + +$dbh = $cp->test_database_handle(); + +my $S = q{Action 'wal_amount'}; +my $label = 'POSTGRES_WAL_AMOUNT'; + +$t=qq{$S fails when called with an invalid option}; +like ($cp->run('foobar=12'), qr{Usage:}, $t); + +$t=qq{$S fails when called with an invalid option}; +like ($cp->run('--warning=30%'), qr{ERROR:.+Invalid size}, $t); +like ($cp->run('--warning=-30'), qr{ERROR:.+Invalid size}, $t); + +my $ver = $dbh->{pg_server_version}; +if ($ver < 80100) { + + $t=qq{$S gives an error when run against an old Postgres version}; + like ($cp->run('--warning=99'), qr{ERROR.*server version must be >= 8.1}, $t); + + SKIP: { + skip 'Cannot test wal_amount completely on Postgres 8.0 or lower', 7; + } + + exit; +} + +$cp->drop_schema_if_exists(); + +$t=qq{$S works as expected for warnings}; +like ($cp->run('--warning="100000 GB"'), qr{^$label OK}, $t); +like ($cp->run('--warning=0'), qr{^$label WARNING}, $t); + +$t=qq{$S works as expected for criticals}; +like ($cp->run('--critical="1 TB"'), qr{^$label OK}, $t); +like ($cp->run('--critical=0'), qr{^$label CRITICAL}, $t); + +$cp->set_fake_schema(); + +# determine the written wal file size in the last hour before inserting test data +# +my $initialWalSize = $cp->run('--interval=15m --output=simple'); +chomp($initialWalSize); +$t=qq{$S reported a positive amount of written wal files ($initialWalSize) in the last 15 minutes}; +ok ($initialWalSize > 0, $t); + +# create a table with simple text contents and insert a set with large (~4*wal segment siz) test data +# +my $walSegmentSize = 16*1024*1024; +$dbh->do(q{DROP TABLE IF EXISTS cptest.randomdata}); +$dbh->do(q{CREATE TABLE cptest.randomdata (data TEXT)}); +my $randomText = ""; +while (length( $randomText ) < (4 * $walSegmentSize)) { + $randomText = $randomText . chr( int(rand(26)) + 65); +} +my $sth = $dbh->prepare(q{INSERT INTO cptest.randomdata VALUES (?)}); +$sth->bind_param(1, $randomText); +$sth->execute(); +$dbh->commit(); + +my $currentWalSize = $cp->run('--interval=15m --output=simple'); +chomp($currentWalSize); +$t=qq{$S reported a positive amount of written wal files ($currentWalSize) in the last 15 minutes before commited 64MB of random data}; +ok ($currentWalSize > 0, $t); + +# validate if enough wal data was written +# +my $minWalSizeDelta = 3 * $walSegmentSize; +$t=qq{$S reported a minimum of more ($minWalSizeDelta) amount of written wal files ($currentWalSize) since comitted test data}; +ok ($currentWalSize >= ($initialWalSize + 3 * $walSegmentSize), $t); + +# take a look on the mrtg output +# +$t=qq{$S returns correct MRTG information}; +is ($cp->run('--interval=15m --output=mrtg'), "$currentWalSize\n0\n\n\n", $t); + +# check if the lsfunc option is working +# +my $xlogdir = $ver >= 96000 ? 'pg_wal' : 'pg_xlog'; +$dbh->do(qq{CREATE OR REPLACE FUNCTION ls_xlog_dir() + RETURNS SETOF TEXT + AS \$\$ SELECT pg_ls_dir('$xlogdir') \$\$ + LANGUAGE SQL + SECURITY DEFINER}); +$dbh->commit(); + +$t=qq{$S returns correct amount of written wal files if lsfunc is used}; +like ($cp->run('--interval=15m --output=simple'), qr{^$currentWalSize$}, $t); + +# cleanup +# +$dbh->do(q{DROP TABLE cptest.randomdata}); +$dbh->do(q{DROP FUNCTION ls_xlog_dir()}); +$cp->drop_schema_if_exists(); +$dbh->commit(); + +exit; From ed63ba5ad5949db01d8ea33d7a309f160275ecde Mon Sep 17 00:00:00 2001 From: Matthias Schmidt Date: Wed, 14 Apr 2021 15:29:36 +0200 Subject: [PATCH 5/7] added database cleanup before initialization to prevent written log data from previous tests --- t/02_wal_amount.t | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/t/02_wal_amount.t b/t/02_wal_amount.t index baec73b..9c50b2b 100644 --- a/t/02_wal_amount.t +++ b/t/02_wal_amount.t @@ -14,6 +14,10 @@ use vars qw/$dbh $t/; my $cp = CP_Testing->new({default_action => 'wal_amount'}); +# reinitialize the test database to prevent measuring written log data by previous tests +# +$cp->cleanup(); + $dbh = $cp->test_database_handle(); my $S = q{Action 'wal_amount'}; From 50603966c13b322b101da507417aebf769023f41 Mon Sep 17 00:00:00 2001 From: Matthias Schmidt Date: Thu, 15 Apr 2021 08:08:34 +0000 Subject: [PATCH 6/7] wording in POD documentation improved and corrected --- check_postgres.pl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/check_postgres.pl b/check_postgres.pl index 6f1d407..e855b67 100755 --- a/check_postgres.pl +++ b/check_postgres.pl @@ -10889,14 +10889,14 @@ =head2 B =head2 B -(C) Checks how many data was written in a past time period to WAL files in the F directory (PostgreSQL 10 and later: F), which is found -in your B, sometimes as a symlink to another physical disk for -performance reasons. If the I<--lsfunc> option is not used then this action must be run as a superuser, in order to access the +(C) Checks how much data was written in a certain period of time to WAL files in the directory F (PostgreSQL 10 and later: F), which can be found +in the B, possibly as a symlink to another physical disk for +performance reasons. If the I<--lsfunc> option is not given then this action must be run as superuser, in order to access the contents of the F directory. The minimum version to use this action is -Postgres 8.1. The I<--warning> and I<--critical> options are simply the amount of written +Postgres 8.1. The I<--warning> and I<--critical> options simply represent the amount of written data in the F directory in I, specifying with a unit up to zeta byte is supported. Both options are optional in case if you just want to monitor the amount. -The I<--interval> option is the past time period in I, in which changes to the +The I<--interval> option specifies the period of time in I, in which changes to the WAL files are considered, the units s(econds), m(inutes), h(ours), d(ays), w(eek)s and y(ears) are supported, the default value for this option is "24 hours". @@ -10922,7 +10922,7 @@ =head2 B check_postgres_wal_amount --host=pluto --critical=512MB --lsfunc=ls_xlog_dir --interval=90m Example 2: Report the size of WAL files written in the last 5 minutes on the database -connectable on the unix socket "/tmp/cptesting_socket" as user "check_postgres_testing" +connected through the unix socket "/tmp/cptesting_socket" as user "check_postgres_testing" check_postgres_wal_amount --host=/tmp/cptesting_socket --dbuser=check_postgres_testing --interval=5m From b713e0188e3d5c338f664c3ce56c04d643052949 Mon Sep 17 00:00:00 2001 From: Matthias Schmidt Date: Mon, 19 Apr 2021 15:37:24 +0200 Subject: [PATCH 7/7] fixed undefined warning of no critical and no warning option is given --- check_postgres.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/check_postgres.pl b/check_postgres.pl index e855b67..d4fa6bb 100755 --- a/check_postgres.pl +++ b/check_postgres.pl @@ -9082,7 +9082,7 @@ sub check_wal_amount { # critical and warning states are optional for this check # - if (length($opt{warning}) || length($opt{critical})) { + if ((defined($opt{warning}) && length($opt{warning})) || (defined($opt{critical}) && length($opt{critical}))) { ($warning, $critical) = validate_range({type => 'size'}); }