Skip to content

Commit

Permalink
Implement "repmgr cluster cleanup"
Browse files Browse the repository at this point in the history
  • Loading branch information
ibarwick committed Sep 11, 2017
1 parent a9f4a02 commit b6b31b1
Show file tree
Hide file tree
Showing 8 changed files with 227 additions and 24 deletions.
18 changes: 14 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -1405,10 +1405,10 @@ The view `replication_status` shows the most recent state for each node, e.g.:
The interval in which monitoring history is written is controlled by the
configuration parameter `monitor_interval_secs`; default is 2.

As this can generate a large amount of monitoring data in the `monitoring_history`
table, it's advisable to regularly purge historical data with
`repmgr cluster cleanup`; use the `-k/--keep-history` to specify how
many day's worth of data should be retained. *XXX not yet implemented*
As this can generate a large amount of monitoring data in the table
`repmgr.monitoring_history`. it's advisable to regularly purge historical data
using the `repmgr cluster cleanup` command ; use the `-k/--keep-history` to
specify how many day's worth of data should be retained.

It's possible to use `repmgrd` to provide monitoring only for some or all
nodes by setting `failover=manual` in the node's `repmgr.conf` file. In the
Expand Down Expand Up @@ -1870,6 +1870,16 @@ The following commands are available:
3 | node3 | standby_register | t | 2017-08-17 10:28:55 | standby registration succeeded
2 | node2 | standby_register | t | 2017-08-17 10:28:53 | standby registration succeeded

* `cluster cleanup`

Purges monitoring history from the `repmgr.monitoring_history` table to
prevent excessive table growth. Use the `-k/--keep-history` to specify the
number of days of monitoring history to retain. This command can be used
manually or as a cronjob.

This command requires a valid `repmgr.conf` file for the node on which it is
executed, either specified explicitly with `-f/--config-file` or located in
the current working directory; no additional arguments are required.


Generating event notifications with repmgr/repmgrd
Expand Down
101 changes: 99 additions & 2 deletions dbutils.c
Original file line number Diff line number Diff line change
Expand Up @@ -1661,6 +1661,31 @@ checkpoint(PGconn *conn)
return;
}

/* assumes superuser connection */
bool
vacuum_table(PGconn *primary_conn, const char *table)
{
PQExpBufferData query;
bool success = true;
PGresult *res = NULL;

initPQExpBuffer(&query);

appendPQExpBuffer(&query, "VACUUM %s", table);

res = PQexec(primary_conn, query.data);
termPQExpBuffer(&query);

log_debug("%i", (int) PQresultStatus(res));
if (PQresultStatus(res) != PGRES_COMMAND_OK)
{
success = false;
}

PQclear(res);

return success;
}

/* ===================== */
/* Node record functions */
Expand Down Expand Up @@ -3408,8 +3433,7 @@ is_server_available(const char *conninfo)
/* ==================== */

void
add_monitoring_record(
PGconn *primary_conn,
add_monitoring_record(PGconn *primary_conn,
PGconn *local_conn,
int primary_node_id,
int local_node_id,
Expand Down Expand Up @@ -3478,6 +3502,79 @@ add_monitoring_record(
}


int
get_number_of_monitoring_records_to_delete(PGconn *primary_conn, int keep_history)
{
PQExpBufferData query;
int record_count = -1;
PGresult *res = NULL;

initPQExpBuffer(&query);

appendPQExpBuffer(&query,
"SELECT COUNT(*) "
" FROM repmgr.monitoring_history "
" WHERE age(now(), last_monitor_time) >= '%d days'::interval",
keep_history);

res = PQexec(primary_conn, query.data);
termPQExpBuffer(&query);

if (PQresultStatus(res) != PGRES_TUPLES_OK)
{
log_error(_("unable to query number of monitoring records to clean up"));
log_detail("%s", PQerrorMessage(primary_conn));

PQclear(res);
PQfinish(primary_conn);
exit(ERR_DB_QUERY);
}
else
{
record_count = atoi(PQgetvalue(res, 0, 0));
}

PQclear(res);

return record_count;
}


bool
delete_monitoring_records(PGconn *primary_conn, int keep_history)
{
PQExpBufferData query;
bool success = true;
PGresult *res = NULL;

initPQExpBuffer(&query);

if (keep_history > 0)
{
appendPQExpBuffer(&query,
"DELETE FROM repmgr.monitoring_history "
" WHERE age(now(), last_monitor_time) >= '%d days'::interval ",
keep_history);
}
else
{
appendPQExpBuffer(&query,
"TRUNCATE TABLE repmgr.monitoring_history");
}

res = PQexec(primary_conn, query.data);
termPQExpBuffer(&query);

if (PQresultStatus(res) != PGRES_COMMAND_OK)
{
success = false;
}

PQclear(res);

return success;
}

/*
* node voting functions
*
Expand Down
9 changes: 6 additions & 3 deletions dbutils.h
Original file line number Diff line number Diff line change
Expand Up @@ -396,7 +396,7 @@ ExtensionStatus get_repmgr_extension_status(PGconn *conn);

/* node management functions */
void checkpoint(PGconn *conn);

bool vacuum_table(PGconn *conn, const char *table);


/* node record functions */
Expand Down Expand Up @@ -456,8 +456,7 @@ bool is_server_available(const char *conninfo);

/* monitoring functions */
void
add_monitoring_record(
PGconn *primary_conn,
add_monitoring_record(PGconn *primary_conn,
PGconn *local_conn,
int primary_node_id,
int local_node_id,
Expand All @@ -469,6 +468,10 @@ add_monitoring_record(
long long unsigned int apply_lag_bytes
);

int get_number_of_monitoring_records_to_delete(PGconn *primary_conn, int keep_history);
bool delete_monitoring_records(PGconn *primary_conn, int keep_history);



/* node voting functions */
NodeVotingStatus get_voting_status(PGconn *conn);
Expand Down
83 changes: 75 additions & 8 deletions repmgr-action-cluster.c
Original file line number Diff line number Diff line change
Expand Up @@ -1287,6 +1287,67 @@ cube_set_node_status(t_node_status_cube **cube, int n, int execute_node_id, int
}


void
do_cluster_cleanup(void)
{
PGconn *conn = NULL;
PGconn *primary_conn = NULL;
int entries_to_delete = 0;

conn = establish_db_connection(config_file_options.conninfo, true);

/* check if there is a master in this cluster */
log_info(_("connecting to primary server"));
primary_conn = establish_primary_db_connection(conn, true);

PQfinish(conn);

log_debug(_("number of days of monitoring history to retain: %i"), runtime_options.keep_history);

entries_to_delete = get_number_of_monitoring_records_to_delete(primary_conn, runtime_options.keep_history);

if (entries_to_delete == 0)
{
log_info(_("no monitoring records to delete"));
PQfinish(primary_conn);
return;
}

log_debug("at least %i monitoring records for deletion",
entries_to_delete);

if (delete_monitoring_records(primary_conn, runtime_options.keep_history) == false)
{
log_error(_("unable to delete monitoring records"));
log_detail("%s", PQerrorMessage(primary_conn));
PQfinish(primary_conn);
exit(ERR_DB_QUERY);
}

if (vacuum_table(primary_conn, "repmgr.monitoring_history") == false)
{
/* annoying if this fails, but not fatal */
log_warning(_("unable to vacuum table repmgr.monitoring_history\n"));
log_detail("%s", PQerrorMessage(primary_conn));
}


PQfinish(primary_conn);

if (runtime_options.keep_history > 0)
{
log_notice(_("monitoring records older than %i day(s) deleted"),
runtime_options.keep_history);
}
else
{
log_info(_("all monitoring records deleted"));
}

return;
}


void
do_cluster_help(void)
{
Expand All @@ -1305,7 +1366,7 @@ do_cluster_help(void)
puts("");
printf(_(" Configuration file or database connection required.\n"));
puts("");
printf(_(" --csv emit output as CSV (with a subset of fields)\n"));
printf(_(" --csv emit output as CSV (with a subset of fields)\n"));
puts("");

printf(_("CLUSTER MATRIX\n"));
Expand All @@ -1314,7 +1375,7 @@ do_cluster_help(void)
puts("");
printf(_(" Configuration file or database connection required.\n"));
puts("");
printf(_(" --csv emit output as CSV\n"));
printf(_(" --csv emit output as CSV\n"));
puts("");

printf(_("CLUSTER CROSSCHECK\n"));
Expand All @@ -1323,20 +1384,26 @@ do_cluster_help(void)
puts("");
printf(_(" Configuration file or database connection required.\n"));
puts("");
printf(_(" --csv emit output as CSV\n"));
printf(_(" --csv emit output as CSV\n"));
puts("");


printf(_("CLUSTER EVENT\n"));
puts("");
printf(_(" \"cluster event\" lists recent events logged in the \"repmgr.events\" table.\n"));
puts("");
printf(_(" --limit maximum number of events to display (default: %i)\n"), CLUSTER_EVENT_LIMIT);
printf(_(" --all display all events (overrides --limit)\n"));
printf(_(" --event filter specific event\n"));
printf(_(" --node-id restrict entries to node with this ID\n"));
printf(_(" --node-name restrict entries to node with this name\n"));
printf(_(" --limit maximum number of events to display (default: %i)\n"), CLUSTER_EVENT_LIMIT);
printf(_(" --all display all events (overrides --limit)\n"));
printf(_(" --event filter specific event\n"));
printf(_(" --node-id restrict entries to node with this ID\n"));
printf(_(" --node-name restrict entries to node with this name\n"));
puts("");

printf(_("CLUSTER EVENT\n"));
puts("");
printf(_(" \"cluster event\" purges records from the \"repmgr.monitor\" table.\n"));
puts("");
printf(_(" -k, --keep-history=VALUE retain indicated number of days of history (default: 0)\n"));
puts("");

}
1 change: 1 addition & 0 deletions repmgr-action-cluster.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ extern void do_cluster_show(void);
extern void do_cluster_event(void);
extern void do_cluster_crosscheck(void);
extern void do_cluster_matrix(void);
extern void do_cluster_cleanup(void);

extern void do_cluster_help(void);

Expand Down
5 changes: 5 additions & 0 deletions repmgr-client-global.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,9 @@ typedef struct
char event[MAXLEN];
int limit;

/* "cluster cleanup" options */
int keep_history;

/* following options for internal use */
char config_archive_dir[MAXPGPATH];
OutputMode output_mode;
Expand Down Expand Up @@ -155,6 +158,8 @@ typedef struct
"", false, false, false, \
/* "cluster event" options */ \
false, "", CLUSTER_EVENT_LIMIT, \
/* "cluster cleanup" options */ \
0, \
/* Following options for internal use */ \
"/tmp", OM_TEXT \
}
Expand Down
29 changes: 23 additions & 6 deletions repmgr-client.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
* CLUSTER EVENT
* CLUSTER CROSSCHECK
* CLUSTER MATRIX
* CLUSTER CLEANUP
*
* NODE STATUS
* NODE CHECK
Expand Down Expand Up @@ -499,6 +500,16 @@ main(int argc, char **argv)
runtime_options.all = true;
break;

/*------------------------
* "cluster cleanup" options
*------------------------
*/

/* -k/--keep-history */
case 'k':
runtime_options.keep_history = repmgr_atoi(optarg, "-k/--keep-history", &cli_errors, false);
break;

/*----------------
* logging options
*----------------
Expand Down Expand Up @@ -688,17 +699,18 @@ main(int argc, char **argv)
exit_with_cli_errors(&cli_errors);
}

/*
/*----------
* Determine the node type and action; following are valid:
*
* { PRIMARY | MASTER } REGISTER | STANDBY {REGISTER | UNREGISTER | CLONE
* [node] | PROMOTE | FOLLOW [node] | SWITCHOVER | REWIND} | BDR {
* REGISTER | UNREGISTER } | NODE { STATUS | CHECK | REJOIN |
* ARCHIVE-CONFIG | RESTORE-CONFIG | SERVICE } | CLUSTER { CROSSCHECK |
* MATRIX | SHOW | CLEANUP | EVENT }
* { PRIMARY | MASTER } REGISTER |
* STANDBY { REGISTER | UNREGISTER | CLONE [node] | PROMOTE | FOLLOW [node] | SWITCHOVER } |
* BDR { REGISTER | UNREGISTER } |
* NODE { STATUS | CHECK | REJOIN | SERVICE } |
* CLUSTER { CROSSCHECK | MATRIX | SHOW | EVENT | CLEANUP }
*
* [node] is an optional hostname, provided instead of the -h/--host
* option
* ---------
*/
if (optind < argc)
{
Expand Down Expand Up @@ -818,6 +830,8 @@ main(int argc, char **argv)
action = CLUSTER_CROSSCHECK;
else if (strcasecmp(repmgr_action, "MATRIX") == 0)
action = CLUSTER_MATRIX;
else if (strcasecmp(repmgr_action, "CLEANUP") == 0)
action = CLUSTER_CLEANUP;
}
else
{
Expand Down Expand Up @@ -1200,6 +1214,9 @@ main(int argc, char **argv)
case CLUSTER_MATRIX:
do_cluster_matrix();
break;
case CLUSTER_CLEANUP:
do_cluster_cleanup();
break;

default:
/* An action will have been determined by this point */
Expand Down
Loading

0 comments on commit b6b31b1

Please sign in to comment.