diff --git a/NEWS.adoc b/NEWS.adoc index 1b59dc78b9..eaef87273a 100644 --- a/NEWS.adoc +++ b/NEWS.adoc @@ -114,6 +114,16 @@ https://github.com/networkupstools/nut/milestone/13 ended up as literal HTML on KDE Plasma 6; now that rich text formatting is only used for main window status labels. [PR #3430] + - `upsd` data server updates: + * If we hit "Too many open files" during configuration reload, close + the oldest client connection and retry. [issue #3365] + * If the `MAXCONN` requested in the configuration file exceeds the OS + allowance on open file descriptors, fail early since the requested + configuration can not be guaranteed and can mis-fire unexpectedly + much later (tell the sysadmin to increase `ulimit` or set up a more + conservative `MAXCONN`). If there is a separate soft and hard limit, + and `MAXCONN` exceeds the soft limit, try to raise the bar. [issue #3365] + - Recipes, CI and helper script updates not classified above: * Introduced `ci_build.sh` settings and respective CI workflow settings to optionally re-use a `config.cache` file from older runs, and similar @@ -127,6 +137,7 @@ https://github.com/networkupstools/nut/milestone/13 dependencies were not previously discovered and bundled). [issue #3420, PRs #3429, #3432] + Release notes for NUT 2.8.5 - what's new since 2.8.4 ---------------------------------------------------- diff --git a/UPGRADING.adoc b/UPGRADING.adoc index e9cb6a65f4..e4beed637a 100644 --- a/UPGRADING.adoc +++ b/UPGRADING.adoc @@ -39,6 +39,13 @@ Changes from 2.8.5 to 2.8.6 in this regard, at the cost of adding arguments to methods introduced in the previous release. [issue #3331, PR #3408] +- Potentially a breaking change for existing deployments with a (large) + `MAXCONN` setting in `upsd.conf`: now this value is checked against the + `getrlimit()` (e.g. `ulimit -n`) setting of the operating system for this + daemon, where available, and the `upsd` data server would refuse to start + if the requested value is larger than what is allowed (minus some reserve + for configuration files and other use-cases). [issue #3365] + Changes from 2.8.4 to 2.8.5 --------------------------- diff --git a/docs/nut.dict b/docs/nut.dict index c95c5a555f..a8b49e7432 100644 --- a/docs/nut.dict +++ b/docs/nut.dict @@ -2242,6 +2242,7 @@ getenv gethostbyname getopt getproctag +getrlimit getter getters gettext diff --git a/server/conf.c b/server/conf.c index e181fc4c5b..aabe01cc21 100644 --- a/server/conf.c +++ b/server/conf.c @@ -27,6 +27,7 @@ #include "netssl.h" #include "nut_stdint.h" #include +#include static ups_t *upstable = NULL; int num_ups = 0; @@ -420,7 +421,12 @@ void load_upsdconf(int reloading) pconf_init(&ctx, upsd_conf_err); +retry: if (!pconf_file_begin(&ctx, fn)) { + if (errno == EMFILE && reloading == 2) { + close_oldest_client(); + goto retry; + } pconf_finish(&ctx); if (!reloading) @@ -491,6 +497,24 @@ void load_upsdconf(int reloading) pconf_finish(&ctx); } +static int load_upsconf(int reloading) { + int ret; + + ret = read_upsconf(0); /* 0 = do not abort fatally just yet */ + if (ret == -1) { + if (errno == EMFILE && reloading == 2) { + upsdebugx(1, "%s: close an oldest client connection and try reading config again", __func__); + close_oldest_client(); + ret = read_upsconf(1); /* 1 = may abort upon fundamental errors */ + } else { + /* Not fatalx(), the method above already reported the problem */ + exit(EXIT_FAILURE); + } + } + + return ret; +} + /* callback during parsing of ups.conf */ void do_upsconf_args(char *upsname, char *var, char *val) { @@ -654,12 +678,19 @@ static int check_file(const char *fn) { char chkfn[NUT_PATH_MAX]; FILE *f; + int retries = 0; snprintf(chkfn, sizeof(chkfn), "%s/%s", confpath(), fn); +retry: f = fopen(chkfn, "r"); if (!f) { + if (errno == EMFILE && retries < 10) { + close_oldest_client(); + retries++; + goto retry; + } upslog_with_errno(LOG_ERR, "Reload failed: can't open %s", chkfn); return 0; /* failed */ } @@ -687,11 +718,11 @@ void conf_reload(void) } /* reload from ups.conf */ - read_upsconf(1); /* 1 = may abort upon fundamental errors */ + load_upsconf(2); /* 2 = reloading, and may retry by closing clients if EMFILE */ upsconf_add(1); /* 1 = reloading */ /* now reread upsd.conf */ - load_upsdconf(1); /* 1 = reloading */ + load_upsdconf(2); /* 2 = reloading, and may retry by closing clients if EMFILE */ /* now delete all UPS entries that didn't get reloaded */ diff --git a/server/upsd.c b/server/upsd.c index df887278b8..26782d2d8f 100644 --- a/server/upsd.c +++ b/server/upsd.c @@ -45,6 +45,9 @@ # include /* #include */ # endif +# ifdef HAVE_SYS_RESOURCE_H +# include /* for getrlimit() and struct rlimit */ +# endif #else /* WIN32 */ /* Those 2 files for support of getaddrinfo, getnameinfo and freeaddrinfo on Windows 2000 and older versions */ @@ -94,12 +97,14 @@ int allow_no_device = 0; */ int allow_not_all_listeners = 0; -/* preloaded to POSIX sysconf(_SC_OPEN_MAX) or WIN32 MAX_WAIT_OBJECTS in main +/* Preloaded to POSIX sysconf(_SC_OPEN_MAX) or WIN32 MAX_WAIT_OBJECTS in main * and elsewhere, the run-time value can be overridden via upsd.conf `MAXCONN` * option (may cause partial waits chunk by chunk, if sysmaxconn is smaller). + * The sysmaxconn_hard is derived from getrlimit() (aka `ulimit` on allowed + * opened file descriptors) where available. */ nfds_t maxconn = 0; -static nfds_t sysmaxconn = 0; +static nfds_t sysmaxconn = 0, sysmaxconn_hard = 0; /* preloaded to STATEPATH in main, can be overridden via upsd.conf */ char *statepath = NULL; @@ -1258,12 +1263,68 @@ static void update_sysmaxconn(void) char *s = getenv("NUT_SYSMAXCONN_LIMIT"); #ifndef WIN32 +# ifdef HAVE_SYS_RESOURCE_H + struct rlimit limit; +# endif /* HAVE_SYS_RESOURCE_H */ + /* default to system limit (may be overridden in upsd.conf) */ /* FIXME: Check for overflows (and int size of nfds_t vs. long) - see get_max_pid_t() for example */ l = sysconf(_SC_OPEN_MAX); + +# ifdef HAVE_SYS_RESOURCE_H + /* Try to use getrlimit/setrlimit to detect and possibly increase the limit */ + if (getrlimit(RLIMIT_NOFILE, &limit) == 0) { + upsdebugx(2, "%s: System file descriptor limits: soft=%ld, hard=%ld", + __func__, (long)limit.rlim_cur, (long)limit.rlim_max); + + /* If we requested a specific MAXCONN, try to ensure we have enough FDs */ + if (maxconn > 0) { + rlim_t needed = (rlim_t)maxconn + RESERVE_FD_COUNT_UPSD; + + if (limit.rlim_cur < needed) { + if (needed <= limit.rlim_max) { + upslogx(LOG_INFO, "Increasing file descriptor limit to %ld", (long)needed); + + limit.rlim_cur = needed; + if (setrlimit(RLIMIT_NOFILE, &limit) != 0) { + upslog_with_errno(LOG_WARNING, "setrlimit(RLIMIT_NOFILE) to %ld failed", (long)needed); + } + } else { + upslogx(LOG_WARNING, "WARNING: Requested MAXCONN %" PRIdMAX + " requires %ld FDs overall " + "(with %ld reserved for non-connection purposes), " + "but system hard limit is %ld", + (intmax_t)maxconn, (long)needed, + (long)RESERVE_FD_COUNT_UPSD, + (long)limit.rlim_max); + + /* We might still try to bump to hard limit */ + if (limit.rlim_cur < limit.rlim_max) { + limit.rlim_cur = limit.rlim_max; + setrlimit(RLIMIT_NOFILE, &limit); + } + } + } + } + + /* Refresh limit after possible update */ + getrlimit(RLIMIT_NOFILE, &limit); + sysmaxconn_hard = (long)limit.rlim_cur; + } else { +# endif /* HAVE_SYS_RESOURCE_H */ + /* Fallback to sysconf if getrlimit fails or is absent */ + /* TOTHINK: Any other reasonable fallback hard limit? */ + sysmaxconn_hard = (nfds_t)l; +# ifdef HAVE_SYS_RESOURCE_H + } +# endif /* HAVE_SYS_RESOURCE_H */ + #else /* WIN32 */ /* hard-coded 64 (from ddk/wdm.h or winnt.h) */ l = (long)MAXIMUM_WAIT_OBJECTS; + + /* No known limit, do not check */ + sysmaxconn_hard = 0; #endif /* WIN32 */ if (l < 1) { @@ -1276,11 +1337,35 @@ static void update_sysmaxconn(void) l); } + if (sysmaxconn_hard > 0 && sysmaxconn_hard < RESERVE_FD_COUNT_UPSD + 10) { + fatalx(EXIT_FAILURE, + "System reported an absurd value %ld (below the %ld reservation for\n" + "non-connection purposes and some 10 for driver/client/... connections)\n" + "as its hard maximum number of connections.\n" + "The server won't start until this problem is resolved.\n", + (long)sysmaxconn_hard, (long)RESERVE_FD_COUNT_UPSD); + } + /* Note this historically also serves as * the initial/default MAXCONN setting * (so site/platform-dependent). */ - sysmaxconn = (nfds_t)l; + if (sysmaxconn_hard > 0) { + if (l < RESERVE_FD_COUNT_UPSD + 10) { + fatalx(EXIT_FAILURE, + "System reported an absurd value %ld (below the %ld reservation for\n" + "non-connection purposes and some 10 for driver/client/... connections)\n" + "as its sysconf maximum number of connections.\n" + "The server won't start until this problem is resolved.\n", + l, (long)RESERVE_FD_COUNT_UPSD); + } + + sysmaxconn = (nfds_t)(l - RESERVE_FD_COUNT_UPSD); + } else { + /* No known limit on open FDs/handles, whether connections or files or other streams */ + sysmaxconn = (nfds_t)l; + } + if (maxconn < 1) { upsdebugx(1, "%s: defaulting maxconn to sysmaxconn: %ld", __func__, l); @@ -1308,6 +1393,15 @@ static void poll_reload(void) /* Not likely this would change, but refresh just in case */ update_sysmaxconn(); + if (sysmaxconn_hard > 0 && (maxconn > sysmaxconn_hard - RESERVE_FD_COUNT_UPSD)) { + fatalx(EXIT_FAILURE, + "You requested %" PRIdMAX " as maximum number of connections,\n" + "but the system only allows %" PRIdMAX " and we need %d for ourselves.\n" + "The server won't start until this problem is resolved\n" + "(reduce MAXCONN or increase ulimit or similar settings).\n", + (intmax_t)maxconn, (intmax_t)sysmaxconn, RESERVE_FD_COUNT_UPSD); + } + if ((intmax_t)sysmaxconn < (intmax_t)maxconn) { upslogx(LOG_WARNING, "Your system limits the maximum number of connections to %" PRIdMAX "\n" @@ -1593,6 +1687,7 @@ static void mainloop(void) if (reload_flag) { upsnotify(NOTIFY_STATE_RELOADING, NULL); conf_reload(); + /* Among other things, re-detect sysmaxconn after loading config, because MAXCONN might have changed */ poll_reload(); reload_flag = 0; upsnotify(NOTIFY_STATE_READY, NULL); @@ -2419,6 +2514,22 @@ void check_perms(const char *fn) #endif /* WIN32 */ } +void close_oldest_client(void) +{ + nut_ctype_t *client, *oldest = NULL; + + for (client = firstclient; client; client = client->next) { + if (!oldest || client->last_heard < oldest->last_heard) { + oldest = client; + } + } + + if (oldest) { + upslogx(LOG_INFO, "Closing oldest client connection from %s to free up file descriptors", oldest->addr); + client_disconnect(oldest); + } +} + int main(int argc, char **argv) { int opt_ret = 0, cmdret = 0, foreground = -1; @@ -2708,6 +2819,9 @@ int main(int argc, char **argv) /* handle upsd.conf */ load_upsdconf(0); /* 0 = initial */ + /* Re-detect sysmaxconn after loading config, because MAXCONN might have changed */ + update_sysmaxconn(); + /* CLI debug level can not be smaller than debug_min specified * in upsd.conf. Note that non-zero debug_min does not impact * foreground running mode. diff --git a/server/upsd.h b/server/upsd.h index bb61fe373e..1fe5c02284 100644 --- a/server/upsd.h +++ b/server/upsd.h @@ -1,6 +1,14 @@ /* upsd.h - support structures and other minor details - Copyright (C) 1999 Russell Kroll + Copyright (C) + 1999 Russell Kroll + 2005 - 2019 Arnaud Quette + 2006 - 2007 Peter Selinger + 2006 - 2007 Arjen de Korte + 2010 - 2011 Frederic Bohe + 2012 Charles Lepple + 2013 Emilien Kia + 2020 - 2026 Jim Klimov This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -74,17 +82,34 @@ int send_err_extra(nut_ctype_t *client, const char *errtype, const char *extra); void server_load(void); void server_free(void); +/* Can be called by configuration (re)loading logic to free up file descriptors */ +void close_oldest_client(void); + +/* Three usual stdin/stdout/stderr triplet FDs, new connection handler, + * work with config files (reload), maybe something else?.. + * Keep about this many file descriptors off limits to the connection + * processing loop (comparing to `ulimit` if applicable). Note we can + * process existing connections in chunks (especially on Windows where + * a limited amount of handles may be polled in one go), but can not + * exceed the overall allowance (if any) on opened file descriptors. + * + * Note that the C Standard OPEN_MAX and the _POSIX_OPEN_MAX allow some + * 16-20 FDs that a process may always reasonably expect to be at its + * disposal. + */ +#define RESERVE_FD_COUNT_UPSD 8 + void check_perms(const char *fn); /* return values for instcmd / setvar status tracking, * mapped on drivers/upshandler.h, apart from STAT_PENDING (initial state) */ enum { - STAT_PENDING = -1, /* not yet completed */ - STAT_HANDLED = 0, /* completed successfully (NUT_SUCCESS or "OK") */ - STAT_UNKNOWN, /* unspecified error (NUT_ERR_UNKNOWN) */ - STAT_INVALID, /* invalid command/setvar (NUT_ERR_INVALID_ARGUMENT) */ - STAT_FAILED, /* command/setvar failed (NUT_ERR_INSTCMD_FAILED / NUT_ERR_SET_FAILED) */ - STAT_CONVERSION_FAILED /* STAT_INSTCMD_CONVERSION_FAILED / STAT_SET_CONVERSION_FAILED in drivers/upshandler.h => "ERR INVALID-ARGUMENT" same as STAT_INVALID */ + STAT_PENDING = -1, /* not yet completed */ + STAT_HANDLED = 0, /* completed successfully (NUT_SUCCESS or "OK") */ + STAT_UNKNOWN, /* unspecified error (NUT_ERR_UNKNOWN) */ + STAT_INVALID, /* invalid command/setvar (NUT_ERR_INVALID_ARGUMENT) */ + STAT_FAILED, /* command/setvar failed (NUT_ERR_INSTCMD_FAILED / NUT_ERR_SET_FAILED) */ + STAT_CONVERSION_FAILED /* STAT_INSTCMD_CONVERSION_FAILED / STAT_SET_CONVERSION_FAILED in drivers/upshandler.h => "ERR INVALID-ARGUMENT" same as STAT_INVALID */ }; /* Commands and settings status tracking functions */ @@ -107,23 +132,22 @@ extern nut_ctype_t *firstclient; /* map commands onto signals */ #ifndef WIN32 -#define SIGCMD_STOP SIGTERM -#define SIGCMD_RELOAD SIGHUP +# define SIGCMD_STOP SIGTERM +# define SIGCMD_RELOAD SIGHUP #else /* WIN32 */ -#define SIGCMD_STOP COMMAND_STOP -#define SIGCMD_RELOAD COMMAND_RELOAD +# define SIGCMD_STOP COMMAND_STOP +# define SIGCMD_RELOAD COMMAND_RELOAD #endif /* WIN32 */ /* awkward way to make a string out of a numeric constant */ - #define string_const_aux(x) #x #define string_const(x) string_const_aux(x) #ifdef SHUT_RDWR -#define shutdown_how SHUT_RDWR +# define shutdown_how SHUT_RDWR #else -#define shutdown_how 2 -#endif +# define shutdown_how 2 +#endif /* SHUT_RDWR */ /* UUID v4 generation function * Note: 'dest' must be at least `UUID4_LEN` long */