From 6a305feb0f7bdcae9d0552e5d2bca9c48ec2e63f Mon Sep 17 00:00:00 2001 From: Pau Espin Pedrol Date: Fri, 24 May 2019 19:58:20 +0200 Subject: [PATCH] Add VTY commands to set error ctr thresholds osmo-trx will validate over time that those thresholds are not reached. If they are reached, osmo-trx will die. As a result, osmo-bts-trx will notice and will end up notifying the BSC about it (for instance because it will also restart its process). For instance: """ ctr-error-threshold rx_drop_events 2 minute ctr-error-threshold rx_underruns 10 second """ In those cases above, osmo-trx will die if rate_ctr rx_drop_events went to a value higher than 2 per minute, or it will die to if rx_underruns went higher than 10 per second. Change-Id: I4bcf44dbf064e2e86dfc3b8a2ad18fea76fbd51a --- CommonLibs/trx_rate_ctr.cpp | 164 ++++++++++++++++++- CommonLibs/trx_rate_ctr.h | 25 +++ CommonLibs/trx_vty.c | 105 ++++++++++++ doc/manuals/chapters/counters.adoc | 59 +++++++ doc/manuals/chapters/counters_generated.adoc | 18 +- doc/manuals/vty/trx_vty_reference.xml | 31 ++++ 6 files changed, 391 insertions(+), 11 deletions(-) diff --git a/CommonLibs/trx_rate_ctr.cpp b/CommonLibs/trx_rate_ctr.cpp index 711b904c..d98caff6 100644 --- a/CommonLibs/trx_rate_ctr.cpp +++ b/CommonLibs/trx_rate_ctr.cpp @@ -56,6 +56,7 @@ extern "C" { #include #include #include +#include #include "osmo_signal.h" #include "trx_vty.h" @@ -68,18 +69,34 @@ extern "C" { (non-pending) counter data */ #define PENDING_CHAN_NONE SIZE_MAX +static void *trx_rate_ctr_ctx; + static struct rate_ctr_group** rate_ctrs; static struct device_counters* ctrs_pending; static size_t chan_len; static struct osmo_fd rate_ctr_timerfd; static Mutex rate_ctr_mutex; -enum { - TRX_CTR_RX_UNDERRUNS, - TRX_CTR_RX_OVERRUNS, - TRX_CTR_TX_UNDERRUNS, - TRX_CTR_RX_DROP_EV, - TRX_CTR_RX_DROP_SMPL, +struct osmo_timer_list threshold_timer; +static LLIST_HEAD(threshold_list); +static int threshold_timer_sched_secs; +static bool threshold_initied; + +const struct value_string rate_ctr_intv[] = { + { RATE_CTR_INTV_SEC, "per-second" }, + { RATE_CTR_INTV_MIN, "per-minute" }, + { RATE_CTR_INTV_HOUR, "per-hour" }, + { RATE_CTR_INTV_DAY, "per-day" }, + { 0, NULL } +}; + +const struct value_string trx_chan_ctr_names[] = { + { TRX_CTR_RX_UNDERRUNS, "rx_underruns" }, + { TRX_CTR_RX_OVERRUNS, "rx_overruns" }, + { TRX_CTR_TX_UNDERRUNS, "tx_underruns" }, + { TRX_CTR_RX_DROP_EV, "rx_drop_events" }, + { TRX_CTR_RX_DROP_SMPL, "rx_drop_samples" }, + { 0, NULL } }; static const struct rate_ctr_desc trx_chan_ctr_desc[] = { @@ -155,10 +172,99 @@ static int device_sig_cb(unsigned int subsys, unsigned int signal, return 0; } -/* Init rate_ctr subsystem. Expected to be called during process start by main thread */ +/************************************ + * ctr_threshold APIs + ************************************/ +static const char* ctr_threshold_2_vty_str(struct ctr_threshold *ctr) +{ + static char buf[256]; + int rc = 0; + rc += snprintf(buf, sizeof(buf), "ctr-error-threshold %s", get_value_string(trx_chan_ctr_names, ctr->ctr_id)); + rc += snprintf(buf + rc, sizeof(buf) - rc, " %d %s", ctr->val, get_value_string(rate_ctr_intv, ctr->intv)); + return buf; +} + +static void threshold_timer_cb(void *data) +{ + struct ctr_threshold *ctr_thr; + struct rate_ctr *rate_ctr; + size_t chan; + LOGC(DMAIN, DEBUG) << "threshold_timer_cb fired!"; + + llist_for_each_entry(ctr_thr, &threshold_list, list) { + for (chan = 0; chan < chan_len; chan++) { + rate_ctr = &rate_ctrs[chan]->ctr[ctr_thr->ctr_id]; + LOGCHAN(chan, DMAIN, INFO) << "checking threshold: " << ctr_threshold_2_vty_str(ctr_thr) + << " ("<< rate_ctr->intv[ctr_thr->intv].rate << " vs " << ctr_thr->val << ")"; + if (rate_ctr->intv[ctr_thr->intv].rate >= ctr_thr->val) { + LOGCHAN(chan, DMAIN, FATAL) << "threshold reached, stopping! " << ctr_threshold_2_vty_str(ctr_thr) + << " ("<< rate_ctr->intv[ctr_thr->intv].rate << " vs " << ctr_thr->val << ")"; + osmo_signal_dispatch(SS_MAIN, S_MAIN_STOP_REQUIRED, NULL); + return; + } + } + } + osmo_timer_schedule(&threshold_timer, threshold_timer_sched_secs, 0); +} + +static size_t ctr_threshold_2_seconds(struct ctr_threshold *ctr) +{ + size_t mult = 0; + switch (ctr->intv) { + case RATE_CTR_INTV_SEC: + mult = 1; + break; + case RATE_CTR_INTV_MIN: + mult = 60; + break; + case RATE_CTR_INTV_HOUR: + mult = 60*60; + break; + case RATE_CTR_INTV_DAY: + mult = 60*60*24; + break; + default: + OSMO_ASSERT(false); + } + return mult; +} + +static void threshold_timer_update_intv() { + struct ctr_threshold *ctr, *min_ctr; + size_t secs, min_secs; + + /* Avoid scheduling timer until itself and other structures are prepared + by trx_rate_ctr_init */ + if (!threshold_initied) + return; + + if (llist_empty(&threshold_list)) { + if (osmo_timer_pending(&threshold_timer)) + osmo_timer_del(&threshold_timer); + return; + } + + min_ctr = llist_first_entry(&threshold_list, struct ctr_threshold, list); + min_secs = ctr_threshold_2_seconds(min_ctr); + + llist_for_each_entry(ctr, &threshold_list, list) { + secs = ctr_threshold_2_seconds(ctr); + if( min_secs > secs) + min_secs = secs; + } + + + threshold_timer_sched_secs = OSMO_MAX(min_secs / 2 - 1, 1); + LOGC(DMAIN, INFO) << "New ctr-error-threshold check interval: " + << threshold_timer_sched_secs << " seconds"; + osmo_timer_schedule(&threshold_timer, threshold_timer_sched_secs, 0); +} + +/* Init rate_ctr subsystem. Expected to be called during process start by main thread before VTY is ready */ void trx_rate_ctr_init(void *ctx, struct trx_ctx* trx_ctx) { size_t i; + trx_rate_ctr_ctx = ctx; chan_len = trx_ctx->cfg.num_chans; ctrs_pending = (struct device_counters*) talloc_zero_size(ctx, chan_len * sizeof(struct device_counters)); rate_ctrs = (struct rate_ctr_group**) talloc_zero_size(ctx, chan_len * sizeof(struct rate_ctr_group*)); @@ -177,4 +283,48 @@ void trx_rate_ctr_init(void *ctx, struct trx_ctx* trx_ctx) exit(1); } osmo_signal_register_handler(SS_DEVICE, device_sig_cb, NULL); + + /* Now set up threshold checks */ + threshold_initied = true; + osmo_timer_setup(&threshold_timer, threshold_timer_cb, NULL); + threshold_timer_update_intv(); +} + +void trx_rate_ctr_threshold_add(struct ctr_threshold *ctr) +{ + struct ctr_threshold *new_ctr; + + new_ctr = talloc_zero(trx_rate_ctr_ctx, struct ctr_threshold); + *new_ctr = *ctr; + LOGC(DMAIN, NOTICE) << "Adding new threshold check: " << ctr_threshold_2_vty_str(new_ctr); + llist_add(&new_ctr->list, &threshold_list); + threshold_timer_update_intv(); +} + +int trx_rate_ctr_threshold_del(struct ctr_threshold *del_ctr) +{ + struct ctr_threshold *ctr; + + llist_for_each_entry(ctr, &threshold_list, list) { + if (ctr->intv != del_ctr->intv || + ctr->ctr_id != del_ctr->ctr_id || + ctr->val != del_ctr->val) + continue; + + LOGC(DMAIN, NOTICE) << "Deleting threshold check: " << ctr_threshold_2_vty_str(del_ctr); + llist_del(&ctr->list); + talloc_free(ctr); + threshold_timer_update_intv(); + return 0; + } + return -1; +} + +void trx_rate_ctr_threshold_write_config(struct vty *vty, char *indent_prefix) +{ + struct ctr_threshold *ctr; + + llist_for_each_entry(ctr, &threshold_list, list) { + vty_out(vty, "%s%s%s", indent_prefix, ctr_threshold_2_vty_str(ctr), VTY_NEWLINE); + } } diff --git a/CommonLibs/trx_rate_ctr.h b/CommonLibs/trx_rate_ctr.h index 48131e7c..6e4fa4d2 100644 --- a/CommonLibs/trx_rate_ctr.h +++ b/CommonLibs/trx_rate_ctr.h @@ -1,4 +1,29 @@ #pragma once +#include +#include + +enum TrxCtr { + TRX_CTR_RX_UNDERRUNS, + TRX_CTR_RX_OVERRUNS, + TRX_CTR_TX_UNDERRUNS, + TRX_CTR_RX_DROP_EV, + TRX_CTR_RX_DROP_SMPL, +}; + +struct ctr_threshold { + /*! Linked list of all counter groups in the system */ + struct llist_head list; + enum rate_ctr_intv intv; + enum TrxCtr ctr_id; + uint32_t val; +}; + +extern const struct value_string rate_ctr_intv[]; +extern const struct value_string trx_chan_ctr_names[]; + struct trx_ctx; void trx_rate_ctr_init(void *ctx, struct trx_ctx* trx_ctx); +void trx_rate_ctr_threshold_add(struct ctr_threshold *ctr); +int trx_rate_ctr_threshold_del(struct ctr_threshold *del_ctr); +void trx_rate_ctr_threshold_write_config(struct vty *vty, char *indent_prefix); diff --git a/CommonLibs/trx_vty.c b/CommonLibs/trx_vty.c index 06e20ab3..4cc827b2 100644 --- a/CommonLibs/trx_vty.c +++ b/CommonLibs/trx_vty.c @@ -31,6 +31,7 @@ #include #include +#include "trx_rate_ctr.h" #include "trx_vty.h" #include "../config.h" @@ -347,6 +348,107 @@ DEFUN(cfg_filler, cfg_filler_cmd, return CMD_SUCCESS; } +static int vty_ctr_name_2_id(const char* str) { + size_t i; + for (i = 0; trx_chan_ctr_names[i].str; i++) { + if (strstr(trx_chan_ctr_names[i].str, str)) { + return i; + } + } + return -1; +} + +static int vty_intv_name_2_id(const char* str) { + size_t i; + for (i = 0; rate_ctr_intv[i].str; i++) { + if (strcmp(rate_ctr_intv[i].str, str) == 0) { + return i; + } + } + return -1; +} + +#define THRESHOLD_ARGS "(rx_underruns|rx_overruns|tx_underruns|rx_drop_events|rx_drop_samples)" +#define THRESHOLD_STR_VAL(s) "Set threshold value for rate_ctr device:" OSMO_STRINGIFY_VAL(s) "\n" +#define THRESHOLD_STRS \ + THRESHOLD_STR_VAL(rx_underruns) \ + THRESHOLD_STR_VAL(rx_overruns) \ + THRESHOLD_STR_VAL(tx_underruns) \ + THRESHOLD_STR_VAL(rx_drop_events) \ + THRESHOLD_STR_VAL(rx_drop_samples) +#define INTV_ARGS "(per-second|per-minute|per-hour|per-day)" +#define INTV_STR_VAL(s) "Threshold value sampled " OSMO_STRINGIFY_VAL(s) "\n" +#define INTV_STRS \ + INTV_STR_VAL(per-second) \ + INTV_STR_VAL(per-minute) \ + INTV_STR_VAL(per-hour) \ + INTV_STR_VAL(per-day) + +DEFUN(cfg_ctr_error_threshold, cfg_ctr_error_threshold_cmd, + "ctr-error-threshold " THRESHOLD_ARGS " <0-65535> " INTV_ARGS, + "Threshold rate for error counter\n" + THRESHOLD_STRS + "Value to set for threshold\n" + INTV_STRS) +{ + int rc; + struct ctr_threshold ctr; + + struct trx_ctx *trx = trx_from_vty(vty); + rc = vty_ctr_name_2_id(argv[0]); + if (rc < 0) { + vty_out(vty, "No valid ctr_name found for ctr-error-threshold %s%s", + argv[0], VTY_NEWLINE); + return CMD_WARNING; + } + ctr.ctr_id = (enum TrxCtr)rc; + ctr.val = atoi(argv[1]); + rc = vty_intv_name_2_id(argv[2]); + if (rc < 0) { + vty_out(vty, "No valid time frame found for ctr-error-threshold %s %d %s%s", + argv[0], ctr.val, argv[2], VTY_NEWLINE); + return CMD_WARNING; + } + ctr.intv = (enum rate_ctr_intv) rc; + trx_rate_ctr_threshold_add(&ctr); + + return CMD_SUCCESS; +} + +DEFUN(cfg_no_ctr_error_threshold, cfg_no_ctr_error_threshold_cmd, + "no ctr-error-threshold " THRESHOLD_ARGS " <0-65535> " INTV_ARGS, + NO_STR "Threshold rate for error counter\n" + THRESHOLD_STRS + "Value to set for threshold\n" + INTV_STRS) +{ + int rc; + struct ctr_threshold ctr; + + struct trx_ctx *trx = trx_from_vty(vty); + rc = vty_ctr_name_2_id(argv[0]); + if (rc < 0) { + vty_out(vty, "No valid ctr_name found for ctr-error-threshold %s%s", + argv[0], VTY_NEWLINE); + return CMD_WARNING; + } + ctr.ctr_id = (enum TrxCtr)rc; + ctr.val = atoi(argv[1]); + rc = vty_intv_name_2_id(argv[2]); + if (rc < 0) { + vty_out(vty, "No valid time frame found for ctr-error-threshold %s %d %s%s", + argv[0], ctr.val, argv[2], VTY_NEWLINE); + return CMD_WARNING; + } + ctr.intv = (enum rate_ctr_intv) rc; + if (trx_rate_ctr_threshold_del(&ctr) < 0) { + vty_out(vty, "no ctr-error-threshold: Entry to delete not found%s", VTY_NEWLINE); + return CMD_WARNING; + } + + return CMD_SUCCESS; +} + DEFUN(cfg_chan, cfg_chan_cmd, "chan <0-100>", "Select a channel to configure\n" @@ -444,6 +546,7 @@ static int config_write_trx(struct vty *vty) vty_out(vty, " ext-rach %s%s", trx->cfg.ext_rach ? "enable" : "disable", VTY_NEWLINE); if (trx->cfg.sched_rr != 0) vty_out(vty, " rt-prio %u%s", trx->cfg.sched_rr, VTY_NEWLINE); + trx_rate_ctr_threshold_write_config(vty, " "); for (i = 0; i < trx->cfg.num_chans; i++) { chan = &trx->cfg.chans[i]; @@ -593,6 +696,8 @@ int trx_vty_init(struct trx_ctx* trx) install_element(TRX_NODE, &cfg_ext_rach_cmd); install_element(TRX_NODE, &cfg_rt_prio_cmd); install_element(TRX_NODE, &cfg_filler_cmd); + install_element(TRX_NODE, &cfg_ctr_error_threshold_cmd); + install_element(TRX_NODE, &cfg_no_ctr_error_threshold_cmd); install_element(TRX_NODE, &cfg_chan_cmd); install_node(&chan_node, dummy_config_write); diff --git a/doc/manuals/chapters/counters.adoc b/doc/manuals/chapters/counters.adoc index 7fbb10c6..79d19620 100644 --- a/doc/manuals/chapters/counters.adoc +++ b/doc/manuals/chapters/counters.adoc @@ -2,3 +2,62 @@ == Counters include::./counters_generated.adoc[] + +=== Rate Counter Configurable Error Thresholds + +Some rate counters such as overruns, underruns and dropped packets indicate +events that can really harm correct operation of the BTS served by OsmoTRX, +specially if they happen frequently. OsmoTRX is in most cases (depending on +maturity of device driver) prepared to dodge the temporary failure and keep +running and providing service. + +Still, it is sometimes important for this kind of events to not go unnoticed by +the operator, since they may indicate issues regarding the set up that may +require operator intervention to fix it. + +For instance, frequent dropped packets could indicate SDR HW/FW/power errors, or +a faulty connection against the host running OsmoTRX. + +They can also indicate issues on the host running OsmoTRX itself: For instance, +OsmoTRX may not be running under a high enough priority (hence other processes +eventually battling for resources with it), or that simply the HW running +OsmoTRX is not powerful enough to accomplish all work in a timely fashion all +the time. + +As a result, OsmoTRX can be configured to exit the process upon certain +conditions being met, in order to let osmoBTS notice something is wrong and thus +announcing issues through alarms to the network, where the operator can then +investigate the issue by looking at OsmoTRX logs. + +These conditions are configured by means of introducing rate counter thresholds +in the VTY. The OsmoTRX user can provide those threshold commands either in the +VTY cfg file read by OsmoTRX process during startup, or by adding/removing them +dynamically through the VTY interactive console. + +Each threshold cmd states an event (a rate counter type), a value and an time +interval (a second, a minute, an hour or a day). A threshold will be reached +(and OsmoTRX stopped) if its value grows bigger than the configured threshold +value over the configured time interval. This is the syntax used to manage rate +counter thresholds: + +`(no) ctr-error-threshold ` + +If several rate counter thresholds are set, then all of them are checked over +time and the first one reached will stop OsmoTRX. + +.Example: rate counter threshold configuration (VTY .cfg file) +---- +trx + ctr-error-threshold rx_drop_events 2 per-minute <1> + ctr-error-threshold rx_drop_samples 800 per-second <2> +---- +<1> Stop OsmoTRX if dropped event (any amount of samples) during Rx was detected 2 times or more during a minute. +<2> Stop OsmoTRX if 800 or more samples were detected during Rx to be dropped by the HW during a second. + +.Example: rate counter threshold configuration (VTY interactive) +---- +OsmoTRX(config-trx)# ctr-error-threshold tx_underruns 3 per-hour <1> +OsmoTRX(config-trx)# no ctr-error-threshold tx_underruns 3 per-hour <2> +---- +<1> Stop OsmoTRX if 3 or more underruns were detected during Tx over the last hour +<2> Remove previously set rate counter threshold diff --git a/doc/manuals/chapters/counters_generated.adoc b/doc/manuals/chapters/counters_generated.adoc index b40dc37d..6955b180 100644 --- a/doc/manuals/chapters/counters_generated.adoc +++ b/doc/manuals/chapters/counters_generated.adoc @@ -1,7 +1,17 @@ // autogenerated by show asciidoc counters -These counters and their description based on OsmoTRX 0.2.0.61-408f (OsmoTRX). +These counters and their description based on OsmoTRX 1.0.0.43-3f7c0 (OsmoTRX). + +=== Rate Counters // generating tables for rate_ctr_group -// generating tables for osmo_stat_items -// generating tables for osmo_counters -// there are no ungrouped osmo_counters +// rate_ctr_group table osmo-trx statistics +.trx:chan - osmo-trx statistics +[options="header"] +|=== +| Name | Reference | Description +| device:rx_underruns | <> | Number of Rx underruns +| device:rx_overruns | <> | Number of Rx overruns +| device:tx_underruns | <> | Number of Tx underruns +| device:rx_drop_events | <> | Number of times Rx samples were dropped by HW +| device:rx_drop_samples | <> | Number of Rx samples dropped by HW +|=== diff --git a/doc/manuals/vty/trx_vty_reference.xml b/doc/manuals/vty/trx_vty_reference.xml index d6cd15db..e448a46b 100644 --- a/doc/manuals/vty/trx_vty_reference.xml +++ b/doc/manuals/vty/trx_vty_reference.xml @@ -1253,6 +1253,37 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +