Add VTY commands to set error ctr thresholds

osmo-trx will validate over time that those thresholds are not reached.
If they are reached, osmo-trx will die. As a result, osmo-bts-trx will
notice and will end up notifying the BSC about it (for instance because
it will also restart its process).

For instance:
"""
ctr-error-threshold rx_drop_events 2 minute
ctr-error-threshold rx_underruns 10 second
"""

In those cases above, osmo-trx will die if rate_ctr rx_drop_events went
to a value higher than 2 per minute, or it will die to if rx_underruns
went higher than 10 per second.

Change-Id: I4bcf44dbf064e2e86dfc3b8a2ad18fea76fbd51a
This commit is contained in:
Pau Espin 2019-05-24 19:58:20 +02:00 committed by pespin
parent bde55afd29
commit 6a305feb0f
6 changed files with 391 additions and 11 deletions

View File

@ -56,6 +56,7 @@ extern "C" {
#include <osmocom/core/rate_ctr.h>
#include <osmocom/core/select.h>
#include <osmocom/core/stats.h>
#include <osmocom/core/timer.h>
#include "osmo_signal.h"
#include "trx_vty.h"
@ -68,18 +69,34 @@ extern "C" {
(non-pending) counter data */
#define PENDING_CHAN_NONE SIZE_MAX
static void *trx_rate_ctr_ctx;
static struct rate_ctr_group** rate_ctrs;
static struct device_counters* ctrs_pending;
static size_t chan_len;
static struct osmo_fd rate_ctr_timerfd;
static Mutex rate_ctr_mutex;
enum {
TRX_CTR_RX_UNDERRUNS,
TRX_CTR_RX_OVERRUNS,
TRX_CTR_TX_UNDERRUNS,
TRX_CTR_RX_DROP_EV,
TRX_CTR_RX_DROP_SMPL,
struct osmo_timer_list threshold_timer;
static LLIST_HEAD(threshold_list);
static int threshold_timer_sched_secs;
static bool threshold_initied;
const struct value_string rate_ctr_intv[] = {
{ RATE_CTR_INTV_SEC, "per-second" },
{ RATE_CTR_INTV_MIN, "per-minute" },
{ RATE_CTR_INTV_HOUR, "per-hour" },
{ RATE_CTR_INTV_DAY, "per-day" },
{ 0, NULL }
};
const struct value_string trx_chan_ctr_names[] = {
{ TRX_CTR_RX_UNDERRUNS, "rx_underruns" },
{ TRX_CTR_RX_OVERRUNS, "rx_overruns" },
{ TRX_CTR_TX_UNDERRUNS, "tx_underruns" },
{ TRX_CTR_RX_DROP_EV, "rx_drop_events" },
{ TRX_CTR_RX_DROP_SMPL, "rx_drop_samples" },
{ 0, NULL }
};
static const struct rate_ctr_desc trx_chan_ctr_desc[] = {
@ -155,10 +172,99 @@ static int device_sig_cb(unsigned int subsys, unsigned int signal,
return 0;
}
/* Init rate_ctr subsystem. Expected to be called during process start by main thread */
/************************************
* ctr_threshold APIs
************************************/
static const char* ctr_threshold_2_vty_str(struct ctr_threshold *ctr)
{
static char buf[256];
int rc = 0;
rc += snprintf(buf, sizeof(buf), "ctr-error-threshold %s", get_value_string(trx_chan_ctr_names, ctr->ctr_id));
rc += snprintf(buf + rc, sizeof(buf) - rc, " %d %s", ctr->val, get_value_string(rate_ctr_intv, ctr->intv));
return buf;
}
static void threshold_timer_cb(void *data)
{
struct ctr_threshold *ctr_thr;
struct rate_ctr *rate_ctr;
size_t chan;
LOGC(DMAIN, DEBUG) << "threshold_timer_cb fired!";
llist_for_each_entry(ctr_thr, &threshold_list, list) {
for (chan = 0; chan < chan_len; chan++) {
rate_ctr = &rate_ctrs[chan]->ctr[ctr_thr->ctr_id];
LOGCHAN(chan, DMAIN, INFO) << "checking threshold: " << ctr_threshold_2_vty_str(ctr_thr)
<< " ("<< rate_ctr->intv[ctr_thr->intv].rate << " vs " << ctr_thr->val << ")";
if (rate_ctr->intv[ctr_thr->intv].rate >= ctr_thr->val) {
LOGCHAN(chan, DMAIN, FATAL) << "threshold reached, stopping! " << ctr_threshold_2_vty_str(ctr_thr)
<< " ("<< rate_ctr->intv[ctr_thr->intv].rate << " vs " << ctr_thr->val << ")";
osmo_signal_dispatch(SS_MAIN, S_MAIN_STOP_REQUIRED, NULL);
return;
}
}
}
osmo_timer_schedule(&threshold_timer, threshold_timer_sched_secs, 0);
}
static size_t ctr_threshold_2_seconds(struct ctr_threshold *ctr)
{
size_t mult = 0;
switch (ctr->intv) {
case RATE_CTR_INTV_SEC:
mult = 1;
break;
case RATE_CTR_INTV_MIN:
mult = 60;
break;
case RATE_CTR_INTV_HOUR:
mult = 60*60;
break;
case RATE_CTR_INTV_DAY:
mult = 60*60*24;
break;
default:
OSMO_ASSERT(false);
}
return mult;
}
static void threshold_timer_update_intv() {
struct ctr_threshold *ctr, *min_ctr;
size_t secs, min_secs;
/* Avoid scheduling timer until itself and other structures are prepared
by trx_rate_ctr_init */
if (!threshold_initied)
return;
if (llist_empty(&threshold_list)) {
if (osmo_timer_pending(&threshold_timer))
osmo_timer_del(&threshold_timer);
return;
}
min_ctr = llist_first_entry(&threshold_list, struct ctr_threshold, list);
min_secs = ctr_threshold_2_seconds(min_ctr);
llist_for_each_entry(ctr, &threshold_list, list) {
secs = ctr_threshold_2_seconds(ctr);
if( min_secs > secs)
min_secs = secs;
}
threshold_timer_sched_secs = OSMO_MAX(min_secs / 2 - 1, 1);
LOGC(DMAIN, INFO) << "New ctr-error-threshold check interval: "
<< threshold_timer_sched_secs << " seconds";
osmo_timer_schedule(&threshold_timer, threshold_timer_sched_secs, 0);
}
/* Init rate_ctr subsystem. Expected to be called during process start by main thread before VTY is ready */
void trx_rate_ctr_init(void *ctx, struct trx_ctx* trx_ctx)
{
size_t i;
trx_rate_ctr_ctx = ctx;
chan_len = trx_ctx->cfg.num_chans;
ctrs_pending = (struct device_counters*) talloc_zero_size(ctx, chan_len * sizeof(struct device_counters));
rate_ctrs = (struct rate_ctr_group**) talloc_zero_size(ctx, chan_len * sizeof(struct rate_ctr_group*));
@ -177,4 +283,48 @@ void trx_rate_ctr_init(void *ctx, struct trx_ctx* trx_ctx)
exit(1);
}
osmo_signal_register_handler(SS_DEVICE, device_sig_cb, NULL);
/* Now set up threshold checks */
threshold_initied = true;
osmo_timer_setup(&threshold_timer, threshold_timer_cb, NULL);
threshold_timer_update_intv();
}
void trx_rate_ctr_threshold_add(struct ctr_threshold *ctr)
{
struct ctr_threshold *new_ctr;
new_ctr = talloc_zero(trx_rate_ctr_ctx, struct ctr_threshold);
*new_ctr = *ctr;
LOGC(DMAIN, NOTICE) << "Adding new threshold check: " << ctr_threshold_2_vty_str(new_ctr);
llist_add(&new_ctr->list, &threshold_list);
threshold_timer_update_intv();
}
int trx_rate_ctr_threshold_del(struct ctr_threshold *del_ctr)
{
struct ctr_threshold *ctr;
llist_for_each_entry(ctr, &threshold_list, list) {
if (ctr->intv != del_ctr->intv ||
ctr->ctr_id != del_ctr->ctr_id ||
ctr->val != del_ctr->val)
continue;
LOGC(DMAIN, NOTICE) << "Deleting threshold check: " << ctr_threshold_2_vty_str(del_ctr);
llist_del(&ctr->list);
talloc_free(ctr);
threshold_timer_update_intv();
return 0;
}
return -1;
}
void trx_rate_ctr_threshold_write_config(struct vty *vty, char *indent_prefix)
{
struct ctr_threshold *ctr;
llist_for_each_entry(ctr, &threshold_list, list) {
vty_out(vty, "%s%s%s", indent_prefix, ctr_threshold_2_vty_str(ctr), VTY_NEWLINE);
}
}

View File

@ -1,4 +1,29 @@
#pragma once
#include <osmocom/core/rate_ctr.h>
#include <osmocom/vty/command.h>
enum TrxCtr {
TRX_CTR_RX_UNDERRUNS,
TRX_CTR_RX_OVERRUNS,
TRX_CTR_TX_UNDERRUNS,
TRX_CTR_RX_DROP_EV,
TRX_CTR_RX_DROP_SMPL,
};
struct ctr_threshold {
/*! Linked list of all counter groups in the system */
struct llist_head list;
enum rate_ctr_intv intv;
enum TrxCtr ctr_id;
uint32_t val;
};
extern const struct value_string rate_ctr_intv[];
extern const struct value_string trx_chan_ctr_names[];
struct trx_ctx;
void trx_rate_ctr_init(void *ctx, struct trx_ctx* trx_ctx);
void trx_rate_ctr_threshold_add(struct ctr_threshold *ctr);
int trx_rate_ctr_threshold_del(struct ctr_threshold *del_ctr);
void trx_rate_ctr_threshold_write_config(struct vty *vty, char *indent_prefix);

View File

@ -31,6 +31,7 @@
#include <osmocom/vty/vty.h>
#include <osmocom/vty/misc.h>
#include "trx_rate_ctr.h"
#include "trx_vty.h"
#include "../config.h"
@ -347,6 +348,107 @@ DEFUN(cfg_filler, cfg_filler_cmd,
return CMD_SUCCESS;
}
static int vty_ctr_name_2_id(const char* str) {
size_t i;
for (i = 0; trx_chan_ctr_names[i].str; i++) {
if (strstr(trx_chan_ctr_names[i].str, str)) {
return i;
}
}
return -1;
}
static int vty_intv_name_2_id(const char* str) {
size_t i;
for (i = 0; rate_ctr_intv[i].str; i++) {
if (strcmp(rate_ctr_intv[i].str, str) == 0) {
return i;
}
}
return -1;
}
#define THRESHOLD_ARGS "(rx_underruns|rx_overruns|tx_underruns|rx_drop_events|rx_drop_samples)"
#define THRESHOLD_STR_VAL(s) "Set threshold value for rate_ctr device:" OSMO_STRINGIFY_VAL(s) "\n"
#define THRESHOLD_STRS \
THRESHOLD_STR_VAL(rx_underruns) \
THRESHOLD_STR_VAL(rx_overruns) \
THRESHOLD_STR_VAL(tx_underruns) \
THRESHOLD_STR_VAL(rx_drop_events) \
THRESHOLD_STR_VAL(rx_drop_samples)
#define INTV_ARGS "(per-second|per-minute|per-hour|per-day)"
#define INTV_STR_VAL(s) "Threshold value sampled " OSMO_STRINGIFY_VAL(s) "\n"
#define INTV_STRS \
INTV_STR_VAL(per-second) \
INTV_STR_VAL(per-minute) \
INTV_STR_VAL(per-hour) \
INTV_STR_VAL(per-day)
DEFUN(cfg_ctr_error_threshold, cfg_ctr_error_threshold_cmd,
"ctr-error-threshold " THRESHOLD_ARGS " <0-65535> " INTV_ARGS,
"Threshold rate for error counter\n"
THRESHOLD_STRS
"Value to set for threshold\n"
INTV_STRS)
{
int rc;
struct ctr_threshold ctr;
struct trx_ctx *trx = trx_from_vty(vty);
rc = vty_ctr_name_2_id(argv[0]);
if (rc < 0) {
vty_out(vty, "No valid ctr_name found for ctr-error-threshold %s%s",
argv[0], VTY_NEWLINE);
return CMD_WARNING;
}
ctr.ctr_id = (enum TrxCtr)rc;
ctr.val = atoi(argv[1]);
rc = vty_intv_name_2_id(argv[2]);
if (rc < 0) {
vty_out(vty, "No valid time frame found for ctr-error-threshold %s %d %s%s",
argv[0], ctr.val, argv[2], VTY_NEWLINE);
return CMD_WARNING;
}
ctr.intv = (enum rate_ctr_intv) rc;
trx_rate_ctr_threshold_add(&ctr);
return CMD_SUCCESS;
}
DEFUN(cfg_no_ctr_error_threshold, cfg_no_ctr_error_threshold_cmd,
"no ctr-error-threshold " THRESHOLD_ARGS " <0-65535> " INTV_ARGS,
NO_STR "Threshold rate for error counter\n"
THRESHOLD_STRS
"Value to set for threshold\n"
INTV_STRS)
{
int rc;
struct ctr_threshold ctr;
struct trx_ctx *trx = trx_from_vty(vty);
rc = vty_ctr_name_2_id(argv[0]);
if (rc < 0) {
vty_out(vty, "No valid ctr_name found for ctr-error-threshold %s%s",
argv[0], VTY_NEWLINE);
return CMD_WARNING;
}
ctr.ctr_id = (enum TrxCtr)rc;
ctr.val = atoi(argv[1]);
rc = vty_intv_name_2_id(argv[2]);
if (rc < 0) {
vty_out(vty, "No valid time frame found for ctr-error-threshold %s %d %s%s",
argv[0], ctr.val, argv[2], VTY_NEWLINE);
return CMD_WARNING;
}
ctr.intv = (enum rate_ctr_intv) rc;
if (trx_rate_ctr_threshold_del(&ctr) < 0) {
vty_out(vty, "no ctr-error-threshold: Entry to delete not found%s", VTY_NEWLINE);
return CMD_WARNING;
}
return CMD_SUCCESS;
}
DEFUN(cfg_chan, cfg_chan_cmd,
"chan <0-100>",
"Select a channel to configure\n"
@ -444,6 +546,7 @@ static int config_write_trx(struct vty *vty)
vty_out(vty, " ext-rach %s%s", trx->cfg.ext_rach ? "enable" : "disable", VTY_NEWLINE);
if (trx->cfg.sched_rr != 0)
vty_out(vty, " rt-prio %u%s", trx->cfg.sched_rr, VTY_NEWLINE);
trx_rate_ctr_threshold_write_config(vty, " ");
for (i = 0; i < trx->cfg.num_chans; i++) {
chan = &trx->cfg.chans[i];
@ -593,6 +696,8 @@ int trx_vty_init(struct trx_ctx* trx)
install_element(TRX_NODE, &cfg_ext_rach_cmd);
install_element(TRX_NODE, &cfg_rt_prio_cmd);
install_element(TRX_NODE, &cfg_filler_cmd);
install_element(TRX_NODE, &cfg_ctr_error_threshold_cmd);
install_element(TRX_NODE, &cfg_no_ctr_error_threshold_cmd);
install_element(TRX_NODE, &cfg_chan_cmd);
install_node(&chan_node, dummy_config_write);

View File

@ -2,3 +2,62 @@
== Counters
include::./counters_generated.adoc[]
=== Rate Counter Configurable Error Thresholds
Some rate counters such as overruns, underruns and dropped packets indicate
events that can really harm correct operation of the BTS served by OsmoTRX,
specially if they happen frequently. OsmoTRX is in most cases (depending on
maturity of device driver) prepared to dodge the temporary failure and keep
running and providing service.
Still, it is sometimes important for this kind of events to not go unnoticed by
the operator, since they may indicate issues regarding the set up that may
require operator intervention to fix it.
For instance, frequent dropped packets could indicate SDR HW/FW/power errors, or
a faulty connection against the host running OsmoTRX.
They can also indicate issues on the host running OsmoTRX itself: For instance,
OsmoTRX may not be running under a high enough priority (hence other processes
eventually battling for resources with it), or that simply the HW running
OsmoTRX is not powerful enough to accomplish all work in a timely fashion all
the time.
As a result, OsmoTRX can be configured to exit the process upon certain
conditions being met, in order to let osmoBTS notice something is wrong and thus
announcing issues through alarms to the network, where the operator can then
investigate the issue by looking at OsmoTRX logs.
These conditions are configured by means of introducing rate counter thresholds
in the VTY. The OsmoTRX user can provide those threshold commands either in the
VTY cfg file read by OsmoTRX process during startup, or by adding/removing them
dynamically through the VTY interactive console.
Each threshold cmd states an event (a rate counter type), a value and an time
interval (a second, a minute, an hour or a day). A threshold will be reached
(and OsmoTRX stopped) if its value grows bigger than the configured threshold
value over the configured time interval. This is the syntax used to manage rate
counter thresholds:
`(no) ctr-error-threshold <EVENT> <VALUE> <INTERVAL>`
If several rate counter thresholds are set, then all of them are checked over
time and the first one reached will stop OsmoTRX.
.Example: rate counter threshold configuration (VTY .cfg file)
----
trx
ctr-error-threshold rx_drop_events 2 per-minute <1>
ctr-error-threshold rx_drop_samples 800 per-second <2>
----
<1> Stop OsmoTRX if dropped event (any amount of samples) during Rx was detected 2 times or more during a minute.
<2> Stop OsmoTRX if 800 or more samples were detected during Rx to be dropped by the HW during a second.
.Example: rate counter threshold configuration (VTY interactive)
----
OsmoTRX(config-trx)# ctr-error-threshold tx_underruns 3 per-hour <1>
OsmoTRX(config-trx)# no ctr-error-threshold tx_underruns 3 per-hour <2>
----
<1> Stop OsmoTRX if 3 or more underruns were detected during Tx over the last hour
<2> Remove previously set rate counter threshold

View File

@ -1,7 +1,17 @@
// autogenerated by show asciidoc counters
These counters and their description based on OsmoTRX 0.2.0.61-408f (OsmoTRX).
These counters and their description based on OsmoTRX 1.0.0.43-3f7c0 (OsmoTRX).
=== Rate Counters
// generating tables for rate_ctr_group
// generating tables for osmo_stat_items
// generating tables for osmo_counters
// there are no ungrouped osmo_counters
// rate_ctr_group table osmo-trx statistics
.trx:chan - osmo-trx statistics
[options="header"]
|===
| Name | Reference | Description
| device:rx_underruns | <<trx:chan_device:rx_underruns>> | Number of Rx underruns
| device:rx_overruns | <<trx:chan_device:rx_overruns>> | Number of Rx overruns
| device:tx_underruns | <<trx:chan_device:tx_underruns>> | Number of Tx underruns
| device:rx_drop_events | <<trx:chan_device:rx_drop_events>> | Number of times Rx samples were dropped by HW
| device:rx_drop_samples | <<trx:chan_device:rx_drop_samples>> | Number of Rx samples dropped by HW
|===

View File

@ -1253,6 +1253,37 @@
<param name='dummy' doc='Dummy method' />
</params>
</command>
<command id='ctr-error-threshold (rx_underruns|rx_overruns|tx_underruns|rx_drop_events|rx_drop_samples) &lt;0-65535&gt; (per-second|per-minute|per-hour|per-day)'>
<params>
<param name='ctr-error-threshold' doc='Threshold rate for error counter' />
<param name='rx_underruns' doc='Set threshold value for rate_ctr device:rx_underruns' />
<param name='rx_overruns' doc='Set threshold value for rate_ctr device:rx_overruns' />
<param name='tx_underruns' doc='Set threshold value for rate_ctr device:tx_underruns' />
<param name='rx_drop_events' doc='Set threshold value for rate_ctr device:rx_drop_events' />
<param name='rx_drop_samples' doc='Set threshold value for rate_ctr device:rx_drop_samples' />
<param name='&lt;0-65535&gt;' doc='Value to set for threshold' />
<param name='per-second' doc='Threshold value sampled per-second' />
<param name='per-minute' doc='Threshold value sampled per-minute' />
<param name='per-hour' doc='Threshold value sampled per-hour' />
<param name='per-day' doc='Threshold value sampled per-day' />
</params>
</command>
<command id='no ctr-error-threshold (rx_underruns|rx_overruns|tx_underruns|rx_drop_events|rx_drop_samples) &lt;0-65535&gt; (per-second|per-minute|per-hour|per-day)'>
<params>
<param name='no' doc='Negate a command or set its defaults' />
<param name='ctr-error-threshold' doc='Threshold rate for error counter' />
<param name='rx_underruns' doc='Set threshold value for rate_ctr device:rx_underruns' />
<param name='rx_overruns' doc='Set threshold value for rate_ctr device:rx_overruns' />
<param name='tx_underruns' doc='Set threshold value for rate_ctr device:tx_underruns' />
<param name='rx_drop_events' doc='Set threshold value for rate_ctr device:rx_drop_events' />
<param name='rx_drop_samples' doc='Set threshold value for rate_ctr device:rx_drop_samples' />
<param name='&lt;0-65535&gt;' doc='Value to set for threshold' />
<param name='per-second' doc='Threshold value sampled per-second' />
<param name='per-minute' doc='Threshold value sampled per-minute' />
<param name='per-hour' doc='Threshold value sampled per-hour' />
<param name='per-day' doc='Threshold value sampled per-day' />
</params>
</command>
<command id='chan &lt;0-100&gt;'>
<params>
<param name='chan' doc='Select a channel to configure' />