stats: new trackers for lchan life duration (v2)

This patch adds two stats which track cummulative lchan lifetime by
type TCH and SDCCH. These new counters will accomplish two things:

1) Provide a glanceable way to see if lchan durations look healthy. When
examining a site, short-lived (<5s) and long-lived (>30s) TCH lchans
are difficult to tell apart. If we only see short-lived TCH lchans,
there is most likely an RF or signaling problem to investigate. This
new counter will expose channel ages in the VTY output

2) Provide a more accurate count for Erlangs per site. Currently, we
are basing Erlangs on active TCH channel counts per stats period. This
method skews high very quickly. Each active TCH in that period
translates into the full 10s of activity. This counter should improve
accuracy by two orders of magnitude.

Change-Id: Ie3771233ecbd4bc24a24fb22c1064a18e7b8b2b0
This commit is contained in:
Michael Iedema 2022-05-19 11:00:14 -07:00
parent 80cd4555b0
commit a7628d9cf0
8 changed files with 140 additions and 0 deletions

View File

@ -60,6 +60,8 @@ enum bts_counter_id {
BTS_CTR_CHAN_ACT_SDCCH,
BTS_CTR_CHAN_ACT_TCH,
BTS_CTR_CHAN_ACT_NACK,
BTS_CTR_CHAN_TCH_ACTIVE_MILLISECONDS_TOTAL,
BTS_CTR_CHAN_SDCCH_ACTIVE_MILLISECONDS_TOTAL,
BTS_CTR_RSL_UNKNOWN,
BTS_CTR_RSL_IPA_NACK,
BTS_CTR_RSL_DELETE_IND,
@ -757,6 +759,9 @@ void bts_store_uptime(struct gsm_bts *bts);
unsigned long long bts_uptime(const struct gsm_bts *bts);
#define BTS_STORE_LCHAN_DURATIONS_INTERVAL 1 /* in seconds */
void bts_store_lchan_durations(struct gsm_bts *bts);
char *get_model_oml_status(const struct gsm_bts *bts);
/* reset the state of all MO in the BTS */
void gsm_bts_mo_reset(struct gsm_bts *bts);

View File

@ -889,6 +889,9 @@ struct gsm_lchan {
uint8_t interf_band;
/* MS power control state */
struct lchan_power_ctrl_state ms_power_ctrl;
/* Timestamps and markers to track active state duration. */
struct timespec active_start;
struct timespec active_stored;
};
/* One Timeslot in a TRX */
@ -1147,6 +1150,7 @@ const char *gsm_chreq_name(enum gsm_chreq_reason_t c);
char *gsm_ts_name(const struct gsm_bts_trx_ts *ts);
char *gsm_ts_and_pchan_name(const struct gsm_bts_trx_ts *ts);
void lchan_update_name(struct gsm_lchan *lchan);
uint64_t gsm_lchan_active_duration_ms(const struct gsm_lchan *lchan);
static inline char *gsm_lchan_name(const struct gsm_lchan *lchan)
{
@ -1278,6 +1282,9 @@ struct gsm_network {
/* Timer to write each BTS's uptime counter state to the stats system. */
struct osmo_timer_list bts_store_uptime_timer;
/* Timer to write each BTS's set of lchan duration counters' state to the stats system. */
struct osmo_timer_list bts_store_lchan_durations_timer;
struct {
/* Single MGCP client configuration under msc node (also required for
* MGCP proxy when sccp-lite is used) */

View File

@ -88,6 +88,18 @@ static void bsc_store_bts_uptime(void *data)
osmo_timer_schedule(&net->bts_store_uptime_timer, BTS_STORE_UPTIME_INTERVAL, 0);
}
static void bsc_store_bts_lchan_durations(void *data)
{
struct gsm_network *net = data;
struct gsm_bts *bts;
llist_for_each_entry(bts, &net->bts_list, list)
bts_store_lchan_durations(bts);
/* Keep this timer ticking. */
osmo_timer_schedule(&net->bts_store_lchan_durations_timer, BTS_STORE_LCHAN_DURATIONS_INTERVAL, 0);
}
static struct gsm_network *bsc_network_init(void *ctx)
{
struct gsm_network *net = gsm_network_init(ctx);
@ -180,6 +192,10 @@ static struct gsm_network *bsc_network_init(void *ctx)
osmo_timer_setup(&net->bts_store_uptime_timer, bsc_store_bts_uptime, net);
osmo_timer_schedule(&net->bts_store_uptime_timer, BTS_STORE_UPTIME_INTERVAL, 0);
/* Init lchan duration tracking timer. */
osmo_timer_setup(&net->bts_store_lchan_durations_timer, bsc_store_bts_lchan_durations, net);
osmo_timer_schedule(&net->bts_store_lchan_durations_timer, BTS_STORE_LCHAN_DURATIONS_INTERVAL, 0);
net->cbc->net = net;
net->cbc->mode = BSC_CBC_LINK_MODE_DISABLED;
net->cbc->server.local_addr = bsc_cbc_default_server_local_addr;

View File

@ -741,6 +741,61 @@ void bts_store_uptime(struct gsm_bts *bts)
osmo_stat_item_set(osmo_stat_item_group_get_item(bts->bts_statg, BTS_STAT_UPTIME_SECONDS), bts_uptime(bts));
}
void bts_store_lchan_durations(struct gsm_bts *bts)
{
struct gsm_bts_trx *trx;
int i, j;
struct timespec now, elapsed;
uint64_t elapsed_ms;
uint64_t elapsed_tch_ms = 0;
uint64_t elapsed_sdcch_ms = 0;
/* Ignore BTS that are not in operation. */
if (!trx_is_usable(bts->c0))
return;
/* Grab storage time to be used for all lchans. */
osmo_clock_gettime(CLOCK_MONOTONIC, &now);
/* Iterate over all lchans. */
llist_for_each_entry(trx, &bts->trx_list, list) {
for (i = 0; i < ARRAY_SIZE(trx->ts); i++) {
struct gsm_bts_trx_ts *ts = &trx->ts[i];
for (j = 0; j < ARRAY_SIZE(ts->lchan); j++) {
struct gsm_lchan *lchan = &ts->lchan[j];
/* Ignore lchans whose activation timestamps are not yet set. */
if (lchan->active_stored.tv_sec == 0 && lchan->active_stored.tv_nsec == 0)
continue;
/* Calculate elapsed time since last storage. */
timespecsub(&now, &lchan->active_stored, &elapsed);
elapsed_ms = elapsed.tv_sec * 1000 + elapsed.tv_nsec / 1000000;
/* Assign elapsed time to appropriate bucket. */
switch (lchan->type) {
case GSM_LCHAN_TCH_H:
case GSM_LCHAN_TCH_F:
elapsed_tch_ms += elapsed_ms;
break;
case GSM_LCHAN_SDCCH:
elapsed_sdcch_ms += elapsed_ms;
break;
default:
continue;
}
/* Update storage time. */
lchan->active_stored = now;
}
}
}
/* Export to rate counters. */
rate_ctr_add(rate_ctr_group_get_ctr(bts->bts_ctrs, BTS_CTR_CHAN_TCH_ACTIVE_MILLISECONDS_TOTAL), elapsed_tch_ms);
rate_ctr_add(rate_ctr_group_get_ctr(bts->bts_ctrs, BTS_CTR_CHAN_SDCCH_ACTIVE_MILLISECONDS_TOTAL), elapsed_sdcch_ms);
}
unsigned long long bts_uptime(const struct gsm_bts *bts)
{
struct timespec tp;
@ -1099,6 +1154,12 @@ const struct rate_ctr_desc bts_ctr_description[] = {
[BTS_CTR_CHAN_ACT_NACK] = \
{ "chan_act:nack",
"Number of Channel Activations that the BTS NACKed" },
[BTS_CTR_CHAN_TCH_ACTIVE_MILLISECONDS_TOTAL] = \
{ "chan_tch:active_milliseconds:total",
"Cumulative number of milliseconds of TCH channel activity" },
[BTS_CTR_CHAN_SDCCH_ACTIVE_MILLISECONDS_TOTAL] = \
{ "chan_sdcch:active_milliseconds:total",
"Cumulative number of milliseconds of SDCCH channel activity" },
[BTS_CTR_RSL_UNKNOWN] = \
{ "rsl:unknown",
"Number of unknown/unsupported RSL messages received from BTS" },

View File

@ -569,6 +569,13 @@ void lchan_dump_full_vty(struct vty *vty, struct gsm_lchan *lchan)
vty_out(vty, "BTS %u, TRX %u, Timeslot %u, Lchan %u: Type %s%s",
lchan->ts->trx->bts->nr, lchan->ts->trx->nr, lchan->ts->nr,
lchan->nr, gsm_lchant_name(lchan->type), VTY_NEWLINE);
if (lchan->activate.concluded) {
vty_out(vty, " Active for: %s seconds%s",
osmo_int_to_float_str_c(OTC_SELECT, gsm_lchan_active_duration_ms(lchan), 3),
VTY_NEWLINE);
}
vty_out_dyn_ts_details(vty, lchan->ts);
vty_out(vty, " Connection: %u, State: %s%s%s%s",
lchan->conn ? 1: 0, lchan_state_name(lchan),

View File

@ -3677,6 +3677,8 @@ void bts_dump_vty(struct vty *vty, struct gsm_bts *bts)
struct gsm_bts_trx *trx;
int ts_hopping_total;
int ts_non_hopping_total;
const struct rate_ctr *activations_tch;
const struct rate_ctr *activations_sdcch;
vty_out(vty, "BTS %u is of %s type in band %s, has CI %u LAC %u, "
"BSIC %u (NCC=%u, BCC=%u) and %u TRX%s",
@ -3851,6 +3853,29 @@ void bts_dump_vty(struct vty *vty, struct gsm_bts *bts)
rate_ctr_group_get_ctr(bts->bts_ctrs, BTS_CTR_CHREQ_TOTAL)->current,
rate_ctr_group_get_ctr(bts->bts_ctrs, BTS_CTR_CHREQ_NO_CHANNEL)->current,
VTY_NEWLINE);
vty_out(vty, " Channel Activations :%s", VTY_NEWLINE);
activations_tch = rate_ctr_group_get_ctr(bts->bts_ctrs, BTS_CTR_CHAN_ACT_TCH);
vty_out(vty, " TCH %"PRIu64"", activations_tch->current);
if (activations_tch->intv[RATE_CTR_INTV_HOUR].rate > 0) {
const struct rate_ctr *active_time_tch_ms = rate_ctr_group_get_ctr(bts->bts_ctrs, BTS_CTR_CHAN_TCH_ACTIVE_MILLISECONDS_TOTAL);
vty_out(vty, " (avg lifespan %s seconds in last hour)",
osmo_int_to_float_str_c(OTC_SELECT,
active_time_tch_ms->intv[RATE_CTR_INTV_HOUR].rate
/ activations_tch->intv[RATE_CTR_INTV_HOUR].rate, 3));
}
vty_out(vty, "%s", VTY_NEWLINE);
activations_sdcch = rate_ctr_group_get_ctr(bts->bts_ctrs, BTS_CTR_CHAN_ACT_SDCCH);
vty_out(vty, " SDCCH %"PRIu64"", activations_sdcch->current);
if (activations_sdcch->intv[RATE_CTR_INTV_HOUR].rate > 0) {
const struct rate_ctr *active_time_sdcch_ms = rate_ctr_group_get_ctr(bts->bts_ctrs, BTS_CTR_CHAN_SDCCH_ACTIVE_MILLISECONDS_TOTAL);
vty_out(vty, " (avg lifespan %s seconds in last hour)",
osmo_int_to_float_str_c(OTC_SELECT,
active_time_sdcch_ms->intv[RATE_CTR_INTV_HOUR].rate
/ activations_sdcch->intv[RATE_CTR_INTV_HOUR].rate, 3));
}
vty_out(vty, "%s", VTY_NEWLINE);
vty_out(vty, " Channel Failures : %"PRIu64" rf_failures, %"PRIu64" rll failures%s",
rate_ctr_group_get_ctr(bts->bts_ctrs, BTS_CTR_CHAN_RF_FAIL)->current,
rate_ctr_group_get_ctr(bts->bts_ctrs, BTS_CTR_CHAN_RLL_ERR)->current,

View File

@ -342,6 +342,21 @@ void lchan_update_name(struct gsm_lchan *lchan)
lchan->nr - (lchan->vamos.is_secondary ? ts->max_primary_lchans : 0));
}
/* If the lchan is currently active, return the duration since activation in milliseconds.
* Otherwise return 0. */
uint64_t gsm_lchan_active_duration_ms(const struct gsm_lchan *lchan)
{
struct timespec now, elapsed;
if (lchan->active_start.tv_sec == 0 && lchan->active_start.tv_nsec == 0)
return 0;
osmo_clock_gettime(CLOCK_MONOTONIC, &now);
timespecsub(&now, &lchan->active_start, &elapsed);
return elapsed.tv_sec * 1000 + elapsed.tv_nsec / 1000000;
}
/* obtain the MO structure for a given object instance */
static inline struct gsm_abis_mo *
gsm_objclass2mo(struct gsm_bts *bts, uint8_t obj_class,

View File

@ -204,6 +204,10 @@ static void lchan_on_fully_established(struct gsm_lchan *lchan)
return;
lchan->activate.concluded = true;
/* Set active state timekeeping markers. */
osmo_clock_gettime(CLOCK_MONOTONIC, &lchan->active_start);
lchan->active_stored = lchan->active_start;
switch (lchan->activate.info.activ_for) {
case ACTIVATE_FOR_MS_CHANNEL_REQUEST:
/* No signalling to do here, MS is free to use the channel, and should go on to connect