stats: new trackers for lchan life duration

This patch adds two stats which track cummulative lchan lifetime by
type TCH and SDCCH. These new counters will accomplish two things:

1) Provide a glanceable way to see if lchan durations look healthy. When
examining a site, short-lived (<5s) and long-lived (>30s) TCH lchans
are difficult to tell apart. If we only see short-lived TCH lchans,
there is most likely an RF or signaling problem to investigate. This
new counter will expose channel ages in the VTY output

2) Provide a more accurate count for Erlangs per site. Currently, we
are basing Erlangs on active TCH channel counts per stats period. This
method skews high very quickly. Each active TCH in that period
translates into the full 10s of activity. This counter should improve
accuracy by two orders of magnitude.

Change-Id: I1b0670c47cb5e0b7776eda89d1e71545ba0e3347
This commit is contained in:
Michael Iedema 2022-02-07 01:27:45 -08:00 committed by pespin
parent 7048fc66bf
commit 5e2ac29703
7 changed files with 91 additions and 0 deletions

View File

@ -59,6 +59,8 @@ enum bts_counter_id {
BTS_CTR_CHAN_ACT_SDCCH,
BTS_CTR_CHAN_ACT_TCH,
BTS_CTR_CHAN_ACT_NACK,
BTS_CTR_CHAN_TCH_ACTIVE_DECISECONDS_TOTAL,
BTS_CTR_CHAN_SDCCH_ACTIVE_DECISECONDS_TOTAL,
BTS_CTR_RSL_UNKNOWN,
BTS_CTR_RSL_IPA_NACK,
BTS_CTR_RSL_DELETE_IND,

View File

@ -888,6 +888,8 @@ struct gsm_lchan {
uint8_t interf_band;
/* MS power control state */
struct lchan_power_ctrl_state ms_power_ctrl;
/* Interval timing to capture duration per activation and cumulative active time */
struct osmo_time_cc active_cc;
};
/* One Timeslot in a TRX */
@ -1146,6 +1148,7 @@ const char *gsm_chreq_name(enum gsm_chreq_reason_t c);
char *gsm_ts_name(const struct gsm_bts_trx_ts *ts);
char *gsm_ts_and_pchan_name(const struct gsm_bts_trx_ts *ts);
void lchan_update_name(struct gsm_lchan *lchan);
uint64_t gsm_lchan_active_duration_ms(const struct gsm_lchan *lchan);
static inline char *gsm_lchan_name(const struct gsm_lchan *lchan)
{

View File

@ -1095,6 +1095,12 @@ const struct rate_ctr_desc bts_ctr_description[] = {
[BTS_CTR_CHAN_ACT_NACK] = \
{ "chan_act:nack",
"Number of Channel Activations that the BTS NACKed" },
[BTS_CTR_CHAN_TCH_ACTIVE_DECISECONDS_TOTAL] = \
{ "chan_tch:active_deciseconds:total",
"Cumulative number of deciseconds of TCH channel activity" },
[BTS_CTR_CHAN_SDCCH_ACTIVE_DECISECONDS_TOTAL] = \
{ "chan_sdcch:active_deciseconds:total",
"Cumulative number of deciseconds of SDCCH channel activity" },
[BTS_CTR_RSL_UNKNOWN] = \
{ "rsl:unknown",
"Number of unknown/unsupported RSL messages received from BTS" },

View File

@ -569,6 +569,13 @@ void lchan_dump_full_vty(struct vty *vty, struct gsm_lchan *lchan)
vty_out(vty, "BTS %u, TRX %u, Timeslot %u, Lchan %u: Type %s%s",
lchan->ts->trx->bts->nr, lchan->ts->trx->nr, lchan->ts->nr,
lchan->nr, gsm_lchant_name(lchan->type), VTY_NEWLINE);
if (lchan->activate.concluded) {
vty_out(vty, " Activated %s seconds ago%s",
osmo_int_to_float_str_c(OTC_SELECT, gsm_lchan_active_duration_ms(lchan), 3),
VTY_NEWLINE);
}
vty_out_dyn_ts_details(vty, lchan->ts);
vty_out(vty, " Connection: %u, State: %s%s%s%s",
lchan->conn ? 1: 0, lchan_state_name(lchan),

View File

@ -3677,6 +3677,8 @@ void bts_dump_vty(struct vty *vty, struct gsm_bts *bts)
struct gsm_bts_trx *trx;
int ts_hopping_total;
int ts_non_hopping_total;
const struct rate_ctr *activations_tch;
const struct rate_ctr *activations_sdcch;
vty_out(vty, "BTS %u is of %s type in band %s, has CI %u LAC %u, "
"BSIC %u (NCC=%u, BCC=%u) and %u TRX%s",
@ -3851,6 +3853,29 @@ void bts_dump_vty(struct vty *vty, struct gsm_bts *bts)
rate_ctr_group_get_ctr(bts->bts_ctrs, BTS_CTR_CHREQ_TOTAL)->current,
rate_ctr_group_get_ctr(bts->bts_ctrs, BTS_CTR_CHREQ_NO_CHANNEL)->current,
VTY_NEWLINE);
vty_out(vty, " Channel Activations :%s", VTY_NEWLINE);
activations_tch = rate_ctr_group_get_ctr(bts->bts_ctrs, BTS_CTR_CHAN_ACT_TCH);
vty_out(vty, " TCH %"PRIu64"", activations_tch->current);
if (activations_tch->intv[RATE_CTR_INTV_HOUR].rate > 0) {
const struct rate_ctr *active_time_tch_ds = rate_ctr_group_get_ctr(bts->bts_ctrs, BTS_CTR_CHAN_TCH_ACTIVE_DECISECONDS_TOTAL);
vty_out(vty, " (avg lifespan %s seconds in last hour)",
osmo_int_to_float_str_c(OTC_SELECT,
100 * active_time_tch_ds->intv[RATE_CTR_INTV_HOUR].rate
/ activations_tch->intv[RATE_CTR_INTV_HOUR].rate, 3));
}
vty_out(vty, "%s", VTY_NEWLINE);
activations_sdcch = rate_ctr_group_get_ctr(bts->bts_ctrs, BTS_CTR_CHAN_ACT_SDCCH);
vty_out(vty, " SDCCH %"PRIu64"", activations_sdcch->current);
if (activations_sdcch->intv[RATE_CTR_INTV_HOUR].rate > 0) {
const struct rate_ctr *active_time_sdcch_ds = rate_ctr_group_get_ctr(bts->bts_ctrs, BTS_CTR_CHAN_SDCCH_ACTIVE_DECISECONDS_TOTAL);
vty_out(vty, " (avg lifespan %s seconds in last hour)",
osmo_int_to_float_str_c(OTC_SELECT,
100 * active_time_sdcch_ds->intv[RATE_CTR_INTV_HOUR].rate
/ activations_sdcch->intv[RATE_CTR_INTV_HOUR].rate, 3));
}
vty_out(vty, "%s", VTY_NEWLINE);
vty_out(vty, " Channel Failures : %"PRIu64" rf_failures, %"PRIu64" rll failures%s",
rate_ctr_group_get_ctr(bts->bts_ctrs, BTS_CTR_CHAN_RF_FAIL)->current,
rate_ctr_group_get_ctr(bts->bts_ctrs, BTS_CTR_CHAN_RLL_ERR)->current,

View File

@ -342,6 +342,15 @@ void lchan_update_name(struct gsm_lchan *lchan)
lchan->nr - (lchan->vamos.is_secondary ? ts->max_primary_lchans : 0));
}
/* If the lchan is currently active, return the duration since activation in milliseconds.
* Otherwise return 0. */
uint64_t gsm_lchan_active_duration_ms(const struct gsm_lchan *lchan)
{
if (!lchan->activate.concluded)
return 0;
return lchan->active_cc.total_sum / 1000;
}
/* obtain the MO structure for a given object instance */
static inline struct gsm_abis_mo *
gsm_objclass2mo(struct gsm_bts *bts, uint8_t obj_class,

View File

@ -200,10 +200,33 @@ static void _lchan_on_activation_failure(struct gsm_lchan *lchan, enum lchan_act
static void lchan_on_fully_established(struct gsm_lchan *lchan)
{
struct rate_ctr *rate_ctr;
if (lchan->activate.concluded)
return;
lchan->activate.concluded = true;
/* Assign timekeeper to appropriate rate counter */
switch (lchan->type) {
case GSM_LCHAN_TCH_H:
case GSM_LCHAN_TCH_F:
rate_ctr = rate_ctr_group_get_ctr(lchan->ts->trx->bts->bts_ctrs, BTS_CTR_CHAN_TCH_ACTIVE_DECISECONDS_TOTAL);
break;
case GSM_LCHAN_SDCCH:
rate_ctr = rate_ctr_group_get_ctr(lchan->ts->trx->bts->bts_ctrs, BTS_CTR_CHAN_SDCCH_ACTIVE_DECISECONDS_TOTAL);
break;
default:
rate_ctr = NULL;
break;
}
lchan->active_cc.cfg.rate_ctr = rate_ctr;
/* This reset allows showing on vty how long ago a single active lchan was activated.
* This reset does not affect rate_ctr statistics. */
osmo_time_cc_cleanup(&lchan->active_cc);
/* Start the timekeeper */
osmo_time_cc_set_flag(&lchan->active_cc, true);
switch (lchan->activate.info.activ_for) {
case ACTIVATE_FOR_MS_CHANNEL_REQUEST:
/* No signalling to do here, MS is free to use the channel, and should go on to connect
@ -463,6 +486,16 @@ void lchan_fsm_alloc(struct gsm_lchan *lchan)
OSMO_ASSERT(lchan->fi);
lchan->fi->priv = lchan;
lchan_fsm_update_id(lchan);
/* Configure timekeeper to track this lchan's cumulative active milliseconds */
lchan->active_cc = (struct osmo_time_cc){
.cfg = {
.gran_usec = 1*100000,
.forget_sum_usec = 1,
.rate_ctr = NULL,
},
};
LOGPFSML(lchan->fi, LOGL_DEBUG, "new lchan\n");
lchan_reset(lchan);
}
@ -511,6 +544,8 @@ static void lchan_reset(struct gsm_lchan *lchan)
.tsc_set = 1,
.interf_dbm = INTERF_DBM_UNKNOWN,
.interf_band = INTERF_BAND_UNKNOWN,
.active_cc = lchan->active_cc,
};
}
@ -518,6 +553,10 @@ static void lchan_fsm_unused_onenter(struct osmo_fsm_inst *fi, uint32_t prev_sta
{
struct gsm_lchan *lchan = lchan_fi_lchan(fi);
struct gsm_bts *bts = lchan->ts->trx->bts;
/* Stop the timekeeper */
osmo_time_cc_set_flag(&lchan->active_cc, false);
lchan_reset(lchan);
osmo_fsm_inst_dispatch(lchan->ts->fi, TS_EV_LCHAN_UNUSED, lchan);