recover BORKEN lchans for missing ACK scenarios

We already recover broken lchans where an ACTIV ACK or REL ACK arrives
late. Now add a recovery path for lchans that are broken because no
ACTIV ACK or REL ACK arrives at all.

Add a timeout of X28 = 30s to the lchan BORKEN state.
On timeout, attempt both a Channel Activation and a Channel Release. If
any of them is ACKed, we have successfully synced BTS and BSC's state.

After successful recovery, place the lchan back in the UNUSED state,
available for servicing subscribers.

If recovery is unsuccessful, just continue to attempt recovery every
further X28 seconds.

Patch-by: osmith, nhofmeyr
Related: osmo-ttcn3-hacks I9b4ddfc4a337808d9d5ec538c25fd390b1b2530f
Related: OS#5106
Related: SYS#6655
Change-Id: Ic4728b3efe843ea63e2a0b54b1ea8a925347484a
This commit is contained in:
Oliver Smith 2022-11-21 10:25:26 +01:00 committed by laforge
parent a4fc35c3b2
commit 6766608231
5 changed files with 115 additions and 0 deletions

View File

@ -32,6 +32,7 @@ labelloc=t; label="lchan FSM"
WAIT_TS_READY -> UNUSED [label="error/timeout",style=dashed,constraint=false]
{WAIT_ACTIV_ACK,WAIT_RF_RELEASE_ACK} -> BORKEN [label="error/timeout",style=dashed]
BORKEN -> WAIT_AFTER_ERROR [label="late RF Release ACK"]
BORKEN -> WAIT_RF_RELEASE_ACK [label="late Activation ACK"]
WAIT_RLL_RTP_ESTABLISH -> WAIT_RLL_RTP_RELEASED [label=error,style=dashed]
WAIT_ACTIV_ACK -> rtp [label="LCHAN_RTP_EV_LCHAN_READY",style=dotted]
@ -44,4 +45,13 @@ labelloc=t; label="lchan FSM"
WAIT_RSL_CHAN_MODE_MODIFY_ACK -> ESTABLISHED [label="LCHAN_EV_RSL_CHAN_MODE_MODIFY_ACK\nno change to RTP"]
WAIT_RR_CHAN_MODE_MODIFY_ACK -> BORKEN [label="error/timeout",style=dashed]
WAIT_RSL_CHAN_MODE_MODIFY_ACK -> BORKEN [label="error/timeout",style=dashed]
BORKEN -> RECOVER_WAIT_ACTIV_ACK [label="X28"]
RECOVER_WAIT_ACTIV_ACK -> BORKEN [label="error/timeout",style=dashed]
RECOVER_WAIT_ACTIV_ACK -> UNUSED [label="rx ACK"]
RECOVER_WAIT_ACTIV_ACK -> RECOVER_WAIT_RF_RELEASE_ACK [label="rx NACK"]
RECOVER_WAIT_RF_RELEASE_ACK -> UNUSED [label="rx ACK"]
RECOVER_WAIT_RF_RELEASE_ACK -> BORKEN [label="error/timeout",style=dashed]
}

View File

@ -33,6 +33,8 @@ enum lchan_fsm_state {
LCHAN_ST_WAIT_RF_RELEASE_ACK,
LCHAN_ST_WAIT_AFTER_ERROR,
LCHAN_ST_BORKEN,
LCHAN_ST_RECOVER_WAIT_ACTIV_ACK, /*< Attempt to recover from BORKEN: first try to activate the lchan */
LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK, /*< Attempt to recover from BORKEN: then try to release it */
};
enum lchan_fsm_event {

View File

@ -334,6 +334,9 @@ struct osmo_tdef_state_timeout lchan_fsm_timeouts[32] = {
[LCHAN_ST_WAIT_AFTER_ERROR] = { .T = -3111 },
[LCHAN_ST_WAIT_RR_CHAN_MODE_MODIFY_ACK] = { .T = -13 },
[LCHAN_ST_WAIT_RSL_CHAN_MODE_MODIFY_ACK] = { .T = -14 },
[LCHAN_ST_BORKEN] = { .T = -28 },
[LCHAN_ST_RECOVER_WAIT_ACTIV_ACK] = { .T = -6 },
[LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK] = { .T = -6 },
};
/* Transition to a state, using the T timer defined in lchan_fsm_timeouts.
@ -380,6 +383,8 @@ uint32_t lchan_fsm_on_error[32] = {
[LCHAN_ST_BORKEN] = LCHAN_ST_BORKEN,
[LCHAN_ST_WAIT_RR_CHAN_MODE_MODIFY_ACK] = LCHAN_ST_WAIT_RF_RELEASE_ACK,
[LCHAN_ST_WAIT_RSL_CHAN_MODE_MODIFY_ACK] = LCHAN_ST_WAIT_RF_RELEASE_ACK,
[LCHAN_ST_RECOVER_WAIT_ACTIV_ACK] = LCHAN_ST_BORKEN,
[LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK] = LCHAN_ST_BORKEN,
};
#define lchan_fail(fmt, args...) lchan_fail_to(lchan_fsm_on_error[fi->state], fmt, ## args)
@ -1631,6 +1636,71 @@ static void lchan_fsm_borken(struct osmo_fsm_inst *fi, uint32_t event, void *dat
}
}
static void lchan_fsm_recover_wait_activ_ack_onenter(struct osmo_fsm_inst *fi, uint32_t prev_state)
{
int rc;
struct gsm_lchan *lchan = lchan_fi_lchan(fi);
LOG_LCHAN(lchan, LOGL_INFO, "attempting to recover from BORKEN lchan\n");
lchan->type = GSM_LCHAN_SDCCH;
lchan->activate.info.ta_known = true;
chan_counts_ts_update(lchan->ts);
rc = rsl_tx_chan_activ(lchan, RSL_ACT_INTRA_NORM_ASS, 0);
if (rc)
lchan_fail("Tx Chan Activ failed: %s (%d)", strerror(-rc), rc);
}
static void lchan_fsm_recover_wait_activ_ack(struct osmo_fsm_inst *fi, uint32_t event, void *data)
{
struct gsm_lchan *lchan = lchan_fi_lchan(fi);
switch (event) {
case LCHAN_EV_RSL_CHAN_ACTIV_ACK:
lchan_fsm_state_chg(LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK);
break;
case LCHAN_EV_RSL_CHAN_ACTIV_NACK:
/* If an earlier lchan activ got through to the BTS, but the
* ACK did not get back to the BSC, it may still be active on
* the BTS side. Proceed to release it. */
LOG_LCHAN(lchan, LOGL_NOTICE, "received NACK for activation of BORKEN lchan, assuming still active\n");
lchan_fsm_state_chg(LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK);
break;
default:
OSMO_ASSERT(false);
}
}
static void lchan_fsm_recover_wait_rf_release_ack_onenter(struct osmo_fsm_inst *fi, uint32_t prev_state)
{
int rc;
struct gsm_lchan *lchan = lchan_fi_lchan(fi);
rc = rsl_tx_rf_chan_release(lchan);
if (rc)
lchan_fail("Tx RSL RF Channel Release failed: %s (%d)\n", strerror(-rc), rc);
}
static void lchan_fsm_recover_wait_rf_release_ack(struct osmo_fsm_inst *fi, uint32_t event, void *data)
{
struct gsm_lchan *lchan = lchan_fi_lchan(fi);
switch (event) {
case LCHAN_EV_RSL_RF_CHAN_REL_ACK:
LOG_LCHAN(lchan, LOGL_NOTICE, "successfully recovered BORKEN lchan\n");
lchan_fsm_state_chg(LCHAN_ST_UNUSED);
break;
default:
OSMO_ASSERT(false);
}
}
#define S(x) (1 << (x))
static const struct osmo_fsm_state lchan_fsm_states[] = {
@ -1820,6 +1890,32 @@ static const struct osmo_fsm_state lchan_fsm_states[] = {
| S(LCHAN_ST_WAIT_RF_RELEASE_ACK)
| S(LCHAN_ST_UNUSED)
| S(LCHAN_ST_WAIT_AFTER_ERROR)
| S(LCHAN_ST_RECOVER_WAIT_ACTIV_ACK)
,
},
[LCHAN_ST_RECOVER_WAIT_ACTIV_ACK] {
.name = "RECOVER_WAIT_ACTIV_ACK",
.onenter = lchan_fsm_recover_wait_activ_ack_onenter,
.action = lchan_fsm_recover_wait_activ_ack,
.in_event_mask = 0
| S(LCHAN_EV_RSL_CHAN_ACTIV_ACK)
| S(LCHAN_EV_RSL_CHAN_ACTIV_NACK)
,
.out_state_mask = 0
| S(LCHAN_ST_BORKEN)
| S(LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK)
,
},
[LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK] {
.name = "RECOVER_WAIT_RF_RELEASE_ACK",
.onenter = lchan_fsm_recover_wait_rf_release_ack_onenter,
.action = lchan_fsm_recover_wait_rf_release_ack,
.in_event_mask = 0
| S(LCHAN_EV_RSL_RF_CHAN_REL_ACK)
,
.out_state_mask = 0
| S(LCHAN_ST_BORKEN)
| S(LCHAN_ST_UNUSED)
,
},
};
@ -1893,6 +1989,10 @@ static int lchan_fsm_timer_cb(struct osmo_fsm_inst *fi)
lchan_fsm_state_chg(LCHAN_ST_UNUSED);
return 0;
case LCHAN_ST_BORKEN:
lchan_fsm_state_chg(LCHAN_ST_RECOVER_WAIT_ACTIV_ACK);
return 0;
default:
lchan->release.in_error = true;
lchan->release.rsl_error_cause = RSL_ERR_INTERWORKING;

View File

@ -74,6 +74,7 @@ static struct osmo_tdef gsm_network_T_defs[] = {
" after this amount of idle time, forget internally cumulated time remainders. Zero to always"
" keep remainders. See also X16, X17." },
{ .T = -25, .default_val = 5, .desc = "Timeout for initial user data after an MSC initiated an SCCP connection to the BSS" },
{ .T = -28, .default_val = 30, .desc = "Interval at which to try to recover a BORKEN lchan" },
{ .T = -3105, .default_val = GSM_NY1_DEFAULT, .unit = OSMO_TDEF_CUSTOM,
.desc = "Ny1: Maximum number of Physical Information (re)transmissions" },
{ .T = -3111, .default_val = 4, .desc = "Wait time after lchan was released in error (should be T3111 + 2s)" },

View File

@ -34,6 +34,7 @@ net: X16 = 1000 ms Granularity for all_allocated:* rate counters: amount of mill
net: X17 = 0 ms Rounding threshold for all_allocated:* rate counters: round up to the next counter increment after this many milliseconds. If set to half of X16 (or 0), employ the usual round() behavior: round up after half of a granularity period. If set to 1, behave like ceil(): already increment the counter immediately when all channels are allocated. If set >= X16, behave like floor(): only increment after a full X16 period of all channels being occupied. See also X16, X18 (default: 0 ms)
net: X18 = 60000 ms Forget-sum period for all_allocated:* rate counters: after this amount of idle time, forget internally cumulated time remainders. Zero to always keep remainders. See also X16, X17. (default: 60000 ms)
net: X25 = 5 s Timeout for initial user data after an MSC initiated an SCCP connection to the BSS (default: 5 s)
net: X28 = 30 s Interval at which to try to recover a BORKEN lchan (default: 30 s)
net: X3105 = 17 Ny1: Maximum number of Physical Information (re)transmissions (default: 17)
net: X3111 = 4 s Wait time after lchan was released in error (should be T3111 + 2s) (default: 4 s)
net: X3113 = 60 s Maximum Paging Request Transmit Delay Threshold: If the estimated transmit delay of the messages in the paging queue surpasses this threshold, then new incoming paging requests will if possible replace a request in retransmission state from the queue or otherwise be discarded, hence limiting the size of the queue and maximum delay of its scheduled requests. X3113 also serves as the upper boundary for dynamic T3113 when estimating the expected maximum delay to get a response (default: 60 s)
@ -90,6 +91,7 @@ net: X16 = 1000 ms Granularity for all_allocated:* rate counters: amount of mill
net: X17 = 0 ms Rounding threshold for all_allocated:* rate counters: round up to the next counter increment after this many milliseconds. If set to half of X16 (or 0), employ the usual round() behavior: round up after half of a granularity period. If set to 1, behave like ceil(): already increment the counter immediately when all channels are allocated. If set >= X16, behave like floor(): only increment after a full X16 period of all channels being occupied. See also X16, X18 (default: 0 ms)
net: X18 = 60000 ms Forget-sum period for all_allocated:* rate counters: after this amount of idle time, forget internally cumulated time remainders. Zero to always keep remainders. See also X16, X17. (default: 60000 ms)
net: X25 = 5 s Timeout for initial user data after an MSC initiated an SCCP connection to the BSS (default: 5 s)
net: X28 = 30 s Interval at which to try to recover a BORKEN lchan (default: 30 s)
net: X3105 = 17 Ny1: Maximum number of Physical Information (re)transmissions (default: 17)
net: X3111 = 4 s Wait time after lchan was released in error (should be T3111 + 2s) (default: 4 s)
net: X3113 = 60 s Maximum Paging Request Transmit Delay Threshold: If the estimated transmit delay of the messages in the paging queue surpasses this threshold, then new incoming paging requests will if possible replace a request in retransmission state from the queue or otherwise be discarded, hence limiting the size of the queue and maximum delay of its scheduled requests. X3113 also serves as the upper boundary for dynamic T3113 when estimating the expected maximum delay to get a response (default: 60 s)