From 2a2a459eeeff48640dc557548ce576d666ab06ed Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 21 Mar 2012 06:58:03 +0000 Subject: [PATCH 01/14] net: fix napi_reuse_skb() skb reserve napi->skb is allocated in napi_get_frags() using netdev_alloc_skb_ip_align(), with a reserve of NET_SKB_PAD + NET_IP_ALIGN bytes. However, when such skb is recycled in napi_reuse_skb(), it ends with a reserve of NET_IP_ALIGN which is suboptimal. Signed-off-by: Eric Dumazet Cc: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 0f3eb7d79a2..452db7090d1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3560,7 +3560,8 @@ EXPORT_SYMBOL(napi_gro_receive); static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) { __skb_pull(skb, skb_headlen(skb)); - skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb)); + /* restore the reserve we had after netdev_alloc_skb_ip_align() */ + skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb)); skb->vlan_tci = 0; skb->dev = napi->dev; skb->skb_iif = 0; From 5676cc7bfe1e388e87843f71daa229610385b41e Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Wed, 21 Mar 2012 05:32:05 +0000 Subject: [PATCH 02/14] sky2: override for PCI legacy power management Some BIOS's don't setup power management correctly (what else is new) and don't allow use of PCI Express power control. Add a special exception module parameter to allow working around this issue. Based on slightly different patch by Knut Petersen. Reported-by: Arkadiusz Miskiewicz Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/sky2.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c index 82c2c86a195..423a1a2a702 100644 --- a/drivers/net/ethernet/marvell/sky2.c +++ b/drivers/net/ethernet/marvell/sky2.c @@ -95,6 +95,10 @@ static int disable_msi = 0; module_param(disable_msi, int, 0); MODULE_PARM_DESC(disable_msi, "Disable Message Signaled Interrupt (MSI)"); +static int legacy_pme = 0; +module_param(legacy_pme, int, 0); +MODULE_PARM_DESC(legacy_pme, "Legacy power management"); + static DEFINE_PCI_DEVICE_TABLE(sky2_id_table) = { { PCI_DEVICE(PCI_VENDOR_ID_SYSKONNECT, 0x9000) }, /* SK-9Sxx */ { PCI_DEVICE(PCI_VENDOR_ID_SYSKONNECT, 0x9E00) }, /* SK-9Exx */ @@ -867,6 +871,13 @@ static void sky2_wol_init(struct sky2_port *sky2) /* Disable PiG firmware */ sky2_write16(hw, B0_CTST, Y2_HW_WOL_OFF); + /* Needed by some broken BIOSes, use PCI rather than PCI-e for WOL */ + if (legacy_pme) { + u32 reg1 = sky2_pci_read32(hw, PCI_DEV_REG1); + reg1 |= PCI_Y2_PME_LEGACY; + sky2_pci_write32(hw, PCI_DEV_REG1, reg1); + } + /* block receiver */ sky2_write8(hw, SK_REG(port, RX_GMF_CTRL_T), GMF_RST_SET); sky2_read32(hw, B0_CTST); From 7ae5289017e5ed5514b2603d157fb54c058a3c82 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Wed, 21 Mar 2012 15:38:33 +0000 Subject: [PATCH 03/14] tg3: Fix RSS ring refill race condition The RSS feature in tg3 hardware has only one rx producer ring for all RSS rings. NAPI vector 1 is special and handles the refilling of the rx producer ring on behalf of all RSS rings. There is a race condition between these RSS NAPIs and the NAPI[1]. If NAPI[1] finishes checking for refill and then another RSS ring empties the rx producer ring before NAPI[1] exits NAPI, the chip will be completely out of SKBs in the rx producer ring. We fix this by adding a flag tp->rx_refill and rely on napi_schedule()/ napi_complete() to help synchronize it to close the race condition. Update driver version to 3.123. Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/tg3.c | 25 ++++++++++++++++++++++--- drivers/net/ethernet/broadcom/tg3.h | 1 + 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index b0657466041..7b71387cf93 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -89,10 +89,10 @@ static inline void _tg3_flag_clear(enum TG3_FLAGS flag, unsigned long *bits) #define DRV_MODULE_NAME "tg3" #define TG3_MAJ_NUM 3 -#define TG3_MIN_NUM 122 +#define TG3_MIN_NUM 123 #define DRV_MODULE_VERSION \ __stringify(TG3_MAJ_NUM) "." __stringify(TG3_MIN_NUM) -#define DRV_MODULE_RELDATE "December 7, 2011" +#define DRV_MODULE_RELDATE "March 21, 2012" #define RESET_KIND_SHUTDOWN 0 #define RESET_KIND_INIT 1 @@ -5953,8 +5953,10 @@ next_pkt_nopost: tpr->rx_std_prod_idx = std_prod_idx & tp->rx_std_ring_mask; tpr->rx_jmb_prod_idx = jmb_prod_idx & tp->rx_jmb_ring_mask; - if (tnapi != &tp->napi[1]) + if (tnapi != &tp->napi[1]) { + tp->rx_refill = true; napi_schedule(&tp->napi[1].napi); + } } return received; @@ -6134,6 +6136,7 @@ static int tg3_poll_work(struct tg3_napi *tnapi, int work_done, int budget) u32 std_prod_idx = dpr->rx_std_prod_idx; u32 jmb_prod_idx = dpr->rx_jmb_prod_idx; + tp->rx_refill = false; for (i = 1; i < tp->irq_cnt; i++) err |= tg3_rx_prodring_xfer(tp, dpr, &tp->napi[i].prodring); @@ -6197,9 +6200,25 @@ static int tg3_poll_msix(struct napi_struct *napi, int budget) /* check for RX/TX work to do */ if (likely(sblk->idx[0].tx_consumer == tnapi->tx_cons && *(tnapi->rx_rcb_prod_idx) == tnapi->rx_rcb_ptr)) { + + /* This test here is not race free, but will reduce + * the number of interrupts by looping again. + */ + if (tnapi == &tp->napi[1] && tp->rx_refill) + continue; + napi_complete(napi); /* Reenable interrupts. */ tw32_mailbox(tnapi->int_mbox, tnapi->last_tag << 24); + + /* This test here is synchronized by napi_schedule() + * and napi_complete() to close the race condition. + */ + if (unlikely(tnapi == &tp->napi[1] && tp->rx_refill)) { + tw32(HOSTCC_MODE, tp->coalesce_mode | + HOSTCC_MODE_ENABLE | + tnapi->coal_now); + } mmiowb(); break; } diff --git a/drivers/net/ethernet/broadcom/tg3.h b/drivers/net/ethernet/broadcom/tg3.h index 66bcfca5526..93865f899a4 100644 --- a/drivers/net/ethernet/broadcom/tg3.h +++ b/drivers/net/ethernet/broadcom/tg3.h @@ -3007,6 +3007,7 @@ struct tg3 { u32 rx_std_max_post; u32 rx_offset; u32 rx_pkt_map_sz; + bool rx_refill; /* begin "everything else" cacheline(s) section */ From 8ec3e70207486bbd3e2d3c0d6b809116ccd4f219 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Wed, 21 Mar 2012 15:38:34 +0000 Subject: [PATCH 04/14] cnic: Fix parity error code conflict The recently added parity error handling used an error code that was already defined for a different error. This could lead to bnx2x firmware assert. We need to fix this with new error codes that are defined for parity error only. Signed-off-by: Michael Chan Reviewed-by: Eddie Wai Reviewed-by: Bhanu Prakash Gollapudi Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/cnic.c | 12 ++++++---- drivers/net/ethernet/broadcom/cnic_defs.h | 28 +---------------------- drivers/net/ethernet/broadcom/cnic_if.h | 4 ++-- drivers/scsi/bnx2fc/bnx2fc_constants.h | 1 + drivers/scsi/bnx2i/57xx_iscsi_constants.h | 1 + 5 files changed, 12 insertions(+), 34 deletions(-) diff --git a/drivers/net/ethernet/broadcom/cnic.c b/drivers/net/ethernet/broadcom/cnic.c index 7b65716b873..c95e7b5e2b8 100644 --- a/drivers/net/ethernet/broadcom/cnic.c +++ b/drivers/net/ethernet/broadcom/cnic.c @@ -47,6 +47,7 @@ #include "bnx2x/bnx2x_hsi.h" #include "../../../scsi/bnx2i/57xx_iscsi_constants.h" #include "../../../scsi/bnx2i/57xx_iscsi_hsi.h" +#include "../../../scsi/bnx2fc/bnx2fc_constants.h" #include "cnic.h" #include "cnic_defs.h" @@ -2547,7 +2548,7 @@ static void cnic_bnx2x_kwqe_err(struct cnic_dev *dev, struct kwqe *kwqe) } kcqe.kcqe_op_flag = kcqe_op << KCQE_FLAGS_OPCODE_SHIFT; kcqe.kcqe_op_flag |= KCQE_FLAGS_LAYER_MASK_L5_FCOE; - kcqe.kcqe_info1 = FCOE_KCQE_COMPLETION_STATUS_NIC_ERROR; + kcqe.kcqe_info1 = FCOE_KCQE_COMPLETION_STATUS_PARITY_ERROR; kcqe.kcqe_info2 = cid; kcqe.kcqe_info0 = l5_cid; @@ -2558,7 +2559,7 @@ static void cnic_bnx2x_kwqe_err(struct cnic_dev *dev, struct kwqe *kwqe) kcqe.kcqe_op_flag = (opcode + 0x10) << KCQE_FLAGS_OPCODE_SHIFT; kcqe.kcqe_op_flag |= KCQE_FLAGS_LAYER_MASK_L5_ISCSI; - kcqe.kcqe_info1 = ISCSI_KCQE_COMPLETION_STATUS_NIC_ERROR; + kcqe.kcqe_info1 = ISCSI_KCQE_COMPLETION_STATUS_PARITY_ERR; kcqe.kcqe_info2 = cid; cnic_get_l5_cid(cp, BNX2X_SW_CID(cid), &kcqe.kcqe_info0); @@ -2577,7 +2578,7 @@ static void cnic_bnx2x_kwqe_err(struct cnic_dev *dev, struct kwqe *kwqe) kcqe.kcqe_op_flag = (kcqe_op << KCQE_FLAGS_OPCODE_SHIFT) | KCQE_FLAGS_LAYER_MASK_L4; - l4kcqe->status = L4_KCQE_COMPLETION_STATUS_NIC_ERROR; + l4kcqe->status = L4_KCQE_COMPLETION_STATUS_PARITY_ERROR; l4kcqe->cid = cid; cnic_get_l5_cid(cp, BNX2X_SW_CID(cid), &l4kcqe->conn_id); } else { @@ -3933,7 +3934,8 @@ static void cnic_cm_process_kcqe(struct cnic_dev *dev, struct kcqe *kcqe) case L4_KCQE_OPCODE_VALUE_CONNECT_COMPLETE: if (l4kcqe->status == 0) set_bit(SK_F_OFFLD_COMPLETE, &csk->flags); - else if (l4kcqe->status == L4_KCQE_COMPLETION_STATUS_NIC_ERROR) + else if (l4kcqe->status == + L4_KCQE_COMPLETION_STATUS_PARITY_ERROR) set_bit(SK_F_HW_ERR, &csk->flags); smp_mb__before_clear_bit(); @@ -3946,7 +3948,7 @@ static void cnic_cm_process_kcqe(struct cnic_dev *dev, struct kcqe *kcqe) case L4_KCQE_OPCODE_VALUE_RESET_COMP: case L5CM_RAMROD_CMD_ID_SEARCHER_DELETE: case L5CM_RAMROD_CMD_ID_TERMINATE_OFFLOAD: - if (l4kcqe->status == L4_KCQE_COMPLETION_STATUS_NIC_ERROR) + if (l4kcqe->status == L4_KCQE_COMPLETION_STATUS_PARITY_ERROR) set_bit(SK_F_HW_ERR, &csk->flags); cp->close_conn(csk, opcode); diff --git a/drivers/net/ethernet/broadcom/cnic_defs.h b/drivers/net/ethernet/broadcom/cnic_defs.h index 06ca00266d7..382c98b0cc0 100644 --- a/drivers/net/ethernet/broadcom/cnic_defs.h +++ b/drivers/net/ethernet/broadcom/cnic_defs.h @@ -35,16 +35,6 @@ #define L5CM_RAMROD_CMD_ID_SEARCHER_DELETE (L5CM_RAMROD_CMD_ID_BASE + 14) #define L5CM_RAMROD_CMD_ID_TERMINATE_OFFLOAD (L5CM_RAMROD_CMD_ID_BASE + 15) -#define FCOE_KCQE_OPCODE_INIT_FUNC (0x10) -#define FCOE_KCQE_OPCODE_DESTROY_FUNC (0x11) -#define FCOE_KCQE_OPCODE_STAT_FUNC (0x12) -#define FCOE_KCQE_OPCODE_OFFLOAD_CONN (0x15) -#define FCOE_KCQE_OPCODE_ENABLE_CONN (0x16) -#define FCOE_KCQE_OPCODE_DISABLE_CONN (0x17) -#define FCOE_KCQE_OPCODE_DESTROY_CONN (0x18) -#define FCOE_KCQE_OPCODE_CQ_EVENT_NOTIFICATION (0x20) -#define FCOE_KCQE_OPCODE_FCOE_ERROR (0x21) - #define FCOE_RAMROD_CMD_ID_INIT_FUNC (FCOE_KCQE_OPCODE_INIT_FUNC) #define FCOE_RAMROD_CMD_ID_DESTROY_FUNC (FCOE_KCQE_OPCODE_DESTROY_FUNC) #define FCOE_RAMROD_CMD_ID_STAT_FUNC (FCOE_KCQE_OPCODE_STAT_FUNC) @@ -54,23 +44,6 @@ #define FCOE_RAMROD_CMD_ID_DESTROY_CONN (FCOE_KCQE_OPCODE_DESTROY_CONN) #define FCOE_RAMROD_CMD_ID_TERMINATE_CONN (0x81) -#define FCOE_KWQE_OPCODE_INIT1 (0) -#define FCOE_KWQE_OPCODE_INIT2 (1) -#define FCOE_KWQE_OPCODE_INIT3 (2) -#define FCOE_KWQE_OPCODE_OFFLOAD_CONN1 (3) -#define FCOE_KWQE_OPCODE_OFFLOAD_CONN2 (4) -#define FCOE_KWQE_OPCODE_OFFLOAD_CONN3 (5) -#define FCOE_KWQE_OPCODE_OFFLOAD_CONN4 (6) -#define FCOE_KWQE_OPCODE_ENABLE_CONN (7) -#define FCOE_KWQE_OPCODE_DISABLE_CONN (8) -#define FCOE_KWQE_OPCODE_DESTROY_CONN (9) -#define FCOE_KWQE_OPCODE_DESTROY (10) -#define FCOE_KWQE_OPCODE_STAT (11) - -#define FCOE_KCQE_COMPLETION_STATUS_ERROR (0x1) -#define FCOE_KCQE_COMPLETION_STATUS_CTX_ALLOC_FAILURE (0x3) -#define FCOE_KCQE_COMPLETION_STATUS_NIC_ERROR (0x5) - /* KCQ (kernel completion queue) response op codes */ #define L4_KCQE_OPCODE_VALUE_CLOSE_COMP (53) #define L4_KCQE_OPCODE_VALUE_RESET_COMP (54) @@ -87,6 +60,7 @@ /* KCQ (kernel completion queue) completion status */ #define L4_KCQE_COMPLETION_STATUS_SUCCESS (0) #define L4_KCQE_COMPLETION_STATUS_NIC_ERROR (4) +#define L4_KCQE_COMPLETION_STATUS_PARITY_ERROR (0x81) #define L4_KCQE_COMPLETION_STATUS_TIMEOUT (0x93) #define L4_KCQE_COMPLETION_STATUS_CTX_ALLOC_FAIL (0x83) diff --git a/drivers/net/ethernet/broadcom/cnic_if.h b/drivers/net/ethernet/broadcom/cnic_if.h index 60deb84d36b..289274e546b 100644 --- a/drivers/net/ethernet/broadcom/cnic_if.h +++ b/drivers/net/ethernet/broadcom/cnic_if.h @@ -12,8 +12,8 @@ #ifndef CNIC_IF_H #define CNIC_IF_H -#define CNIC_MODULE_VERSION "2.5.9" -#define CNIC_MODULE_RELDATE "Feb 8, 2012" +#define CNIC_MODULE_VERSION "2.5.10" +#define CNIC_MODULE_RELDATE "March 21, 2012" #define CNIC_ULP_RDMA 0 #define CNIC_ULP_ISCSI 1 diff --git a/drivers/scsi/bnx2fc/bnx2fc_constants.h b/drivers/scsi/bnx2fc/bnx2fc_constants.h index c12702bb16d..dad9924abbb 100644 --- a/drivers/scsi/bnx2fc/bnx2fc_constants.h +++ b/drivers/scsi/bnx2fc/bnx2fc_constants.h @@ -47,6 +47,7 @@ #define FCOE_KCQE_COMPLETION_STATUS_CTX_FREE_FAILURE (0x4) #define FCOE_KCQE_COMPLETION_STATUS_NIC_ERROR (0x5) #define FCOE_KCQE_COMPLETION_STATUS_WRONG_HSI_VERSION (0x6) +#define FCOE_KCQE_COMPLETION_STATUS_PARITY_ERROR (0x81) /* CQE type */ #define FCOE_PENDING_CQE_TYPE 0 diff --git a/drivers/scsi/bnx2i/57xx_iscsi_constants.h b/drivers/scsi/bnx2i/57xx_iscsi_constants.h index 57515f1f169..495a841645f 100644 --- a/drivers/scsi/bnx2i/57xx_iscsi_constants.h +++ b/drivers/scsi/bnx2i/57xx_iscsi_constants.h @@ -122,6 +122,7 @@ #define ISCSI_KCQE_COMPLETION_STATUS_LOM_ISCSI_NOT_ENABLED (0x51) #define ISCSI_KCQE_COMPLETION_STATUS_CID_BUSY (0x80) +#define ISCSI_KCQE_COMPLETION_STATUS_PARITY_ERR (0x81) /* SQ/RQ/CQ DB structure sizes */ #define ISCSI_SQ_DB_SIZE (16) From 9395a09d05a23bb313cd20c99fb234f308d948b3 Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Tue, 20 Mar 2012 14:01:21 +0000 Subject: [PATCH 05/14] l2tp: enable automatic module loading for l2tp_ppp When L2TP is configured as a module, requests for L2TP sockets do not result in the l2tp_ppp module being loaded. Fix this by adding the appropriate MODULE_ALIAS to be recognized by pppox's request_module() call. Signed-off-by: Benjamin LaHaise Signed-off-by: David S. Miller --- net/l2tp/l2tp_ppp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 9b071910b4b..1addd9f3f40 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -1845,3 +1845,4 @@ MODULE_AUTHOR("James Chapman "); MODULE_DESCRIPTION("PPP over L2TP over UDP"); MODULE_LICENSE("GPL"); MODULE_VERSION(PPPOL2TP_DRV_VERSION); +MODULE_ALIAS("pppox-proto-" __stringify(PX_PROTO_OL2TP)); From 64b5fad526f63e9b56752a7e8e153b99ec0ddecd Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 21 Mar 2012 20:41:01 +0000 Subject: [PATCH 06/14] netlabel: use GFP flags from caller instead of GFP_ATOMIC This function takes a GFP flags as a parameter, but they are never used. We don't take a lock in this function so there is no reason to prefer GFP_ATOMIC over the caller's GFP flags. There is only one caller, cipso_v4_map_cat_rng_ntoh(), and it passes GFP_ATOMIC as the GFP flags so this doesn't change how the code works. It's just a cleanup. Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- net/netlabel/netlabel_kapi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index 2560e7b441c..7c94aedd091 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -597,7 +597,7 @@ int netlbl_secattr_catmap_setrng(struct netlbl_lsm_secattr_catmap *catmap, iter = iter->next; iter_max_spot = iter->startbit + NETLBL_CATMAP_SIZE; } - ret_val = netlbl_secattr_catmap_setbit(iter, spot, GFP_ATOMIC); + ret_val = netlbl_secattr_catmap_setbit(iter, spot, flags); } return ret_val; From f0229eaaf3f82522e2b16b41b0f45bb84a88d1b0 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 21 Mar 2012 20:44:09 +0000 Subject: [PATCH 07/14] RDS: use gfp flags from caller in conn_alloc() We should be using the gfp flags the caller specified here, instead of GFP_KERNEL. I think this might be a bugfix, depending on the value of "sock->sk->sk_allocation" when we call rds_conn_create_outgoing() in rds_sendmsg(). Otherwise, it's just a cleanup. Signed-off-by: Dan Carpenter Acked-by: Venkat Venkatsubra Signed-off-by: David S. Miller --- net/rds/ib_cm.c | 2 +- net/rds/iw_cm.c | 2 +- net/rds/loop.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 51c868923f6..a1e11627747 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -749,7 +749,7 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) int ret; /* XXX too lazy? */ - ic = kzalloc(sizeof(struct rds_ib_connection), GFP_KERNEL); + ic = kzalloc(sizeof(struct rds_ib_connection), gfp); if (!ic) return -ENOMEM; diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c index 9556d2895f7..a91e1db62ee 100644 --- a/net/rds/iw_cm.c +++ b/net/rds/iw_cm.c @@ -694,7 +694,7 @@ int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp) unsigned long flags; /* XXX too lazy? */ - ic = kzalloc(sizeof(struct rds_iw_connection), GFP_KERNEL); + ic = kzalloc(sizeof(struct rds_iw_connection), gfp); if (!ic) return -ENOMEM; diff --git a/net/rds/loop.c b/net/rds/loop.c index 87ff2a8a454..6b12b68541a 100644 --- a/net/rds/loop.c +++ b/net/rds/loop.c @@ -121,7 +121,7 @@ static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp) struct rds_loop_connection *lc; unsigned long flags; - lc = kzalloc(sizeof(struct rds_loop_connection), GFP_KERNEL); + lc = kzalloc(sizeof(struct rds_loop_connection), gfp); if (!lc) return -ENOMEM; From 26b2072e7536e57995b2867d057fbb32ecfe498d Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 21 Mar 2012 23:32:39 +0000 Subject: [PATCH 08/14] xfrm: Remove unused xfrm_state from xfrm_state_check_space The xfrm_state argument is unused in this function, so remove it. Also the name xfrm_state_check_space does not really match what this function does. It actually checks if we have enough head and tailroom on the skb. So we rename the function to xfrm_skb_check_space. Signed-off-by: Steffen Klassert Signed-off-by: David S. Miller --- net/xfrm/xfrm_output.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 47bacd8c025..95a338c89f9 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -21,7 +21,7 @@ static int xfrm_output2(struct sk_buff *skb); -static int xfrm_state_check_space(struct xfrm_state *x, struct sk_buff *skb) +static int xfrm_skb_check_space(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); int nhead = dst->header_len + LL_RESERVED_SPACE(dst->dev) @@ -48,7 +48,7 @@ static int xfrm_output_one(struct sk_buff *skb, int err) goto resume; do { - err = xfrm_state_check_space(x, skb); + err = xfrm_skb_check_space(skb); if (err) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR); goto error_nolock; From 1265fd616782ef03b98fd19f65c2b47fcd4ea11f Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 21 Mar 2012 23:36:13 +0000 Subject: [PATCH 09/14] xfrm: Access the replay notify functions via the registered callbacks We call the wrong replay notify function when we use ESN replay handling. This leads to the fact that we don't send notifications if we use ESN. Fix this by calling the registered callbacks instead of xfrm_replay_notify(). Signed-off-by: Steffen Klassert Signed-off-by: David S. Miller --- net/xfrm/xfrm_replay.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index 39e02c54ed2..2f6d11d04a2 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -167,7 +167,7 @@ static void xfrm_replay_advance(struct xfrm_state *x, __be32 net_seq) } if (xfrm_aevent_is_on(xs_net(x))) - xfrm_replay_notify(x, XFRM_REPLAY_UPDATE); + x->repl->notify(x, XFRM_REPLAY_UPDATE); } static int xfrm_replay_overflow_bmp(struct xfrm_state *x, struct sk_buff *skb) @@ -279,7 +279,7 @@ static void xfrm_replay_advance_bmp(struct xfrm_state *x, __be32 net_seq) replay_esn->bmp[nr] |= (1U << bitnr); if (xfrm_aevent_is_on(xs_net(x))) - xfrm_replay_notify(x, XFRM_REPLAY_UPDATE); + x->repl->notify(x, XFRM_REPLAY_UPDATE); } static void xfrm_replay_notify_bmp(struct xfrm_state *x, int event) @@ -473,7 +473,7 @@ static void xfrm_replay_advance_esn(struct xfrm_state *x, __be32 net_seq) replay_esn->bmp[nr] |= (1U << bitnr); if (xfrm_aevent_is_on(xs_net(x))) - xfrm_replay_notify(x, XFRM_REPLAY_UPDATE); + x->repl->notify(x, XFRM_REPLAY_UPDATE); } static struct xfrm_replay xfrm_replay_legacy = { From 0956a8c20b23d429e79ff86d4325583fc06f9eb4 Mon Sep 17 00:00:00 2001 From: "tom.leiming@gmail.com" Date: Thu, 22 Mar 2012 03:22:18 +0000 Subject: [PATCH 10/14] usbnet: increase URB reference count before usb_unlink_urb Commit 4231d47e6fe69f061f96c98c30eaf9fb4c14b96d(net/usbnet: avoid recursive locking in usbnet_stop()) fixes the recursive locking problem by releasing the skb queue lock, but it makes usb_unlink_urb racing with defer_bh, and the URB to being unlinked may be freed before or during calling usb_unlink_urb, so use-after-free problem may be triggerd inside usb_unlink_urb. The patch fixes the use-after-free problem by increasing URB reference count with skb queue lock held before calling usb_unlink_urb, so the URB won't be freed until return from usb_unlink_urb. Cc: stable@kernel.org Cc: Sebastian Andrzej Siewior Cc: Alan Stern Cc: Oliver Neukum Reported-by: Dave Jones Signed-off-by: Ming Lei Acked-by: Greg Kroah-Hartman Signed-off-by: David S. Miller --- drivers/net/usb/usbnet.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index 4b8b52ca09d..febfdceeb9e 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -589,6 +589,14 @@ static int unlink_urbs (struct usbnet *dev, struct sk_buff_head *q) entry = (struct skb_data *) skb->cb; urb = entry->urb; + /* + * Get reference count of the URB to avoid it to be + * freed during usb_unlink_urb, which may trigger + * use-after-free problem inside usb_unlink_urb since + * usb_unlink_urb is always racing with .complete + * handler(include defer_bh). + */ + usb_get_urb(urb); spin_unlock_irqrestore(&q->lock, flags); // during some PM-driven resume scenarios, // these (async) unlinks complete immediately @@ -597,6 +605,7 @@ static int unlink_urbs (struct usbnet *dev, struct sk_buff_head *q) netdev_dbg(dev->net, "unlink urb err, %d\n", retval); else count++; + usb_put_urb(urb); spin_lock_irqsave(&q->lock, flags); } spin_unlock_irqrestore (&q->lock, flags); From 5d5440a835710d09f0ef18da5000541ec98b537a Mon Sep 17 00:00:00 2001 From: "tom.leiming@gmail.com" Date: Thu, 22 Mar 2012 03:22:38 +0000 Subject: [PATCH 11/14] usbnet: don't clear urb->dev in tx_complete URB unlinking is always racing with its completion and tx_complete may be called before or during running usb_unlink_urb, so tx_complete must not clear urb->dev since it will be used in unlink path, otherwise invalid memory accesses or usb device leak may be caused inside usb_unlink_urb. Cc: stable@kernel.org Cc: Alan Stern Cc: Oliver Neukum Signed-off-by: Ming Lei Acked-by: Greg Kroah-Hartman Signed-off-by: David S. Miller --- drivers/net/usb/usbnet.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index febfdceeb9e..62f8b5cfbb5 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -1037,7 +1037,6 @@ static void tx_complete (struct urb *urb) } usb_autopm_put_interface_async(dev->intf); - urb->dev = NULL; entry->state = tx_done; defer_bh(dev, skb, &dev->txq); } From 523f610e1be2a4afca605962e137064378883c5f Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 22 Mar 2012 12:27:06 +0000 Subject: [PATCH 12/14] netfilter: remove forward module param confusion. It used to be an int, and it got changed to a bool parameter at least 7 years ago. It happens that NF_ACCEPT and NF_DROP are 0 and 1, so this works, but it's unclear, and the check that it's in range is not required. Reported-by: Dan Carpenter Signed-off-by: Rusty Russell Signed-off-by: David S. Miller --- net/ipv4/netfilter/iptable_filter.c | 9 ++------- net/ipv6/netfilter/ip6table_filter.c | 9 ++------- 2 files changed, 4 insertions(+), 14 deletions(-) diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index 0e58f09e59f..851acec852d 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -52,7 +52,7 @@ iptable_filter_hook(unsigned int hook, struct sk_buff *skb, static struct nf_hook_ops *filter_ops __read_mostly; /* Default to forward because I got too much mail already. */ -static bool forward = NF_ACCEPT; +static bool forward = true; module_param(forward, bool, 0000); static int __net_init iptable_filter_net_init(struct net *net) @@ -64,7 +64,7 @@ static int __net_init iptable_filter_net_init(struct net *net) return -ENOMEM; /* Entry 1 is the FORWARD hook */ ((struct ipt_standard *)repl->entries)[1].target.verdict = - -forward - 1; + forward ? -NF_ACCEPT - 1 : -NF_DROP - 1; net->ipv4.iptable_filter = ipt_register_table(net, &packet_filter, repl); @@ -88,11 +88,6 @@ static int __init iptable_filter_init(void) { int ret; - if (forward < 0 || forward > NF_MAX_VERDICT) { - pr_err("iptables forward must be 0 or 1\n"); - return -EINVAL; - } - ret = register_pernet_subsys(&iptable_filter_net_ops); if (ret < 0) return ret; diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c index a8f6da97e3b..325e59a0224 100644 --- a/net/ipv6/netfilter/ip6table_filter.c +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -44,7 +44,7 @@ ip6table_filter_hook(unsigned int hook, struct sk_buff *skb, static struct nf_hook_ops *filter_ops __read_mostly; /* Default to forward because I got too much mail already. */ -static bool forward = NF_ACCEPT; +static bool forward = true; module_param(forward, bool, 0000); static int __net_init ip6table_filter_net_init(struct net *net) @@ -56,7 +56,7 @@ static int __net_init ip6table_filter_net_init(struct net *net) return -ENOMEM; /* Entry 1 is the FORWARD hook */ ((struct ip6t_standard *)repl->entries)[1].target.verdict = - -forward - 1; + forward ? -NF_ACCEPT - 1 : -NF_DROP - 1; net->ipv6.ip6table_filter = ip6t_register_table(net, &packet_filter, repl); @@ -80,11 +80,6 @@ static int __init ip6table_filter_init(void) { int ret; - if (forward < 0 || forward > NF_MAX_VERDICT) { - pr_err("iptables forward must be 0 or 1\n"); - return -EINVAL; - } - ret = register_pernet_subsys(&ip6table_filter_net_ops); if (ret < 0) return ret; From eaddcd76903c28e84bb452a35835babb0800a2c4 Mon Sep 17 00:00:00 2001 From: Andy Gospodarek Date: Thu, 22 Mar 2012 16:14:29 +0000 Subject: [PATCH 13/14] bonding: remove entries for master_ip and vlan_ip and query devices instead The following patch aimed to resolve an issue where secondary, tertiary, etc. addresses added to bond interfaces could overwrite the bond->master_ip and vlan_ip values. commit 917fbdb32f37e9a93b00bb12ee83532982982df3 Author: Henrik Saavedra Persson Date: Wed Nov 23 23:37:15 2011 +0000 bonding: only use primary address for ARP That patch was good because it prevented bonds using ARP monitoring from sending frames with an invalid source IP address. Unfortunately, it didn't always work as expected. When using an ioctl (like ifconfig does) to set the IP address and netmask, 2 separate ioctls are actually called to set the IP and netmask if the mask chosen doesn't match the standard mask for that class of address. The first ioctl did not have a mask that matched the one in the primary address and would still cause the device address to be overwritten. The second ioctl that was called to set the mask would then detect as secondary and ignored, but the damage was already done. This was not an issue when using an application that used netlink sockets as the setting of IP and netmask came down at once. The inconsistent behavior between those two interfaces was something that needed to be resolved. While I was thinking about how I wanted to resolve this, Ralf Zeidler came with a patch that resolved this on a RHEL kernel by keeping a full shadow of the entries in dev->ifa_list for the bonding device and vlan devices in the bonding driver. I didn't like the duplication of the list as I want to see the 'bonding' struct and code shrink rather than grow, but liked the general idea. As the Subject indicates this patch drops the master_ip and vlan_ip elements from the 'bonding' and 'vlan_entry' structs, respectively. This can be done because a device's address-list is now traversed to determine the optimal source IP address for ARP requests and for checks to see if the bonding device has a particular IP address. This code could have all be contained inside the bonding driver, but it made more sense to me to EXPORT and call inet_confirm_addr since it did exactly what was needed. I tested this and a backported patch and everything works as expected. Ralf also helped with verification of the backported patch. Thanks to Ralf for all his help on this. v2: Whitespace and organizational changes based on suggestions from Jay Vosburgh and Dave Miller. v3: Fixup incorrect usage of rcu_read_unlock based on Dave Miller's suggestion. Signed-off-by: Andy Gospodarek CC: Ralf Zeidler Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 82 ++++++--------------------------- drivers/net/bonding/bonding.h | 18 +++++++- net/ipv4/devinet.c | 1 + 3 files changed, 32 insertions(+), 69 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 0730203a19f..b920d829692 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2573,12 +2573,16 @@ re_arm: static int bond_has_this_ip(struct bonding *bond, __be32 ip) { struct vlan_entry *vlan; + struct net_device *vlan_dev; - if (ip == bond->master_ip) + if (ip == bond_confirm_addr(bond->dev, 0, ip)) return 1; list_for_each_entry(vlan, &bond->vlan_list, vlan_list) { - if (ip == vlan->vlan_ip) + rcu_read_lock(); + vlan_dev = __vlan_find_dev_deep(bond->dev, vlan->vlan_id); + rcu_read_unlock(); + if (vlan_dev && ip == bond_confirm_addr(vlan_dev, 0, ip)) return 1; } @@ -2620,17 +2624,19 @@ static void bond_arp_send_all(struct bonding *bond, struct slave *slave) int i, vlan_id; __be32 *targets = bond->params.arp_targets; struct vlan_entry *vlan; - struct net_device *vlan_dev; + struct net_device *vlan_dev = NULL; struct rtable *rt; for (i = 0; (i < BOND_MAX_ARP_TARGETS); i++) { + __be32 addr; if (!targets[i]) break; pr_debug("basa: target %x\n", targets[i]); if (!bond_vlan_used(bond)) { pr_debug("basa: empty vlan: arp_send\n"); + addr = bond_confirm_addr(bond->dev, targets[i], 0); bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i], - bond->master_ip, 0); + addr, 0); continue; } @@ -2655,8 +2661,9 @@ static void bond_arp_send_all(struct bonding *bond, struct slave *slave) if (rt->dst.dev == bond->dev) { ip_rt_put(rt); pr_debug("basa: rtdev == bond->dev: arp_send\n"); + addr = bond_confirm_addr(bond->dev, targets[i], 0); bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i], - bond->master_ip, 0); + addr, 0); continue; } @@ -2674,10 +2681,11 @@ static void bond_arp_send_all(struct bonding *bond, struct slave *slave) } } - if (vlan_id) { + if (vlan_id && vlan_dev) { ip_rt_put(rt); + addr = bond_confirm_addr(vlan_dev, targets[i], 0); bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i], - vlan->vlan_ip, vlan_id); + addr, vlan_id); continue; } @@ -3299,68 +3307,10 @@ static int bond_netdev_event(struct notifier_block *this, return NOTIFY_DONE; } -/* - * bond_inetaddr_event: handle inetaddr notifier chain events. - * - * We keep track of device IPs primarily to use as source addresses in - * ARP monitor probes (rather than spewing out broadcasts all the time). - * - * We track one IP for the main device (if it has one), plus one per VLAN. - */ -static int bond_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) -{ - struct in_ifaddr *ifa = ptr; - struct net_device *vlan_dev, *event_dev = ifa->ifa_dev->dev; - struct bond_net *bn = net_generic(dev_net(event_dev), bond_net_id); - struct bonding *bond; - struct vlan_entry *vlan; - - /* we only care about primary address */ - if(ifa->ifa_flags & IFA_F_SECONDARY) - return NOTIFY_DONE; - - list_for_each_entry(bond, &bn->dev_list, bond_list) { - if (bond->dev == event_dev) { - switch (event) { - case NETDEV_UP: - bond->master_ip = ifa->ifa_local; - return NOTIFY_OK; - case NETDEV_DOWN: - bond->master_ip = 0; - return NOTIFY_OK; - default: - return NOTIFY_DONE; - } - } - - list_for_each_entry(vlan, &bond->vlan_list, vlan_list) { - vlan_dev = __vlan_find_dev_deep(bond->dev, - vlan->vlan_id); - if (vlan_dev == event_dev) { - switch (event) { - case NETDEV_UP: - vlan->vlan_ip = ifa->ifa_local; - return NOTIFY_OK; - case NETDEV_DOWN: - vlan->vlan_ip = 0; - return NOTIFY_OK; - default: - return NOTIFY_DONE; - } - } - } - } - return NOTIFY_DONE; -} - static struct notifier_block bond_netdev_notifier = { .notifier_call = bond_netdev_event, }; -static struct notifier_block bond_inetaddr_notifier = { - .notifier_call = bond_inetaddr_event, -}; - /*---------------------------- Hashing Policies -----------------------------*/ /* @@ -4929,7 +4879,6 @@ static int __init bonding_init(void) } register_netdevice_notifier(&bond_netdev_notifier); - register_inetaddr_notifier(&bond_inetaddr_notifier); out: return res; err: @@ -4943,7 +4892,6 @@ err_link: static void __exit bonding_exit(void) { unregister_netdevice_notifier(&bond_netdev_notifier); - unregister_inetaddr_notifier(&bond_inetaddr_notifier); bond_destroy_debugfs(); diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h index 1aecc37e5b4..9f2bae6616d 100644 --- a/drivers/net/bonding/bonding.h +++ b/drivers/net/bonding/bonding.h @@ -21,6 +21,7 @@ #include #include #include +#include #include "bond_3ad.h" #include "bond_alb.h" @@ -166,7 +167,6 @@ struct bond_parm_tbl { struct vlan_entry { struct list_head vlan_list; - __be32 vlan_ip; unsigned short vlan_id; }; @@ -232,7 +232,6 @@ struct bonding { struct list_head bond_list; struct netdev_hw_addr_list mc_list; int (*xmit_hash_policy)(struct sk_buff *, int); - __be32 master_ip; u16 rr_tx_counter; struct ad_bond_info ad_info; struct alb_bond_info alb_info; @@ -378,6 +377,21 @@ static inline bool bond_is_slave_inactive(struct slave *slave) return slave->inactive; } +static inline __be32 bond_confirm_addr(struct net_device *dev, __be32 dst, __be32 local) +{ + struct in_device *in_dev; + __be32 addr = 0; + + rcu_read_lock(); + in_dev = __in_dev_get_rcu(dev); + + if (in_dev) + addr = inet_confirm_addr(in_dev, dst, local, RT_SCOPE_HOST); + + rcu_read_unlock(); + return addr; +} + struct bond_net; struct vlan_entry *bond_next_vlan(struct bonding *bond, struct vlan_entry *curr); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index e41c40f48cf..d4fad5c7744 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1079,6 +1079,7 @@ __be32 inet_confirm_addr(struct in_device *in_dev, return addr; } +EXPORT_SYMBOL(inet_confirm_addr); /* * Device notifier From 8a78335442cea429afb2b964318b6e257448ea00 Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Sat, 3 Mar 2012 18:45:07 +0100 Subject: [PATCH 14/14] usbnet: consider device busy at each recieved packet usbnet should centrally handle busy reporting in the rx path so subdrivers need not worry. This hurts use cases which do rx only or predominantly. Signed-off-by: Oliver Neukum Signed-off-by: David S. Miller --- drivers/net/usb/usbnet.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index 62f8b5cfbb5..b7b3f5b0d40 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -493,6 +493,7 @@ block: if (netif_running (dev->net) && !test_bit (EVENT_RX_HALT, &dev->flags)) { rx_submit (dev, urb, GFP_ATOMIC); + usb_mark_last_busy(dev->udev); return; } usb_free_urb (urb);