176 lines
6.5 KiB
Diff
176 lines
6.5 KiB
Diff
From: Bruce Allan <bruce.w.allan@intel.com>
|
|
Date: Fri, 24 Aug 2012 20:38:11 +0000
|
|
Subject: e1000e: DoS while TSO enabled caused by link partner with small MSS
|
|
|
|
commit d821a4c4d11ad160925dab2bb009b8444beff484 upstream.
|
|
|
|
With a low enough MSS on the link partner and TSO enabled locally, the
|
|
networking stack can periodically send a very large (e.g. 64KB) TCP
|
|
message for which the driver will attempt to use more Tx descriptors than
|
|
are available by default in the Tx ring. This is due to a workaround in
|
|
the code that imposes a limit of only 4 MSS-sized segments per descriptor
|
|
which appears to be a carry-over from the older e1000 driver and may be
|
|
applicable only to some older PCI or PCIx parts which are not supported in
|
|
e1000e. When the driver gets a message that is too large to fit across the
|
|
configured number of Tx descriptors, it stops the upper stack from queueing
|
|
any more and gets stuck in this state. After a timeout, the upper stack
|
|
assumes the adapter is hung and calls the driver to reset it.
|
|
|
|
Remove the unnecessary limitation of using up to only 4 MSS-sized segments
|
|
per Tx descriptor, and put in a hard failure test to catch when attempting
|
|
to check for message sizes larger than would fit in the whole Tx ring.
|
|
Refactor the remaining logic that limits the size of data per Tx descriptor
|
|
from a seemingly arbitrary 8KB to a limit based on the dynamic size of the
|
|
Tx packet buffer as described in the hardware specification.
|
|
|
|
Also, fix the logic in the check for space in the Tx ring for the next
|
|
largest possible packet after the current one has been successfully queued
|
|
for transmit, and use the appropriate defines for default ring sizes in
|
|
e1000_probe instead of magic values.
|
|
|
|
This issue goes back to the introduction of e1000e in 2.6.24 when it was
|
|
split off from e1000.
|
|
|
|
Reported-by: Ben Hutchings <bhutchings@solarflare.com>
|
|
Signed-off-by: Bruce Allan <bruce.w.allan@intel.com>
|
|
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
|
|
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
|
|
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
[bwh: Backported to 3.2:
|
|
- Adjust context
|
|
- Adjust for use of net_device vs e1000_ring parameter]
|
|
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
|
|
---
|
|
--- a/drivers/net/ethernet/intel/e1000e/e1000.h
|
|
+++ b/drivers/net/ethernet/intel/e1000e/e1000.h
|
|
@@ -302,6 +302,7 @@ struct e1000_adapter {
|
|
*/
|
|
struct e1000_ring *tx_ring /* One per active queue */
|
|
____cacheline_aligned_in_smp;
|
|
+ u32 tx_fifo_limit;
|
|
|
|
struct napi_struct napi;
|
|
|
|
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
|
|
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
|
|
@@ -3386,6 +3386,15 @@ void e1000e_reset(struct e1000_adapter *
|
|
}
|
|
|
|
/*
|
|
+ * Alignment of Tx data is on an arbitrary byte boundary with the
|
|
+ * maximum size per Tx descriptor limited only to the transmit
|
|
+ * allocation of the packet buffer minus 96 bytes with an upper
|
|
+ * limit of 24KB due to receive synchronization limitations.
|
|
+ */
|
|
+ adapter->tx_fifo_limit = min_t(u32, ((er32(PBA) >> 16) << 10) - 96,
|
|
+ 24 << 10);
|
|
+
|
|
+ /*
|
|
* Disable Adaptive Interrupt Moderation if 2 full packets cannot
|
|
* fit in receive buffer and early-receive not supported.
|
|
*/
|
|
@@ -4647,13 +4656,9 @@ static bool e1000_tx_csum(struct e1000_a
|
|
return 1;
|
|
}
|
|
|
|
-#define E1000_MAX_PER_TXD 8192
|
|
-#define E1000_MAX_TXD_PWR 12
|
|
-
|
|
static int e1000_tx_map(struct e1000_adapter *adapter,
|
|
struct sk_buff *skb, unsigned int first,
|
|
- unsigned int max_per_txd, unsigned int nr_frags,
|
|
- unsigned int mss)
|
|
+ unsigned int max_per_txd, unsigned int nr_frags)
|
|
{
|
|
struct e1000_ring *tx_ring = adapter->tx_ring;
|
|
struct pci_dev *pdev = adapter->pdev;
|
|
@@ -4882,20 +4887,19 @@ static int e1000_maybe_stop_tx(struct ne
|
|
{
|
|
struct e1000_adapter *adapter = netdev_priv(netdev);
|
|
|
|
+ BUG_ON(size > adapter->tx_ring->count);
|
|
+
|
|
if (e1000_desc_unused(adapter->tx_ring) >= size)
|
|
return 0;
|
|
return __e1000_maybe_stop_tx(netdev, size);
|
|
}
|
|
|
|
-#define TXD_USE_COUNT(S, X) (((S) >> (X)) + 1 )
|
|
static netdev_tx_t e1000_xmit_frame(struct sk_buff *skb,
|
|
struct net_device *netdev)
|
|
{
|
|
struct e1000_adapter *adapter = netdev_priv(netdev);
|
|
struct e1000_ring *tx_ring = adapter->tx_ring;
|
|
unsigned int first;
|
|
- unsigned int max_per_txd = E1000_MAX_PER_TXD;
|
|
- unsigned int max_txd_pwr = E1000_MAX_TXD_PWR;
|
|
unsigned int tx_flags = 0;
|
|
unsigned int len = skb_headlen(skb);
|
|
unsigned int nr_frags;
|
|
@@ -4915,18 +4919,8 @@ static netdev_tx_t e1000_xmit_frame(stru
|
|
}
|
|
|
|
mss = skb_shinfo(skb)->gso_size;
|
|
- /*
|
|
- * The controller does a simple calculation to
|
|
- * make sure there is enough room in the FIFO before
|
|
- * initiating the DMA for each buffer. The calc is:
|
|
- * 4 = ceil(buffer len/mss). To make sure we don't
|
|
- * overrun the FIFO, adjust the max buffer len if mss
|
|
- * drops.
|
|
- */
|
|
if (mss) {
|
|
u8 hdr_len;
|
|
- max_per_txd = min(mss << 2, max_per_txd);
|
|
- max_txd_pwr = fls(max_per_txd) - 1;
|
|
|
|
/*
|
|
* TSO Workaround for 82571/2/3 Controllers -- if skb->data
|
|
@@ -4956,12 +4950,12 @@ static netdev_tx_t e1000_xmit_frame(stru
|
|
count++;
|
|
count++;
|
|
|
|
- count += TXD_USE_COUNT(len, max_txd_pwr);
|
|
+ count += DIV_ROUND_UP(len, adapter->tx_fifo_limit);
|
|
|
|
nr_frags = skb_shinfo(skb)->nr_frags;
|
|
for (f = 0; f < nr_frags; f++)
|
|
- count += TXD_USE_COUNT(skb_frag_size(&skb_shinfo(skb)->frags[f]),
|
|
- max_txd_pwr);
|
|
+ count += DIV_ROUND_UP(skb_frag_size(&skb_shinfo(skb)->frags[f]),
|
|
+ adapter->tx_fifo_limit);
|
|
|
|
if (adapter->hw.mac.tx_pkt_filtering)
|
|
e1000_transfer_dhcp_info(adapter, skb);
|
|
@@ -5000,13 +4994,16 @@ static netdev_tx_t e1000_xmit_frame(stru
|
|
tx_flags |= E1000_TX_FLAGS_IPV4;
|
|
|
|
/* if count is 0 then mapping error has occurred */
|
|
- count = e1000_tx_map(adapter, skb, first, max_per_txd, nr_frags, mss);
|
|
+ count = e1000_tx_map(adapter, skb, first, adapter->tx_fifo_limit,
|
|
+ nr_frags);
|
|
if (count) {
|
|
netdev_sent_queue(netdev, skb->len);
|
|
e1000_tx_queue(adapter, tx_flags, count);
|
|
/* Make sure there is space in the ring for the next send. */
|
|
- e1000_maybe_stop_tx(netdev, MAX_SKB_FRAGS + 2);
|
|
-
|
|
+ e1000_maybe_stop_tx(netdev,
|
|
+ (MAX_SKB_FRAGS *
|
|
+ DIV_ROUND_UP(PAGE_SIZE,
|
|
+ adapter->tx_fifo_limit) + 2));
|
|
} else {
|
|
dev_kfree_skb_any(skb);
|
|
tx_ring->buffer_info[first].time_stamp = 0;
|
|
@@ -6150,8 +6147,8 @@ static int __devinit e1000_probe(struct
|
|
adapter->hw.phy.autoneg_advertised = 0x2f;
|
|
|
|
/* ring size defaults */
|
|
- adapter->rx_ring->count = 256;
|
|
- adapter->tx_ring->count = 256;
|
|
+ adapter->rx_ring->count = E1000_DEFAULT_RXD;
|
|
+ adapter->tx_ring->count = E1000_DEFAULT_TXD;
|
|
|
|
/*
|
|
* Initial Wake on LAN setting - If APM wake is enabled in
|