Apply some vlan driver enhancements from 2.6.33 and 2.6.34

vlan/macvlan: propagate transmission state to upper layers
macvlan: add GRO bit to features mask
macvlan: allow multiple driver backends
Add macvtap driver (Closes: #568755)

svn path=/dists/sid/linux-2.6/; revision=15752
This commit is contained in:
Ben Hutchings 2010-05-19 04:29:24 +00:00
parent 6cb557b2f1
commit b074367c42
11 changed files with 1959 additions and 0 deletions

4
debian/changelog vendored
View File

@ -4,6 +4,10 @@ linux-2.6 (2.6.32-14) UNRELEASED; urgency=low
* [ia64] Hardcode the output of the scripts under arch/ia64/scripts so
that we can build out-of-tree modules correctly (refresh and re-add
dropped patch) (Closes: #392592)
* vlan/macvlan: propagate transmission state to upper layers
* macvlan: add GRO bit to features mask
* macvlan: allow multiple driver backends
* Add macvtap driver (Closes: #568755)
-- Ben Hutchings <ben@decadent.org.uk> Tue, 18 May 2010 02:13:44 +0100

View File

@ -1361,6 +1361,7 @@ CONFIG_IFB=m
CONFIG_DUMMY=m
CONFIG_BONDING=m
CONFIG_MACVLAN=m
CONFIG_MACVTAP=m
CONFIG_EQUALIZER=m
CONFIG_TUN=m
CONFIG_VETH=m

View File

@ -1,6 +1,8 @@
[abi]
abiname: 5
ignore-changes: module:crypto/cryptd
module:drivers/net/macvlan
module:drivers/net/macvtap
[base]
arches:

View File

@ -0,0 +1,130 @@
From 564517e804c9c6d4e29c270bfc1517404d27107b Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 11 Feb 2010 05:55:39 +0000
Subject: [PATCH 2/5] net/macvtap: fix reference counting
The RCU usage in the original code was broken because
there are cases where we possibly sleep with rcu_read_lock
held. As a fix, change the macvtap_file_get_queue to
get a reference on the socket and the netdev instead of
taking the full rcu_read_lock.
Also, change macvtap_file_get_queue failure case to
not require a subsequent macvtap_file_put_queue, as
pointed out by Ed Swierk.
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Ed Swierk <eswierk@aristanetworks.com>
Cc: Sridhar Samudrala <sri@us.ibm.com>
Acked-by: Sridhar Samudrala <sri@us.ibm.com>
Acked-by: Ed Swierk <eswierk@aristanetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
drivers/net/macvtap.c | 35 ++++++++++++++++++++++-------------
1 files changed, 22 insertions(+), 13 deletions(-)
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index ad1f6ef..fe7656b 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -70,7 +70,8 @@ static struct cdev macvtap_cdev;
* exists.
*
* The callbacks from macvlan are always done with rcu_read_lock held
- * already, while in the file_operations, we get it ourselves.
+ * already. For calls from file_operations, we use the rcu_read_lock_bh
+ * to get a reference count on the socket and the device.
*
* When destroying a queue, we remove the pointers from the file and
* from the dev and then synchronize_rcu to make sure no thread is
@@ -159,13 +160,21 @@ static void macvtap_del_queues(struct net_device *dev)
static inline struct macvtap_queue *macvtap_file_get_queue(struct file *file)
{
+ struct macvtap_queue *q;
rcu_read_lock_bh();
- return rcu_dereference(file->private_data);
+ q = rcu_dereference(file->private_data);
+ if (q) {
+ sock_hold(&q->sk);
+ dev_hold(q->vlan->dev);
+ }
+ rcu_read_unlock_bh();
+ return q;
}
-static inline void macvtap_file_put_queue(void)
+static inline void macvtap_file_put_queue(struct macvtap_queue *q)
{
- rcu_read_unlock_bh();
+ sock_put(&q->sk);
+ dev_put(q->vlan->dev);
}
/*
@@ -314,8 +323,8 @@ static unsigned int macvtap_poll(struct file *file, poll_table * wait)
sock_writeable(&q->sk)))
mask |= POLLOUT | POLLWRNORM;
+ macvtap_file_put_queue(q);
out:
- macvtap_file_put_queue();
return mask;
}
@@ -366,8 +375,8 @@ static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv,
result = macvtap_get_user(q, iv, iov_length(iv, count),
file->f_flags & O_NONBLOCK);
+ macvtap_file_put_queue(q);
out:
- macvtap_file_put_queue();
return result;
}
@@ -398,10 +407,8 @@ static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv,
struct sk_buff *skb;
ssize_t len, ret = 0;
- if (!q) {
- ret = -ENOLINK;
- goto out;
- }
+ if (!q)
+ return -ENOLINK;
len = iov_length(iv, count);
if (len < 0) {
@@ -437,7 +444,7 @@ static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv,
remove_wait_queue(q->sk.sk_sleep, &wait);
out:
- macvtap_file_put_queue();
+ macvtap_file_put_queue(q);
return ret;
}
@@ -468,7 +475,7 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
if (!q)
return -ENOLINK;
memcpy(devname, q->vlan->dev->name, sizeof(devname));
- macvtap_file_put_queue();
+ macvtap_file_put_queue(q);
if (copy_to_user(&ifr->ifr_name, q->vlan->dev->name, IFNAMSIZ) ||
put_user((TUN_TAP_DEV | TUN_NO_PI), &ifr->ifr_flags))
@@ -485,8 +492,10 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
return -EFAULT;
q = macvtap_file_get_queue(file);
+ if (!q)
+ return -ENOLINK;
q->sk.sk_sndbuf = u;
- macvtap_file_put_queue();
+ macvtap_file_put_queue(q);
return 0;
case TUNSETOFFLOAD:
--
1.7.1

View File

@ -0,0 +1,390 @@
From 02df55d28c6001a3cdb7a997a34a0b01f01d015e Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 18 Feb 2010 05:45:36 +0000
Subject: [PATCH 3/5] macvtap: rework object lifetime rules
This reworks the change done by the previous patch
in a more complete way.
The original macvtap code has a number of problems
resulting from the use of RCU for protecting the
access to struct macvtap_queue from open files.
This includes
- need for GFP_ATOMIC allocations for skbs
- potential deadlocks when copy_*_user sleeps
- inability to work with vhost-net
Changing the lifetime of macvtap_queue to always
depend on the open file solves all these. The
RCU reference simply moves one step down to
the reference on the macvlan_dev, which we
only need for nonblocking operations.
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Sridhar Samudrala <sri@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
drivers/net/macvtap.c | 183 ++++++++++++++++++++++++-------------------------
1 files changed, 91 insertions(+), 92 deletions(-)
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index fe7656b..7050997 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -60,30 +60,19 @@ static struct cdev macvtap_cdev;
/*
* RCU usage:
- * The macvtap_queue is referenced both from the chardev struct file
- * and from the struct macvlan_dev using rcu_read_lock.
+ * The macvtap_queue and the macvlan_dev are loosely coupled, the
+ * pointers from one to the other can only be read while rcu_read_lock
+ * or macvtap_lock is held.
*
- * We never actually update the contents of a macvtap_queue atomically
- * with RCU but it is used for race-free destruction of a queue when
- * either the file or the macvlan_dev goes away. Pointers back to
- * the dev and the file are implicitly valid as long as the queue
- * exists.
+ * Both the file and the macvlan_dev hold a reference on the macvtap_queue
+ * through sock_hold(&q->sk). When the macvlan_dev goes away first,
+ * q->vlan becomes inaccessible. When the files gets closed,
+ * macvtap_get_queue() fails.
*
- * The callbacks from macvlan are always done with rcu_read_lock held
- * already. For calls from file_operations, we use the rcu_read_lock_bh
- * to get a reference count on the socket and the device.
- *
- * When destroying a queue, we remove the pointers from the file and
- * from the dev and then synchronize_rcu to make sure no thread is
- * still using the queue. There may still be references to the struct
- * sock inside of the queue from outbound SKBs, but these never
- * reference back to the file or the dev. The data structure is freed
- * through __sk_free when both our references and any pending SKBs
- * are gone.
- *
- * macvtap_lock is only used to prevent multiple concurrent open()
- * calls to assign a new vlan->tap pointer. It could be moved into
- * the macvlan_dev itself but is extremely rarely used.
+ * There may still be references to the struct sock inside of the
+ * queue from outbound SKBs, but these never reference back to the
+ * file or the dev. The data structure is freed through __sk_free
+ * when both our references and any pending SKBs are gone.
*/
static DEFINE_SPINLOCK(macvtap_lock);
@@ -101,11 +90,12 @@ static int macvtap_set_queue(struct net_device *dev, struct file *file,
goto out;
err = 0;
- q->vlan = vlan;
+ rcu_assign_pointer(q->vlan, vlan);
rcu_assign_pointer(vlan->tap, q);
+ sock_hold(&q->sk);
q->file = file;
- rcu_assign_pointer(file->private_data, q);
+ file->private_data = q;
out:
spin_unlock(&macvtap_lock);
@@ -113,28 +103,25 @@ out:
}
/*
- * We must destroy each queue exactly once, when either
- * the netdev or the file go away.
+ * The file owning the queue got closed, give up both
+ * the reference that the files holds as well as the
+ * one from the macvlan_dev if that still exists.
*
* Using the spinlock makes sure that we don't get
* to the queue again after destroying it.
- *
- * synchronize_rcu serializes with the packet flow
- * that uses rcu_read_lock.
*/
-static void macvtap_del_queue(struct macvtap_queue **qp)
+static void macvtap_put_queue(struct macvtap_queue *q)
{
- struct macvtap_queue *q;
+ struct macvlan_dev *vlan;
spin_lock(&macvtap_lock);
- q = rcu_dereference(*qp);
- if (!q) {
- spin_unlock(&macvtap_lock);
- return;
+ vlan = rcu_dereference(q->vlan);
+ if (vlan) {
+ rcu_assign_pointer(vlan->tap, NULL);
+ rcu_assign_pointer(q->vlan, NULL);
+ sock_put(&q->sk);
}
- rcu_assign_pointer(q->vlan->tap, NULL);
- rcu_assign_pointer(q->file->private_data, NULL);
spin_unlock(&macvtap_lock);
synchronize_rcu();
@@ -152,29 +139,29 @@ static struct macvtap_queue *macvtap_get_queue(struct net_device *dev,
return rcu_dereference(vlan->tap);
}
+/*
+ * The net_device is going away, give up the reference
+ * that it holds on the queue (all the queues one day)
+ * and safely set the pointer from the queues to NULL.
+ */
static void macvtap_del_queues(struct net_device *dev)
{
struct macvlan_dev *vlan = netdev_priv(dev);
- macvtap_del_queue(&vlan->tap);
-}
-
-static inline struct macvtap_queue *macvtap_file_get_queue(struct file *file)
-{
struct macvtap_queue *q;
- rcu_read_lock_bh();
- q = rcu_dereference(file->private_data);
- if (q) {
- sock_hold(&q->sk);
- dev_hold(q->vlan->dev);
+
+ spin_lock(&macvtap_lock);
+ q = rcu_dereference(vlan->tap);
+ if (!q) {
+ spin_unlock(&macvtap_lock);
+ return;
}
- rcu_read_unlock_bh();
- return q;
-}
-static inline void macvtap_file_put_queue(struct macvtap_queue *q)
-{
+ rcu_assign_pointer(vlan->tap, NULL);
+ rcu_assign_pointer(q->vlan, NULL);
+ spin_unlock(&macvtap_lock);
+
+ synchronize_rcu();
sock_put(&q->sk);
- dev_put(q->vlan->dev);
}
/*
@@ -284,7 +271,6 @@ static int macvtap_open(struct inode *inode, struct file *file)
q->sock.type = SOCK_RAW;
q->sock.state = SS_CONNECTED;
sock_init_data(&q->sock, &q->sk);
- q->sk.sk_allocation = GFP_ATOMIC; /* for now */
q->sk.sk_write_space = macvtap_sock_write_space;
err = macvtap_set_queue(dev, file, q);
@@ -300,13 +286,14 @@ out:
static int macvtap_release(struct inode *inode, struct file *file)
{
- macvtap_del_queue((struct macvtap_queue **)&file->private_data);
+ struct macvtap_queue *q = file->private_data;
+ macvtap_put_queue(q);
return 0;
}
static unsigned int macvtap_poll(struct file *file, poll_table * wait)
{
- struct macvtap_queue *q = macvtap_file_get_queue(file);
+ struct macvtap_queue *q = file->private_data;
unsigned int mask = POLLERR;
if (!q)
@@ -323,7 +310,6 @@ static unsigned int macvtap_poll(struct file *file, poll_table * wait)
sock_writeable(&q->sk)))
mask |= POLLOUT | POLLWRNORM;
- macvtap_file_put_queue(q);
out:
return mask;
}
@@ -334,6 +320,7 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q,
int noblock)
{
struct sk_buff *skb;
+ struct macvlan_dev *vlan;
size_t len = count;
int err;
@@ -341,26 +328,37 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q,
return -EINVAL;
skb = sock_alloc_send_skb(&q->sk, NET_IP_ALIGN + len, noblock, &err);
-
- if (!skb) {
- macvlan_count_rx(q->vlan, 0, false, false);
- return err;
- }
+ if (!skb)
+ goto err;
skb_reserve(skb, NET_IP_ALIGN);
skb_put(skb, count);
- if (skb_copy_datagram_from_iovec(skb, 0, iv, 0, len)) {
- macvlan_count_rx(q->vlan, 0, false, false);
- kfree_skb(skb);
- return -EFAULT;
- }
+ err = skb_copy_datagram_from_iovec(skb, 0, iv, 0, len);
+ if (err)
+ goto err;
skb_set_network_header(skb, ETH_HLEN);
-
- macvlan_start_xmit(skb, q->vlan->dev);
+ rcu_read_lock_bh();
+ vlan = rcu_dereference(q->vlan);
+ if (vlan)
+ macvlan_start_xmit(skb, vlan->dev);
+ else
+ kfree_skb(skb);
+ rcu_read_unlock_bh();
return count;
+
+err:
+ rcu_read_lock_bh();
+ vlan = rcu_dereference(q->vlan);
+ if (vlan)
+ macvlan_count_rx(q->vlan, 0, false, false);
+ rcu_read_unlock_bh();
+
+ kfree_skb(skb);
+
+ return err;
}
static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv,
@@ -368,15 +366,10 @@ static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv,
{
struct file *file = iocb->ki_filp;
ssize_t result = -ENOLINK;
- struct macvtap_queue *q = macvtap_file_get_queue(file);
-
- if (!q)
- goto out;
+ struct macvtap_queue *q = file->private_data;
result = macvtap_get_user(q, iv, iov_length(iv, count),
file->f_flags & O_NONBLOCK);
- macvtap_file_put_queue(q);
-out:
return result;
}
@@ -385,14 +378,17 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q,
const struct sk_buff *skb,
const struct iovec *iv, int len)
{
- struct macvlan_dev *vlan = q->vlan;
+ struct macvlan_dev *vlan;
int ret;
len = min_t(int, skb->len, len);
ret = skb_copy_datagram_const_iovec(skb, 0, iv, 0, len);
+ rcu_read_lock_bh();
+ vlan = rcu_dereference(q->vlan);
macvlan_count_rx(vlan, len, ret == 0, 0);
+ rcu_read_unlock_bh();
return ret ? ret : len;
}
@@ -401,14 +397,16 @@ static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv,
unsigned long count, loff_t pos)
{
struct file *file = iocb->ki_filp;
- struct macvtap_queue *q = macvtap_file_get_queue(file);
+ struct macvtap_queue *q = file->private_data;
DECLARE_WAITQUEUE(wait, current);
struct sk_buff *skb;
ssize_t len, ret = 0;
- if (!q)
- return -ENOLINK;
+ if (!q) {
+ ret = -ENOLINK;
+ goto out;
+ }
len = iov_length(iv, count);
if (len < 0) {
@@ -444,7 +442,6 @@ static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv,
remove_wait_queue(q->sk.sk_sleep, &wait);
out:
- macvtap_file_put_queue(q);
return ret;
}
@@ -454,12 +451,13 @@ out:
static long macvtap_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
- struct macvtap_queue *q;
+ struct macvtap_queue *q = file->private_data;
+ struct macvlan_dev *vlan;
void __user *argp = (void __user *)arg;
struct ifreq __user *ifr = argp;
unsigned int __user *up = argp;
unsigned int u;
- char devname[IFNAMSIZ];
+ int ret;
switch (cmd) {
case TUNSETIFF:
@@ -471,16 +469,21 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
return 0;
case TUNGETIFF:
- q = macvtap_file_get_queue(file);
- if (!q)
+ rcu_read_lock_bh();
+ vlan = rcu_dereference(q->vlan);
+ if (vlan)
+ dev_hold(vlan->dev);
+ rcu_read_unlock_bh();
+
+ if (!vlan)
return -ENOLINK;
- memcpy(devname, q->vlan->dev->name, sizeof(devname));
- macvtap_file_put_queue(q);
+ ret = 0;
if (copy_to_user(&ifr->ifr_name, q->vlan->dev->name, IFNAMSIZ) ||
put_user((TUN_TAP_DEV | TUN_NO_PI), &ifr->ifr_flags))
- return -EFAULT;
- return 0;
+ ret = -EFAULT;
+ dev_put(vlan->dev);
+ return ret;
case TUNGETFEATURES:
if (put_user((IFF_TAP | IFF_NO_PI), up))
@@ -491,11 +494,7 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
if (get_user(u, up))
return -EFAULT;
- q = macvtap_file_get_queue(file);
- if (!q)
- return -ENOLINK;
q->sk.sk_sndbuf = u;
- macvtap_file_put_queue(q);
return 0;
case TUNSETOFFLOAD:
--
1.7.1

View File

@ -0,0 +1,55 @@
From cbbef5e183079455763fc470ccf69008f92ab4b6 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 10 Nov 2009 06:14:24 +0000
Subject: [PATCH] vlan/macvlan: propagate transmission state to upper layers
Both vlan and macvlan devices usually don't use a qdisc and immediately
queue packets to the underlying device. Propagate transmission state of
the underlying device to the upper layers so they can react on congestion
and/or inform the sending process.
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
drivers/net/macvlan.c | 2 +-
net/8021q/vlan_dev.c | 4 ++--
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index d7dba3f..271aa7e 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -202,7 +202,7 @@ static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,
} else
txq->tx_dropped++;
- return NETDEV_TX_OK;
+ return ret;
}
static int macvlan_hard_header(struct sk_buff *skb, struct net_device *dev,
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 790fd55..9159659 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -332,7 +332,7 @@ static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb,
} else
txq->tx_dropped++;
- return NETDEV_TX_OK;
+ return ret;
}
static netdev_tx_t vlan_dev_hwaccel_hard_start_xmit(struct sk_buff *skb,
@@ -358,7 +358,7 @@ static netdev_tx_t vlan_dev_hwaccel_hard_start_xmit(struct sk_buff *skb,
} else
txq->tx_dropped++;
- return NETDEV_TX_OK;
+ return ret;
}
static int vlan_dev_change_mtu(struct net_device *dev, int new_mtu)
--
1.7.1

View File

@ -0,0 +1,30 @@
From 6eb3a8553345ba2b4efd5390709e158289b9ece4 Mon Sep 17 00:00:00 2001
From: Patrick Mullaney <pmullaney@novell.com>
Date: Sat, 16 Jan 2010 01:05:38 -0800
Subject: [PATCH] macvlan: add GRO bit to features mask
Allow macvlan devices to support GRO.
Signed-off-by: Patrick Mullaney <pmullaney@novell.com>
Acked-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
drivers/net/macvlan.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 21a9c9a..fa0dc51 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -418,7 +418,7 @@ static struct lock_class_key macvlan_netdev_addr_lock_key;
#define MACVLAN_FEATURES \
(NETIF_F_SG | NETIF_F_ALL_CSUM | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | \
NETIF_F_GSO | NETIF_F_TSO | NETIF_F_UFO | NETIF_F_GSO_ROBUST | \
- NETIF_F_TSO_ECN | NETIF_F_TSO6)
+ NETIF_F_TSO_ECN | NETIF_F_TSO6 | NETIF_F_GRO)
#define MACVLAN_STATE_MASK \
((1<<__LINK_STATE_NOCARRIER) | (1<<__LINK_STATE_DORMANT))
--
1.7.1

View File

@ -0,0 +1,335 @@
From fc0663d6b5e6d8e9b57f872a644c0aafd82361b7 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Sat, 30 Jan 2010 12:23:40 +0000
Subject: [PATCH] macvlan: allow multiple driver backends
This makes it possible to hook into the macvlan driver
from another kernel module. In particular, the goal is
to extend it with the macvtap backend that provides
a tun/tap compatible interface directly on the macvlan
device.
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
[bwh: Backport to Debian's 2.6.32]
---
drivers/net/macvlan.c | 113 +++++++++++++++++++-------------------------
include/linux/if_macvlan.h | 70 +++++++++++++++++++++++++++
2 files changed, 119 insertions(+), 64 deletions(-)
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index d32e0bd..40faa36 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -39,30 +39,6 @@ struct macvlan_port {
struct list_head vlans;
};
-/**
- * struct macvlan_rx_stats - MACVLAN percpu rx stats
- * @rx_packets: number of received packets
- * @rx_bytes: number of received bytes
- * @multicast: number of received multicast packets
- * @rx_errors: number of errors
- */
-struct macvlan_rx_stats {
- unsigned long rx_packets;
- unsigned long rx_bytes;
- unsigned long multicast;
- unsigned long rx_errors;
-};
-
-struct macvlan_dev {
- struct net_device *dev;
- struct list_head list;
- struct hlist_node hlist;
- struct macvlan_port *port;
- struct net_device *lowerdev;
- struct macvlan_rx_stats *rx_stats;
- enum macvlan_mode mode;
-};
-
/* From 2.6.33 net/core/dev.c */
static int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
{
@@ -118,31 +93,17 @@ static int macvlan_addr_busy(const struct macvlan_port *port,
return 0;
}
-static inline void macvlan_count_rx(const struct macvlan_dev *vlan,
- unsigned int len, bool success,
- bool multicast)
-{
- struct macvlan_rx_stats *rx_stats;
-
- rx_stats = per_cpu_ptr(vlan->rx_stats, smp_processor_id());
- if (likely(success)) {
- rx_stats->rx_packets++;;
- rx_stats->rx_bytes += len;
- if (multicast)
- rx_stats->multicast++;
- } else {
- rx_stats->rx_errors++;
- }
-}
-static int macvlan_broadcast_one(struct sk_buff *skb, struct net_device *dev,
+static int macvlan_broadcast_one(struct sk_buff *skb,
+ const struct macvlan_dev *vlan,
const struct ethhdr *eth, bool local)
{
+ struct net_device *dev = vlan->dev;
if (!skb)
return NET_RX_DROP;
if (local)
- return dev_forward_skb(dev, skb);
+ return vlan->forward(dev, skb);
skb->dev = dev;
if (!compare_ether_addr_64bits(eth->h_dest,
@@ -151,7 +112,7 @@ static int macvlan_broadcast_one(struct sk_buff *skb, struct net_device *dev,
else
skb->pkt_type = PACKET_MULTICAST;
- return netif_rx(skb);
+ return vlan->receive(skb);
}
static void macvlan_broadcast(struct sk_buff *skb,
@@ -175,7 +136,7 @@ static void macvlan_broadcast(struct sk_buff *skb,
continue;
nskb = skb_clone(skb, GFP_ATOMIC);
- err = macvlan_broadcast_one(nskb, vlan->dev, eth,
+ err = macvlan_broadcast_one(nskb, vlan, eth,
mode == MACVLAN_MODE_BRIDGE);
macvlan_count_rx(vlan, skb->len + ETH_HLEN,
err == NET_RX_SUCCESS, 1);
@@ -238,7 +199,7 @@ static struct sk_buff *macvlan_handle_frame(struct sk_buff *skb)
skb->dev = dev;
skb->pkt_type = PACKET_HOST;
- netif_rx(skb);
+ vlan->receive(skb);
return NULL;
}
@@ -260,7 +221,7 @@ static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev)
dest = macvlan_hash_lookup(port, eth->h_dest);
if (dest && dest->mode == MACVLAN_MODE_BRIDGE) {
unsigned int length = skb->len + ETH_HLEN;
- int ret = dev_forward_skb(dest->dev, skb);
+ int ret = dest->forward(dest->dev, skb);
macvlan_count_rx(dest, length,
ret == NET_RX_SUCCESS, 0);
@@ -273,8 +234,8 @@ xmit_world:
return dev_queue_xmit(skb);
}
-static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,
- struct net_device *dev)
+netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,
+ struct net_device *dev)
{
int i = skb_get_queue_mapping(skb);
struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
@@ -290,6 +251,7 @@ static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,
return ret;
}
+EXPORT_SYMBOL_GPL(macvlan_start_xmit);
static int macvlan_hard_header(struct sk_buff *skb, struct net_device *dev,
unsigned short type, const void *daddr,
@@ -623,8 +585,11 @@ static int macvlan_get_tx_queues(struct net *net,
return 0;
}
-static int macvlan_newlink(struct net_device *dev,
- struct nlattr *tb[], struct nlattr *data[])
+int macvlan_common_newlink(struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[],
+ int (*receive)(struct sk_buff *skb),
+ int (*forward)(struct net_device *dev,
+ struct sk_buff *skb))
{
struct macvlan_dev *vlan = netdev_priv(dev);
struct macvlan_port *port;
@@ -664,6 +629,8 @@ static int macvlan_newlink(struct net *src_net, struct net_device *dev,
vlan->lowerdev = lowerdev;
vlan->dev = dev;
vlan->port = port;
+ vlan->receive = receive;
+ vlan->forward = forward;
vlan->mode = MACVLAN_MODE_VEPA;
if (data && data[IFLA_MACVLAN_MODE])
@@ -677,8 +644,17 @@ static int macvlan_newlink(struct net *src_net, struct net_device *dev,
netif_stacked_transfer_operstate(lowerdev, dev);
return 0;
}
+EXPORT_SYMBOL_GPL(macvlan_common_newlink);
-static void macvlan_dellink(struct net_device *dev)
+static int macvlan_newlink(struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
+{
+ return macvlan_common_newlink(dev, tb, data,
+ netif_rx,
+ dev_forward_skb);
+}
+
+void macvlan_dellink(struct net_device *dev)
{
struct macvlan_dev *vlan = netdev_priv(dev);
struct macvlan_port *port = vlan->port;
@@ -689,6 +665,7 @@ static void macvlan_dellink(struct net_device *dev, struct list_head *head)
if (list_empty(&port->vlans))
macvlan_port_destroy(port->dev);
}
+EXPORT_SYMBOL_GPL(macvlan_dellink);
static int macvlan_changelink(struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[])
@@ -720,19 +697,27 @@ static const struct nla_policy macvlan_policy[IFLA_MACVLAN_MAX + 1] = {
[IFLA_MACVLAN_MODE] = { .type = NLA_U32 },
};
-static struct rtnl_link_ops macvlan_link_ops __read_mostly = {
+int macvlan_link_register(struct rtnl_link_ops *ops)
+{
+ /* common fields */
+ ops->priv_size = sizeof(struct macvlan_dev);
+ ops->get_tx_queues = macvlan_get_tx_queues;
+ ops->setup = macvlan_setup;
+ ops->validate = macvlan_validate;
+ ops->maxtype = IFLA_MACVLAN_MAX;
+ ops->policy = macvlan_policy;
+ ops->changelink = macvlan_changelink;
+ ops->get_size = macvlan_get_size;
+ ops->fill_info = macvlan_fill_info;
+
+ return rtnl_link_register(ops);
+};
+EXPORT_SYMBOL_GPL(macvlan_link_register);
+
+static struct rtnl_link_ops macvlan_link_ops = {
.kind = "macvlan",
- .priv_size = sizeof(struct macvlan_dev),
- .get_tx_queues = macvlan_get_tx_queues,
- .setup = macvlan_setup,
- .validate = macvlan_validate,
.newlink = macvlan_newlink,
.dellink = macvlan_dellink,
- .maxtype = IFLA_MACVLAN_MAX,
- .policy = macvlan_policy,
- .changelink = macvlan_changelink,
- .get_size = macvlan_get_size,
- .fill_info = macvlan_fill_info,
};
static int macvlan_device_event(struct notifier_block *unused,
@@ -761,7 +746,7 @@ static int macvlan_device_event(struct notifier_block *unused,
break;
case NETDEV_UNREGISTER:
list_for_each_entry_safe(vlan, next, &port->vlans, list)
- macvlan_dellink(vlan->dev);
+ vlan->dev->rtnl_link_ops->dellink(vlan->dev);
break;
}
return NOTIFY_DONE;
@@ -778,7 +763,7 @@ static int __init macvlan_init_module(void)
register_netdevice_notifier(&macvlan_notifier_block);
macvlan_handle_frame_hook = macvlan_handle_frame;
- err = rtnl_link_register(&macvlan_link_ops);
+ err = macvlan_link_register(&macvlan_link_ops);
if (err < 0)
goto err1;
return 0;
diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index 5f200ba..9a11544 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -1,6 +1,76 @@
#ifndef _LINUX_IF_MACVLAN_H
#define _LINUX_IF_MACVLAN_H
+#include <linux/if_link.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <net/netlink.h>
+
+struct macvlan_port;
+struct macvtap_queue;
+
+/**
+ * struct macvlan_rx_stats - MACVLAN percpu rx stats
+ * @rx_packets: number of received packets
+ * @rx_bytes: number of received bytes
+ * @multicast: number of received multicast packets
+ * @rx_errors: number of errors
+ */
+struct macvlan_rx_stats {
+ unsigned long rx_packets;
+ unsigned long rx_bytes;
+ unsigned long multicast;
+ unsigned long rx_errors;
+};
+
+struct macvlan_dev {
+ struct net_device *dev;
+ struct list_head list;
+ struct hlist_node hlist;
+ struct macvlan_port *port;
+ struct net_device *lowerdev;
+ struct macvlan_rx_stats *rx_stats;
+ enum macvlan_mode mode;
+ int (*receive)(struct sk_buff *skb);
+ int (*forward)(struct net_device *dev, struct sk_buff *skb);
+};
+
+static inline void macvlan_count_rx(const struct macvlan_dev *vlan,
+ unsigned int len, bool success,
+ bool multicast)
+{
+ struct macvlan_rx_stats *rx_stats;
+
+ rx_stats = per_cpu_ptr(vlan->rx_stats, smp_processor_id());
+ if (likely(success)) {
+ rx_stats->rx_packets++;;
+ rx_stats->rx_bytes += len;
+ if (multicast)
+ rx_stats->multicast++;
+ } else {
+ rx_stats->rx_errors++;
+ }
+}
+
+extern int macvlan_common_newlink(struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[],
+ int (*receive)(struct sk_buff *skb),
+ int (*forward)(struct net_device *dev,
+ struct sk_buff *skb));
+
+extern void macvlan_count_rx(const struct macvlan_dev *vlan,
+ unsigned int len, bool success,
+ bool multicast);
+
+extern void macvlan_dellink(struct net_device *dev);
+
+extern int macvlan_link_register(struct rtnl_link_ops *ops);
+
+extern netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,
+ struct net_device *dev);
+
+
extern struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *);
#endif /* _LINUX_IF_MACVLAN_H */
--
1.7.1

View File

@ -0,0 +1,333 @@
From b9fb9ee07e67fce0b7bfd517a48710465706c30a Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 18 Feb 2010 05:48:17 +0000
Subject: [PATCH 5/5] macvtap: add GSO/csum offload support
Added flags field to macvtap_queue to enable/disable processing of
virtio_net_hdr via IFF_VNET_HDR. This flag is checked to prepend virtio_net_hdr
in the receive path and process/skip virtio_net_hdr in the send path.
Original patch by Sridhar, further changes by Arnd.
Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
drivers/net/macvtap.c | 206 +++++++++++++++++++++++++++++++++++++++++++------
1 files changed, 182 insertions(+), 24 deletions(-)
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index e354501..55ceae0 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -17,6 +17,7 @@
#include <net/net_namespace.h>
#include <net/rtnetlink.h>
#include <net/sock.h>
+#include <linux/virtio_net.h>
/*
* A macvtap queue is the central object of this driver, it connects
@@ -37,6 +38,7 @@ struct macvtap_queue {
struct socket sock;
struct macvlan_dev *vlan;
struct file *file;
+ unsigned int flags;
};
static struct proto macvtap_proto = {
@@ -276,6 +278,7 @@ static int macvtap_open(struct inode *inode, struct file *file)
q->sock.ops = &macvtap_socket_ops;
sock_init_data(&q->sock, &q->sk);
q->sk.sk_write_space = macvtap_sock_write_space;
+ q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP;
err = macvtap_set_queue(dev, file, q);
if (err)
@@ -318,6 +321,111 @@ out:
return mask;
}
+static inline struct sk_buff *macvtap_alloc_skb(struct sock *sk, size_t prepad,
+ size_t len, size_t linear,
+ int noblock, int *err)
+{
+ struct sk_buff *skb;
+
+ /* Under a page? Don't bother with paged skb. */
+ if (prepad + len < PAGE_SIZE || !linear)
+ linear = len;
+
+ skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
+ err);
+ if (!skb)
+ return NULL;
+
+ skb_reserve(skb, prepad);
+ skb_put(skb, linear);
+ skb->data_len = len - linear;
+ skb->len += len - linear;
+
+ return skb;
+}
+
+/*
+ * macvtap_skb_from_vnet_hdr and macvtap_skb_to_vnet_hdr should
+ * be shared with the tun/tap driver.
+ */
+static int macvtap_skb_from_vnet_hdr(struct sk_buff *skb,
+ struct virtio_net_hdr *vnet_hdr)
+{
+ unsigned short gso_type = 0;
+ if (vnet_hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
+ switch (vnet_hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
+ case VIRTIO_NET_HDR_GSO_TCPV4:
+ gso_type = SKB_GSO_TCPV4;
+ break;
+ case VIRTIO_NET_HDR_GSO_TCPV6:
+ gso_type = SKB_GSO_TCPV6;
+ break;
+ case VIRTIO_NET_HDR_GSO_UDP:
+ gso_type = SKB_GSO_UDP;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (vnet_hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN)
+ gso_type |= SKB_GSO_TCP_ECN;
+
+ if (vnet_hdr->gso_size == 0)
+ return -EINVAL;
+ }
+
+ if (vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
+ if (!skb_partial_csum_set(skb, vnet_hdr->csum_start,
+ vnet_hdr->csum_offset))
+ return -EINVAL;
+ }
+
+ if (vnet_hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
+ skb_shinfo(skb)->gso_size = vnet_hdr->gso_size;
+ skb_shinfo(skb)->gso_type = gso_type;
+
+ /* Header must be checked, and gso_segs computed. */
+ skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
+ skb_shinfo(skb)->gso_segs = 0;
+ }
+ return 0;
+}
+
+static int macvtap_skb_to_vnet_hdr(const struct sk_buff *skb,
+ struct virtio_net_hdr *vnet_hdr)
+{
+ memset(vnet_hdr, 0, sizeof(*vnet_hdr));
+
+ if (skb_is_gso(skb)) {
+ struct skb_shared_info *sinfo = skb_shinfo(skb);
+
+ /* This is a hint as to how much should be linear. */
+ vnet_hdr->hdr_len = skb_headlen(skb);
+ vnet_hdr->gso_size = sinfo->gso_size;
+ if (sinfo->gso_type & SKB_GSO_TCPV4)
+ vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
+ else if (sinfo->gso_type & SKB_GSO_TCPV6)
+ vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
+ else if (sinfo->gso_type & SKB_GSO_UDP)
+ vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP;
+ else
+ BUG();
+ if (sinfo->gso_type & SKB_GSO_TCP_ECN)
+ vnet_hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN;
+ } else
+ vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ vnet_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
+ vnet_hdr->csum_start = skb->csum_start -
+ skb_headroom(skb);
+ vnet_hdr->csum_offset = skb->csum_offset;
+ } /* else everything is zero */
+
+ return 0;
+}
+
+
/* Get packet from user space buffer */
static ssize_t macvtap_get_user(struct macvtap_queue *q,
const struct iovec *iv, size_t count,
@@ -327,22 +435,53 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q,
struct macvlan_dev *vlan;
size_t len = count;
int err;
+ struct virtio_net_hdr vnet_hdr = { 0 };
+ int vnet_hdr_len = 0;
+
+ if (q->flags & IFF_VNET_HDR) {
+ vnet_hdr_len = sizeof(vnet_hdr);
+
+ err = -EINVAL;
+ if ((len -= vnet_hdr_len) < 0)
+ goto err;
+
+ err = memcpy_fromiovecend((void *)&vnet_hdr, iv, 0,
+ vnet_hdr_len);
+ if (err < 0)
+ goto err;
+ if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
+ vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
+ vnet_hdr.hdr_len)
+ vnet_hdr.hdr_len = vnet_hdr.csum_start +
+ vnet_hdr.csum_offset + 2;
+ err = -EINVAL;
+ if (vnet_hdr.hdr_len > len)
+ goto err;
+ }
+ err = -EINVAL;
if (unlikely(len < ETH_HLEN))
- return -EINVAL;
+ goto err;
- skb = sock_alloc_send_skb(&q->sk, NET_IP_ALIGN + len, noblock, &err);
+ skb = macvtap_alloc_skb(&q->sk, NET_IP_ALIGN, len, vnet_hdr.hdr_len,
+ noblock, &err);
if (!skb)
goto err;
- skb_reserve(skb, NET_IP_ALIGN);
- skb_put(skb, count);
-
- err = skb_copy_datagram_from_iovec(skb, 0, iv, 0, len);
+ err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len, len);
if (err)
- goto err;
+ goto err_kfree;
skb_set_network_header(skb, ETH_HLEN);
+ skb_reset_mac_header(skb);
+ skb->protocol = eth_hdr(skb)->h_proto;
+
+ if (vnet_hdr_len) {
+ err = macvtap_skb_from_vnet_hdr(skb, &vnet_hdr);
+ if (err)
+ goto err_kfree;
+ }
+
rcu_read_lock_bh();
vlan = rcu_dereference(q->vlan);
if (vlan)
@@ -353,15 +492,16 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q,
return count;
+err_kfree:
+ kfree_skb(skb);
+
err:
rcu_read_lock_bh();
vlan = rcu_dereference(q->vlan);
if (vlan)
- macvlan_count_rx(q->vlan, 0, false, false);
+ netdev_get_tx_queue(vlan->dev, 0)->tx_dropped++;
rcu_read_unlock_bh();
- kfree_skb(skb);
-
return err;
}
@@ -384,10 +524,25 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q,
{
struct macvlan_dev *vlan;
int ret;
+ int vnet_hdr_len = 0;
+
+ if (q->flags & IFF_VNET_HDR) {
+ struct virtio_net_hdr vnet_hdr;
+ vnet_hdr_len = sizeof (vnet_hdr);
+ if ((len -= vnet_hdr_len) < 0)
+ return -EINVAL;
+
+ ret = macvtap_skb_to_vnet_hdr(skb, &vnet_hdr);
+ if (ret)
+ return ret;
+
+ if (memcpy_toiovecend(iv, (void *)&vnet_hdr, 0, vnet_hdr_len))
+ return -EFAULT;
+ }
len = min_t(int, skb->len, len);
- ret = skb_copy_datagram_const_iovec(skb, 0, iv, 0, len);
+ ret = skb_copy_datagram_const_iovec(skb, 0, iv, vnet_hdr_len, len);
rcu_read_lock_bh();
vlan = rcu_dereference(q->vlan);
@@ -395,7 +550,7 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q,
macvlan_count_rx(vlan, len, ret == 0, 0);
rcu_read_unlock_bh();
- return ret ? ret : len;
+ return ret ? ret : (len + vnet_hdr_len);
}
static ssize_t macvtap_do_read(struct macvtap_queue *q, struct kiocb *iocb,
@@ -473,9 +628,14 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
/* ignore the name, just look at flags */
if (get_user(u, &ifr->ifr_flags))
return -EFAULT;
- if (u != (IFF_TAP | IFF_NO_PI))
- return -EINVAL;
- return 0;
+
+ ret = 0;
+ if ((u & ~IFF_VNET_HDR) != (IFF_NO_PI | IFF_TAP))
+ ret = -EINVAL;
+ else
+ q->flags = u;
+
+ return ret;
case TUNGETIFF:
rcu_read_lock_bh();
@@ -489,13 +649,13 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
ret = 0;
if (copy_to_user(&ifr->ifr_name, q->vlan->dev->name, IFNAMSIZ) ||
- put_user((TUN_TAP_DEV | TUN_NO_PI), &ifr->ifr_flags))
+ put_user(q->flags, &ifr->ifr_flags))
ret = -EFAULT;
dev_put(vlan->dev);
return ret;
case TUNGETFEATURES:
- if (put_user((IFF_TAP | IFF_NO_PI), up))
+ if (put_user(IFF_TAP | IFF_NO_PI | IFF_VNET_HDR, up))
return -EFAULT;
return 0;
@@ -509,15 +669,13 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
case TUNSETOFFLOAD:
/* let the user check for future flags */
if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
- TUN_F_TSO_ECN | TUN_F_UFO))
- return -EINVAL;
-
- /* TODO: add support for these, so far we don't
- support any offload */
- if (arg & (TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
- TUN_F_TSO_ECN | TUN_F_UFO))
+ TUN_F_TSO_ECN | TUN_F_UFO))
return -EINVAL;
+ /* TODO: only accept frames with the features that
+ got enabled for forwarded frames */
+ if (!(q->flags & IFF_VNET_HDR))
+ return -EINVAL;
return 0;
default:
--
1.7.1

View File

@ -0,0 +1,672 @@
From 20d29d7a916a47bf533b5709437fe735b6b5b79e Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Sat, 30 Jan 2010 12:24:26 +0000
Subject: [PATCH 1/5] net: macvtap driver
In order to use macvlan with qemu and other tools that require
a tap file descriptor, the macvtap driver adds a small backend
with a character device with the same interface as the tun
driver, with a minimum set of features.
Macvtap interfaces are created in the same way as macvlan
interfaces using ip link, but the netif is just used as a
handle for configuration and accounting, while the data
goes through the chardev. Each macvtap interface has its
own character device, simplifying permission management
significantly over the generic tun/tap driver.
Cc: Patrick McHardy <kaber@trash.net>
Cc: Stephen Hemminger <shemminger@linux-foundation.org>
Cc: David S. Miller" <davem@davemloft.net>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Or Gerlitz <ogerlitz@voltaire.com>
Cc: netdev@vger.kernel.org
Cc: bridge@lists.linux-foundation.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
[bwh: Backport to Debian's 2.6.32]
---
drivers/net/Kconfig | 12 +
drivers/net/Makefile | 1 +
drivers/net/macvtap.c | 581 ++++++++++++++++++++++++++++++++++++++++++++
include/linux/if_macvlan.h | 1 +
4 files changed, 595 insertions(+), 0 deletions(-)
create mode 100644 drivers/net/macvtap.c
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index cb0e534..411e207 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -90,6 +90,18 @@ config MACVLAN
To compile this driver as a module, choose M here: the module
will be called macvlan.
+config MACVTAP
+ tristate "MAC-VLAN based tap driver (EXPERIMENTAL)"
+ depends on MACVLAN
+ help
+ This adds a specialized tap character device driver that is based
+ on the MAC-VLAN network interface, called macvtap. A macvtap device
+ can be added in the same way as a macvlan device, using 'type
+ macvlan', and then be accessed through the tap user space interface.
+
+ To compile this driver as a module, choose M here: the module
+ will be called macvtap.
+
config EQUALIZER
tristate "EQL (serial line load balancing) support"
---help---
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 0b763cb..9595803 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -169,6 +169,7 @@ obj-$(CONFIG_XEN_NETDEV_FRONTEND) += xen-netfront.o
obj-$(CONFIG_DUMMY) += dummy.o
obj-$(CONFIG_IFB) += ifb.o
obj-$(CONFIG_MACVLAN) += macvlan.o
+obj-$(CONFIG_MACVTAP) += macvtap.o
obj-$(CONFIG_DE600) += de600.o
obj-$(CONFIG_DE620) += de620.o
obj-$(CONFIG_LANCE) += lance.o
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
new file mode 100644
index 0000000..ad1f6ef
--- /dev/null
+++ b/drivers/net/macvtap.c
@@ -0,0 +1,579 @@
+#include <linux/etherdevice.h>
+#include <linux/if_macvlan.h>
+#include <linux/interrupt.h>
+#include <linux/nsproxy.h>
+#include <linux/compat.h>
+#include <linux/if_tun.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/cache.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/wait.h>
+#include <linux/cdev.h>
+#include <linux/fs.h>
+
+#include <net/net_namespace.h>
+#include <net/rtnetlink.h>
+#include <net/sock.h>
+
+/*
+ * A macvtap queue is the central object of this driver, it connects
+ * an open character device to a macvlan interface. There can be
+ * multiple queues on one interface, which map back to queues
+ * implemented in hardware on the underlying device.
+ *
+ * macvtap_proto is used to allocate queues through the sock allocation
+ * mechanism.
+ *
+ * TODO: multiqueue support is currently not implemented, even though
+ * macvtap is basically prepared for that. We will need to add this
+ * here as well as in virtio-net and qemu to get line rate on 10gbit
+ * adapters from a guest.
+ */
+struct macvtap_queue {
+ struct sock sk;
+ struct socket sock;
+ struct macvlan_dev *vlan;
+ struct file *file;
+};
+
+static struct proto macvtap_proto = {
+ .name = "macvtap",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof (struct macvtap_queue),
+};
+
+/*
+ * Minor number matches netdev->ifindex, so need a potentially
+ * large value. This also makes it possible to split the
+ * tap functionality out again in the future by offering it
+ * from other drivers besides macvtap. As long as every device
+ * only has one tap, the interface numbers assure that the
+ * device nodes are unique.
+ */
+static unsigned int macvtap_major;
+#define MACVTAP_NUM_DEVS 65536
+static struct class *macvtap_class;
+static struct cdev macvtap_cdev;
+
+/*
+ * RCU usage:
+ * The macvtap_queue is referenced both from the chardev struct file
+ * and from the struct macvlan_dev using rcu_read_lock.
+ *
+ * We never actually update the contents of a macvtap_queue atomically
+ * with RCU but it is used for race-free destruction of a queue when
+ * either the file or the macvlan_dev goes away. Pointers back to
+ * the dev and the file are implicitly valid as long as the queue
+ * exists.
+ *
+ * The callbacks from macvlan are always done with rcu_read_lock held
+ * already, while in the file_operations, we get it ourselves.
+ *
+ * When destroying a queue, we remove the pointers from the file and
+ * from the dev and then synchronize_rcu to make sure no thread is
+ * still using the queue. There may still be references to the struct
+ * sock inside of the queue from outbound SKBs, but these never
+ * reference back to the file or the dev. The data structure is freed
+ * through __sk_free when both our references and any pending SKBs
+ * are gone.
+ *
+ * macvtap_lock is only used to prevent multiple concurrent open()
+ * calls to assign a new vlan->tap pointer. It could be moved into
+ * the macvlan_dev itself but is extremely rarely used.
+ */
+static DEFINE_SPINLOCK(macvtap_lock);
+
+/*
+ * Choose the next free queue, for now there is only one
+ */
+static int macvtap_set_queue(struct net_device *dev, struct file *file,
+ struct macvtap_queue *q)
+{
+ struct macvlan_dev *vlan = netdev_priv(dev);
+ int err = -EBUSY;
+
+ spin_lock(&macvtap_lock);
+ if (rcu_dereference(vlan->tap))
+ goto out;
+
+ err = 0;
+ q->vlan = vlan;
+ rcu_assign_pointer(vlan->tap, q);
+
+ q->file = file;
+ rcu_assign_pointer(file->private_data, q);
+
+out:
+ spin_unlock(&macvtap_lock);
+ return err;
+}
+
+/*
+ * We must destroy each queue exactly once, when either
+ * the netdev or the file go away.
+ *
+ * Using the spinlock makes sure that we don't get
+ * to the queue again after destroying it.
+ *
+ * synchronize_rcu serializes with the packet flow
+ * that uses rcu_read_lock.
+ */
+static void macvtap_del_queue(struct macvtap_queue **qp)
+{
+ struct macvtap_queue *q;
+
+ spin_lock(&macvtap_lock);
+ q = rcu_dereference(*qp);
+ if (!q) {
+ spin_unlock(&macvtap_lock);
+ return;
+ }
+
+ rcu_assign_pointer(q->vlan->tap, NULL);
+ rcu_assign_pointer(q->file->private_data, NULL);
+ spin_unlock(&macvtap_lock);
+
+ synchronize_rcu();
+ sock_put(&q->sk);
+}
+
+/*
+ * Since we only support one queue, just dereference the pointer.
+ */
+static struct macvtap_queue *macvtap_get_queue(struct net_device *dev,
+ struct sk_buff *skb)
+{
+ struct macvlan_dev *vlan = netdev_priv(dev);
+
+ return rcu_dereference(vlan->tap);
+}
+
+static void macvtap_del_queues(struct net_device *dev)
+{
+ struct macvlan_dev *vlan = netdev_priv(dev);
+ macvtap_del_queue(&vlan->tap);
+}
+
+static inline struct macvtap_queue *macvtap_file_get_queue(struct file *file)
+{
+ rcu_read_lock_bh();
+ return rcu_dereference(file->private_data);
+}
+
+static inline void macvtap_file_put_queue(void)
+{
+ rcu_read_unlock_bh();
+}
+
+/*
+ * Forward happens for data that gets sent from one macvlan
+ * endpoint to another one in bridge mode. We just take
+ * the skb and put it into the receive queue.
+ */
+static int macvtap_forward(struct net_device *dev, struct sk_buff *skb)
+{
+ struct macvtap_queue *q = macvtap_get_queue(dev, skb);
+ if (!q)
+ return -ENOLINK;
+
+ skb_queue_tail(&q->sk.sk_receive_queue, skb);
+ wake_up(q->sk.sk_sleep);
+ return 0;
+}
+
+/*
+ * Receive is for data from the external interface (lowerdev),
+ * in case of macvtap, we can treat that the same way as
+ * forward, which macvlan cannot.
+ */
+static int macvtap_receive(struct sk_buff *skb)
+{
+ skb_push(skb, ETH_HLEN);
+ return macvtap_forward(skb->dev, skb);
+}
+
+static int macvtap_newlink(struct net_device *dev,
+ struct nlattr *tb[],
+ struct nlattr *data[])
+{
+ struct device *classdev;
+ dev_t devt;
+ int err;
+
+ err = macvlan_common_newlink(dev, tb, data,
+ macvtap_receive, macvtap_forward);
+ if (err)
+ goto out;
+
+ devt = MKDEV(MAJOR(macvtap_major), dev->ifindex);
+
+ classdev = device_create(macvtap_class, &dev->dev, devt,
+ dev, "tap%d", dev->ifindex);
+ if (IS_ERR(classdev)) {
+ err = PTR_ERR(classdev);
+ macvtap_del_queues(dev);
+ }
+
+out:
+ return err;
+}
+
+static void macvtap_dellink(struct net_device *dev)
+{
+ device_destroy(macvtap_class,
+ MKDEV(MAJOR(macvtap_major), dev->ifindex));
+
+ macvtap_del_queues(dev);
+ macvlan_dellink(dev);
+}
+
+static struct rtnl_link_ops macvtap_link_ops __read_mostly = {
+ .kind = "macvtap",
+ .newlink = macvtap_newlink,
+ .dellink = macvtap_dellink,
+};
+
+
+static void macvtap_sock_write_space(struct sock *sk)
+{
+ if (!sock_writeable(sk) ||
+ !test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags))
+ return;
+
+ if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
+ wake_up_interruptible_sync(sk->sk_sleep);
+}
+
+static int macvtap_open(struct inode *inode, struct file *file)
+{
+ struct net *net = current->nsproxy->net_ns;
+ struct net_device *dev = dev_get_by_index(net, iminor(inode));
+ struct macvtap_queue *q;
+ int err;
+
+ err = -ENODEV;
+ if (!dev)
+ goto out;
+
+ /* check if this is a macvtap device */
+ err = -EINVAL;
+ if (dev->rtnl_link_ops != &macvtap_link_ops)
+ goto out;
+
+ err = -ENOMEM;
+ q = (struct macvtap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
+ &macvtap_proto);
+ if (!q)
+ goto out;
+
+ init_waitqueue_head(&q->sock.wait);
+ q->sock.type = SOCK_RAW;
+ q->sock.state = SS_CONNECTED;
+ sock_init_data(&q->sock, &q->sk);
+ q->sk.sk_allocation = GFP_ATOMIC; /* for now */
+ q->sk.sk_write_space = macvtap_sock_write_space;
+
+ err = macvtap_set_queue(dev, file, q);
+ if (err)
+ sock_put(&q->sk);
+
+out:
+ if (dev)
+ dev_put(dev);
+
+ return err;
+}
+
+static int macvtap_release(struct inode *inode, struct file *file)
+{
+ macvtap_del_queue((struct macvtap_queue **)&file->private_data);
+ return 0;
+}
+
+static unsigned int macvtap_poll(struct file *file, poll_table * wait)
+{
+ struct macvtap_queue *q = macvtap_file_get_queue(file);
+ unsigned int mask = POLLERR;
+
+ if (!q)
+ goto out;
+
+ mask = 0;
+ poll_wait(file, &q->sock.wait, wait);
+
+ if (!skb_queue_empty(&q->sk.sk_receive_queue))
+ mask |= POLLIN | POLLRDNORM;
+
+ if (sock_writeable(&q->sk) ||
+ (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &q->sock.flags) &&
+ sock_writeable(&q->sk)))
+ mask |= POLLOUT | POLLWRNORM;
+
+out:
+ macvtap_file_put_queue();
+ return mask;
+}
+
+/* Get packet from user space buffer */
+static ssize_t macvtap_get_user(struct macvtap_queue *q,
+ const struct iovec *iv, size_t count,
+ int noblock)
+{
+ struct sk_buff *skb;
+ size_t len = count;
+ int err;
+
+ if (unlikely(len < ETH_HLEN))
+ return -EINVAL;
+
+ skb = sock_alloc_send_skb(&q->sk, NET_IP_ALIGN + len, noblock, &err);
+
+ if (!skb) {
+ macvlan_count_rx(q->vlan, 0, false, false);
+ return err;
+ }
+
+ skb_reserve(skb, NET_IP_ALIGN);
+ skb_put(skb, count);
+
+ if (skb_copy_datagram_from_iovec(skb, 0, iv, 0, len)) {
+ macvlan_count_rx(q->vlan, 0, false, false);
+ kfree_skb(skb);
+ return -EFAULT;
+ }
+
+ skb_set_network_header(skb, ETH_HLEN);
+
+ macvlan_start_xmit(skb, q->vlan->dev);
+
+ return count;
+}
+
+static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv,
+ unsigned long count, loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+ ssize_t result = -ENOLINK;
+ struct macvtap_queue *q = macvtap_file_get_queue(file);
+
+ if (!q)
+ goto out;
+
+ result = macvtap_get_user(q, iv, iov_length(iv, count),
+ file->f_flags & O_NONBLOCK);
+out:
+ macvtap_file_put_queue();
+ return result;
+}
+
+/* Put packet to the user space buffer */
+static ssize_t macvtap_put_user(struct macvtap_queue *q,
+ const struct sk_buff *skb,
+ const struct iovec *iv, int len)
+{
+ struct macvlan_dev *vlan = q->vlan;
+ int ret;
+
+ len = min_t(int, skb->len, len);
+
+ ret = skb_copy_datagram_const_iovec(skb, 0, iv, 0, len);
+
+ macvlan_count_rx(vlan, len, ret == 0, 0);
+
+ return ret ? ret : len;
+}
+
+static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv,
+ unsigned long count, loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+ struct macvtap_queue *q = macvtap_file_get_queue(file);
+
+ DECLARE_WAITQUEUE(wait, current);
+ struct sk_buff *skb;
+ ssize_t len, ret = 0;
+
+ if (!q) {
+ ret = -ENOLINK;
+ goto out;
+ }
+
+ len = iov_length(iv, count);
+ if (len < 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ add_wait_queue(q->sk.sk_sleep, &wait);
+ while (len) {
+ current->state = TASK_INTERRUPTIBLE;
+
+ /* Read frames from the queue */
+ skb = skb_dequeue(&q->sk.sk_receive_queue);
+ if (!skb) {
+ if (file->f_flags & O_NONBLOCK) {
+ ret = -EAGAIN;
+ break;
+ }
+ if (signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+ /* Nothing to read, let's sleep */
+ schedule();
+ continue;
+ }
+ ret = macvtap_put_user(q, skb, iv, len);
+ kfree_skb(skb);
+ break;
+ }
+
+ current->state = TASK_RUNNING;
+ remove_wait_queue(q->sk.sk_sleep, &wait);
+
+out:
+ macvtap_file_put_queue();
+ return ret;
+}
+
+/*
+ * provide compatibility with generic tun/tap interface
+ */
+static long macvtap_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct macvtap_queue *q;
+ void __user *argp = (void __user *)arg;
+ struct ifreq __user *ifr = argp;
+ unsigned int __user *up = argp;
+ unsigned int u;
+ char devname[IFNAMSIZ];
+
+ switch (cmd) {
+ case TUNSETIFF:
+ /* ignore the name, just look at flags */
+ if (get_user(u, &ifr->ifr_flags))
+ return -EFAULT;
+ if (u != (IFF_TAP | IFF_NO_PI))
+ return -EINVAL;
+ return 0;
+
+ case TUNGETIFF:
+ q = macvtap_file_get_queue(file);
+ if (!q)
+ return -ENOLINK;
+ memcpy(devname, q->vlan->dev->name, sizeof(devname));
+ macvtap_file_put_queue();
+
+ if (copy_to_user(&ifr->ifr_name, q->vlan->dev->name, IFNAMSIZ) ||
+ put_user((TUN_TAP_DEV | TUN_NO_PI), &ifr->ifr_flags))
+ return -EFAULT;
+ return 0;
+
+ case TUNGETFEATURES:
+ if (put_user((IFF_TAP | IFF_NO_PI), up))
+ return -EFAULT;
+ return 0;
+
+ case TUNSETSNDBUF:
+ if (get_user(u, up))
+ return -EFAULT;
+
+ q = macvtap_file_get_queue(file);
+ q->sk.sk_sndbuf = u;
+ macvtap_file_put_queue();
+ return 0;
+
+ case TUNSETOFFLOAD:
+ /* let the user check for future flags */
+ if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
+ TUN_F_TSO_ECN | TUN_F_UFO))
+ return -EINVAL;
+
+ /* TODO: add support for these, so far we don't
+ support any offload */
+ if (arg & (TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
+ TUN_F_TSO_ECN | TUN_F_UFO))
+ return -EINVAL;
+
+ return 0;
+
+ default:
+ return -EINVAL;
+ }
+}
+
+#ifdef CONFIG_COMPAT
+static long macvtap_compat_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ return macvtap_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
+
+static const struct file_operations macvtap_fops = {
+ .owner = THIS_MODULE,
+ .open = macvtap_open,
+ .release = macvtap_release,
+ .aio_read = macvtap_aio_read,
+ .aio_write = macvtap_aio_write,
+ .poll = macvtap_poll,
+ .llseek = no_llseek,
+ .unlocked_ioctl = macvtap_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = macvtap_compat_ioctl,
+#endif
+};
+
+static int macvtap_init(void)
+{
+ int err;
+
+ err = alloc_chrdev_region(&macvtap_major, 0,
+ MACVTAP_NUM_DEVS, "macvtap");
+ if (err)
+ goto out1;
+
+ cdev_init(&macvtap_cdev, &macvtap_fops);
+ err = cdev_add(&macvtap_cdev, macvtap_major, MACVTAP_NUM_DEVS);
+ if (err)
+ goto out2;
+
+ macvtap_class = class_create(THIS_MODULE, "macvtap");
+ if (IS_ERR(macvtap_class)) {
+ err = PTR_ERR(macvtap_class);
+ goto out3;
+ }
+
+ err = macvlan_link_register(&macvtap_link_ops);
+ if (err)
+ goto out4;
+
+ return 0;
+
+out4:
+ class_unregister(macvtap_class);
+out3:
+ cdev_del(&macvtap_cdev);
+out2:
+ unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS);
+out1:
+ return err;
+}
+module_init(macvtap_init);
+
+static void macvtap_exit(void)
+{
+ rtnl_link_unregister(&macvtap_link_ops);
+ class_unregister(macvtap_class);
+ cdev_del(&macvtap_cdev);
+ unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS);
+}
+module_exit(macvtap_exit);
+
+MODULE_ALIAS_RTNL_LINK("macvtap");
+MODULE_AUTHOR("Arnd Bergmann <arnd@arndb.de>");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index 9a11544..51f1512 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -34,6 +34,7 @@ struct macvlan_dev {
enum macvlan_mode mode;
int (*receive)(struct sk_buff *skb);
int (*forward)(struct net_device *dev, struct sk_buff *skb);
+ struct macvtap_queue *tap;
};
static inline void macvlan_count_rx(const struct macvlan_dev *vlan,
--
1.7.1

View File

@ -1 +1,8 @@
+ bugfix/ia64/hardcode-arch-script-output.patch
+ bugfix/all/vlan-macvlan-propagate-transmission-state-to-upper-layer.patch
+ features/all/macvlan-add-GRO-bit-to-features-mask.patch
+ features/all/macvlan-allow-multiple-driver-backends.patch
+ features/all/macvtap-driver.patch
+ bugfix/all/macvtap-fix-reference-counting.patch
+ bugfix/all/macvtap-rework-object-lifetime-rules.patch
+ features/all/macvtap-add-GSO-csum-offload-support.patch