Merge branch 'vhost-net-next' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost
Michael S. Tsirkin says: -------------------- There are mostly bugfixes here. I hope to merge some more patches by 3.5, in particular vlan support fixes are waiting for Eric's ack, and a version of tracepoint patch might be ready in time, but let's merge what's ready so it's testable. This includes a ton of zerocopy fixes by Jason - good stuff but too intrusive for 3.4 and zerocopy is experimental anyway. virtio supported delayed interrupt for a while now so adding support to the virtio tool made sense -------------------- Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
commit
6e06c0e234
|
@ -505,10 +505,11 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
|
||||||
if (copy > size) {
|
if (copy > size) {
|
||||||
++from;
|
++from;
|
||||||
--count;
|
--count;
|
||||||
}
|
offset = 0;
|
||||||
|
} else
|
||||||
|
offset += size;
|
||||||
copy -= size;
|
copy -= size;
|
||||||
offset1 += size;
|
offset1 += size;
|
||||||
offset = 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (len == offset1)
|
if (len == offset1)
|
||||||
|
@ -518,24 +519,29 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
|
||||||
struct page *page[MAX_SKB_FRAGS];
|
struct page *page[MAX_SKB_FRAGS];
|
||||||
int num_pages;
|
int num_pages;
|
||||||
unsigned long base;
|
unsigned long base;
|
||||||
|
unsigned long truesize;
|
||||||
|
|
||||||
len = from->iov_len - offset1;
|
len = from->iov_len - offset;
|
||||||
if (!len) {
|
if (!len) {
|
||||||
offset1 = 0;
|
offset = 0;
|
||||||
++from;
|
++from;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
base = (unsigned long)from->iov_base + offset1;
|
base = (unsigned long)from->iov_base + offset;
|
||||||
size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
|
size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
|
||||||
|
if (i + size > MAX_SKB_FRAGS)
|
||||||
|
return -EMSGSIZE;
|
||||||
num_pages = get_user_pages_fast(base, size, 0, &page[i]);
|
num_pages = get_user_pages_fast(base, size, 0, &page[i]);
|
||||||
if ((num_pages != size) ||
|
if (num_pages != size) {
|
||||||
(num_pages > MAX_SKB_FRAGS - skb_shinfo(skb)->nr_frags))
|
for (i = 0; i < num_pages; i++)
|
||||||
/* put_page is in skb free */
|
put_page(page[i]);
|
||||||
return -EFAULT;
|
return -EFAULT;
|
||||||
|
}
|
||||||
|
truesize = size * PAGE_SIZE;
|
||||||
skb->data_len += len;
|
skb->data_len += len;
|
||||||
skb->len += len;
|
skb->len += len;
|
||||||
skb->truesize += len;
|
skb->truesize += truesize;
|
||||||
atomic_add(len, &skb->sk->sk_wmem_alloc);
|
atomic_add(truesize, &skb->sk->sk_wmem_alloc);
|
||||||
while (len) {
|
while (len) {
|
||||||
int off = base & ~PAGE_MASK;
|
int off = base & ~PAGE_MASK;
|
||||||
int size = min_t(int, len, PAGE_SIZE - off);
|
int size = min_t(int, len, PAGE_SIZE - off);
|
||||||
|
@ -546,7 +552,7 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
|
||||||
len -= size;
|
len -= size;
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
offset1 = 0;
|
offset = 0;
|
||||||
++from;
|
++from;
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -646,7 +652,7 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m,
|
||||||
int err;
|
int err;
|
||||||
struct virtio_net_hdr vnet_hdr = { 0 };
|
struct virtio_net_hdr vnet_hdr = { 0 };
|
||||||
int vnet_hdr_len = 0;
|
int vnet_hdr_len = 0;
|
||||||
int copylen;
|
int copylen = 0;
|
||||||
bool zerocopy = false;
|
bool zerocopy = false;
|
||||||
|
|
||||||
if (q->flags & IFF_VNET_HDR) {
|
if (q->flags & IFF_VNET_HDR) {
|
||||||
|
@ -675,15 +681,31 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m,
|
||||||
if (unlikely(len < ETH_HLEN))
|
if (unlikely(len < ETH_HLEN))
|
||||||
goto err;
|
goto err;
|
||||||
|
|
||||||
|
err = -EMSGSIZE;
|
||||||
|
if (unlikely(count > UIO_MAXIOV))
|
||||||
|
goto err;
|
||||||
|
|
||||||
if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY))
|
if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY))
|
||||||
zerocopy = true;
|
zerocopy = true;
|
||||||
|
|
||||||
if (zerocopy) {
|
if (zerocopy) {
|
||||||
|
/* Userspace may produce vectors with count greater than
|
||||||
|
* MAX_SKB_FRAGS, so we need to linearize parts of the skb
|
||||||
|
* to let the rest of data to be fit in the frags.
|
||||||
|
*/
|
||||||
|
if (count > MAX_SKB_FRAGS) {
|
||||||
|
copylen = iov_length(iv, count - MAX_SKB_FRAGS);
|
||||||
|
if (copylen < vnet_hdr_len)
|
||||||
|
copylen = 0;
|
||||||
|
else
|
||||||
|
copylen -= vnet_hdr_len;
|
||||||
|
}
|
||||||
/* There are 256 bytes to be copied in skb, so there is enough
|
/* There are 256 bytes to be copied in skb, so there is enough
|
||||||
* room for skb expand head in case it is used.
|
* room for skb expand head in case it is used.
|
||||||
* The rest buffer is mapped from userspace.
|
* The rest buffer is mapped from userspace.
|
||||||
*/
|
*/
|
||||||
copylen = vnet_hdr.hdr_len;
|
if (copylen < vnet_hdr.hdr_len)
|
||||||
|
copylen = vnet_hdr.hdr_len;
|
||||||
if (!copylen)
|
if (!copylen)
|
||||||
copylen = GOODCOPY_LEN;
|
copylen = GOODCOPY_LEN;
|
||||||
} else
|
} else
|
||||||
|
@ -694,10 +716,9 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m,
|
||||||
if (!skb)
|
if (!skb)
|
||||||
goto err;
|
goto err;
|
||||||
|
|
||||||
if (zerocopy) {
|
if (zerocopy)
|
||||||
err = zerocopy_sg_from_iovec(skb, iv, vnet_hdr_len, count);
|
err = zerocopy_sg_from_iovec(skb, iv, vnet_hdr_len, count);
|
||||||
skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
|
else
|
||||||
} else
|
|
||||||
err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len,
|
err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len,
|
||||||
len);
|
len);
|
||||||
if (err)
|
if (err)
|
||||||
|
@ -716,8 +737,10 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m,
|
||||||
rcu_read_lock_bh();
|
rcu_read_lock_bh();
|
||||||
vlan = rcu_dereference_bh(q->vlan);
|
vlan = rcu_dereference_bh(q->vlan);
|
||||||
/* copy skb_ubuf_info for callback when skb has no error */
|
/* copy skb_ubuf_info for callback when skb has no error */
|
||||||
if (zerocopy)
|
if (zerocopy) {
|
||||||
skb_shinfo(skb)->destructor_arg = m->msg_control;
|
skb_shinfo(skb)->destructor_arg = m->msg_control;
|
||||||
|
skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
|
||||||
|
}
|
||||||
if (vlan)
|
if (vlan)
|
||||||
macvlan_start_xmit(skb, vlan->dev);
|
macvlan_start_xmit(skb, vlan->dev);
|
||||||
else
|
else
|
||||||
|
|
|
@ -166,7 +166,7 @@ static void handle_tx(struct vhost_net *net)
|
||||||
if (wmem < sock->sk->sk_sndbuf / 2)
|
if (wmem < sock->sk->sk_sndbuf / 2)
|
||||||
tx_poll_stop(net);
|
tx_poll_stop(net);
|
||||||
hdr_size = vq->vhost_hlen;
|
hdr_size = vq->vhost_hlen;
|
||||||
zcopy = vhost_sock_zcopy(sock);
|
zcopy = vq->ubufs;
|
||||||
|
|
||||||
for (;;) {
|
for (;;) {
|
||||||
/* Release DMAs done buffers first */
|
/* Release DMAs done buffers first */
|
||||||
|
@ -257,7 +257,8 @@ static void handle_tx(struct vhost_net *net)
|
||||||
UIO_MAXIOV;
|
UIO_MAXIOV;
|
||||||
}
|
}
|
||||||
vhost_discard_vq_desc(vq, 1);
|
vhost_discard_vq_desc(vq, 1);
|
||||||
tx_poll_start(net, sock);
|
if (err == -EAGAIN || err == -ENOBUFS)
|
||||||
|
tx_poll_start(net, sock);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (err != len)
|
if (err != len)
|
||||||
|
@ -265,6 +266,8 @@ static void handle_tx(struct vhost_net *net)
|
||||||
" len %d != %zd\n", err, len);
|
" len %d != %zd\n", err, len);
|
||||||
if (!zcopy)
|
if (!zcopy)
|
||||||
vhost_add_used_and_signal(&net->dev, vq, head, 0);
|
vhost_add_used_and_signal(&net->dev, vq, head, 0);
|
||||||
|
else
|
||||||
|
vhost_zerocopy_signal_used(vq);
|
||||||
total_len += len;
|
total_len += len;
|
||||||
if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
|
if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
|
||||||
vhost_poll_queue(&vq->poll);
|
vhost_poll_queue(&vq->poll);
|
||||||
|
|
|
@ -1603,6 +1603,7 @@ void vhost_zerocopy_callback(struct ubuf_info *ubuf)
|
||||||
struct vhost_ubuf_ref *ubufs = ubuf->ctx;
|
struct vhost_ubuf_ref *ubufs = ubuf->ctx;
|
||||||
struct vhost_virtqueue *vq = ubufs->vq;
|
struct vhost_virtqueue *vq = ubufs->vq;
|
||||||
|
|
||||||
|
vhost_poll_queue(&vq->poll);
|
||||||
/* set len = 1 to mark this desc buffers done DMA */
|
/* set len = 1 to mark this desc buffers done DMA */
|
||||||
vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN;
|
vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN;
|
||||||
kref_put(&ubufs->kref, vhost_zerocopy_done_signal);
|
kref_put(&ubufs->kref, vhost_zerocopy_done_signal);
|
||||||
|
|
|
@ -203,6 +203,7 @@ void *virtqueue_get_buf(struct virtqueue *vq, unsigned int *len);
|
||||||
void virtqueue_disable_cb(struct virtqueue *vq);
|
void virtqueue_disable_cb(struct virtqueue *vq);
|
||||||
|
|
||||||
bool virtqueue_enable_cb(struct virtqueue *vq);
|
bool virtqueue_enable_cb(struct virtqueue *vq);
|
||||||
|
bool virtqueue_enable_cb_delayed(struct virtqueue *vq);
|
||||||
|
|
||||||
void *virtqueue_detach_unused_buf(struct virtqueue *vq);
|
void *virtqueue_detach_unused_buf(struct virtqueue *vq);
|
||||||
struct virtqueue *vring_new_virtqueue(unsigned int num,
|
struct virtqueue *vring_new_virtqueue(unsigned int num,
|
||||||
|
|
|
@ -144,7 +144,8 @@ static void wait_for_interrupt(struct vdev_info *dev)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void run_test(struct vdev_info *dev, struct vq_info *vq, int bufs)
|
static void run_test(struct vdev_info *dev, struct vq_info *vq,
|
||||||
|
bool delayed, int bufs)
|
||||||
{
|
{
|
||||||
struct scatterlist sl;
|
struct scatterlist sl;
|
||||||
long started = 0, completed = 0;
|
long started = 0, completed = 0;
|
||||||
|
@ -183,8 +184,12 @@ static void run_test(struct vdev_info *dev, struct vq_info *vq, int bufs)
|
||||||
assert(started <= bufs);
|
assert(started <= bufs);
|
||||||
if (completed == bufs)
|
if (completed == bufs)
|
||||||
break;
|
break;
|
||||||
if (virtqueue_enable_cb(vq->vq)) {
|
if (delayed) {
|
||||||
wait_for_interrupt(dev);
|
if (virtqueue_enable_cb_delayed(vq->vq))
|
||||||
|
wait_for_interrupt(dev);
|
||||||
|
} else {
|
||||||
|
if (virtqueue_enable_cb(vq->vq))
|
||||||
|
wait_for_interrupt(dev);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
test = 0;
|
test = 0;
|
||||||
|
@ -215,6 +220,14 @@ const struct option longopts[] = {
|
||||||
.name = "no-indirect",
|
.name = "no-indirect",
|
||||||
.val = 'i',
|
.val = 'i',
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
.name = "delayed-interrupt",
|
||||||
|
.val = 'D',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
.name = "no-delayed-interrupt",
|
||||||
|
.val = 'd',
|
||||||
|
},
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -224,6 +237,7 @@ static void help()
|
||||||
fprintf(stderr, "Usage: virtio_test [--help]"
|
fprintf(stderr, "Usage: virtio_test [--help]"
|
||||||
" [--no-indirect]"
|
" [--no-indirect]"
|
||||||
" [--no-event-idx]"
|
" [--no-event-idx]"
|
||||||
|
" [--delayed-interrupt]"
|
||||||
"\n");
|
"\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -233,6 +247,7 @@ int main(int argc, char **argv)
|
||||||
unsigned long long features = (1ULL << VIRTIO_RING_F_INDIRECT_DESC) |
|
unsigned long long features = (1ULL << VIRTIO_RING_F_INDIRECT_DESC) |
|
||||||
(1ULL << VIRTIO_RING_F_EVENT_IDX);
|
(1ULL << VIRTIO_RING_F_EVENT_IDX);
|
||||||
int o;
|
int o;
|
||||||
|
bool delayed = false;
|
||||||
|
|
||||||
for (;;) {
|
for (;;) {
|
||||||
o = getopt_long(argc, argv, optstring, longopts, NULL);
|
o = getopt_long(argc, argv, optstring, longopts, NULL);
|
||||||
|
@ -251,6 +266,9 @@ int main(int argc, char **argv)
|
||||||
case 'i':
|
case 'i':
|
||||||
features &= ~(1ULL << VIRTIO_RING_F_INDIRECT_DESC);
|
features &= ~(1ULL << VIRTIO_RING_F_INDIRECT_DESC);
|
||||||
break;
|
break;
|
||||||
|
case 'D':
|
||||||
|
delayed = true;
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
assert(0);
|
assert(0);
|
||||||
break;
|
break;
|
||||||
|
@ -260,6 +278,6 @@ int main(int argc, char **argv)
|
||||||
done:
|
done:
|
||||||
vdev_info_init(&dev, features);
|
vdev_info_init(&dev, features);
|
||||||
vq_info_add(&dev, 256);
|
vq_info_add(&dev, 256);
|
||||||
run_test(&dev, &dev.vqs[0], 0x100000);
|
run_test(&dev, &dev.vqs[0], delayed, 0x100000);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
Reference in New Issue