diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index 0427c6561c8..163559c1698 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -505,10 +505,11 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, if (copy > size) { ++from; --count; - } + offset = 0; + } else + offset += size; copy -= size; offset1 += size; - offset = 0; } if (len == offset1) @@ -518,24 +519,29 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, struct page *page[MAX_SKB_FRAGS]; int num_pages; unsigned long base; + unsigned long truesize; - len = from->iov_len - offset1; + len = from->iov_len - offset; if (!len) { - offset1 = 0; + offset = 0; ++from; continue; } - base = (unsigned long)from->iov_base + offset1; + base = (unsigned long)from->iov_base + offset; size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; + if (i + size > MAX_SKB_FRAGS) + return -EMSGSIZE; num_pages = get_user_pages_fast(base, size, 0, &page[i]); - if ((num_pages != size) || - (num_pages > MAX_SKB_FRAGS - skb_shinfo(skb)->nr_frags)) - /* put_page is in skb free */ + if (num_pages != size) { + for (i = 0; i < num_pages; i++) + put_page(page[i]); return -EFAULT; + } + truesize = size * PAGE_SIZE; skb->data_len += len; skb->len += len; - skb->truesize += len; - atomic_add(len, &skb->sk->sk_wmem_alloc); + skb->truesize += truesize; + atomic_add(truesize, &skb->sk->sk_wmem_alloc); while (len) { int off = base & ~PAGE_MASK; int size = min_t(int, len, PAGE_SIZE - off); @@ -546,7 +552,7 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, len -= size; i++; } - offset1 = 0; + offset = 0; ++from; } return 0; @@ -646,7 +652,7 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m, int err; struct virtio_net_hdr vnet_hdr = { 0 }; int vnet_hdr_len = 0; - int copylen; + int copylen = 0; bool zerocopy = false; if (q->flags & IFF_VNET_HDR) { @@ -675,15 +681,31 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m, if (unlikely(len < ETH_HLEN)) goto err; + err = -EMSGSIZE; + if (unlikely(count > UIO_MAXIOV)) + goto err; + if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) zerocopy = true; if (zerocopy) { + /* Userspace may produce vectors with count greater than + * MAX_SKB_FRAGS, so we need to linearize parts of the skb + * to let the rest of data to be fit in the frags. + */ + if (count > MAX_SKB_FRAGS) { + copylen = iov_length(iv, count - MAX_SKB_FRAGS); + if (copylen < vnet_hdr_len) + copylen = 0; + else + copylen -= vnet_hdr_len; + } /* There are 256 bytes to be copied in skb, so there is enough * room for skb expand head in case it is used. * The rest buffer is mapped from userspace. */ - copylen = vnet_hdr.hdr_len; + if (copylen < vnet_hdr.hdr_len) + copylen = vnet_hdr.hdr_len; if (!copylen) copylen = GOODCOPY_LEN; } else @@ -694,10 +716,9 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m, if (!skb) goto err; - if (zerocopy) { + if (zerocopy) err = zerocopy_sg_from_iovec(skb, iv, vnet_hdr_len, count); - skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY; - } else + else err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len, len); if (err) @@ -716,8 +737,10 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m, rcu_read_lock_bh(); vlan = rcu_dereference_bh(q->vlan); /* copy skb_ubuf_info for callback when skb has no error */ - if (zerocopy) + if (zerocopy) { skb_shinfo(skb)->destructor_arg = m->msg_control; + skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY; + } if (vlan) macvlan_start_xmit(skb, vlan->dev); else diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 1f21d2a1e52..853db7a08a2 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -166,7 +166,7 @@ static void handle_tx(struct vhost_net *net) if (wmem < sock->sk->sk_sndbuf / 2) tx_poll_stop(net); hdr_size = vq->vhost_hlen; - zcopy = vhost_sock_zcopy(sock); + zcopy = vq->ubufs; for (;;) { /* Release DMAs done buffers first */ @@ -257,7 +257,8 @@ static void handle_tx(struct vhost_net *net) UIO_MAXIOV; } vhost_discard_vq_desc(vq, 1); - tx_poll_start(net, sock); + if (err == -EAGAIN || err == -ENOBUFS) + tx_poll_start(net, sock); break; } if (err != len) @@ -265,6 +266,8 @@ static void handle_tx(struct vhost_net *net) " len %d != %zd\n", err, len); if (!zcopy) vhost_add_used_and_signal(&net->dev, vq, head, 0); + else + vhost_zerocopy_signal_used(vq); total_len += len; if (unlikely(total_len >= VHOST_NET_WEIGHT)) { vhost_poll_queue(&vq->poll); diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 51e4c1eeec4..94dbd25caa3 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1603,6 +1603,7 @@ void vhost_zerocopy_callback(struct ubuf_info *ubuf) struct vhost_ubuf_ref *ubufs = ubuf->ctx; struct vhost_virtqueue *vq = ubufs->vq; + vhost_poll_queue(&vq->poll); /* set len = 1 to mark this desc buffers done DMA */ vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN; kref_put(&ubufs->kref, vhost_zerocopy_done_signal); diff --git a/tools/virtio/linux/virtio.h b/tools/virtio/linux/virtio.h index 7579f19e61e..81847dd08bd 100644 --- a/tools/virtio/linux/virtio.h +++ b/tools/virtio/linux/virtio.h @@ -203,6 +203,7 @@ void *virtqueue_get_buf(struct virtqueue *vq, unsigned int *len); void virtqueue_disable_cb(struct virtqueue *vq); bool virtqueue_enable_cb(struct virtqueue *vq); +bool virtqueue_enable_cb_delayed(struct virtqueue *vq); void *virtqueue_detach_unused_buf(struct virtqueue *vq); struct virtqueue *vring_new_virtqueue(unsigned int num, diff --git a/tools/virtio/virtio_test.c b/tools/virtio/virtio_test.c index 6bf95f99536..e626fa553c5 100644 --- a/tools/virtio/virtio_test.c +++ b/tools/virtio/virtio_test.c @@ -144,7 +144,8 @@ static void wait_for_interrupt(struct vdev_info *dev) } } -static void run_test(struct vdev_info *dev, struct vq_info *vq, int bufs) +static void run_test(struct vdev_info *dev, struct vq_info *vq, + bool delayed, int bufs) { struct scatterlist sl; long started = 0, completed = 0; @@ -183,8 +184,12 @@ static void run_test(struct vdev_info *dev, struct vq_info *vq, int bufs) assert(started <= bufs); if (completed == bufs) break; - if (virtqueue_enable_cb(vq->vq)) { - wait_for_interrupt(dev); + if (delayed) { + if (virtqueue_enable_cb_delayed(vq->vq)) + wait_for_interrupt(dev); + } else { + if (virtqueue_enable_cb(vq->vq)) + wait_for_interrupt(dev); } } test = 0; @@ -215,6 +220,14 @@ const struct option longopts[] = { .name = "no-indirect", .val = 'i', }, + { + .name = "delayed-interrupt", + .val = 'D', + }, + { + .name = "no-delayed-interrupt", + .val = 'd', + }, { } }; @@ -224,6 +237,7 @@ static void help() fprintf(stderr, "Usage: virtio_test [--help]" " [--no-indirect]" " [--no-event-idx]" + " [--delayed-interrupt]" "\n"); } @@ -233,6 +247,7 @@ int main(int argc, char **argv) unsigned long long features = (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | (1ULL << VIRTIO_RING_F_EVENT_IDX); int o; + bool delayed = false; for (;;) { o = getopt_long(argc, argv, optstring, longopts, NULL); @@ -251,6 +266,9 @@ int main(int argc, char **argv) case 'i': features &= ~(1ULL << VIRTIO_RING_F_INDIRECT_DESC); break; + case 'D': + delayed = true; + break; default: assert(0); break; @@ -260,6 +278,6 @@ int main(int argc, char **argv) done: vdev_info_init(&dev, features); vq_info_add(&dev, 256); - run_test(&dev, &dev.vqs[0], 0x100000); + run_test(&dev, &dev.vqs[0], delayed, 0x100000); return 0; }