From 6e43fc04a6bc357d260583b8440882f28069207f Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Wed, 11 Sep 2013 14:52:48 +0100 Subject: xen-netback: count number required slots for an skb more carefully When a VM is providing an iSCSI target and the LUN is used by the backend domain, the generated skbs for direct I/O writes to the disk have large, multi-page skb->data but no frags. With some lengths and starting offsets, xen_netbk_count_skb_slots() would be one short because the simple calculation of DIV_ROUND_UP(skb_headlen(), PAGE_SIZE) was not accounting for the decisions made by start_new_rx_buffer() which does not guarantee responses are fully packed. For example, a skb with length < 2 pages but which spans 3 pages would be counted as requiring 2 slots but would actually use 3 slots. skb->data: | 1111|222222222222|3333 | Fully packed, this would need 2 slots: |111122222222|22223333 | But because the 2nd page wholy fits into a slot it is not split across slots and goes into a slot of its own: |1111 |222222222222|3333 | Miscounting the number of slots means netback may push more responses than the number of available requests. This will cause the frontend to get very confused and report "Too many frags/slots". The frontend never recovers and will eventually BUG. Fix this by counting the number of required slots more carefully. In xen_netbk_count_skb_slots(), more closely follow the algorithm used by xen_netbk_gop_skb() by introducing xen_netbk_count_frag_slots() which is the dry-run equivalent of netbk_gop_frag_copy(). Signed-off-by: David Vrabel Acked-by: Ian Campbell Signed-off-by: David S. Miller --- drivers/net/xen-netback/netback.c | 94 ++++++++++++++++++++++++++------------- 1 file changed, 64 insertions(+), 30 deletions(-) (limited to 'drivers/net/xen-netback/netback.c') diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index 956130c70036..f3e591c611de 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -212,6 +212,49 @@ static bool start_new_rx_buffer(int offset, unsigned long size, int head) return false; } +struct xenvif_count_slot_state { + unsigned long copy_off; + bool head; +}; + +unsigned int xenvif_count_frag_slots(struct xenvif *vif, + unsigned long offset, unsigned long size, + struct xenvif_count_slot_state *state) +{ + unsigned count = 0; + + offset &= ~PAGE_MASK; + + while (size > 0) { + unsigned long bytes; + + bytes = PAGE_SIZE - offset; + + if (bytes > size) + bytes = size; + + if (start_new_rx_buffer(state->copy_off, bytes, state->head)) { + count++; + state->copy_off = 0; + } + + if (state->copy_off + bytes > MAX_BUFFER_OFFSET) + bytes = MAX_BUFFER_OFFSET - state->copy_off; + + state->copy_off += bytes; + + offset += bytes; + size -= bytes; + + if (offset == PAGE_SIZE) + offset = 0; + + state->head = false; + } + + return count; +} + /* * Figure out how many ring slots we're going to need to send @skb to * the guest. This function is essentially a dry run of @@ -219,48 +262,39 @@ static bool start_new_rx_buffer(int offset, unsigned long size, int head) */ unsigned int xenvif_count_skb_slots(struct xenvif *vif, struct sk_buff *skb) { + struct xenvif_count_slot_state state; unsigned int count; - int i, copy_off; + unsigned char *data; + unsigned i; - count = DIV_ROUND_UP(skb_headlen(skb), PAGE_SIZE); + state.head = true; + state.copy_off = 0; - copy_off = skb_headlen(skb) % PAGE_SIZE; + /* Slot for the first (partial) page of data. */ + count = 1; + /* Need a slot for the GSO prefix for GSO extra data? */ if (skb_shinfo(skb)->gso_size) count++; - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - unsigned long size = skb_frag_size(&skb_shinfo(skb)->frags[i]); - unsigned long offset = skb_shinfo(skb)->frags[i].page_offset; - unsigned long bytes; - - offset &= ~PAGE_MASK; - - while (size > 0) { - BUG_ON(offset >= PAGE_SIZE); - BUG_ON(copy_off > MAX_BUFFER_OFFSET); - - bytes = PAGE_SIZE - offset; - - if (bytes > size) - bytes = size; + data = skb->data; + while (data < skb_tail_pointer(skb)) { + unsigned long offset = offset_in_page(data); + unsigned long size = PAGE_SIZE - offset; - if (start_new_rx_buffer(copy_off, bytes, 0)) { - count++; - copy_off = 0; - } + if (data + size > skb_tail_pointer(skb)) + size = skb_tail_pointer(skb) - data; - if (copy_off + bytes > MAX_BUFFER_OFFSET) - bytes = MAX_BUFFER_OFFSET - copy_off; + count += xenvif_count_frag_slots(vif, offset, size, &state); - copy_off += bytes; + data += size; + } - offset += bytes; - size -= bytes; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + unsigned long size = skb_frag_size(&skb_shinfo(skb)->frags[i]); + unsigned long offset = skb_shinfo(skb)->frags[i].page_offset; - if (offset == PAGE_SIZE) - offset = 0; - } + count += xenvif_count_frag_slots(vif, offset, size, &state); } return count; } -- cgit v1.2.3 From 4f0581d25827d5e864bcf07b05d73d0d12a20a5c Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Sun, 22 Sep 2013 19:03:44 +0100 Subject: xen-netback: improve ring effeciency for guest RX There was a bug that netback routines netbk/xenvif_skb_count_slots and netbk/xenvif_gop_frag_copy disagreed with each other, which caused netback to push wrong number of responses to netfront, which caused netfront to eventually crash. The bug was fixed in 6e43fc04a ("xen-netback: count number required slots for an skb more carefully"). Commit 6e43fc04a focused on backport-ability. The drawback with the existing packing scheme is that the ring is not used effeciently, as stated in 6e43fc04a. skb->data like: | 1111|222222222222|3333 | is arranged as: |1111 |222222222222|3333 | If we can do this: |111122222222|22223333 | That would save one ring slot, which improves ring effeciency. This patch effectively reverts 6e43fc04a. That patch made count_slots agree with gop_frag_copy, while this patch goes the other way around -- make gop_frag_copy agree with count_slots. The end result is that they still agree with each other, and the ring is now arranged like: |111122222222|22223333 | The patch that improves packing was first posted by Xi Xong and Matt Wilson. I only rebase it on top of net-next and rewrite commit message, so I retain all their SoBs. For more infomation about the original bug please refer to email listed below and commit message of 6e43fc04a. Original patch: http://lists.xen.org/archives/html/xen-devel/2013-07/msg00760.html Signed-off-by: Xi Xiong Reviewed-by: Matt Wilson [ msw: minor code cleanups, rewrote commit message, adjusted code to count RX slots instead of meta structures ] Signed-off-by: Matt Wilson Cc: Annie Li Cc: Wei Liu Cc: Ian Campbell [ liuw: rebased on top of net-next tree, rewrote commit message, coding style cleanup. ] Signed-off-by: Wei Liu Cc: David Vrabel Acked-by: Ian Campbell Signed-off-by: David S. Miller --- drivers/net/xen-netback/netback.c | 144 ++++++++++++++++---------------------- 1 file changed, 61 insertions(+), 83 deletions(-) (limited to 'drivers/net/xen-netback/netback.c') diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index f3e591c611de..d0b0feb035fb 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -47,6 +47,14 @@ #include #include +/* SKB control block overlay is used to store useful information when + * doing guest RX. + */ +struct skb_cb_overlay { + int meta_slots_used; + int peek_slots_count; +}; + /* Provide an option to disable split event channels at load time as * event channels are limited resource. Split event channels are * enabled by default. @@ -212,49 +220,6 @@ static bool start_new_rx_buffer(int offset, unsigned long size, int head) return false; } -struct xenvif_count_slot_state { - unsigned long copy_off; - bool head; -}; - -unsigned int xenvif_count_frag_slots(struct xenvif *vif, - unsigned long offset, unsigned long size, - struct xenvif_count_slot_state *state) -{ - unsigned count = 0; - - offset &= ~PAGE_MASK; - - while (size > 0) { - unsigned long bytes; - - bytes = PAGE_SIZE - offset; - - if (bytes > size) - bytes = size; - - if (start_new_rx_buffer(state->copy_off, bytes, state->head)) { - count++; - state->copy_off = 0; - } - - if (state->copy_off + bytes > MAX_BUFFER_OFFSET) - bytes = MAX_BUFFER_OFFSET - state->copy_off; - - state->copy_off += bytes; - - offset += bytes; - size -= bytes; - - if (offset == PAGE_SIZE) - offset = 0; - - state->head = false; - } - - return count; -} - /* * Figure out how many ring slots we're going to need to send @skb to * the guest. This function is essentially a dry run of @@ -262,40 +227,53 @@ unsigned int xenvif_count_frag_slots(struct xenvif *vif, */ unsigned int xenvif_count_skb_slots(struct xenvif *vif, struct sk_buff *skb) { - struct xenvif_count_slot_state state; unsigned int count; - unsigned char *data; - unsigned i; + int i, copy_off; + struct skb_cb_overlay *sco; - state.head = true; - state.copy_off = 0; + count = DIV_ROUND_UP(skb_headlen(skb), PAGE_SIZE); - /* Slot for the first (partial) page of data. */ - count = 1; + copy_off = skb_headlen(skb) % PAGE_SIZE; - /* Need a slot for the GSO prefix for GSO extra data? */ if (skb_shinfo(skb)->gso_size) count++; - data = skb->data; - while (data < skb_tail_pointer(skb)) { - unsigned long offset = offset_in_page(data); - unsigned long size = PAGE_SIZE - offset; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + unsigned long size = skb_frag_size(&skb_shinfo(skb)->frags[i]); + unsigned long offset = skb_shinfo(skb)->frags[i].page_offset; + unsigned long bytes; - if (data + size > skb_tail_pointer(skb)) - size = skb_tail_pointer(skb) - data; + offset &= ~PAGE_MASK; - count += xenvif_count_frag_slots(vif, offset, size, &state); + while (size > 0) { + BUG_ON(offset >= PAGE_SIZE); + BUG_ON(copy_off > MAX_BUFFER_OFFSET); - data += size; - } + bytes = PAGE_SIZE - offset; - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - unsigned long size = skb_frag_size(&skb_shinfo(skb)->frags[i]); - unsigned long offset = skb_shinfo(skb)->frags[i].page_offset; + if (bytes > size) + bytes = size; + + if (start_new_rx_buffer(copy_off, bytes, 0)) { + count++; + copy_off = 0; + } - count += xenvif_count_frag_slots(vif, offset, size, &state); + if (copy_off + bytes > MAX_BUFFER_OFFSET) + bytes = MAX_BUFFER_OFFSET - copy_off; + + copy_off += bytes; + + offset += bytes; + size -= bytes; + + if (offset == PAGE_SIZE) + offset = 0; + } } + + sco = (struct skb_cb_overlay *)skb->cb; + sco->peek_slots_count = count; return count; } @@ -327,14 +305,11 @@ static struct xenvif_rx_meta *get_next_rx_buffer(struct xenvif *vif, return meta; } -/* - * Set up the grant operations for this fragment. If it's a flipping - * interface, we also set up the unmap request from here. - */ +/* Set up the grant operations for this fragment. */ static void xenvif_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb, struct netrx_pending_operations *npo, struct page *page, unsigned long size, - unsigned long offset, int *head) + unsigned long offset, int head, int *first) { struct gnttab_copy *copy_gop; struct xenvif_rx_meta *meta; @@ -358,12 +333,12 @@ static void xenvif_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb, if (bytes > size) bytes = size; - if (start_new_rx_buffer(npo->copy_off, bytes, *head)) { + if (start_new_rx_buffer(npo->copy_off, bytes, head)) { /* * Netfront requires there to be some data in the head * buffer. */ - BUG_ON(*head); + BUG_ON(*first); meta = get_next_rx_buffer(vif, npo); } @@ -397,10 +372,10 @@ static void xenvif_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb, } /* Leave a gap for the GSO descriptor. */ - if (*head && skb_shinfo(skb)->gso_size && !vif->gso_prefix) + if (*first && skb_shinfo(skb)->gso_size && !vif->gso_prefix) vif->rx.req_cons++; - *head = 0; /* There must be something in this buffer now. */ + *first = 0; /* There must be something in this buffer now. */ } } @@ -426,7 +401,7 @@ static int xenvif_gop_skb(struct sk_buff *skb, struct xen_netif_rx_request *req; struct xenvif_rx_meta *meta; unsigned char *data; - int head = 1; + int first = 1; int old_meta_prod; old_meta_prod = npo->meta_prod; @@ -462,7 +437,7 @@ static int xenvif_gop_skb(struct sk_buff *skb, len = skb_tail_pointer(skb) - data; xenvif_gop_frag_copy(vif, skb, npo, - virt_to_page(data), len, offset, &head); + virt_to_page(data), len, offset, 1, &first); data += len; } @@ -471,7 +446,7 @@ static int xenvif_gop_skb(struct sk_buff *skb, skb_frag_page(&skb_shinfo(skb)->frags[i]), skb_frag_size(&skb_shinfo(skb)->frags[i]), skb_shinfo(skb)->frags[i].page_offset, - &head); + 0, &first); } return npo->meta_prod - old_meta_prod; @@ -529,10 +504,6 @@ static void xenvif_add_frag_responses(struct xenvif *vif, int status, } } -struct skb_cb_overlay { - int meta_slots_used; -}; - static void xenvif_kick_thread(struct xenvif *vif) { wake_up(&vif->wq); @@ -563,19 +534,26 @@ void xenvif_rx_action(struct xenvif *vif) count = 0; while ((skb = skb_dequeue(&vif->rx_queue)) != NULL) { + RING_IDX old_rx_req_cons; + vif = netdev_priv(skb->dev); nr_frags = skb_shinfo(skb)->nr_frags; + old_rx_req_cons = vif->rx.req_cons; sco = (struct skb_cb_overlay *)skb->cb; sco->meta_slots_used = xenvif_gop_skb(skb, &npo); - count += nr_frags + 1; + count += vif->rx.req_cons - old_rx_req_cons; __skb_queue_tail(&rxq, skb); + skb = skb_peek(&vif->rx_queue); + if (skb == NULL) + break; + sco = (struct skb_cb_overlay *)skb->cb; + /* Filled the batch queue? */ - /* XXX FIXME: RX path dependent on MAX_SKB_FRAGS */ - if (count + MAX_SKB_FRAGS >= XEN_NETIF_RX_RING_SIZE) + if (count + sco->peek_slots_count >= XEN_NETIF_RX_RING_SIZE) break; } -- cgit v1.2.3 From 33bc801dddc14f0f96b79e453ec51cecfe5ed612 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Tue, 8 Oct 2013 10:54:21 +0100 Subject: Revert "xen-netback: improve ring effeciency for guest RX" This reverts commit 4f0581d25827d5e864bcf07b05d73d0d12a20a5c. The named changeset is causing problem. Let's aim to make this part less fragile before trying to improve things. Signed-off-by: Wei Liu Cc: Ian Campbell Cc: Annie Li Cc: Matt Wilson Cc: Xi Xiong Cc: David Vrabel Cc: Paul Durrant Acked-by: Ian Campbell Signed-off-by: David S. Miller --- drivers/net/xen-netback/netback.c | 144 ++++++++++++++++++++++---------------- 1 file changed, 83 insertions(+), 61 deletions(-) (limited to 'drivers/net/xen-netback/netback.c') diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index d0b0feb035fb..f3e591c611de 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -47,14 +47,6 @@ #include #include -/* SKB control block overlay is used to store useful information when - * doing guest RX. - */ -struct skb_cb_overlay { - int meta_slots_used; - int peek_slots_count; -}; - /* Provide an option to disable split event channels at load time as * event channels are limited resource. Split event channels are * enabled by default. @@ -220,6 +212,49 @@ static bool start_new_rx_buffer(int offset, unsigned long size, int head) return false; } +struct xenvif_count_slot_state { + unsigned long copy_off; + bool head; +}; + +unsigned int xenvif_count_frag_slots(struct xenvif *vif, + unsigned long offset, unsigned long size, + struct xenvif_count_slot_state *state) +{ + unsigned count = 0; + + offset &= ~PAGE_MASK; + + while (size > 0) { + unsigned long bytes; + + bytes = PAGE_SIZE - offset; + + if (bytes > size) + bytes = size; + + if (start_new_rx_buffer(state->copy_off, bytes, state->head)) { + count++; + state->copy_off = 0; + } + + if (state->copy_off + bytes > MAX_BUFFER_OFFSET) + bytes = MAX_BUFFER_OFFSET - state->copy_off; + + state->copy_off += bytes; + + offset += bytes; + size -= bytes; + + if (offset == PAGE_SIZE) + offset = 0; + + state->head = false; + } + + return count; +} + /* * Figure out how many ring slots we're going to need to send @skb to * the guest. This function is essentially a dry run of @@ -227,53 +262,40 @@ static bool start_new_rx_buffer(int offset, unsigned long size, int head) */ unsigned int xenvif_count_skb_slots(struct xenvif *vif, struct sk_buff *skb) { + struct xenvif_count_slot_state state; unsigned int count; - int i, copy_off; - struct skb_cb_overlay *sco; + unsigned char *data; + unsigned i; - count = DIV_ROUND_UP(skb_headlen(skb), PAGE_SIZE); + state.head = true; + state.copy_off = 0; - copy_off = skb_headlen(skb) % PAGE_SIZE; + /* Slot for the first (partial) page of data. */ + count = 1; + /* Need a slot for the GSO prefix for GSO extra data? */ if (skb_shinfo(skb)->gso_size) count++; - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - unsigned long size = skb_frag_size(&skb_shinfo(skb)->frags[i]); - unsigned long offset = skb_shinfo(skb)->frags[i].page_offset; - unsigned long bytes; - - offset &= ~PAGE_MASK; - - while (size > 0) { - BUG_ON(offset >= PAGE_SIZE); - BUG_ON(copy_off > MAX_BUFFER_OFFSET); - - bytes = PAGE_SIZE - offset; - - if (bytes > size) - bytes = size; + data = skb->data; + while (data < skb_tail_pointer(skb)) { + unsigned long offset = offset_in_page(data); + unsigned long size = PAGE_SIZE - offset; - if (start_new_rx_buffer(copy_off, bytes, 0)) { - count++; - copy_off = 0; - } + if (data + size > skb_tail_pointer(skb)) + size = skb_tail_pointer(skb) - data; - if (copy_off + bytes > MAX_BUFFER_OFFSET) - bytes = MAX_BUFFER_OFFSET - copy_off; + count += xenvif_count_frag_slots(vif, offset, size, &state); - copy_off += bytes; + data += size; + } - offset += bytes; - size -= bytes; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + unsigned long size = skb_frag_size(&skb_shinfo(skb)->frags[i]); + unsigned long offset = skb_shinfo(skb)->frags[i].page_offset; - if (offset == PAGE_SIZE) - offset = 0; - } + count += xenvif_count_frag_slots(vif, offset, size, &state); } - - sco = (struct skb_cb_overlay *)skb->cb; - sco->peek_slots_count = count; return count; } @@ -305,11 +327,14 @@ static struct xenvif_rx_meta *get_next_rx_buffer(struct xenvif *vif, return meta; } -/* Set up the grant operations for this fragment. */ +/* + * Set up the grant operations for this fragment. If it's a flipping + * interface, we also set up the unmap request from here. + */ static void xenvif_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb, struct netrx_pending_operations *npo, struct page *page, unsigned long size, - unsigned long offset, int head, int *first) + unsigned long offset, int *head) { struct gnttab_copy *copy_gop; struct xenvif_rx_meta *meta; @@ -333,12 +358,12 @@ static void xenvif_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb, if (bytes > size) bytes = size; - if (start_new_rx_buffer(npo->copy_off, bytes, head)) { + if (start_new_rx_buffer(npo->copy_off, bytes, *head)) { /* * Netfront requires there to be some data in the head * buffer. */ - BUG_ON(*first); + BUG_ON(*head); meta = get_next_rx_buffer(vif, npo); } @@ -372,10 +397,10 @@ static void xenvif_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb, } /* Leave a gap for the GSO descriptor. */ - if (*first && skb_shinfo(skb)->gso_size && !vif->gso_prefix) + if (*head && skb_shinfo(skb)->gso_size && !vif->gso_prefix) vif->rx.req_cons++; - *first = 0; /* There must be something in this buffer now. */ + *head = 0; /* There must be something in this buffer now. */ } } @@ -401,7 +426,7 @@ static int xenvif_gop_skb(struct sk_buff *skb, struct xen_netif_rx_request *req; struct xenvif_rx_meta *meta; unsigned char *data; - int first = 1; + int head = 1; int old_meta_prod; old_meta_prod = npo->meta_prod; @@ -437,7 +462,7 @@ static int xenvif_gop_skb(struct sk_buff *skb, len = skb_tail_pointer(skb) - data; xenvif_gop_frag_copy(vif, skb, npo, - virt_to_page(data), len, offset, 1, &first); + virt_to_page(data), len, offset, &head); data += len; } @@ -446,7 +471,7 @@ static int xenvif_gop_skb(struct sk_buff *skb, skb_frag_page(&skb_shinfo(skb)->frags[i]), skb_frag_size(&skb_shinfo(skb)->frags[i]), skb_shinfo(skb)->frags[i].page_offset, - 0, &first); + &head); } return npo->meta_prod - old_meta_prod; @@ -504,6 +529,10 @@ static void xenvif_add_frag_responses(struct xenvif *vif, int status, } } +struct skb_cb_overlay { + int meta_slots_used; +}; + static void xenvif_kick_thread(struct xenvif *vif) { wake_up(&vif->wq); @@ -534,26 +563,19 @@ void xenvif_rx_action(struct xenvif *vif) count = 0; while ((skb = skb_dequeue(&vif->rx_queue)) != NULL) { - RING_IDX old_rx_req_cons; - vif = netdev_priv(skb->dev); nr_frags = skb_shinfo(skb)->nr_frags; - old_rx_req_cons = vif->rx.req_cons; sco = (struct skb_cb_overlay *)skb->cb; sco->meta_slots_used = xenvif_gop_skb(skb, &npo); - count += vif->rx.req_cons - old_rx_req_cons; + count += nr_frags + 1; __skb_queue_tail(&rxq, skb); - skb = skb_peek(&vif->rx_queue); - if (skb == NULL) - break; - sco = (struct skb_cb_overlay *)skb->cb; - /* Filled the batch queue? */ - if (count + sco->peek_slots_count >= XEN_NETIF_RX_RING_SIZE) + /* XXX FIXME: RX path dependent on MAX_SKB_FRAGS */ + if (count + MAX_SKB_FRAGS >= XEN_NETIF_RX_RING_SIZE) break; } -- cgit v1.2.3 From 2eba61d55e5104d0bf08ba4a9cc609613f52b4c9 Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Wed, 16 Oct 2013 17:50:29 +0100 Subject: xen-netback: add support for IPv6 checksum offload from guest For performance of VM to VM traffic on a single host it is better to avoid calculation of TCP/UDP checksum in the sending frontend. To allow this this patch adds the code necessary to set up partial checksum for IPv6 packets and xenstore flag feature-ipv6-csum-offload to advertise that fact to frontends. Signed-off-by: Paul Durrant Cc: Wei Liu Cc: David Vrabel Cc: Ian Campbell Signed-off-by: David S. Miller --- drivers/net/xen-netback/netback.c | 235 +++++++++++++++++++++++++++++++------- 1 file changed, 196 insertions(+), 39 deletions(-) (limited to 'drivers/net/xen-netback/netback.c') diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index f3e591c611de..4271f8d1da7a 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -109,15 +109,12 @@ static inline unsigned long idx_to_kaddr(struct xenvif *vif, return (unsigned long)pfn_to_kaddr(idx_to_pfn(vif, idx)); } -/* - * This is the amount of packet we copy rather than map, so that the - * guest can't fiddle with the contents of the headers while we do - * packet processing on them (netfilter, routing, etc). +/* This is a miniumum size for the linear area to avoid lots of + * calls to __pskb_pull_tail() as we set up checksum offsets. The + * value 128 was chosen as it covers all IPv4 and most likely + * IPv6 headers. */ -#define PKT_PROT_LEN (ETH_HLEN + \ - VLAN_HLEN + \ - sizeof(struct iphdr) + MAX_IPOPTLEN + \ - sizeof(struct tcphdr) + MAX_TCP_OPTION_SPACE) +#define PKT_PROT_LEN 128 static u16 frag_get_pending_idx(skb_frag_t *frag) { @@ -1118,61 +1115,74 @@ static int xenvif_set_skb_gso(struct xenvif *vif, return 0; } -static int checksum_setup(struct xenvif *vif, struct sk_buff *skb) +static inline void maybe_pull_tail(struct sk_buff *skb, unsigned int len) +{ + if (skb_is_nonlinear(skb) && skb_headlen(skb) < len) { + /* If we need to pullup then pullup to the max, so we + * won't need to do it again. + */ + int target = min_t(int, skb->len, MAX_TCP_HEADER); + __pskb_pull_tail(skb, target - skb_headlen(skb)); + } +} + +static int checksum_setup_ip(struct xenvif *vif, struct sk_buff *skb, + int recalculate_partial_csum) { - struct iphdr *iph; + struct iphdr *iph = (void *)skb->data; + unsigned int header_size; + unsigned int off; int err = -EPROTO; - int recalculate_partial_csum = 0; - /* - * A GSO SKB must be CHECKSUM_PARTIAL. However some buggy - * peers can fail to set NETRXF_csum_blank when sending a GSO - * frame. In this case force the SKB to CHECKSUM_PARTIAL and - * recalculate the partial checksum. - */ - if (skb->ip_summed != CHECKSUM_PARTIAL && skb_is_gso(skb)) { - vif->rx_gso_checksum_fixup++; - skb->ip_summed = CHECKSUM_PARTIAL; - recalculate_partial_csum = 1; - } + off = sizeof(struct iphdr); - /* A non-CHECKSUM_PARTIAL SKB does not require setup. */ - if (skb->ip_summed != CHECKSUM_PARTIAL) - return 0; + header_size = skb->network_header + off + MAX_IPOPTLEN; + maybe_pull_tail(skb, header_size); - if (skb->protocol != htons(ETH_P_IP)) - goto out; + off = iph->ihl * 4; - iph = (void *)skb->data; switch (iph->protocol) { case IPPROTO_TCP: - if (!skb_partial_csum_set(skb, 4 * iph->ihl, + if (!skb_partial_csum_set(skb, off, offsetof(struct tcphdr, check))) goto out; if (recalculate_partial_csum) { struct tcphdr *tcph = tcp_hdr(skb); + + header_size = skb->network_header + + off + + sizeof(struct tcphdr); + maybe_pull_tail(skb, header_size); + tcph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, - skb->len - iph->ihl*4, + skb->len - off, IPPROTO_TCP, 0); } break; case IPPROTO_UDP: - if (!skb_partial_csum_set(skb, 4 * iph->ihl, + if (!skb_partial_csum_set(skb, off, offsetof(struct udphdr, check))) goto out; if (recalculate_partial_csum) { struct udphdr *udph = udp_hdr(skb); + + header_size = skb->network_header + + off + + sizeof(struct udphdr); + maybe_pull_tail(skb, header_size); + udph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, - skb->len - iph->ihl*4, + skb->len - off, IPPROTO_UDP, 0); } break; default: if (net_ratelimit()) netdev_err(vif->dev, - "Attempting to checksum a non-TCP/UDP packet, dropping a protocol %d packet\n", + "Attempting to checksum a non-TCP/UDP packet, " + "dropping a protocol %d packet\n", iph->protocol); goto out; } @@ -1183,6 +1193,158 @@ out: return err; } +static int checksum_setup_ipv6(struct xenvif *vif, struct sk_buff *skb, + int recalculate_partial_csum) +{ + int err = -EPROTO; + struct ipv6hdr *ipv6h = (void *)skb->data; + u8 nexthdr; + unsigned int header_size; + unsigned int off; + bool fragment; + bool done; + + done = false; + + off = sizeof(struct ipv6hdr); + + header_size = skb->network_header + off; + maybe_pull_tail(skb, header_size); + + nexthdr = ipv6h->nexthdr; + + while ((off <= sizeof(struct ipv6hdr) + ntohs(ipv6h->payload_len)) && + !done) { + switch (nexthdr) { + case IPPROTO_DSTOPTS: + case IPPROTO_HOPOPTS: + case IPPROTO_ROUTING: { + struct ipv6_opt_hdr *hp = (void *)(skb->data + off); + + header_size = skb->network_header + + off + + sizeof(struct ipv6_opt_hdr); + maybe_pull_tail(skb, header_size); + + nexthdr = hp->nexthdr; + off += ipv6_optlen(hp); + break; + } + case IPPROTO_AH: { + struct ip_auth_hdr *hp = (void *)(skb->data + off); + + header_size = skb->network_header + + off + + sizeof(struct ip_auth_hdr); + maybe_pull_tail(skb, header_size); + + nexthdr = hp->nexthdr; + off += (hp->hdrlen+2)<<2; + break; + } + case IPPROTO_FRAGMENT: + fragment = true; + /* fall through */ + default: + done = true; + break; + } + } + + if (!done) { + if (net_ratelimit()) + netdev_err(vif->dev, "Failed to parse packet header\n"); + goto out; + } + + if (fragment) { + if (net_ratelimit()) + netdev_err(vif->dev, "Packet is a fragment!\n"); + goto out; + } + + switch (nexthdr) { + case IPPROTO_TCP: + if (!skb_partial_csum_set(skb, off, + offsetof(struct tcphdr, check))) + goto out; + + if (recalculate_partial_csum) { + struct tcphdr *tcph = tcp_hdr(skb); + + header_size = skb->network_header + + off + + sizeof(struct tcphdr); + maybe_pull_tail(skb, header_size); + + tcph->check = ~csum_ipv6_magic(&ipv6h->saddr, + &ipv6h->daddr, + skb->len - off, + IPPROTO_TCP, 0); + } + break; + case IPPROTO_UDP: + if (!skb_partial_csum_set(skb, off, + offsetof(struct udphdr, check))) + goto out; + + if (recalculate_partial_csum) { + struct udphdr *udph = udp_hdr(skb); + + header_size = skb->network_header + + off + + sizeof(struct udphdr); + maybe_pull_tail(skb, header_size); + + udph->check = ~csum_ipv6_magic(&ipv6h->saddr, + &ipv6h->daddr, + skb->len - off, + IPPROTO_UDP, 0); + } + break; + default: + if (net_ratelimit()) + netdev_err(vif->dev, + "Attempting to checksum a non-TCP/UDP packet, " + "dropping a protocol %d packet\n", + nexthdr); + goto out; + } + + err = 0; + +out: + return err; +} + +static int checksum_setup(struct xenvif *vif, struct sk_buff *skb) +{ + int err = -EPROTO; + int recalculate_partial_csum = 0; + + /* A GSO SKB must be CHECKSUM_PARTIAL. However some buggy + * peers can fail to set NETRXF_csum_blank when sending a GSO + * frame. In this case force the SKB to CHECKSUM_PARTIAL and + * recalculate the partial checksum. + */ + if (skb->ip_summed != CHECKSUM_PARTIAL && skb_is_gso(skb)) { + vif->rx_gso_checksum_fixup++; + skb->ip_summed = CHECKSUM_PARTIAL; + recalculate_partial_csum = 1; + } + + /* A non-CHECKSUM_PARTIAL SKB does not require setup. */ + if (skb->ip_summed != CHECKSUM_PARTIAL) + return 0; + + if (skb->protocol == htons(ETH_P_IP)) + err = checksum_setup_ip(vif, skb, recalculate_partial_csum); + else if (skb->protocol == htons(ETH_P_IPV6)) + err = checksum_setup_ipv6(vif, skb, recalculate_partial_csum); + + return err; +} + static bool tx_credit_exceeded(struct xenvif *vif, unsigned size) { unsigned long now = jiffies; @@ -1428,12 +1590,7 @@ static int xenvif_tx_submit(struct xenvif *vif, int budget) xenvif_fill_frags(vif, skb); - /* - * If the initial fragment was < PKT_PROT_LEN then - * pull through some bytes from the other fragments to - * increase the linear region to PKT_PROT_LEN bytes. - */ - if (skb_headlen(skb) < PKT_PROT_LEN && skb_is_nonlinear(skb)) { + if (skb_is_nonlinear(skb) && skb_headlen(skb) < PKT_PROT_LEN) { int target = min_t(int, skb->len, PKT_PROT_LEN); __pskb_pull_tail(skb, target - skb_headlen(skb)); } -- cgit v1.2.3 From a94685876859be30446357db6d6c4a9c951305b4 Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Wed, 16 Oct 2013 17:50:31 +0100 Subject: xen-netback: handle IPv6 TCP GSO packets from the guest This patch adds a xenstore feature flag, festure-gso-tcpv6, to advertise that netback can handle IPv6 TCP GSO packets. It creates SKB_GSO_TCPV6 skbs if the frontend passes an extra segment with the new type XEN_NETIF_GSO_TYPE_TCPV6 added to netif.h. Signed-off-by: Paul Durrant Cc: Wei Liu Cc: David Vrabel Acked-by: Ian Campbell Signed-off-by: David S. Miller --- drivers/net/xen-netback/netback.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'drivers/net/xen-netback/netback.c') diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index 4271f8d1da7a..0e327d46a139 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -1098,15 +1098,20 @@ static int xenvif_set_skb_gso(struct xenvif *vif, return -EINVAL; } - /* Currently only TCPv4 S.O. is supported. */ - if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) { + switch (gso->u.gso.type) { + case XEN_NETIF_GSO_TYPE_TCPV4: + skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; + break; + case XEN_NETIF_GSO_TYPE_TCPV6: + skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6; + break; + default: netdev_err(vif->dev, "Bad GSO type %d.\n", gso->u.gso.type); xenvif_fatal_tx_err(vif); return -EINVAL; } skb_shinfo(skb)->gso_size = gso->u.gso.size; - skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; /* Header must be checked, and gso_segs computed. */ skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; -- cgit v1.2.3 From 82cada22a0bbec6a7afb573ef5fb6c512aaa2739 Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Wed, 16 Oct 2013 17:50:32 +0100 Subject: xen-netback: enable IPv6 TCP GSO to the guest This patch adds code to handle SKB_GSO_TCPV6 skbs and construct appropriate extra or prefix segments to pass the large packet to the frontend. New xenstore flags, feature-gso-tcpv6 and feature-gso-tcpv6-prefix, are sampled to determine if the frontend is capable of handling such packets. Signed-off-by: Paul Durrant Cc: Wei Liu Cc: David Vrabel Cc: Ian Campbell Signed-off-by: David S. Miller --- drivers/net/xen-netback/netback.c | 48 +++++++++++++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 10 deletions(-) (limited to 'drivers/net/xen-netback/netback.c') diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index 0e327d46a139..828fdab4f1a4 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -142,7 +142,7 @@ static int max_required_rx_slots(struct xenvif *vif) int max = DIV_ROUND_UP(vif->dev->mtu, PAGE_SIZE); /* XXX FIXME: RX path dependent on MAX_SKB_FRAGS */ - if (vif->can_sg || vif->gso || vif->gso_prefix) + if (vif->can_sg || vif->gso_mask || vif->gso_prefix_mask) max += MAX_SKB_FRAGS + 1; /* extra_info + frags */ return max; @@ -314,6 +314,7 @@ static struct xenvif_rx_meta *get_next_rx_buffer(struct xenvif *vif, req = RING_GET_REQUEST(&vif->rx, vif->rx.req_cons++); meta = npo->meta + npo->meta_prod++; + meta->gso_type = XEN_NETIF_GSO_TYPE_NONE; meta->gso_size = 0; meta->size = 0; meta->id = req->id; @@ -336,6 +337,7 @@ static void xenvif_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb, struct gnttab_copy *copy_gop; struct xenvif_rx_meta *meta; unsigned long bytes; + int gso_type; /* Data must not cross a page boundary. */ BUG_ON(size + offset > PAGE_SIZE<gso_size && !vif->gso_prefix) + if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) + gso_type = XEN_NETIF_GSO_TYPE_TCPV4; + else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) + gso_type = XEN_NETIF_GSO_TYPE_TCPV6; + else + gso_type = XEN_NETIF_GSO_TYPE_NONE; + + if (*head && ((1 << gso_type) & vif->gso_mask)) vif->rx.req_cons++; *head = 0; /* There must be something in this buffer now. */ @@ -425,14 +434,28 @@ static int xenvif_gop_skb(struct sk_buff *skb, unsigned char *data; int head = 1; int old_meta_prod; + int gso_type; + int gso_size; old_meta_prod = npo->meta_prod; + if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { + gso_type = XEN_NETIF_GSO_TYPE_TCPV4; + gso_size = skb_shinfo(skb)->gso_size; + } else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) { + gso_type = XEN_NETIF_GSO_TYPE_TCPV6; + gso_size = skb_shinfo(skb)->gso_size; + } else { + gso_type = XEN_NETIF_GSO_TYPE_NONE; + gso_size = 0; + } + /* Set up a GSO prefix descriptor, if necessary */ - if (skb_shinfo(skb)->gso_size && vif->gso_prefix) { + if ((1 << skb_shinfo(skb)->gso_type) & vif->gso_prefix_mask) { req = RING_GET_REQUEST(&vif->rx, vif->rx.req_cons++); meta = npo->meta + npo->meta_prod++; - meta->gso_size = skb_shinfo(skb)->gso_size; + meta->gso_type = gso_type; + meta->gso_size = gso_size; meta->size = 0; meta->id = req->id; } @@ -440,10 +463,13 @@ static int xenvif_gop_skb(struct sk_buff *skb, req = RING_GET_REQUEST(&vif->rx, vif->rx.req_cons++); meta = npo->meta + npo->meta_prod++; - if (!vif->gso_prefix) - meta->gso_size = skb_shinfo(skb)->gso_size; - else + if ((1 << gso_type) & vif->gso_mask) { + meta->gso_type = gso_type; + meta->gso_size = gso_size; + } else { + meta->gso_type = XEN_NETIF_GSO_TYPE_NONE; meta->gso_size = 0; + } meta->size = 0; meta->id = req->id; @@ -589,7 +615,8 @@ void xenvif_rx_action(struct xenvif *vif) vif = netdev_priv(skb->dev); - if (vif->meta[npo.meta_cons].gso_size && vif->gso_prefix) { + if ((1 << vif->meta[npo.meta_cons].gso_type) & + vif->gso_prefix_mask) { resp = RING_GET_RESPONSE(&vif->rx, vif->rx.rsp_prod_pvt++); @@ -626,7 +653,8 @@ void xenvif_rx_action(struct xenvif *vif) vif->meta[npo.meta_cons].size, flags); - if (vif->meta[npo.meta_cons].gso_size && !vif->gso_prefix) { + if ((1 << vif->meta[npo.meta_cons].gso_type) & + vif->gso_mask) { struct xen_netif_extra_info *gso = (struct xen_netif_extra_info *) RING_GET_RESPONSE(&vif->rx, @@ -634,8 +662,8 @@ void xenvif_rx_action(struct xenvif *vif) resp->flags |= XEN_NETRXF_extra_info; + gso->u.gso.type = vif->meta[npo.meta_cons].gso_type; gso->u.gso.size = vif->meta[npo.meta_cons].gso_size; - gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4; gso->u.gso.pad = 0; gso->u.gso.features = 0; -- cgit v1.2.3 From 059dfa6a93b779516321e5112db9d7621b1367ba Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Mon, 28 Oct 2013 12:07:57 +0000 Subject: xen-netback: use jiffies_64 value to calculate credit timeout time_after_eq() only works if the delta is < MAX_ULONG/2. For a 32bit Dom0, if netfront sends packets at a very low rate, the time between subsequent calls to tx_credit_exceeded() may exceed MAX_ULONG/2 and the test for timer_after_eq() will be incorrect. Credit will not be replenished and the guest may become unable to send packets (e.g., if prior to the long gap, all credit was exhausted). Use jiffies_64 variant to mitigate this problem for 32bit Dom0. Suggested-by: Jan Beulich Signed-off-by: Wei Liu Reviewed-by: David Vrabel Cc: Ian Campbell Cc: Jason Luan Signed-off-by: David S. Miller --- drivers/net/xen-netback/netback.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'drivers/net/xen-netback/netback.c') diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index f3e591c611de..900da4b243ad 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -1185,9 +1185,8 @@ out: static bool tx_credit_exceeded(struct xenvif *vif, unsigned size) { - unsigned long now = jiffies; - unsigned long next_credit = - vif->credit_timeout.expires + + u64 now = get_jiffies_64(); + u64 next_credit = vif->credit_window_start + msecs_to_jiffies(vif->credit_usec / 1000); /* Timer could already be pending in rare cases. */ @@ -1195,8 +1194,8 @@ static bool tx_credit_exceeded(struct xenvif *vif, unsigned size) return true; /* Passed the point where we can replenish credit? */ - if (time_after_eq(now, next_credit)) { - vif->credit_timeout.expires = now; + if (time_after_eq64(now, next_credit)) { + vif->credit_window_start = now; tx_add_credit(vif); } @@ -1208,6 +1207,7 @@ static bool tx_credit_exceeded(struct xenvif *vif, unsigned size) tx_credit_callback; mod_timer(&vif->credit_timeout, next_credit); + vif->credit_window_start = next_credit; return true; } -- cgit v1.2.3 From ae5e8127b712313ec1b99356019ce9226fea8b88 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Mon, 25 Nov 2013 16:52:34 +0000 Subject: xen-netback: include definition of csum_ipv6_magic We are now using csum_ipv6_magic, include the appropriate header. Avoids the following error: drivers/net/xen-netback/netback.c:1313:4: error: implicit declaration of function 'csum_ipv6_magic' [-Werror=implicit-function-declaration] tcph->check = ~csum_ipv6_magic(&ipv6h->saddr, Signed-off-by: Andy Whitcroft Acked-by: Ian Campbell Signed-off-by: David S. Miller --- drivers/net/xen-netback/netback.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/net/xen-netback/netback.c') diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index 919b6509455c..64f0e0d18b81 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -39,6 +39,7 @@ #include #include +#include #include #include -- cgit v1.2.3 From 1431fb31ecbaba1b5718006128f0f2ed0b94e1c3 Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Tue, 3 Dec 2013 17:39:29 +0000 Subject: xen-netback: fix fragment detection in checksum setup The code to detect fragments in checksum_setup() was missing for IPv4 and too eager for IPv6. (It transpires that Windows seems to send IPv6 packets with a fragment header even if they are not a fragment - i.e. offset is zero, and M bit is not set). This patch also incorporates a fix to callers of maybe_pull_tail() where skb->network_header was being erroneously added to the length argument. Signed-off-by: Paul Durrant Signed-off-by: Zoltan Kiss Cc: Wei Liu Cc: Ian Campbell Cc: David Vrabel cc: David Miller Acked-by: Wei Liu Signed-off-by: David S. Miller --- drivers/net/xen-netback/netback.c | 236 ++++++++++++++++++++++---------------- 1 file changed, 137 insertions(+), 99 deletions(-) (limited to 'drivers/net/xen-netback/netback.c') diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index 64f0e0d18b81..acf13920e6d1 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -1149,49 +1149,72 @@ static int xenvif_set_skb_gso(struct xenvif *vif, return 0; } -static inline void maybe_pull_tail(struct sk_buff *skb, unsigned int len) +static inline int maybe_pull_tail(struct sk_buff *skb, unsigned int len, + unsigned int max) { - if (skb_is_nonlinear(skb) && skb_headlen(skb) < len) { - /* If we need to pullup then pullup to the max, so we - * won't need to do it again. - */ - int target = min_t(int, skb->len, MAX_TCP_HEADER); - __pskb_pull_tail(skb, target - skb_headlen(skb)); - } + if (skb_headlen(skb) >= len) + return 0; + + /* If we need to pullup then pullup to the max, so we + * won't need to do it again. + */ + if (max > skb->len) + max = skb->len; + + if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL) + return -ENOMEM; + + if (skb_headlen(skb) < len) + return -EPROTO; + + return 0; } +/* This value should be large enough to cover a tagged ethernet header plus + * maximally sized IP and TCP or UDP headers. + */ +#define MAX_IP_HDR_LEN 128 + static int checksum_setup_ip(struct xenvif *vif, struct sk_buff *skb, int recalculate_partial_csum) { - struct iphdr *iph = (void *)skb->data; - unsigned int header_size; unsigned int off; - int err = -EPROTO; + bool fragment; + int err; + + fragment = false; + + err = maybe_pull_tail(skb, + sizeof(struct iphdr), + MAX_IP_HDR_LEN); + if (err < 0) + goto out; - off = sizeof(struct iphdr); + if (ip_hdr(skb)->frag_off & htons(IP_OFFSET | IP_MF)) + fragment = true; - header_size = skb->network_header + off + MAX_IPOPTLEN; - maybe_pull_tail(skb, header_size); + off = ip_hdrlen(skb); - off = iph->ihl * 4; + err = -EPROTO; - switch (iph->protocol) { + switch (ip_hdr(skb)->protocol) { case IPPROTO_TCP: if (!skb_partial_csum_set(skb, off, offsetof(struct tcphdr, check))) goto out; if (recalculate_partial_csum) { - struct tcphdr *tcph = tcp_hdr(skb); - - header_size = skb->network_header + - off + - sizeof(struct tcphdr); - maybe_pull_tail(skb, header_size); - - tcph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, - skb->len - off, - IPPROTO_TCP, 0); + err = maybe_pull_tail(skb, + off + sizeof(struct tcphdr), + MAX_IP_HDR_LEN); + if (err < 0) + goto out; + + tcp_hdr(skb)->check = + ~csum_tcpudp_magic(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, + skb->len - off, + IPPROTO_TCP, 0); } break; case IPPROTO_UDP: @@ -1200,24 +1223,20 @@ static int checksum_setup_ip(struct xenvif *vif, struct sk_buff *skb, goto out; if (recalculate_partial_csum) { - struct udphdr *udph = udp_hdr(skb); - - header_size = skb->network_header + - off + - sizeof(struct udphdr); - maybe_pull_tail(skb, header_size); - - udph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, - skb->len - off, - IPPROTO_UDP, 0); + err = maybe_pull_tail(skb, + off + sizeof(struct udphdr), + MAX_IP_HDR_LEN); + if (err < 0) + goto out; + + udp_hdr(skb)->check = + ~csum_tcpudp_magic(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, + skb->len - off, + IPPROTO_UDP, 0); } break; default: - if (net_ratelimit()) - netdev_err(vif->dev, - "Attempting to checksum a non-TCP/UDP packet, " - "dropping a protocol %d packet\n", - iph->protocol); goto out; } @@ -1227,75 +1246,99 @@ out: return err; } +/* This value should be large enough to cover a tagged ethernet header plus + * an IPv6 header, all options, and a maximal TCP or UDP header. + */ +#define MAX_IPV6_HDR_LEN 256 + +#define OPT_HDR(type, skb, off) \ + (type *)(skb_network_header(skb) + (off)) + static int checksum_setup_ipv6(struct xenvif *vif, struct sk_buff *skb, int recalculate_partial_csum) { - int err = -EPROTO; - struct ipv6hdr *ipv6h = (void *)skb->data; + int err; u8 nexthdr; - unsigned int header_size; unsigned int off; + unsigned int len; bool fragment; bool done; + fragment = false; done = false; off = sizeof(struct ipv6hdr); - header_size = skb->network_header + off; - maybe_pull_tail(skb, header_size); + err = maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN); + if (err < 0) + goto out; - nexthdr = ipv6h->nexthdr; + nexthdr = ipv6_hdr(skb)->nexthdr; - while ((off <= sizeof(struct ipv6hdr) + ntohs(ipv6h->payload_len)) && - !done) { + len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len); + while (off <= len && !done) { switch (nexthdr) { case IPPROTO_DSTOPTS: case IPPROTO_HOPOPTS: case IPPROTO_ROUTING: { - struct ipv6_opt_hdr *hp = (void *)(skb->data + off); + struct ipv6_opt_hdr *hp; - header_size = skb->network_header + - off + - sizeof(struct ipv6_opt_hdr); - maybe_pull_tail(skb, header_size); + err = maybe_pull_tail(skb, + off + + sizeof(struct ipv6_opt_hdr), + MAX_IPV6_HDR_LEN); + if (err < 0) + goto out; + hp = OPT_HDR(struct ipv6_opt_hdr, skb, off); nexthdr = hp->nexthdr; off += ipv6_optlen(hp); break; } case IPPROTO_AH: { - struct ip_auth_hdr *hp = (void *)(skb->data + off); + struct ip_auth_hdr *hp; + + err = maybe_pull_tail(skb, + off + + sizeof(struct ip_auth_hdr), + MAX_IPV6_HDR_LEN); + if (err < 0) + goto out; + + hp = OPT_HDR(struct ip_auth_hdr, skb, off); + nexthdr = hp->nexthdr; + off += ipv6_authlen(hp); + break; + } + case IPPROTO_FRAGMENT: { + struct frag_hdr *hp; - header_size = skb->network_header + - off + - sizeof(struct ip_auth_hdr); - maybe_pull_tail(skb, header_size); + err = maybe_pull_tail(skb, + off + + sizeof(struct frag_hdr), + MAX_IPV6_HDR_LEN); + if (err < 0) + goto out; + + hp = OPT_HDR(struct frag_hdr, skb, off); + + if (hp->frag_off & htons(IP6_OFFSET | IP6_MF)) + fragment = true; nexthdr = hp->nexthdr; - off += (hp->hdrlen+2)<<2; + off += sizeof(struct frag_hdr); break; } - case IPPROTO_FRAGMENT: - fragment = true; - /* fall through */ default: done = true; break; } } - if (!done) { - if (net_ratelimit()) - netdev_err(vif->dev, "Failed to parse packet header\n"); - goto out; - } + err = -EPROTO; - if (fragment) { - if (net_ratelimit()) - netdev_err(vif->dev, "Packet is a fragment!\n"); + if (!done || fragment) goto out; - } switch (nexthdr) { case IPPROTO_TCP: @@ -1304,17 +1347,17 @@ static int checksum_setup_ipv6(struct xenvif *vif, struct sk_buff *skb, goto out; if (recalculate_partial_csum) { - struct tcphdr *tcph = tcp_hdr(skb); - - header_size = skb->network_header + - off + - sizeof(struct tcphdr); - maybe_pull_tail(skb, header_size); - - tcph->check = ~csum_ipv6_magic(&ipv6h->saddr, - &ipv6h->daddr, - skb->len - off, - IPPROTO_TCP, 0); + err = maybe_pull_tail(skb, + off + sizeof(struct tcphdr), + MAX_IPV6_HDR_LEN); + if (err < 0) + goto out; + + tcp_hdr(skb)->check = + ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, + skb->len - off, + IPPROTO_TCP, 0); } break; case IPPROTO_UDP: @@ -1323,25 +1366,20 @@ static int checksum_setup_ipv6(struct xenvif *vif, struct sk_buff *skb, goto out; if (recalculate_partial_csum) { - struct udphdr *udph = udp_hdr(skb); - - header_size = skb->network_header + - off + - sizeof(struct udphdr); - maybe_pull_tail(skb, header_size); - - udph->check = ~csum_ipv6_magic(&ipv6h->saddr, - &ipv6h->daddr, - skb->len - off, - IPPROTO_UDP, 0); + err = maybe_pull_tail(skb, + off + sizeof(struct udphdr), + MAX_IPV6_HDR_LEN); + if (err < 0) + goto out; + + udp_hdr(skb)->check = + ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, + skb->len - off, + IPPROTO_UDP, 0); } break; default: - if (net_ratelimit()) - netdev_err(vif->dev, - "Attempting to checksum a non-TCP/UDP packet, " - "dropping a protocol %d packet\n", - nexthdr); goto out; } -- cgit v1.2.3 From d52eb0d46f3606b9de9965cebb2beb2202a0dc62 Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Wed, 11 Dec 2013 16:37:40 +0000 Subject: xen-netback: make sure skb linear area covers checksum field skb_partial_csum_set requires that the linear area of the skb covers the checksum field. The checksum setup code in netback was only doing that pullup in the case when the pseudo header checksum was being recalculated though. This patch makes that pullup unconditional. (I pullup the whole transport header just for simplicity; the requirement is only for the check field but in the case of UDP this is the last field in the header and in the case of TCP it's the last but one). The lack of pullup manifested as failures running Microsoft HCK network tests on a pair of Windows 8 VMs and it has been verified that this patch fixes the problem. Suggested-by: Jan Beulich Signed-off-by: Paul Durrant Cc: Wei Liu Cc: Ian Campbell Cc: David Vrabel Reviewed-by: Jan Beulich Signed-off-by: David S. Miller --- drivers/net/xen-netback/netback.c | 60 ++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 32 deletions(-) (limited to 'drivers/net/xen-netback/netback.c') diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index acf13920e6d1..d158fc40cff2 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -1199,42 +1199,40 @@ static int checksum_setup_ip(struct xenvif *vif, struct sk_buff *skb, switch (ip_hdr(skb)->protocol) { case IPPROTO_TCP: + err = maybe_pull_tail(skb, + off + sizeof(struct tcphdr), + MAX_IP_HDR_LEN); + if (err < 0) + goto out; + if (!skb_partial_csum_set(skb, off, offsetof(struct tcphdr, check))) goto out; - if (recalculate_partial_csum) { - err = maybe_pull_tail(skb, - off + sizeof(struct tcphdr), - MAX_IP_HDR_LEN); - if (err < 0) - goto out; - + if (recalculate_partial_csum) tcp_hdr(skb)->check = ~csum_tcpudp_magic(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, skb->len - off, IPPROTO_TCP, 0); - } break; case IPPROTO_UDP: + err = maybe_pull_tail(skb, + off + sizeof(struct udphdr), + MAX_IP_HDR_LEN); + if (err < 0) + goto out; + if (!skb_partial_csum_set(skb, off, offsetof(struct udphdr, check))) goto out; - if (recalculate_partial_csum) { - err = maybe_pull_tail(skb, - off + sizeof(struct udphdr), - MAX_IP_HDR_LEN); - if (err < 0) - goto out; - + if (recalculate_partial_csum) udp_hdr(skb)->check = ~csum_tcpudp_magic(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, skb->len - off, IPPROTO_UDP, 0); - } break; default: goto out; @@ -1342,42 +1340,40 @@ static int checksum_setup_ipv6(struct xenvif *vif, struct sk_buff *skb, switch (nexthdr) { case IPPROTO_TCP: + err = maybe_pull_tail(skb, + off + sizeof(struct tcphdr), + MAX_IPV6_HDR_LEN); + if (err < 0) + goto out; + if (!skb_partial_csum_set(skb, off, offsetof(struct tcphdr, check))) goto out; - if (recalculate_partial_csum) { - err = maybe_pull_tail(skb, - off + sizeof(struct tcphdr), - MAX_IPV6_HDR_LEN); - if (err < 0) - goto out; - + if (recalculate_partial_csum) tcp_hdr(skb)->check = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->len - off, IPPROTO_TCP, 0); - } break; case IPPROTO_UDP: + err = maybe_pull_tail(skb, + off + sizeof(struct udphdr), + MAX_IPV6_HDR_LEN); + if (err < 0) + goto out; + if (!skb_partial_csum_set(skb, off, offsetof(struct udphdr, check))) goto out; - if (recalculate_partial_csum) { - err = maybe_pull_tail(skb, - off + sizeof(struct udphdr), - MAX_IPV6_HDR_LEN); - if (err < 0) - goto out; - + if (recalculate_partial_csum) udp_hdr(skb)->check = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->len - off, IPPROTO_UDP, 0); - } break; default: goto out; -- cgit v1.2.3 From 10574059ce0451c6572c85329c772aa15085f8eb Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Wed, 11 Dec 2013 10:57:15 +0000 Subject: xen-netback: napi: fix abuse of budget netback seems to be somewhat confused about the napi budget parameter. The parameter is supposed to limit the number of skbs processed in each poll, but netback has this confused with grant operations. This patch fixes that, properly limiting the work done in each poll. Note that this limit makes sure we do not process any more data from the shared ring than we intend to pass back from the poll. This is important to prevent tx_queue potentially growing without bound. Signed-off-by: Paul Durrant Cc: Wei Liu Cc: Ian Campbell Cc: David Vrabel Signed-off-by: David S. Miller --- drivers/net/xen-netback/netback.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'drivers/net/xen-netback/netback.c') diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index d158fc40cff2..db79e29b3d09 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -1445,14 +1445,15 @@ static bool tx_credit_exceeded(struct xenvif *vif, unsigned size) return false; } -static unsigned xenvif_tx_build_gops(struct xenvif *vif) +static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget) { struct gnttab_copy *gop = vif->tx_copy_ops, *request_gop; struct sk_buff *skb; int ret; while ((nr_pending_reqs(vif) + XEN_NETBK_LEGACY_SLOTS_MAX - < MAX_PENDING_REQS)) { + < MAX_PENDING_REQS) && + (skb_queue_len(&vif->tx_queue) < budget)) { struct xen_netif_tx_request txreq; struct xen_netif_tx_request txfrags[XEN_NETBK_LEGACY_SLOTS_MAX]; struct page *page; @@ -1614,14 +1615,13 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif) } -static int xenvif_tx_submit(struct xenvif *vif, int budget) +static int xenvif_tx_submit(struct xenvif *vif) { struct gnttab_copy *gop = vif->tx_copy_ops; struct sk_buff *skb; int work_done = 0; - while (work_done < budget && - (skb = __skb_dequeue(&vif->tx_queue)) != NULL) { + while ((skb = __skb_dequeue(&vif->tx_queue)) != NULL) { struct xen_netif_tx_request *txp; u16 pending_idx; unsigned data_len; @@ -1696,14 +1696,14 @@ int xenvif_tx_action(struct xenvif *vif, int budget) if (unlikely(!tx_work_todo(vif))) return 0; - nr_gops = xenvif_tx_build_gops(vif); + nr_gops = xenvif_tx_build_gops(vif, budget); if (nr_gops == 0) return 0; gnttab_batch_copy(vif->tx_copy_ops, nr_gops); - work_done = xenvif_tx_submit(vif, nr_gops); + work_done = xenvif_tx_submit(vif); return work_done; } -- cgit v1.2.3 From d9601a36ffdb5c142697bef1228afb5ba4ee4003 Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Wed, 11 Dec 2013 10:57:16 +0000 Subject: xen-netback: napi: don't prematurely request a tx event This patch changes the RING_FINAL_CHECK_FOR_REQUESTS in xenvif_build_tx_gops to a check for RING_HAS_UNCONSUMED_REQUESTS as the former call has the side effect of advancing the ring event pointer and therefore inviting another interrupt from the frontend before the napi poll has actually finished, thereby defeating the point of napi. The event pointer is updated by RING_FINAL_CHECK_FOR_REQUESTS in xenvif_poll, the napi poll function, if the work done is less than the budget i.e. when actually transitioning back to interrupt mode. Reported-by: Malcolm Crossley Signed-off-by: Paul Durrant Cc: Wei Liu Cc: Ian Campbell Cc: David Vrabel Signed-off-by: David S. Miller --- drivers/net/xen-netback/netback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/net/xen-netback/netback.c') diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index db79e29b3d09..33b8aa612d13 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -1475,7 +1475,7 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget) continue; } - RING_FINAL_CHECK_FOR_REQUESTS(&vif->tx, work_to_do); + work_to_do = RING_HAS_UNCONSUMED_REQUESTS(&vif->tx); if (!work_to_do) break; -- cgit v1.2.3 From a3314f3d40215349ab2427800c1e10676691389f Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Thu, 12 Dec 2013 14:20:13 +0000 Subject: xen-netback: fix gso_prefix check There is a mistake in checking the gso_prefix mask when passing large packets to a guest. The wrong shift is applied to the bit - the raw skb gso type is used rather then the translated one. This leads to large packets being handed to the guest without the GSO metadata. This patch fixes the check. The mistake manifested as errors whilst running Microsoft HCK large packet offload tests between a pair of Windows 8 VMs. I have verified this patch fixes those errors. Signed-off-by: Paul Durrant Cc: Wei Liu Cc: Ian Campbell Cc: David Vrabel Acked-by: Ian Campbell Signed-off-by: David S. Miller --- drivers/net/xen-netback/netback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/net/xen-netback/netback.c') diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index 33b8aa612d13..e884ee1fe7ed 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -452,7 +452,7 @@ static int xenvif_gop_skb(struct sk_buff *skb, } /* Set up a GSO prefix descriptor, if necessary */ - if ((1 << skb_shinfo(skb)->gso_type) & vif->gso_prefix_mask) { + if ((1 << gso_type) & vif->gso_prefix_mask) { req = RING_GET_REQUEST(&vif->rx, vif->rx.req_cons++); meta = npo->meta + npo->meta_prod++; meta->gso_type = gso_type; -- cgit v1.2.3