From d346a3fae3ff1d99f5d0c819bf86edf9094a26a1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 6 Dec 2013 11:36:17 +0100 Subject: packet: introduce PACKET_QDISC_BYPASS socket option This patch introduces a PACKET_QDISC_BYPASS socket option, that allows for using a similar xmit() function as in pktgen instead of taking the dev_queue_xmit() path. This can be very useful when PF_PACKET applications are required to be used in a similar scenario as pktgen, but with full, flexible packet payload that needs to be provided, for example. On default, nothing changes in behaviour for normal PF_PACKET TX users, so everything stays as is for applications. New users, however, can now set PACKET_QDISC_BYPASS if needed to prevent own packets from i) reentering packet_rcv() and ii) to directly push the frame to the driver. In doing so we can increase pps (here 64 byte packets) for PF_PACKET a bit: # CPUs -- QDISC_BYPASS -- qdisc path -- qdisc path[**] 1 CPU == 1,509,628 pps -- 1,208,708 -- 1,247,436 2 CPUs == 3,198,659 pps -- 2,536,012 -- 1,605,779 3 CPUs == 4,787,992 pps -- 3,788,740 -- 1,735,610 4 CPUs == 6,173,956 pps -- 4,907,799 -- 1,909,114 5 CPUs == 7,495,676 pps -- 5,956,499 -- 2,014,422 6 CPUs == 9,001,496 pps -- 7,145,064 -- 2,155,261 7 CPUs == 10,229,776 pps -- 8,190,596 -- 2,220,619 8 CPUs == 11,040,732 pps -- 9,188,544 -- 2,241,879 9 CPUs == 12,009,076 pps -- 10,275,936 -- 2,068,447 10 CPUs == 11,380,052 pps -- 11,265,337 -- 1,578,689 11 CPUs == 11,672,676 pps -- 11,845,344 -- 1,297,412 [...] 20 CPUs == 11,363,192 pps -- 11,014,933 -- 1,245,081 [**]: qdisc path with packet_rcv(), how probably most people seem to use it (hopefully not anymore if not needed) The test was done using a modified trafgen, sending a simple static 64 bytes packet, on all CPUs. The trick in the fast "qdisc path" case, is to avoid reentering packet_rcv() by setting the RAW socket protocol to zero, like: socket(PF_PACKET, SOCK_RAW, 0); Tradeoffs are documented as well in this patch, clearly, if queues are busy, we will drop more packets, tc disciplines are ignored, and these packets are not visible to taps anymore. For a pktgen like scenario, we argue that this is acceptable. The pointer to the xmit function has been placed in packet socket structure hole between cached_dev and prot_hook that is hot anyway as we're working on cached_dev in each send path. Done in joint work together with Jesper Dangaard Brouer. Signed-off-by: Daniel Borkmann Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/uapi/linux/if_packet.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux/if_packet.h') diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h index dbf06667394b..1e24aa701cbd 100644 --- a/include/uapi/linux/if_packet.h +++ b/include/uapi/linux/if_packet.h @@ -51,6 +51,7 @@ struct sockaddr_ll { #define PACKET_TIMESTAMP 17 #define PACKET_FANOUT 18 #define PACKET_TX_HAS_OFF 19 +#define PACKET_QDISC_BYPASS 20 #define PACKET_FANOUT_HASH 0 #define PACKET_FANOUT_LB 1 -- cgit v1.2.3 From e4d26f4b080f55e9577b45e6b51a04971eb459e9 Mon Sep 17 00:00:00 2001 From: Atzm Watanabe Date: Tue, 17 Dec 2013 22:53:36 +0900 Subject: packet: fill the gap of TPACKET_ALIGNMENT with zeros struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT. Explicitly defining and zeroing the gap of this makes additional changes easier. Signed-off-by: Atzm Watanabe Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/if_packet.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux/if_packet.h') diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h index 1e24aa701cbd..9185dc9a4485 100644 --- a/include/uapi/linux/if_packet.h +++ b/include/uapi/linux/if_packet.h @@ -133,7 +133,7 @@ struct tpacket2_hdr { __u32 tp_sec; __u32 tp_nsec; __u16 tp_vlan_tci; - __u16 tp_padding; + __u8 tp_padding[6]; }; struct tpacket_hdr_variant1 { @@ -154,6 +154,7 @@ struct tpacket3_hdr { union { struct tpacket_hdr_variant1 hv1; }; + __u8 tp_padding[12]; }; struct tpacket_bd_ts { -- cgit v1.2.3 From a0cdfcf39362410d5ea983f4daf67b38de129408 Mon Sep 17 00:00:00 2001 From: Atzm Watanabe Date: Tue, 17 Dec 2013 22:53:40 +0900 Subject: packet: deliver VLAN TPID to userspace This enables userspace to get VLAN TPID as well as the VLAN TCI. Signed-off-by: Atzm Watanabe Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/if_packet.h | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'include/uapi/linux/if_packet.h') diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h index 9185dc9a4485..e9d844c80c11 100644 --- a/include/uapi/linux/if_packet.h +++ b/include/uapi/linux/if_packet.h @@ -84,17 +84,18 @@ struct tpacket_auxdata { __u16 tp_mac; __u16 tp_net; __u16 tp_vlan_tci; - __u16 tp_padding; + __u16 tp_vlan_tpid; }; /* Rx ring - header status */ -#define TP_STATUS_KERNEL 0 -#define TP_STATUS_USER (1 << 0) -#define TP_STATUS_COPY (1 << 1) -#define TP_STATUS_LOSING (1 << 2) -#define TP_STATUS_CSUMNOTREADY (1 << 3) -#define TP_STATUS_VLAN_VALID (1 << 4) /* auxdata has valid tp_vlan_tci */ -#define TP_STATUS_BLK_TMO (1 << 5) +#define TP_STATUS_KERNEL 0 +#define TP_STATUS_USER (1 << 0) +#define TP_STATUS_COPY (1 << 1) +#define TP_STATUS_LOSING (1 << 2) +#define TP_STATUS_CSUMNOTREADY (1 << 3) +#define TP_STATUS_VLAN_VALID (1 << 4) /* auxdata has valid tp_vlan_tci */ +#define TP_STATUS_BLK_TMO (1 << 5) +#define TP_STATUS_VLAN_TPID_VALID (1 << 6) /* auxdata has valid tp_vlan_tpid */ /* Tx ring - header status */ #define TP_STATUS_AVAILABLE 0 @@ -133,12 +134,15 @@ struct tpacket2_hdr { __u32 tp_sec; __u32 tp_nsec; __u16 tp_vlan_tci; - __u8 tp_padding[6]; + __u16 tp_vlan_tpid; + __u8 tp_padding[4]; }; struct tpacket_hdr_variant1 { __u32 tp_rxhash; __u32 tp_vlan_tci; + __u16 tp_vlan_tpid; + __u16 tp_padding; }; struct tpacket3_hdr { @@ -154,7 +158,7 @@ struct tpacket3_hdr { union { struct tpacket_hdr_variant1 hv1; }; - __u8 tp_padding[12]; + __u8 tp_padding[8]; }; struct tpacket_bd_ts { -- cgit v1.2.3 From 604d13c97f0d863e41da3f5835c62e3cf899962b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 23 Dec 2013 14:35:56 +0100 Subject: netlink: specify netlink packet direction for nlmon In order to facilitate development for netlink protocol dissector, fill the unused field skb->pkt_type of the cloned skb with a hint of the address space of the new owner (receiver) socket in the notion of "to kernel" resp. "to user". At the time we invoke __netlink_deliver_tap_skb(), we already have set the new skb owner via netlink_skb_set_owner_r(), so we can use that for netlink_is_kernel() probing. In normal PF_PACKET network traffic, this field denotes if the packet is destined for us (PACKET_HOST), if it's broadcast (PACKET_BROADCAST), etc. As we only have 3 bit reserved, we can use the value (= 6) of PACKET_FASTROUTE as it's _not used_ anywhere in the whole kernel and not supported anywhere, and packets of such type were never exposed to user space, so there are no overlapping users of such kind. Thus, as wished, that seems the only way to make both PACKET_* values non-overlapping and therefore device agnostic. By using those two flags for netlink skbs on nlmon devices, they can be made available and picked up via sll_pkttype (previously unused in netlink context) in struct sockaddr_ll. We now have these two directions: - PACKET_USER (= 6) -> to user space - PACKET_KERNEL (= 7) -> to kernel space Partial `ip a` example strace for sa_family=AF_NETLINK with detected nl msg direction: syscall: direction: sendto(3, ...) = 40 /* to kernel */ recvmsg(3, ...) = 3404 /* to user */ recvmsg(3, ...) = 1120 /* to user */ recvmsg(3, ...) = 20 /* to user */ sendto(3, ...) = 40 /* to kernel */ recvmsg(3, ...) = 168 /* to user */ recvmsg(3, ...) = 144 /* to user */ recvmsg(3, ...) = 20 /* to user */ Signed-off-by: Daniel Borkmann Signed-off-by: Jakub Zawadzki Signed-off-by: David S. Miller --- include/uapi/linux/if_packet.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux/if_packet.h') diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h index e9d844c80c11..1988a02842cc 100644 --- a/include/uapi/linux/if_packet.h +++ b/include/uapi/linux/if_packet.h @@ -26,8 +26,10 @@ struct sockaddr_ll { #define PACKET_MULTICAST 2 /* To group */ #define PACKET_OTHERHOST 3 /* To someone else */ #define PACKET_OUTGOING 4 /* Outgoing of any type */ -/* These ones are invisible by user level */ #define PACKET_LOOPBACK 5 /* MC/BRD frame looped back */ +#define PACKET_USER 6 /* To user space */ +#define PACKET_KERNEL 7 /* To kernel space */ +/* Unused, PACKET_FASTROUTE and PACKET_LOOPBACK are invisible to user space */ #define PACKET_FASTROUTE 6 /* Fastrouted frame */ /* Packet socket options */ -- cgit v1.2.3 From 2d36097d26b5991d71a2cf4a20c1a158f0f1bfcd Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Wed, 22 Jan 2014 16:01:44 -0500 Subject: af_packet: Add Queue mapping mode to af_packet fanout operation This patch adds a queue mapping mode to the fanout operation of af_packet sockets. This allows user space af_packet users to better filter on flows ingressing and egressing via a specific hardware queue, and avoids the potential packet reordering that can occur when FANOUT_CPU is being used and irq affinity varies. Tested successfully by myself. applies to net-next Signed-off-by: Neil Horman CC: "David S. Miller" Signed-off-by: David S. Miller --- include/uapi/linux/if_packet.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux/if_packet.h') diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h index 1988a02842cc..bac27fa05f5b 100644 --- a/include/uapi/linux/if_packet.h +++ b/include/uapi/linux/if_packet.h @@ -60,6 +60,7 @@ struct sockaddr_ll { #define PACKET_FANOUT_CPU 2 #define PACKET_FANOUT_ROLLOVER 3 #define PACKET_FANOUT_RND 4 +#define PACKET_FANOUT_QM 5 #define PACKET_FANOUT_FLAG_ROLLOVER 0x1000 #define PACKET_FANOUT_FLAG_DEFRAG 0x8000 -- cgit v1.2.3