diff options
Diffstat (limited to 'net/ipv4')
37 files changed, 587 insertions, 634 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index c68196cc56ab..010fbb2d45e9 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -43,11 +43,11 @@ config IP_ADVANCED_ROUTER asymmetric routing (packets from you to a host take a different path than packets from that host to you) or if you operate a non-routing host which has several IP addresses on different interfaces. To turn - rp_filter off use: + rp_filter on use: - echo 0 > /proc/sys/net/ipv4/conf/<device>/rp_filter + echo 1 > /proc/sys/net/ipv4/conf/<device>/rp_filter or - echo 0 > /proc/sys/net/ipv4/conf/all/rp_filter + echo 1 > /proc/sys/net/ipv4/conf/all/rp_filter If unsure, say N here. @@ -577,6 +577,7 @@ config TCP_CONG_VENO config TCP_CONG_YEAH tristate "YeAH TCP" depends on EXPERIMENTAL + select TCP_CONG_VEGAS default n ---help--- YeAH-TCP is a sender-side high-speed enabled TCP congestion control diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 7110779a0244..e00767e8ebd9 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -877,7 +877,7 @@ static int arp_process(struct sk_buff *skb) n = __neigh_lookup(&arp_tbl, &sip, dev, 0); - if (ipv4_devconf.arp_accept) { + if (IPV4_DEVCONF_ALL(ARP_ACCEPT)) { /* Unsolicited ARP is not accepted by default. It is possible, that this option should be enabled for some devices (strip is candidate) @@ -987,11 +987,11 @@ static int arp_req_set(struct arpreq *r, struct net_device * dev) return 0; } if (dev == NULL) { - ipv4_devconf.proxy_arp = 1; + IPV4_DEVCONF_ALL(PROXY_ARP) = 1; return 0; } if (__in_dev_get_rtnl(dev)) { - __in_dev_get_rtnl(dev)->cnf.proxy_arp = 1; + IN_DEV_CONF_SET(__in_dev_get_rtnl(dev), PROXY_ARP, 1); return 0; } return -ENXIO; @@ -1093,11 +1093,12 @@ static int arp_req_delete(struct arpreq *r, struct net_device * dev) return pneigh_delete(&arp_tbl, &ip, dev); if (mask == 0) { if (dev == NULL) { - ipv4_devconf.proxy_arp = 0; + IPV4_DEVCONF_ALL(PROXY_ARP) = 0; return 0; } if (__in_dev_get_rtnl(dev)) { - __in_dev_get_rtnl(dev)->cnf.proxy_arp = 0; + IN_DEV_CONF_SET(__in_dev_get_rtnl(dev), + PROXY_ARP, 0); return 0; } return -ENXIO; diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 86a2b52aad38..ab56a052ce31 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -45,6 +45,7 @@ #include <net/cipso_ipv4.h> #include <asm/atomic.h> #include <asm/bug.h> +#include <asm/unaligned.h> struct cipso_v4_domhsh_entry { char *domain; @@ -1000,7 +1001,7 @@ static int cipso_v4_map_cat_enum_valid(const struct cipso_v4_doi *doi_def, return -EFAULT; for (iter = 0; iter < enumcat_len; iter += 2) { - cat = ntohs(*((__be16 *)&enumcat[iter])); + cat = ntohs(get_unaligned((__be16 *)&enumcat[iter])); if (cat <= cat_prev) return -EFAULT; cat_prev = cat; @@ -1068,8 +1069,8 @@ static int cipso_v4_map_cat_enum_ntoh(const struct cipso_v4_doi *doi_def, for (iter = 0; iter < net_cat_len; iter += 2) { ret_val = netlbl_secattr_catmap_setbit(secattr->mls_cat, - ntohs(*((__be16 *)&net_cat[iter])), - GFP_ATOMIC); + ntohs(get_unaligned((__be16 *)&net_cat[iter])), + GFP_ATOMIC); if (ret_val != 0) return ret_val; } @@ -1102,9 +1103,10 @@ static int cipso_v4_map_cat_rng_valid(const struct cipso_v4_doi *doi_def, return -EFAULT; for (iter = 0; iter < rngcat_len; iter += 4) { - cat_high = ntohs(*((__be16 *)&rngcat[iter])); + cat_high = ntohs(get_unaligned((__be16 *)&rngcat[iter])); if ((iter + 4) <= rngcat_len) - cat_low = ntohs(*((__be16 *)&rngcat[iter + 2])); + cat_low = ntohs( + get_unaligned((__be16 *)&rngcat[iter + 2])); else cat_low = 0; @@ -1201,9 +1203,10 @@ static int cipso_v4_map_cat_rng_ntoh(const struct cipso_v4_doi *doi_def, u16 cat_high; for (net_iter = 0; net_iter < net_cat_len; net_iter += 4) { - cat_high = ntohs(*((__be16 *)&net_cat[net_iter])); + cat_high = ntohs(get_unaligned((__be16 *)&net_cat[net_iter])); if ((net_iter + 4) <= net_cat_len) - cat_low = ntohs(*((__be16 *)&net_cat[net_iter + 2])); + cat_low = ntohs( + get_unaligned((__be16 *)&net_cat[net_iter + 2])); else cat_low = 0; @@ -1565,7 +1568,7 @@ int cipso_v4_validate(unsigned char **option) } rcu_read_lock(); - doi_def = cipso_v4_doi_search(ntohl(*((__be32 *)&opt[2]))); + doi_def = cipso_v4_doi_search(ntohl(get_unaligned((__be32 *)&opt[2]))); if (doi_def == NULL) { err_offset = 2; goto validate_return_locked; @@ -1709,22 +1712,22 @@ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway) } /** - * cipso_v4_socket_setattr - Add a CIPSO option to a socket - * @sock: the socket + * cipso_v4_sock_setattr - Add a CIPSO option to a socket + * @sk: the socket * @doi_def: the CIPSO DOI to use * @secattr: the specific security attributes of the socket * * Description: * Set the CIPSO option on the given socket using the DOI definition and * security attributes passed to the function. This function requires - * exclusive access to @sock->sk, which means it either needs to be in the - * process of being created or locked via lock_sock(sock->sk). Returns zero on - * success and negative values on failure. + * exclusive access to @sk, which means it either needs to be in the + * process of being created or locked. Returns zero on success and negative + * values on failure. * */ -int cipso_v4_socket_setattr(const struct socket *sock, - const struct cipso_v4_doi *doi_def, - const struct netlbl_lsm_secattr *secattr) +int cipso_v4_sock_setattr(struct sock *sk, + const struct cipso_v4_doi *doi_def, + const struct netlbl_lsm_secattr *secattr) { int ret_val = -EPERM; u32 iter; @@ -1732,7 +1735,6 @@ int cipso_v4_socket_setattr(const struct socket *sock, u32 buf_len = 0; u32 opt_len; struct ip_options *opt = NULL; - struct sock *sk; struct inet_sock *sk_inet; struct inet_connection_sock *sk_conn; @@ -1740,7 +1742,6 @@ int cipso_v4_socket_setattr(const struct socket *sock, * defined yet but it is not a problem as the only users of these * "lite" PF_INET sockets are functions which do an accept() call * afterwards so we will label the socket as part of the accept(). */ - sk = sock->sk; if (sk == NULL) return 0; @@ -1858,7 +1859,7 @@ int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) if (ret_val == 0) return ret_val; - doi = ntohl(*(__be32 *)&cipso_ptr[2]); + doi = ntohl(get_unaligned((__be32 *)&cipso_ptr[2])); rcu_read_lock(); doi_def = cipso_v4_doi_search(doi); if (doi_def == NULL) { @@ -1892,29 +1893,6 @@ int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) } /** - * cipso_v4_socket_getattr - Get the security attributes from a socket - * @sock: the socket - * @secattr: the security attributes - * - * Description: - * Query @sock to see if there is a CIPSO option attached to the socket and if - * there is return the CIPSO security attributes in @secattr. Returns zero on - * success and negative values on failure. - * - */ -int cipso_v4_socket_getattr(const struct socket *sock, - struct netlbl_lsm_secattr *secattr) -{ - int ret_val; - - lock_sock(sock->sk); - ret_val = cipso_v4_sock_getattr(sock->sk, secattr); - release_sock(sock->sk); - - return ret_val; -} - -/** * cipso_v4_skbuff_getattr - Get the security attributes from the CIPSO option * @skb: the packet * @secattr: the security attributes @@ -1936,7 +1914,7 @@ int cipso_v4_skbuff_getattr(const struct sk_buff *skb, if (cipso_v4_cache_check(cipso_ptr, cipso_ptr[1], secattr) == 0) return 0; - doi = ntohl(*(__be32 *)&cipso_ptr[2]); + doi = ntohl(get_unaligned((__be32 *)&cipso_ptr[2])); rcu_read_lock(); doi_def = cipso_v4_doi_search(doi); if (doi_def == NULL) diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index dd02a45d0f67..0301dd468cf4 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -50,8 +50,12 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) RT_CONN_FLAGS(sk), oif, sk->sk_protocol, inet->sport, usin->sin_port, sk, 1); - if (err) + if (err) { + if (err == -ENETUNREACH) + IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); return err; + } + if ((rt->rt_flags & RTCF_BROADCAST) && !sock_flag(sk, SOCK_BROADCAST)) { ip_rt_put(rt); return -EACCES; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 7f95e6e9beeb..abf6352f990f 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -64,21 +64,27 @@ #include <net/rtnetlink.h> struct ipv4_devconf ipv4_devconf = { - .accept_redirects = 1, - .send_redirects = 1, - .secure_redirects = 1, - .shared_media = 1, + .data = { + [NET_IPV4_CONF_ACCEPT_REDIRECTS - 1] = 1, + [NET_IPV4_CONF_SEND_REDIRECTS - 1] = 1, + [NET_IPV4_CONF_SECURE_REDIRECTS - 1] = 1, + [NET_IPV4_CONF_SHARED_MEDIA - 1] = 1, + }, }; static struct ipv4_devconf ipv4_devconf_dflt = { - .accept_redirects = 1, - .send_redirects = 1, - .secure_redirects = 1, - .shared_media = 1, - .accept_source_route = 1, + .data = { + [NET_IPV4_CONF_ACCEPT_REDIRECTS - 1] = 1, + [NET_IPV4_CONF_SEND_REDIRECTS - 1] = 1, + [NET_IPV4_CONF_SECURE_REDIRECTS - 1] = 1, + [NET_IPV4_CONF_SHARED_MEDIA - 1] = 1, + [NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE - 1] = 1, + }, }; -static struct nla_policy ifa_ipv4_policy[IFA_MAX+1] __read_mostly = { +#define IPV4_DEVCONF_DFLT(attr) IPV4_DEVCONF(ipv4_devconf_dflt, attr) + +static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = { [IFA_LOCAL] = { .type = NLA_U32 }, [IFA_ADDRESS] = { .type = NLA_U32 }, [IFA_BROADCAST] = { .type = NLA_U32 }, @@ -141,7 +147,7 @@ void in_dev_finish_destroy(struct in_device *idev) } } -struct in_device *inetdev_init(struct net_device *dev) +static struct in_device *inetdev_init(struct net_device *dev) { struct in_device *in_dev; @@ -321,12 +327,8 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, } } - if (destroy) { + if (destroy) inet_free_ifa(ifa1); - - if (!in_dev->ifa_list) - inetdev_destroy(in_dev); - } } static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, @@ -399,12 +401,10 @@ static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa) ASSERT_RTNL(); if (!in_dev) { - in_dev = inetdev_init(dev); - if (!in_dev) { - inet_free_ifa(ifa); - return -ENOBUFS; - } + inet_free_ifa(ifa); + return -ENOBUFS; } + ipv4_devconf_setall(in_dev); if (ifa->ifa_dev != in_dev) { BUG_TRAP(!ifa->ifa_dev); in_dev_hold(in_dev); @@ -514,13 +514,12 @@ static struct in_ifaddr *rtm_to_ifaddr(struct nlmsghdr *nlh) in_dev = __in_dev_get_rtnl(dev); if (in_dev == NULL) { - in_dev = inetdev_init(dev); - if (in_dev == NULL) { - err = -ENOBUFS; - goto errout; - } + err = -ENOBUFS; + goto errout; } + ipv4_devconf_setall(in_dev); + ifa = inet_alloc_ifa(); if (ifa == NULL) { /* @@ -1057,11 +1056,12 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, if (!in_dev) { if (event == NETDEV_REGISTER) { in_dev = inetdev_init(dev); - if (!in_dev) - panic("devinet: Failed to create loopback\n"); if (dev == &loopback_dev) { - in_dev->cnf.no_xfrm = 1; - in_dev->cnf.no_policy = 1; + if (!in_dev) + panic("devinet: " + "Failed to create loopback\n"); + IN_DEV_CONF_SET(in_dev, NOXFRM, 1); + IN_DEV_CONF_SET(in_dev, NOPOLICY, 1); } } goto out; @@ -1237,13 +1237,98 @@ errout: #ifdef CONFIG_SYSCTL +static void devinet_copy_dflt_conf(int i) +{ + struct net_device *dev; + + read_lock(&dev_base_lock); + for_each_netdev(dev) { + struct in_device *in_dev; + rcu_read_lock(); + in_dev = __in_dev_get_rcu(dev); + if (in_dev && !test_bit(i, in_dev->cnf.state)) + in_dev->cnf.data[i] = ipv4_devconf_dflt.data[i]; + rcu_read_unlock(); + } + read_unlock(&dev_base_lock); +} + +static int devinet_conf_proc(ctl_table *ctl, int write, + struct file* filp, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + + if (write) { + struct ipv4_devconf *cnf = ctl->extra1; + int i = (int *)ctl->data - cnf->data; + + set_bit(i, cnf->state); + + if (cnf == &ipv4_devconf_dflt) + devinet_copy_dflt_conf(i); + } + + return ret; +} + +static int devinet_conf_sysctl(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen) +{ + struct ipv4_devconf *cnf; + int *valp = table->data; + int new; + int i; + + if (!newval || !newlen) + return 0; + + if (newlen != sizeof(int)) + return -EINVAL; + + if (get_user(new, (int __user *)newval)) + return -EFAULT; + + if (new == *valp) + return 0; + + if (oldval && oldlenp) { + size_t len; + + if (get_user(len, oldlenp)) + return -EFAULT; + + if (len) { + if (len > table->maxlen) + len = table->maxlen; + if (copy_to_user(oldval, valp, len)) + return -EFAULT; + if (put_user(len, oldlenp)) + return -EFAULT; + } + } + + *valp = new; + + cnf = table->extra1; + i = (int *)table->data - cnf->data; + + set_bit(i, cnf->state); + + if (cnf == &ipv4_devconf_dflt) + devinet_copy_dflt_conf(i); + + return 1; +} + void inet_forward_change(void) { struct net_device *dev; - int on = ipv4_devconf.forwarding; + int on = IPV4_DEVCONF_ALL(FORWARDING); - ipv4_devconf.accept_redirects = !on; - ipv4_devconf_dflt.forwarding = on; + IPV4_DEVCONF_ALL(ACCEPT_REDIRECTS) = !on; + IPV4_DEVCONF_DFLT(FORWARDING) = on; read_lock(&dev_base_lock); for_each_netdev(dev) { @@ -1251,7 +1336,7 @@ void inet_forward_change(void) rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (in_dev) - in_dev->cnf.forwarding = on; + IN_DEV_CONF_SET(in_dev, FORWARDING, on); rcu_read_unlock(); } read_unlock(&dev_base_lock); @@ -1268,9 +1353,9 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write, int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); if (write && *valp != val) { - if (valp == &ipv4_devconf.forwarding) + if (valp == &IPV4_DEVCONF_ALL(FORWARDING)) inet_forward_change(); - else if (valp != &ipv4_devconf_dflt.forwarding) + else if (valp != &IPV4_DEVCONF_DFLT(FORWARDING)) rt_cache_flush(0); } @@ -1295,42 +1380,43 @@ int ipv4_doint_and_flush_strategy(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { - int *valp = table->data; - int new; + int ret = devinet_conf_sysctl(table, name, nlen, oldval, oldlenp, + newval, newlen); - if (!newval || !newlen) - return 0; + if (ret == 1) + rt_cache_flush(0); - if (newlen != sizeof(int)) - return -EINVAL; + return ret; +} - if (get_user(new, (int __user *)newval)) - return -EFAULT; - if (new == *valp) - return 0; +#define DEVINET_SYSCTL_ENTRY(attr, name, mval, proc, sysctl) \ + { \ + .ctl_name = NET_IPV4_CONF_ ## attr, \ + .procname = name, \ + .data = ipv4_devconf.data + \ + NET_IPV4_CONF_ ## attr - 1, \ + .maxlen = sizeof(int), \ + .mode = mval, \ + .proc_handler = proc, \ + .strategy = sysctl, \ + .extra1 = &ipv4_devconf, \ + } - if (oldval && oldlenp) { - size_t len; +#define DEVINET_SYSCTL_RW_ENTRY(attr, name) \ + DEVINET_SYSCTL_ENTRY(attr, name, 0644, devinet_conf_proc, \ + devinet_conf_sysctl) - if (get_user(len, oldlenp)) - return -EFAULT; +#define DEVINET_SYSCTL_RO_ENTRY(attr, name) \ + DEVINET_SYSCTL_ENTRY(attr, name, 0444, devinet_conf_proc, \ + devinet_conf_sysctl) - if (len) { - if (len > table->maxlen) - len = table->maxlen; - if (copy_to_user(oldval, valp, len)) - return -EFAULT; - if (put_user(len, oldlenp)) - return -EFAULT; - } - } - - *valp = new; - rt_cache_flush(0); - return 1; -} +#define DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, proc, sysctl) \ + DEVINET_SYSCTL_ENTRY(attr, name, 0644, proc, sysctl) +#define DEVINET_SYSCTL_FLUSHING_ENTRY(attr, name) \ + DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, ipv4_doint_and_flush, \ + ipv4_doint_and_flush_strategy) static struct devinet_sysctl_table { struct ctl_table_header *sysctl_header; @@ -1341,178 +1427,34 @@ static struct devinet_sysctl_table { ctl_table devinet_root_dir[2]; } devinet_sysctl = { .devinet_vars = { - { - .ctl_name = NET_IPV4_CONF_FORWARDING, - .procname = "forwarding", - .data = &ipv4_devconf.forwarding, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &devinet_sysctl_forward, - }, - { - .ctl_name = NET_IPV4_CONF_MC_FORWARDING, - .procname = "mc_forwarding", - .data = &ipv4_devconf.mc_forwarding, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_IPV4_CONF_ACCEPT_REDIRECTS, - .procname = "accept_redirects", - .data = &ipv4_devconf.accept_redirects, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_IPV4_CONF_SECURE_REDIRECTS, - .procname = "secure_redirects", - .data = &ipv4_devconf.secure_redirects, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_IPV4_CONF_SHARED_MEDIA, - .procname = "shared_media", - .data = &ipv4_devconf.shared_media, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_IPV4_CONF_RP_FILTER, - .procname = "rp_filter", - .data = &ipv4_devconf.rp_filter, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_IPV4_CONF_SEND_REDIRECTS, - .procname = "send_redirects", - .data = &ipv4_devconf.send_redirects, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE, - .procname = "accept_source_route", - .data = &ipv4_devconf.accept_source_route, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_IPV4_CONF_PROXY_ARP, - .procname = "proxy_arp", - .data = &ipv4_devconf.proxy_arp, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_IPV4_CONF_MEDIUM_ID, - .procname = "medium_id", - .data = &ipv4_devconf.medium_id, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_IPV4_CONF_BOOTP_RELAY, - .procname = "bootp_relay", - .data = &ipv4_devconf.bootp_relay, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_IPV4_CONF_LOG_MARTIANS, - .procname = "log_martians", - .data = &ipv4_devconf.log_martians, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_IPV4_CONF_TAG, - .procname = "tag", - .data = &ipv4_devconf.tag, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_IPV4_CONF_ARPFILTER, - .procname = "arp_filter", - .data = &ipv4_devconf.arp_filter, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_IPV4_CONF_ARP_ANNOUNCE, - .procname = "arp_announce", - .data = &ipv4_devconf.arp_announce, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_IPV4_CONF_ARP_IGNORE, - .procname = "arp_ignore", - .data = &ipv4_devconf.arp_ignore, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_IPV4_CONF_ARP_ACCEPT, - .procname = "arp_accept", - .data = &ipv4_devconf.arp_accept, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_IPV4_CONF_NOXFRM, - .procname = "disable_xfrm", - .data = &ipv4_devconf.no_xfrm, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &ipv4_doint_and_flush, - .strategy = &ipv4_doint_and_flush_strategy, - }, - { - .ctl_name = NET_IPV4_CONF_NOPOLICY, - .procname = "disable_policy", - .data = &ipv4_devconf.no_policy, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &ipv4_doint_and_flush, - .strategy = &ipv4_doint_and_flush_strategy, - }, - { - .ctl_name = NET_IPV4_CONF_FORCE_IGMP_VERSION, - .procname = "force_igmp_version", - .data = &ipv4_devconf.force_igmp_version, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &ipv4_doint_and_flush, - .strategy = &ipv4_doint_and_flush_strategy, - }, - { - .ctl_name = NET_IPV4_CONF_PROMOTE_SECONDARIES, - .procname = "promote_secondaries", - .data = &ipv4_devconf.promote_secondaries, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &ipv4_doint_and_flush, - .strategy = &ipv4_doint_and_flush_strategy, - }, + DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding", + devinet_sysctl_forward, + devinet_conf_sysctl), + DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"), + + DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"), + DEVINET_SYSCTL_RW_ENTRY(SECURE_REDIRECTS, "secure_redirects"), + DEVINET_SYSCTL_RW_ENTRY(SHARED_MEDIA, "shared_media"), + DEVINET_SYSCTL_RW_ENTRY(RP_FILTER, "rp_filter"), + DEVINET_SYSCTL_RW_ENTRY(SEND_REDIRECTS, "send_redirects"), + DEVINET_SYSCTL_RW_ENTRY(ACCEPT_SOURCE_ROUTE, + "accept_source_route"), + DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP, "proxy_arp"), + DEVINET_SYSCTL_RW_ENTRY(MEDIUM_ID, "medium_id"), + DEVINET_SYSCTL_RW_ENTRY(BOOTP_RELAY, "bootp_relay"), + DEVINET_SYSCTL_RW_ENTRY(LOG_MARTIANS, "log_martians"), + DEVINET_SYSCTL_RW_ENTRY(TAG, "tag"), + DEVINET_SYSCTL_RW_ENTRY(ARPFILTER, "arp_filter"), + DEVINET_SYSCTL_RW_ENTRY(ARP_ANNOUNCE, "arp_announce"), + DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"), + DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"), + + DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"), + DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"), + DEVINET_SYSCTL_FLUSHING_ENTRY(FORCE_IGMP_VERSION, + "force_igmp_version"), + DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES, + "promote_secondaries"), }, .devinet_dev = { { @@ -1561,6 +1503,7 @@ static void devinet_sysctl_register(struct in_device *in_dev, return; for (i = 0; i < ARRAY_SIZE(t->devinet_vars) - 1; i++) { t->devinet_vars[i].data += (char *)p - (char *)&ipv4_devconf; + t->devinet_vars[i].extra1 = p; } if (dev) { diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 837f2957fa83..311d633f7f39 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -250,8 +250,6 @@ e_inval: return -EINVAL; } -#ifndef CONFIG_IP_NOSIOCRT - static inline __be32 sk_extract_addr(struct sockaddr *addr) { return ((struct sockaddr_in *) addr)->sin_addr.s_addr; @@ -443,16 +441,7 @@ int ip_rt_ioctl(unsigned int cmd, void __user *arg) return -EINVAL; } -#else - -int ip_rt_ioctl(unsigned int cmd, void *arg) -{ - return -EINVAL; -} - -#endif - -struct nla_policy rtm_ipv4_policy[RTA_MAX+1] __read_mostly = { +const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = { [RTA_DST] = { .type = NLA_U32 }, [RTA_SRC] = { .type = NLA_U32 }, [RTA_IIF] = { .type = NLA_U32 }, diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index 9cfecf1215c9..07e843a47dde 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -456,6 +456,8 @@ static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg) fib_release_info(fi_drop); if (state & FA_S_ACCESSED) rt_cache_flush(-1); + rtmsg_fib(RTM_NEWROUTE, key, fa, cfg->fc_dst_len, tb->tb_id, + &cfg->fc_nlinfo, NLM_F_REPLACE); return 0; } @@ -523,7 +525,7 @@ static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg) rt_cache_flush(-1); rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len, tb->tb_id, - &cfg->fc_nlinfo); + &cfg->fc_nlinfo, 0); return 0; out_free_new_fa: @@ -589,7 +591,7 @@ static int fn_hash_delete(struct fib_table *tb, struct fib_config *cfg) fa = fa_to_delete; rtmsg_fib(RTM_DELROUTE, key, fa, cfg->fc_dst_len, - tb->tb_id, &cfg->fc_nlinfo); + tb->tb_id, &cfg->fc_nlinfo, 0); kill_fn = 0; write_lock_bh(&fib_hash_lock); diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index 0e8b70bad4e1..eef9eec17e0c 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -30,7 +30,8 @@ extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, int dst_len, u8 tos, struct fib_info *fi, unsigned int); extern void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, - int dst_len, u32 tb_id, struct nl_info *info); + int dst_len, u32 tb_id, struct nl_info *info, + unsigned int nlm_flags); extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio); extern int fib_detect_death(struct fib_info *fi, int order, diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 33083ad52e9f..2a947840210e 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -169,7 +169,7 @@ static struct fib_table *fib_empty_table(void) return NULL; } -static struct nla_policy fib4_rule_policy[FRA_MAX+1] __read_mostly = { +static const struct nla_policy fib4_rule_policy[FRA_MAX+1] = { FRA_GENERIC_POLICY, [FRA_FLOW] = { .type = NLA_U32 }, }; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 406ea7050aed..bb94550d95c3 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -301,7 +301,8 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi) } void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, - int dst_len, u32 tb_id, struct nl_info *info) + int dst_len, u32 tb_id, struct nl_info *info, + unsigned int nlm_flags) { struct sk_buff *skb; u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; @@ -313,7 +314,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, err = fib_dump_info(skb, info->pid, seq, event, tb_id, fa->fa_type, fa->fa_scope, key, dst_len, - fa->fa_tos, fa->fa_info, 0); + fa->fa_tos, fa->fa_info, nlm_flags); if (err < 0) { /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 9be7da7c3a8f..30e332ade61b 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1226,6 +1226,8 @@ static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg) fib_release_info(fi_drop); if (state & FA_S_ACCESSED) rt_cache_flush(-1); + rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, + tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE); goto succeeded; } @@ -1278,7 +1280,7 @@ static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg) rt_cache_flush(-1); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, - &cfg->fc_nlinfo); + &cfg->fc_nlinfo, 0); succeeded: return 0; @@ -1624,7 +1626,7 @@ static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg) fa = fa_to_delete; rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, - &cfg->fc_nlinfo); + &cfg->fc_nlinfo, 0); l = fib_find_node(t, key); li = find_leaf_info(l, plen); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index d38cbba92a4d..02a899bec196 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -514,9 +514,15 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) saddr = iph->daddr; if (!(rt->rt_flags & RTCF_LOCAL)) { - if (sysctl_icmp_errors_use_inbound_ifaddr) - saddr = inet_select_addr(skb_in->dev, 0, RT_SCOPE_LINK); - else + struct net_device *dev = NULL; + + if (rt->fl.iif && sysctl_icmp_errors_use_inbound_ifaddr) + dev = dev_get_by_index(rt->fl.iif); + + if (dev) { + saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK); + dev_put(dev); + } else saddr = 0; } diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index f4dd47453108..a646409c2d06 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -128,14 +128,16 @@ * contradict to specs provided this delay is small enough. */ -#define IGMP_V1_SEEN(in_dev) (ipv4_devconf.force_igmp_version == 1 || \ - (in_dev)->cnf.force_igmp_version == 1 || \ - ((in_dev)->mr_v1_seen && \ - time_before(jiffies, (in_dev)->mr_v1_seen))) -#define IGMP_V2_SEEN(in_dev) (ipv4_devconf.force_igmp_version == 2 || \ - (in_dev)->cnf.force_igmp_version == 2 || \ - ((in_dev)->mr_v2_seen && \ - time_before(jiffies, (in_dev)->mr_v2_seen))) +#define IGMP_V1_SEEN(in_dev) \ + (IPV4_DEVCONF_ALL(FORCE_IGMP_VERSION) == 1 || \ + IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 1 || \ + ((in_dev)->mr_v1_seen && \ + time_before(jiffies, (in_dev)->mr_v1_seen))) +#define IGMP_V2_SEEN(in_dev) \ + (IPV4_DEVCONF_ALL(FORCE_IGMP_VERSION) == 2 || \ + IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 2 || \ + ((in_dev)->mr_v2_seen && \ + time_before(jiffies, (in_dev)->mr_v2_seen))) static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im); static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 43fb1600f1f0..fbe7714f21d0 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -31,10 +31,8 @@ EXPORT_SYMBOL(inet_csk_timer_bug_msg); /* * This array holds the first and last local port number. - * For high-usage systems, use sysctl to change this to - * 32768-61000 */ -int sysctl_local_port_range[2] = { 1024, 4999 }; +int sysctl_local_port_range[2] = { 32768, 61000 }; int inet_csk_bind_conflict(const struct sock *sk, const struct inet_bind_bucket *tb) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index d6427d918512..34ea4547ebbe 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1352,7 +1352,8 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar } { - struct flowi fl = { .nl_u = { .ip4_u = + struct flowi fl = { .oif = arg->bound_dev_if, + .nl_u = { .ip4_u = { .daddr = daddr, .saddr = rt->rt_spec_dst, .tos = RT_TOS(ip_hdr(skb)->tos) } }, @@ -1376,6 +1377,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar inet->tos = ip_hdr(skb)->tos; sk->sk_priority = skb->priority; sk->sk_protocol = ip_hdr(skb)->protocol; + sk->sk_bound_dev_if = arg->bound_dev_if; ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0, &ipc, rt, MSG_DONTWAIT); if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 0ebae413ae87..d96582acdf69 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -152,9 +152,11 @@ struct net_device *ipmr_new_tunnel(struct vifctl *v) dev->flags |= IFF_MULTICAST; in_dev = __in_dev_get_rtnl(dev); - if (in_dev == NULL && (in_dev = inetdev_init(dev)) == NULL) + if (in_dev == NULL) goto failure; - in_dev->cnf.rp_filter = 0; + + ipv4_devconf_setall(in_dev); + IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0; if (dev_open(dev)) goto failure; @@ -218,10 +220,15 @@ static struct net_device *ipmr_reg_vif(void) } dev->iflink = 0; - if ((in_dev = inetdev_init(dev)) == NULL) + rcu_read_lock(); + if ((in_dev = __in_dev_get_rcu(dev)) == NULL) { + rcu_read_unlock(); goto failure; + } - in_dev->cnf.rp_filter = 0; + ipv4_devconf_setall(in_dev); + IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0; + rcu_read_unlock(); if (dev_open(dev)) goto failure; @@ -281,7 +288,7 @@ static int vif_delete(int vifi) dev_set_allmulti(dev, -1); if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) { - in_dev->cnf.mc_forwarding--; + IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--; ip_rt_multicast_event(in_dev); } @@ -426,7 +433,7 @@ static int vif_add(struct vifctl *vifc, int mrtsock) if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) return -EADDRNOTAVAIL; - in_dev->cnf.mc_forwarding++; + IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++; dev_set_allmulti(dev, +1); ip_rt_multicast_event(in_dev); @@ -841,7 +848,7 @@ static void mrtsock_destruct(struct sock *sk) { rtnl_lock(); if (sk == mroute_socket) { - ipv4_devconf.mc_forwarding--; + IPV4_DEVCONF_ALL(MC_FORWARDING)--; write_lock_bh(&mrt_lock); mroute_socket=NULL; @@ -890,7 +897,7 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int opt mroute_socket=sk; write_unlock_bh(&mrt_lock); - ipv4_devconf.mc_forwarding++; + IPV4_DEVCONF_ALL(MC_FORWARDING)++; } rtnl_unlock(); return ret; diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig index 891b9355cf96..09d0c3f35669 100644 --- a/net/ipv4/ipvs/Kconfig +++ b/net/ipv4/ipvs/Kconfig @@ -1,10 +1,7 @@ # # IP Virtual Server configuration # -menu "IP: Virtual Server Configuration" - depends on NETFILTER - -config IP_VS +menuconfig IP_VS tristate "IP virtual server support (EXPERIMENTAL)" depends on NETFILTER ---help--- @@ -25,9 +22,10 @@ config IP_VS If you want to compile it in kernel, say Y. To compile it as a module, choose M here. If unsure, say N. +if IP_VS + config IP_VS_DEBUG bool "IP virtual server debugging" - depends on IP_VS ---help--- Say Y here if you want to get additional messages useful in debugging the IP virtual server code. You can change the debug @@ -35,7 +33,6 @@ config IP_VS_DEBUG config IP_VS_TAB_BITS int "IPVS connection table size (the Nth power of 2)" - depends on IP_VS default "12" ---help--- The IPVS connection hash table uses the chaining scheme to handle @@ -61,42 +58,35 @@ config IP_VS_TAB_BITS needed for your box. comment "IPVS transport protocol load balancing support" - depends on IP_VS config IP_VS_PROTO_TCP bool "TCP load balancing support" - depends on IP_VS ---help--- This option enables support for load balancing TCP transport protocol. Say Y if unsure. config IP_VS_PROTO_UDP bool "UDP load balancing support" - depends on IP_VS ---help--- This option enables support for load balancing UDP transport protocol. Say Y if unsure. config IP_VS_PROTO_ESP bool "ESP load balancing support" - depends on IP_VS ---help--- This option enables support for load balancing ESP (Encapsulation Security Payload) transport protocol. Say Y if unsure. config IP_VS_PROTO_AH bool "AH load balancing support" - depends on IP_VS ---help--- This option enables support for load balancing AH (Authentication Header) transport protocol. Say Y if unsure. comment "IPVS scheduler" - depends on IP_VS config IP_VS_RR tristate "round-robin scheduling" - depends on IP_VS ---help--- The robin-robin scheduling algorithm simply directs network connections to different real servers in a round-robin manner. @@ -106,7 +96,6 @@ config IP_VS_RR config IP_VS_WRR tristate "weighted round-robin scheduling" - depends on IP_VS ---help--- The weighted robin-robin scheduling algorithm directs network connections to different real servers based on server weights @@ -120,7 +109,6 @@ config IP_VS_WRR config IP_VS_LC tristate "least-connection scheduling" - depends on IP_VS ---help--- The least-connection scheduling algorithm directs network connections to the server with the least number of active @@ -131,7 +119,6 @@ config IP_VS_LC config IP_VS_WLC tristate "weighted least-connection scheduling" - depends on IP_VS ---help--- The weighted least-connection scheduling algorithm directs network connections to the server with the least active connections @@ -142,7 +129,6 @@ config IP_VS_WLC config IP_VS_LBLC tristate "locality-based least-connection scheduling" - depends on IP_VS ---help--- The locality-based least-connection scheduling algorithm is for destination IP load balancing. It is usually used in cache cluster. @@ -157,7 +143,6 @@ config IP_VS_LBLC config IP_VS_LBLCR tristate "locality-based least-connection with replication scheduling" - depends on IP_VS ---help--- The locality-based least-connection with replication scheduling algorithm is also for destination IP load balancing. It is @@ -176,7 +161,6 @@ config IP_VS_LBLCR config IP_VS_DH tristate "destination hashing scheduling" - depends on IP_VS ---help--- The destination hashing scheduling algorithm assigns network connections to the servers through looking up a statically assigned @@ -187,7 +171,6 @@ config IP_VS_DH config IP_VS_SH tristate "source hashing scheduling" - depends on IP_VS ---help--- The source hashing scheduling algorithm assigns network connections to the servers through looking up a statically assigned @@ -198,7 +181,6 @@ config IP_VS_SH config IP_VS_SED tristate "shortest expected delay scheduling" - depends on IP_VS ---help--- The shortest expected delay scheduling algorithm assigns network connections to the server with the shortest expected delay. The @@ -212,7 +194,6 @@ config IP_VS_SED config IP_VS_NQ tristate "never queue scheduling" - depends on IP_VS ---help--- The never queue scheduling algorithm adopts a two-speed model. When there is an idle server available, the job will be sent to @@ -225,11 +206,10 @@ config IP_VS_NQ module, choose M here. If unsure, say N. comment 'IPVS application helper' - depends on IP_VS config IP_VS_FTP tristate "FTP protocol helper" - depends on IP_VS && IP_VS_PROTO_TCP + depends on IP_VS_PROTO_TCP ---help--- FTP is a protocol that transfers IP address and/or port number in the payload. In the virtual server via Network Address Translation, @@ -241,4 +221,4 @@ config IP_VS_FTP If you want to compile it in kernel, say Y. To compile it as a module, choose M here. If unsure, say N. -endmenu +endif # IP_VS diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index e3f83bf160d9..9bacf1a03630 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -499,7 +499,8 @@ check_entry(struct ipt_entry *e, const char *name) } static inline int check_match(struct ipt_entry_match *m, const char *name, - const struct ipt_ip *ip, unsigned int hookmask) + const struct ipt_ip *ip, unsigned int hookmask, + unsigned int *i) { struct xt_match *match; int ret; @@ -515,6 +516,8 @@ static inline int check_match(struct ipt_entry_match *m, const char *name, m->u.kernel.match->name); ret = -EINVAL; } + if (!ret) + (*i)++; return ret; } @@ -537,11 +540,10 @@ find_check_match(struct ipt_entry_match *m, } m->u.kernel.match = match; - ret = check_match(m, name, ip, hookmask); + ret = check_match(m, name, ip, hookmask, i); if (ret) goto err; - (*i)++; return 0; err: module_put(m->u.kernel.match->me); @@ -1425,7 +1427,7 @@ out: } static inline int -compat_check_calc_match(struct ipt_entry_match *m, +compat_find_calc_match(struct ipt_entry_match *m, const char *name, const struct ipt_ip *ip, unsigned int hookmask, @@ -1449,6 +1451,31 @@ compat_check_calc_match(struct ipt_entry_match *m, } static inline int +compat_release_match(struct ipt_entry_match *m, unsigned int *i) +{ + if (i && (*i)-- == 0) + return 1; + + module_put(m->u.kernel.match->me); + return 0; +} + +static inline int +compat_release_entry(struct ipt_entry *e, unsigned int *i) +{ + struct ipt_entry_target *t; + + if (i && (*i)-- == 0) + return 1; + + /* Cleanup all matches */ + IPT_MATCH_ITERATE(e, compat_release_match, NULL); + t = ipt_get_target(e); + module_put(t->u.kernel.target->me); + return 0; +} + +static inline int check_compat_entry_size_and_hooks(struct ipt_entry *e, struct xt_table_info *newinfo, unsigned int *size, @@ -1485,10 +1512,10 @@ check_compat_entry_size_and_hooks(struct ipt_entry *e, off = 0; entry_offset = (void *)e - (void *)base; j = 0; - ret = IPT_MATCH_ITERATE(e, compat_check_calc_match, name, &e->ip, + ret = IPT_MATCH_ITERATE(e, compat_find_calc_match, name, &e->ip, e->comefrom, &off, &j); if (ret != 0) - goto cleanup_matches; + goto release_matches; t = ipt_get_target(e); target = try_then_request_module(xt_find_target(AF_INET, @@ -1499,7 +1526,7 @@ check_compat_entry_size_and_hooks(struct ipt_entry *e, duprintf("check_compat_entry_size_and_hooks: `%s' not found\n", t->u.user.name); ret = target ? PTR_ERR(target) : -ENOENT; - goto cleanup_matches; + goto release_matches; } t->u.kernel.target = target; @@ -1526,8 +1553,8 @@ check_compat_entry_size_and_hooks(struct ipt_entry *e, out: module_put(t->u.kernel.target->me); -cleanup_matches: - IPT_MATCH_ITERATE(e, cleanup_match, &j); +release_matches: + IPT_MATCH_ITERATE(e, compat_release_match, &j); return ret; } @@ -1574,15 +1601,26 @@ static int compat_copy_entry_from_user(struct ipt_entry *e, void **dstptr, return ret; } -static inline int compat_check_entry(struct ipt_entry *e, const char *name) +static inline int compat_check_entry(struct ipt_entry *e, const char *name, + unsigned int *i) { - int ret; + int j, ret; - ret = IPT_MATCH_ITERATE(e, check_match, name, &e->ip, e->comefrom); + j = 0; + ret = IPT_MATCH_ITERATE(e, check_match, name, &e->ip, e->comefrom, &j); if (ret) - return ret; + goto cleanup_matches; + + ret = check_target(e, name); + if (ret) + goto cleanup_matches; - return check_target(e, name); + (*i)++; + return 0; + + cleanup_matches: + IPT_MATCH_ITERATE(e, cleanup_match, &j); + return ret; } static int @@ -1673,10 +1711,17 @@ translate_compat_table(const char *name, if (!mark_source_chains(newinfo, valid_hooks, entry1)) goto free_newinfo; + i = 0; ret = IPT_ENTRY_ITERATE(entry1, newinfo->size, compat_check_entry, - name); - if (ret) - goto free_newinfo; + name, &i); + if (ret) { + j -= i; + IPT_ENTRY_ITERATE_CONTINUE(entry1, newinfo->size, i, + compat_release_entry, &j); + IPT_ENTRY_ITERATE(entry1, newinfo->size, cleanup_entry, &i); + xt_free_table_info(newinfo); + return ret; + } /* And one copy for every other CPU */ for_each_possible_cpu(i) @@ -1691,7 +1736,7 @@ translate_compat_table(const char *name, free_newinfo: xt_free_table_info(newinfo); out: - IPT_ENTRY_ITERATE(entry0, total_size, cleanup_entry, &j); + IPT_ENTRY_ITERATE(entry0, total_size, compat_release_entry, &j); return ret; out_unlock: compat_flush_offsets(); diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 0654eaae70c9..6dc72a815f77 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -133,6 +133,7 @@ static unsigned int ipv4_conntrack_help(unsigned int hooknum, struct nf_conn *ct; enum ip_conntrack_info ctinfo; struct nf_conn_help *help; + struct nf_conntrack_helper *helper; /* This is where we call the helper: as the packet goes out. */ ct = nf_ct_get(*pskb, &ctinfo); @@ -140,12 +141,14 @@ static unsigned int ipv4_conntrack_help(unsigned int hooknum, return NF_ACCEPT; help = nfct_help(ct); - if (!help || !help->helper) + if (!help) return NF_ACCEPT; - - return help->helper->help(pskb, - skb_network_offset(*pskb) + ip_hdrlen(*pskb), - ct, ctinfo); + /* rcu_read_lock()ed by nf_hook_slow */ + helper = rcu_dereference(help->helper); + if (!helper) + return NF_ACCEPT; + return helper->help(pskb, skb_network_offset(*pskb) + ip_hdrlen(*pskb), + ct, ctinfo); } static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, @@ -154,12 +157,10 @@ static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, const struct net_device *out, int (*okfn)(struct sk_buff *)) { -#if !defined(CONFIG_IP_NF_NAT) && !defined(CONFIG_IP_NF_NAT_MODULE) /* Previously seen (loopback)? Ignore. Do this before fragment check. */ if ((*pskb)->nfct) return NF_ACCEPT; -#endif /* Gather fragments. */ if (ip_hdr(*pskb)->frag_off & htons(IP_MF | IP_OFFSET)) { diff --git a/net/ipv4/netfilter/nf_nat_ftp.c b/net/ipv4/netfilter/nf_nat_ftp.c index 751b59801755..e6bc8e5a72f1 100644 --- a/net/ipv4/netfilter/nf_nat_ftp.c +++ b/net/ipv4/netfilter/nf_nat_ftp.c @@ -40,8 +40,7 @@ mangle_rfc959_packet(struct sk_buff **pskb, unsigned int matchoff, unsigned int matchlen, struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - u32 *seq) + enum ip_conntrack_info ctinfo) { char buffer[sizeof("nnn,nnn,nnn,nnn,nnn,nnn")]; @@ -50,7 +49,6 @@ mangle_rfc959_packet(struct sk_buff **pskb, DEBUGP("calling nf_nat_mangle_tcp_packet\n"); - *seq += strlen(buffer) - matchlen; return nf_nat_mangle_tcp_packet(pskb, ct, ctinfo, matchoff, matchlen, buffer, strlen(buffer)); } @@ -63,8 +61,7 @@ mangle_eprt_packet(struct sk_buff **pskb, unsigned int matchoff, unsigned int matchlen, struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - u32 *seq) + enum ip_conntrack_info ctinfo) { char buffer[sizeof("|1|255.255.255.255|65535|")]; @@ -72,7 +69,6 @@ mangle_eprt_packet(struct sk_buff **pskb, DEBUGP("calling nf_nat_mangle_tcp_packet\n"); - *seq += strlen(buffer) - matchlen; return nf_nat_mangle_tcp_packet(pskb, ct, ctinfo, matchoff, matchlen, buffer, strlen(buffer)); } @@ -85,8 +81,7 @@ mangle_epsv_packet(struct sk_buff **pskb, unsigned int matchoff, unsigned int matchlen, struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - u32 *seq) + enum ip_conntrack_info ctinfo) { char buffer[sizeof("|||65535|")]; @@ -94,14 +89,13 @@ mangle_epsv_packet(struct sk_buff **pskb, DEBUGP("calling nf_nat_mangle_tcp_packet\n"); - *seq += strlen(buffer) - matchlen; return nf_nat_mangle_tcp_packet(pskb, ct, ctinfo, matchoff, matchlen, buffer, strlen(buffer)); } static int (*mangle[])(struct sk_buff **, __be32, u_int16_t, unsigned int, unsigned int, struct nf_conn *, - enum ip_conntrack_info, u32 *seq) + enum ip_conntrack_info) = { [NF_CT_FTP_PORT] = mangle_rfc959_packet, [NF_CT_FTP_PASV] = mangle_rfc959_packet, @@ -116,8 +110,7 @@ static unsigned int nf_nat_ftp(struct sk_buff **pskb, enum nf_ct_ftp_type type, unsigned int matchoff, unsigned int matchlen, - struct nf_conntrack_expect *exp, - u32 *seq) + struct nf_conntrack_expect *exp) { __be32 newip; u_int16_t port; @@ -145,8 +138,7 @@ static unsigned int nf_nat_ftp(struct sk_buff **pskb, if (port == 0) return NF_DROP; - if (!mangle[type](pskb, newip, port, matchoff, matchlen, ct, ctinfo, - seq)) { + if (!mangle[type](pskb, newip, port, matchoff, matchlen, ct, ctinfo)) { nf_conntrack_unexpect_related(exp); return NF_DROP; } diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c index fcebc968d37f..c5d2a2d690b8 100644 --- a/net/ipv4/netfilter/nf_nat_h323.c +++ b/net/ipv4/netfilter/nf_nat_h323.c @@ -455,9 +455,9 @@ static int nat_q931(struct sk_buff **pskb, struct nf_conn *ct, if (idx > 0 && get_h225_addr(ct, *data, &taddr[0], &addr, &port) && (ntohl(addr.ip) & 0xff000000) == 0x7f000000) { - set_h225_addr_hook(pskb, data, 0, &taddr[0], - &ct->tuplehash[!dir].tuple.dst.u3, - info->sig_port[!dir]); + set_h225_addr(pskb, data, 0, &taddr[0], + &ct->tuplehash[!dir].tuple.dst.u3, + info->sig_port[!dir]); } } else { nf_conntrack_unexpect_related(exp); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 37ab5802ca08..3b690cf2a4ee 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -109,6 +109,17 @@ static const struct snmp_mib snmp4_ipstats_list[] = { SNMP_MIB_SENTINEL }; +/* Following RFC4293 items are displayed in /proc/net/netstat */ +static const struct snmp_mib snmp4_ipextstats_list[] = { + SNMP_MIB_ITEM("InNoRoutes", IPSTATS_MIB_INNOROUTES), + SNMP_MIB_ITEM("InTruncatedPkts", IPSTATS_MIB_INTRUNCATEDPKTS), + SNMP_MIB_ITEM("InMcastPkts", IPSTATS_MIB_INMCASTPKTS), + SNMP_MIB_ITEM("OutMcastPkts", IPSTATS_MIB_OUTMCASTPKTS), + SNMP_MIB_ITEM("InBcastPkts", IPSTATS_MIB_INBCASTPKTS), + SNMP_MIB_ITEM("OutBcastPkts", IPSTATS_MIB_OUTBCASTPKTS), + SNMP_MIB_SENTINEL +}; + static const struct snmp_mib snmp4_icmp_list[] = { SNMP_MIB_ITEM("InMsgs", ICMP_MIB_INMSGS), SNMP_MIB_ITEM("InErrors", ICMP_MIB_INERRORS), @@ -249,7 +260,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v) seq_printf(seq, " %s", snmp4_ipstats_list[i].name); seq_printf(seq, "\nIp: %d %d", - ipv4_devconf.forwarding ? 1 : 2, sysctl_ip_default_ttl); + IPV4_DEVCONF_ALL(FORWARDING) ? 1 : 2, sysctl_ip_default_ttl); for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) seq_printf(seq, " %lu", @@ -338,6 +349,16 @@ static int netstat_seq_show(struct seq_file *seq, void *v) snmp_fold_field((void **)net_statistics, snmp4_net_list[i].entry)); + seq_puts(seq, "\nIpExt:"); + for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++) + seq_printf(seq, " %s", snmp4_ipextstats_list[i].name); + + seq_puts(seq, "\nIpExt:"); + for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++) + seq_printf(seq, " %lu", + snmp_fold_field((void **)ip_statistics, + snmp4_ipextstats_list[i].entry)); + seq_putc(seq, '\n'); return 0; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index cb76e3c725a0..29ca63e81ced 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1636,7 +1636,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, atomic_set(&rth->u.dst.__refcnt, 1); rth->u.dst.flags= DST_HOST; - if (in_dev->cnf.no_policy) + if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) rth->u.dst.flags |= DST_NOPOLICY; rth->fl.fl4_dst = daddr; rth->rt_dst = daddr; @@ -1778,9 +1778,9 @@ static inline int __mkroute_input(struct sk_buff *skb, if (res->fi->fib_nhs > 1) rth->u.dst.flags |= DST_BALANCED; #endif - if (in_dev->cnf.no_policy) + if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) rth->u.dst.flags |= DST_NOPOLICY; - if (out_dev->cnf.no_xfrm) + if (IN_DEV_CONF_GET(out_dev, NOXFRM)) rth->u.dst.flags |= DST_NOXFRM; rth->fl.fl4_dst = daddr; rth->rt_dst = daddr; @@ -2021,7 +2021,7 @@ local_input: atomic_set(&rth->u.dst.__refcnt, 1); rth->u.dst.flags= DST_HOST; - if (in_dev->cnf.no_policy) + if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) rth->u.dst.flags |= DST_NOPOLICY; rth->fl.fl4_dst = daddr; rth->rt_dst = daddr; @@ -2218,9 +2218,9 @@ static inline int __mkroute_output(struct rtable **result, rth->u.dst.flags |= DST_BALANCED; } #endif - if (in_dev->cnf.no_xfrm) + if (IN_DEV_CONF_GET(in_dev, NOXFRM)) rth->u.dst.flags |= DST_NOXFRM; - if (in_dev->cnf.no_policy) + if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) rth->u.dst.flags |= DST_NOPOLICY; rth->fl.fl4_dst = oldflp->fl4_dst; @@ -2396,7 +2396,7 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ dev_out = ip_dev_find(oldflp->fl4_src); - if ((dev_out == NULL) && !(sysctl_ip_nonlocal_bind)) + if (dev_out == NULL) goto out; /* I removed check for oif == dev_out->oif here. @@ -2407,7 +2407,7 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) of another iface. --ANK */ - if (dev_out && oldflp->oif == 0 + if (oldflp->oif == 0 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) { /* Special hack: user can direct multicasts and limited broadcast via necessary interface @@ -2598,6 +2598,69 @@ int __ip_route_output_key(struct rtable **rp, const struct flowi *flp) EXPORT_SYMBOL_GPL(__ip_route_output_key); +static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) +{ +} + +static struct dst_ops ipv4_dst_blackhole_ops = { + .family = AF_INET, + .protocol = __constant_htons(ETH_P_IP), + .destroy = ipv4_dst_destroy, + .check = ipv4_dst_check, + .update_pmtu = ipv4_rt_blackhole_update_pmtu, + .entry_size = sizeof(struct rtable), +}; + + +static int ipv4_blackhole_output(struct sk_buff *skb) +{ + kfree_skb(skb); + return 0; +} + +static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk) +{ + struct rtable *ort = *rp; + struct rtable *rt = (struct rtable *) + dst_alloc(&ipv4_dst_blackhole_ops); + + if (rt) { + struct dst_entry *new = &rt->u.dst; + + atomic_set(&new->__refcnt, 1); + new->__use = 1; + new->input = ipv4_blackhole_output; + new->output = ipv4_blackhole_output; + memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32)); + + new->dev = ort->u.dst.dev; + if (new->dev) + dev_hold(new->dev); + + rt->fl = ort->fl; + + rt->idev = ort->idev; + if (rt->idev) + in_dev_hold(rt->idev); + rt->rt_flags = ort->rt_flags; + rt->rt_type = ort->rt_type; + rt->rt_dst = ort->rt_dst; + rt->rt_src = ort->rt_src; + rt->rt_iif = ort->rt_iif; + rt->rt_gateway = ort->rt_gateway; + rt->rt_spec_dst = ort->rt_spec_dst; + rt->peer = ort->peer; + if (rt->peer) + atomic_inc(&rt->peer->refcnt); + + dst_free(new); + } + + dst_release(&(*rp)->u.dst); + *rp = rt; + return (rt ? 0 : -ENOMEM); +} + int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags) { int err; @@ -2610,7 +2673,11 @@ int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, flp->fl4_src = (*rp)->rt_src; if (!flp->fl4_dst) flp->fl4_dst = (*rp)->rt_dst; - return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags); + err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags); + if (err == -EREMOTE) + err = ipv4_dst_blackhole(rp, flp, sk); + + return err; } return 0; @@ -2692,7 +2759,7 @@ static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, __be32 dst = rt->rt_dst; if (MULTICAST(dst) && !LOCAL_MCAST(dst) && - ipv4_devconf.mc_forwarding) { + IPV4_DEVCONF_ALL(MC_FORWARDING)) { int err = ipmr_get_route(skb, r, nowait); if (err <= 0) { if (!nowait) { @@ -3139,6 +3206,8 @@ int __init ip_rt_init(void) kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); + ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; + rt_hash_table = (struct rt_hash_bucket *) alloc_large_system_hash("IP route cache", sizeof(struct rt_hash_bucket), diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 6817d6485df5..53ef0f4bbdaa 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -37,12 +37,12 @@ static int ipv4_sysctl_forward(ctl_table *ctl, int write, struct file * filp, void __user *buffer, size_t *lenp, loff_t *ppos) { - int val = ipv4_devconf.forwarding; + int val = IPV4_DEVCONF_ALL(FORWARDING); int ret; ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); - if (write && ipv4_devconf.forwarding != val) + if (write && IPV4_DEVCONF_ALL(FORWARDING) != val) inet_forward_change(); return ret; @@ -222,7 +222,7 @@ ctl_table ipv4_table[] = { { .ctl_name = NET_IPV4_FORWARD, .procname = "ip_forward", - .data = &ipv4_devconf.forwarding, + .data = &IPV4_DEVCONF_ALL(FORWARDING), .maxlen = sizeof(int), .mode = 0644, .proc_handler = &ipv4_sysctl_forward, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bd4c295f5d79..cd3c7e95de9e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1674,9 +1674,8 @@ adjudge_to_death: } if (sk->sk_state != TCP_CLOSE) { sk_stream_mem_reclaim(sk); - if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans || - (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && - atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) { + if (tcp_too_many_orphans(sk, + atomic_read(sk->sk_prot->orphan_count))) { if (net_ratelimit()) printk(KERN_INFO "TCP: too many of orphaned " "sockets\n"); @@ -2465,13 +2464,10 @@ void __init tcp_init(void) order++) ; if (order >= 4) { - sysctl_local_port_range[0] = 32768; - sysctl_local_port_range[1] = 61000; tcp_death_row.sysctl_max_tw_buckets = 180000; sysctl_tcp_max_orphans = 4096 << (order - 4); sysctl_max_syn_backlog = 1024; } else if (order < 3) { - sysctl_local_port_range[0] = 1024 * (3 - order); tcp_death_row.sysctl_max_tw_buckets >>= (3 - order); sysctl_tcp_max_orphans >>= (3 - order); sysctl_max_syn_backlog = 128; diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 281c9f913257..dd9ef65ad3ff 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -29,7 +29,7 @@ static int fast_convergence = 1; static int max_increment = 16; static int low_window = 14; static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */ -static int initial_ssthresh = 100; +static int initial_ssthresh; static int smooth_part = 20; module_param(fast_convergence, int, 0644); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 86b26539e54b..1260e52ad772 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -276,30 +276,34 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) /* - * Slow start (exponential increase) with - * RFC3742 Limited Slow Start (fast linear increase) support. + * Slow start is used when congestion window is less than slow start + * threshold. This version implements the basic RFC2581 version + * and optionally supports: + * RFC3742 Limited Slow Start - growth limited to max_ssthresh + * RFC3465 Appropriate Byte Counting - growth limited by bytes acknowledged */ void tcp_slow_start(struct tcp_sock *tp) { - int cnt = 0; - - if (sysctl_tcp_abc) { - /* RFC3465: Slow Start - * TCP sender SHOULD increase cwnd by the number of - * previously unacknowledged bytes ACKed by each incoming - * acknowledgment, provided the increase is not more than L - */ - if (tp->bytes_acked < tp->mss_cache) - return; - } + int cnt; /* increase in packets */ + + /* RFC3465: ABC Slow start + * Increase only after a full MSS of bytes is acked + * + * TCP sender SHOULD increase cwnd by the number of + * previously unacknowledged bytes ACKed by each incoming + * acknowledgment, provided the increase is not more than L + */ + if (sysctl_tcp_abc && tp->bytes_acked < tp->mss_cache) + return; - if (sysctl_tcp_max_ssthresh > 0 && - tp->snd_cwnd > sysctl_tcp_max_ssthresh) - cnt += sysctl_tcp_max_ssthresh>>1; + if (sysctl_tcp_max_ssthresh > 0 && tp->snd_cwnd > sysctl_tcp_max_ssthresh) + cnt = sysctl_tcp_max_ssthresh >> 1; /* limited slow start */ else - cnt += tp->snd_cwnd; + cnt = tp->snd_cwnd; /* exponential increase */ - /* RFC3465: We MAY increase by 2 if discovered delayed ack */ + /* RFC3465: ABC + * We MAY increase by 2 if discovered delayed ack + */ if (sysctl_tcp_abc > 1 && tp->bytes_acked >= 2*tp->mss_cache) cnt <<= 1; tp->bytes_acked = 0; diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 14224487b16b..ebfaac2f9f46 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -29,7 +29,7 @@ static int fast_convergence __read_mostly = 1; static int max_increment __read_mostly = 16; static int beta __read_mostly = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */ -static int initial_ssthresh __read_mostly = 100; +static int initial_ssthresh __read_mostly; static int bic_scale __read_mostly = 41; static int tcp_friendliness __read_mostly = 1; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 7641b2761a14..d6d0f9b6cdc6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1501,6 +1501,8 @@ void tcp_enter_loss(struct sock *sk, int how) tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; TCP_ECN_queue_cwr(tp); + /* Abort FRTO algorithm if one is in progress */ + tp->frto_counter = 0; clear_all_retrans_hints(tp); } @@ -2035,7 +2037,7 @@ static void tcp_try_to_open(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); - tp->left_out = tp->sacked_out; + tcp_sync_left_out(tp); if (tp->retrans_out == 0) tp->retrans_stamp = 0; @@ -2405,8 +2407,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) struct sk_buff *skb; __u32 now = tcp_time_stamp; int acked = 0; + int prior_packets = tp->packets_out; __s32 seq_rtt = -1; - u32 pkts_acked = 0; ktime_t last_ackt = ktime_set(0,0); while ((skb = tcp_write_queue_head(sk)) && @@ -2435,7 +2437,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) */ if (!(scb->flags & TCPCB_FLAG_SYN)) { acked |= FLAG_DATA_ACKED; - ++pkts_acked; } else { acked |= FLAG_SYN_ACKED; tp->retrans_stamp = 0; @@ -2479,6 +2480,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) } if (acked&FLAG_ACKED) { + u32 pkts_acked = prior_packets - tp->packets_out; const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; @@ -2608,6 +2610,7 @@ static void tcp_conservative_spur_to_response(struct tcp_sock *tp) { tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); tp->snd_cwnd_cnt = 0; + TCP_ECN_queue_cwr(tp); tcp_moderate_cwnd(tp); } @@ -2929,6 +2932,7 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, opt_rx->sack_ok) { TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th; } + break; #ifdef CONFIG_TCP_MD5SIG case TCPOPT_MD5SIG: /* diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5a3e7f839fc5..354721d67f69 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -192,8 +192,11 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, IPPROTO_TCP, inet->sport, usin->sin_port, sk, 1); - if (tmp < 0) + if (tmp < 0) { + if (tmp == -ENETUNREACH) + IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); return tmp; + } if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { ip_rt_put(rt); @@ -702,6 +705,8 @@ static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk, ip_hdr(skb)->saddr, /* XXX */ arg.iov[0].iov_len, IPPROTO_TCP, 0); arg.csumoffset = offsetof(struct tcphdr, check) / 2; + if (twsk) + arg.bound_dev_if = twsk->tw_sk.tw_bound_dev_if; ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len); @@ -873,6 +878,7 @@ int tcp_v4_md5_do_add(struct sock *sk, __be32 addr, kfree(newkey); return -ENOMEM; } + sk->sk_route_caps &= ~NETIF_F_GSO_MASK; } if (tcp_alloc_md5sig_pool() == NULL) { kfree(newkey); @@ -1002,7 +1008,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval, return -EINVAL; tp->md5sig_info = p; - + sk->sk_route_caps &= ~NETIF_F_GSO_MASK; } newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index 3938d5dbdf20..d9323dfff826 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -63,6 +63,9 @@ struct { * FIXME: causes an extra copy */ static void printl(const char *fmt, ...) + __attribute__ ((format (printf, 1, 2))); + +static void printl(const char *fmt, ...) { va_list args; int len; @@ -95,7 +98,7 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, /* Only update if port matches */ if ((port == 0 || ntohs(inet->dport) == port || ntohs(inet->sport) == port) && (full || tp->snd_cwnd != tcpw.lastcwnd)) { - printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d %#x %#x %u %u %u\n", + printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d %#x %#x %u %u %u %u\n", NIPQUAD(inet->saddr), ntohs(inet->sport), NIPQUAD(inet->daddr), ntohs(inet->dport), skb->len, tp->snd_nxt, tp->snd_una, diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 2ca97b20929d..e9b151b3a598 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -78,9 +78,7 @@ static int tcp_out_of_resources(struct sock *sk, int do_reset) if (sk->sk_err_soft) orphans <<= 1; - if (orphans >= sysctl_tcp_max_orphans || - (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && - atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) { + if (tcp_too_many_orphans(sk, orphans)) { if (net_ratelimit()) printk(KERN_INFO "Out of socket memory\n"); @@ -294,9 +292,9 @@ static void tcp_retransmit_timer(struct sock *sk) * we cannot allow such beasts to hang infinitely. */ #ifdef TCP_DEBUG - if (net_ratelimit()) { + if (1) { struct inet_sock *inet = inet_sk(sk); - printk(KERN_DEBUG "TCP: Treason uncloaked! Peer %u.%u.%u.%u:%u/%u shrinks window %u:%u. Repaired.\n", + LIMIT_NETDEBUG(KERN_DEBUG "TCP: Treason uncloaked! Peer %u.%u.%u.%u:%u/%u shrinks window %u:%u. Repaired.\n", NIPQUAD(inet->daddr), ntohs(inet->dport), inet->num, tp->snd_una, tp->snd_nxt); } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 4c7e95fa090d..facb7e29304e 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -114,36 +114,14 @@ DEFINE_RWLOCK(udp_hash_lock); static int udp_port_rover; -/* - * Note about this hash function : - * Typical use is probably daddr = 0, only dport is going to vary hash - */ -static inline unsigned int udp_hash_port(__u16 port) -{ - return port; -} - -static inline int __udp_lib_port_inuse(unsigned int hash, int port, - const struct sock *this_sk, - struct hlist_head udptable[], - const struct udp_get_port_ops *ops) +static inline int __udp_lib_lport_inuse(__u16 num, struct hlist_head udptable[]) { struct sock *sk; struct hlist_node *node; - struct inet_sock *inet; - sk_for_each(sk, node, &udptable[hash & (UDP_HTABLE_SIZE - 1)]) { - if (sk->sk_hash != hash) - continue; - inet = inet_sk(sk); - if (inet->num != port) - continue; - if (this_sk) { - if (ops->saddr_cmp(sk, this_sk)) - return 1; - } else if (ops->saddr_any(sk)) + sk_for_each(sk, node, &udptable[num & (UDP_HTABLE_SIZE - 1)]) + if (sk->sk_hash == num) return 1; - } return 0; } @@ -154,16 +132,16 @@ static inline int __udp_lib_port_inuse(unsigned int hash, int port, * @snum: port number to look up * @udptable: hash list table, must be of UDP_HTABLE_SIZE * @port_rover: pointer to record of last unallocated port - * @ops: AF-dependent address operations + * @saddr_comp: AF-dependent comparison of bound local IP addresses */ int __udp_lib_get_port(struct sock *sk, unsigned short snum, struct hlist_head udptable[], int *port_rover, - const struct udp_get_port_ops *ops) + int (*saddr_comp)(const struct sock *sk1, + const struct sock *sk2 ) ) { struct hlist_node *node; struct hlist_head *head; struct sock *sk2; - unsigned int hash; int error = 1; write_lock_bh(&udp_hash_lock); @@ -178,8 +156,7 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum, for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) { int size; - hash = ops->hash_port_and_rcv_saddr(result, sk); - head = &udptable[hash & (UDP_HTABLE_SIZE - 1)]; + head = &udptable[result & (UDP_HTABLE_SIZE - 1)]; if (hlist_empty(head)) { if (result > sysctl_local_port_range[1]) result = sysctl_local_port_range[0] + @@ -204,16 +181,7 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum, result = sysctl_local_port_range[0] + ((result - sysctl_local_port_range[0]) & (UDP_HTABLE_SIZE - 1)); - hash = udp_hash_port(result); - if (__udp_lib_port_inuse(hash, result, - NULL, udptable, ops)) - continue; - if (ops->saddr_any(sk)) - break; - - hash = ops->hash_port_and_rcv_saddr(result, sk); - if (! __udp_lib_port_inuse(hash, result, - sk, udptable, ops)) + if (! __udp_lib_lport_inuse(result, udptable)) break; } if (i >= (1 << 16) / UDP_HTABLE_SIZE) @@ -221,40 +189,21 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum, gotit: *port_rover = snum = result; } else { - hash = udp_hash_port(snum); - head = &udptable[hash & (UDP_HTABLE_SIZE - 1)]; + head = &udptable[snum & (UDP_HTABLE_SIZE - 1)]; sk_for_each(sk2, node, head) - if (sk2->sk_hash == hash && - sk2 != sk && - inet_sk(sk2)->num == snum && - (!sk2->sk_reuse || !sk->sk_reuse) && - (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || - sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && - ops->saddr_cmp(sk, sk2)) + if (sk2->sk_hash == snum && + sk2 != sk && + (!sk2->sk_reuse || !sk->sk_reuse) && + (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if + || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && + (*saddr_comp)(sk, sk2) ) goto fail; - - if (!ops->saddr_any(sk)) { - hash = ops->hash_port_and_rcv_saddr(snum, sk); - head = &udptable[hash & (UDP_HTABLE_SIZE - 1)]; - - sk_for_each(sk2, node, head) - if (sk2->sk_hash == hash && - sk2 != sk && - inet_sk(sk2)->num == snum && - (!sk2->sk_reuse || !sk->sk_reuse) && - (!sk2->sk_bound_dev_if || - !sk->sk_bound_dev_if || - sk2->sk_bound_dev_if == - sk->sk_bound_dev_if) && - ops->saddr_cmp(sk, sk2)) - goto fail; - } } inet_sk(sk)->num = snum; - sk->sk_hash = hash; + sk->sk_hash = snum; if (sk_unhashed(sk)) { - head = &udptable[hash & (UDP_HTABLE_SIZE - 1)]; + head = &udptable[snum & (UDP_HTABLE_SIZE - 1)]; sk_add_node(sk, head); sock_prot_inc_use(sk->sk_prot); } @@ -265,12 +214,12 @@ fail: } int udp_get_port(struct sock *sk, unsigned short snum, - const struct udp_get_port_ops *ops) + int (*scmp)(const struct sock *, const struct sock *)) { - return __udp_lib_get_port(sk, snum, udp_hash, &udp_port_rover, ops); + return __udp_lib_get_port(sk, snum, udp_hash, &udp_port_rover, scmp); } -static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2) +int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2) { struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2); @@ -279,33 +228,9 @@ static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2) inet1->rcv_saddr == inet2->rcv_saddr )); } -static int ipv4_rcv_saddr_any(const struct sock *sk) -{ - return !inet_sk(sk)->rcv_saddr; -} - -static inline unsigned int ipv4_hash_port_and_addr(__u16 port, __be32 addr) -{ - addr ^= addr >> 16; - addr ^= addr >> 8; - return port ^ addr; -} - -static unsigned int ipv4_hash_port_and_rcv_saddr(__u16 port, - const struct sock *sk) -{ - return ipv4_hash_port_and_addr(port, inet_sk(sk)->rcv_saddr); -} - -const struct udp_get_port_ops udp_ipv4_ops = { - .saddr_cmp = ipv4_rcv_saddr_equal, - .saddr_any = ipv4_rcv_saddr_any, - .hash_port_and_rcv_saddr = ipv4_hash_port_and_rcv_saddr, -}; - static inline int udp_v4_get_port(struct sock *sk, unsigned short snum) { - return udp_get_port(sk, snum, &udp_ipv4_ops); + return udp_get_port(sk, snum, ipv4_rcv_saddr_equal); } /* UDP is nearly always wildcards out the wazoo, it makes no sense to try @@ -317,77 +242,63 @@ static struct sock *__udp4_lib_lookup(__be32 saddr, __be16 sport, { struct sock *sk, *result = NULL; struct hlist_node *node; - unsigned int hash, hashwild; - int score, best = -1, hport = ntohs(dport); - - hash = ipv4_hash_port_and_addr(hport, daddr); - hashwild = udp_hash_port(hport); + unsigned short hnum = ntohs(dport); + int badness = -1; read_lock(&udp_hash_lock); - -lookup: - - sk_for_each(sk, node, &udptable[hash & (UDP_HTABLE_SIZE - 1)]) { + sk_for_each(sk, node, &udptable[hnum & (UDP_HTABLE_SIZE - 1)]) { struct inet_sock *inet = inet_sk(sk); - if (sk->sk_hash != hash || ipv6_only_sock(sk) || - inet->num != hport) - continue; - - score = (sk->sk_family == PF_INET ? 1 : 0); - if (inet->rcv_saddr) { - if (inet->rcv_saddr != daddr) - continue; - score+=2; - } - if (inet->daddr) { - if (inet->daddr != saddr) - continue; - score+=2; - } - if (inet->dport) { - if (inet->dport != sport) - continue; - score+=2; - } - if (sk->sk_bound_dev_if) { - if (sk->sk_bound_dev_if != dif) - continue; - score+=2; - } - if (score == 9) { - result = sk; - goto found; - } else if (score > best) { - result = sk; - best = score; + if (sk->sk_hash == hnum && !ipv6_only_sock(sk)) { + int score = (sk->sk_family == PF_INET ? 1 : 0); + if (inet->rcv_saddr) { + if (inet->rcv_saddr != daddr) + continue; + score+=2; + } + if (inet->daddr) { + if (inet->daddr != saddr) + continue; + score+=2; + } + if (inet->dport) { + if (inet->dport != sport) + continue; + score+=2; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + continue; + score+=2; + } + if (score == 9) { + result = sk; + break; + } else if (score > badness) { + result = sk; + badness = score; + } } } - - if (hash != hashwild) { - hash = hashwild; - goto lookup; - } -found: if (result) sock_hold(result); read_unlock(&udp_hash_lock); return result; } -static inline struct sock *udp_v4_mcast_next(struct sock *sk, unsigned int hnum, - int hport, __be32 loc_addr, +static inline struct sock *udp_v4_mcast_next(struct sock *sk, + __be16 loc_port, __be32 loc_addr, __be16 rmt_port, __be32 rmt_addr, int dif) { struct hlist_node *node; struct sock *s = sk; + unsigned short hnum = ntohs(loc_port); sk_for_each_from(s, node) { struct inet_sock *inet = inet_sk(s); if (s->sk_hash != hnum || - inet->num != hport || (inet->daddr && inet->daddr != rmt_addr) || (inet->dport != rmt_port && inet->dport) || (inet->rcv_saddr && inet->rcv_saddr != loc_addr) || @@ -722,8 +633,11 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, .dport = dport } } }; security_sk_classify_flow(sk, &fl); err = ip_route_output_flow(&rt, &fl, sk, 1); - if (err) + if (err) { + if (err == -ENETUNREACH) + IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); goto out; + } err = -EACCES; if ((rt->rt_flags & RTCF_BROADCAST) && @@ -1218,45 +1132,29 @@ static int __udp4_lib_mcast_deliver(struct sk_buff *skb, __be32 saddr, __be32 daddr, struct hlist_head udptable[]) { - struct sock *sk, *skw, *sknext; + struct sock *sk; int dif; - int hport = ntohs(uh->dest); - unsigned int hash = ipv4_hash_port_and_addr(hport, daddr); - unsigned int hashwild = udp_hash_port(hport); - - dif = skb->dev->ifindex; read_lock(&udp_hash_lock); - - sk = sk_head(&udptable[hash & (UDP_HTABLE_SIZE - 1)]); - skw = sk_head(&udptable[hashwild & (UDP_HTABLE_SIZE - 1)]); - - sk = udp_v4_mcast_next(sk, hash, hport, daddr, uh->source, saddr, dif); - if (!sk) { - hash = hashwild; - sk = udp_v4_mcast_next(skw, hash, hport, daddr, uh->source, - saddr, dif); - } + sk = sk_head(&udptable[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]); + dif = skb->dev->ifindex; + sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif); if (sk) { + struct sock *sknext = NULL; + do { struct sk_buff *skb1 = skb; - sknext = udp_v4_mcast_next(sk_next(sk), hash, hport, - daddr, uh->source, saddr, dif); - if (!sknext && hash != hashwild) { - hash = hashwild; - sknext = udp_v4_mcast_next(skw, hash, hport, - daddr, uh->source, saddr, dif); - } + + sknext = udp_v4_mcast_next(sk_next(sk), uh->dest, daddr, + uh->source, saddr, dif); if (sknext) skb1 = skb_clone(skb, GFP_ATOMIC); if (skb1) { int ret = udp_queue_rcv_skb(sk, skb1); if (ret > 0) - /* - * we should probably re-process - * instead of dropping packets here. - */ + /* we should probably re-process instead + * of dropping packets here. */ kfree_skb(skb1); } sk = sknext; @@ -1343,7 +1241,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[], return __udp4_lib_mcast_deliver(skb, uh, saddr, daddr, udptable); sk = __udp4_lib_lookup(saddr, uh->source, daddr, uh->dest, - skb->dev->ifindex, udptable); + skb->dev->ifindex, udptable ); if (sk != NULL) { int ret = udp_queue_rcv_skb(sk, skb); diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index 06d94195e644..820a477cfaa6 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -5,14 +5,14 @@ #include <net/protocol.h> #include <net/inet_common.h> -extern const struct udp_get_port_ops udp_ipv4_ops; - extern int __udp4_lib_rcv(struct sk_buff *, struct hlist_head [], int ); extern void __udp4_lib_err(struct sk_buff *, u32, struct hlist_head []); extern int __udp_lib_get_port(struct sock *sk, unsigned short snum, struct hlist_head udptable[], int *port_rover, - const struct udp_get_port_ops *ops); + int (*)(const struct sock*,const struct sock*)); +extern int ipv4_rcv_saddr_equal(const struct sock *, const struct sock *); + extern int udp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, int optlen); diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 3653b32dce2d..f34fd686a8f1 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -19,15 +19,14 @@ struct hlist_head udplite_hash[UDP_HTABLE_SIZE]; static int udplite_port_rover; int udplite_get_port(struct sock *sk, unsigned short p, - const struct udp_get_port_ops *ops) + int (*c)(const struct sock *, const struct sock *)) { - return __udp_lib_get_port(sk, p, udplite_hash, - &udplite_port_rover, ops); + return __udp_lib_get_port(sk, p, udplite_hash, &udplite_port_rover, c); } static int udplite_v4_get_port(struct sock *sk, unsigned short snum) { - return udplite_get_port(sk, snum, &udp_ipv4_ops); + return udplite_get_port(sk, snum, ipv4_rcv_saddr_equal); } static int udplite_rcv(struct sk_buff *skb) diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index 5ceca951d73f..fa1902dc81b8 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -139,10 +139,8 @@ int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type) nf_reset(skb); if (decaps) { - if (!(skb->dev->flags&IFF_LOOPBACK)) { - dst_release(skb->dst); - skb->dst = NULL; - } + dst_release(skb->dst); + skb->dst = NULL; netif_rx(skb); return 0; } else { diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index a2f2e6a5ec5d..9963700e74c1 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -85,6 +85,8 @@ static int xfrm4_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) top_iph->saddr = x->props.saddr.a4; top_iph->daddr = x->id.daddr.a4; + skb->protocol = htons(ETH_P_IP); + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); return 0; } |