From c8923c6b852d3a97c1faad0566e38fca330375a7 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 13 Oct 2005 14:41:23 -0700 Subject: [PATCH 1/4] [NETFILTER]: Fix OOPSes on machines with discontiguous cpu numbering. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Original patch by Harald Welte, with feedback from Herbert Xu and testing by Sébastien Bernard. EBTABLES, ARP tables, and IP/IP6 tables all assume that cpus are numbered linearly. That is not necessarily true. This patch fixes that up by calculating the largest possible cpu number, and allocating enough per-cpu structure space given that. Signed-off-by: David S. Miller --- arch/cris/arch-v32/kernel/smp.c | 2 ++ arch/sh/kernel/smp.c | 3 +++ include/linux/cpumask.h | 12 ++++++++++++ net/bridge/netfilter/ebtables.c | 27 +++++++++++++++++---------- net/ipv4/netfilter/arp_tables.c | 14 +++++++++----- net/ipv4/netfilter/ip_tables.c | 17 +++++++++++------ net/ipv6/netfilter/ip6_tables.c | 16 +++++++++++----- 7 files changed, 65 insertions(+), 26 deletions(-) diff --git a/arch/cris/arch-v32/kernel/smp.c b/arch/cris/arch-v32/kernel/smp.c index 2c5cae04a95c..957f551ba5ce 100644 --- a/arch/cris/arch-v32/kernel/smp.c +++ b/arch/cris/arch-v32/kernel/smp.c @@ -15,6 +15,7 @@ #include #include #include +#include #define IPI_SCHEDULE 1 #define IPI_CALL 2 @@ -28,6 +29,7 @@ spinlock_t cris_atomic_locks[] = { [0 ... LOCK_COUNT - 1] = SPIN_LOCK_UNLOCKED}; /* CPU masks */ cpumask_t cpu_online_map = CPU_MASK_NONE; cpumask_t phys_cpu_present_map = CPU_MASK_NONE; +EXPORT_SYMBOL(phys_cpu_present_map); /* Variables used during SMP boot */ volatile int cpu_now_booting = 0; diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c index 56a39d69e080..5ecefc02896a 100644 --- a/arch/sh/kernel/smp.c +++ b/arch/sh/kernel/smp.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -39,6 +40,8 @@ struct sh_cpuinfo cpu_data[NR_CPUS]; extern void per_cpu_trap_init(void); cpumask_t cpu_possible_map; +EXPORT_SYMBOL(cpu_possible_map); + cpumask_t cpu_online_map; static atomic_t cpus_booted = ATOMIC_INIT(0); diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index b15826f6e3a2..fe9778301d07 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -392,4 +392,16 @@ extern cpumask_t cpu_present_map; #define for_each_online_cpu(cpu) for_each_cpu_mask((cpu), cpu_online_map) #define for_each_present_cpu(cpu) for_each_cpu_mask((cpu), cpu_present_map) +/* Find the highest possible smp_processor_id() */ +static inline unsigned int highest_possible_processor_id(void) +{ + unsigned int cpu, highest = 0; + + for_each_cpu_mask(cpu, cpu_possible_map) + highest = cpu; + + return highest; +} + + #endif /* __LINUX_CPUMASK_H */ diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index c4540144f0f4..f8ffbf6e2333 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -26,6 +26,7 @@ #include #include #include +#include #include /* needed for logical [in,out]-dev filtering */ #include "../br_private.h" @@ -823,10 +824,11 @@ static int translate_table(struct ebt_replace *repl, /* this will get free'd in do_replace()/ebt_register_table() if an error occurs */ newinfo->chainstack = (struct ebt_chainstack **) - vmalloc(num_possible_cpus() * sizeof(struct ebt_chainstack)); + vmalloc((highest_possible_processor_id()+1) + * sizeof(struct ebt_chainstack)); if (!newinfo->chainstack) return -ENOMEM; - for (i = 0; i < num_possible_cpus(); i++) { + for_each_cpu(i) { newinfo->chainstack[i] = vmalloc(udc_cnt * sizeof(struct ebt_chainstack)); if (!newinfo->chainstack[i]) { @@ -895,9 +897,12 @@ static void get_counters(struct ebt_counter *oldcounters, /* counters of cpu 0 */ memcpy(counters, oldcounters, - sizeof(struct ebt_counter) * nentries); + sizeof(struct ebt_counter) * nentries); + /* add other counters to those of cpu 0 */ - for (cpu = 1; cpu < num_possible_cpus(); cpu++) { + for_each_cpu(cpu) { + if (cpu == 0) + continue; counter_base = COUNTER_BASE(oldcounters, nentries, cpu); for (i = 0; i < nentries; i++) { counters[i].pcnt += counter_base[i].pcnt; @@ -929,7 +934,8 @@ static int do_replace(void __user *user, unsigned int len) BUGPRINT("Entries_size never zero\n"); return -EINVAL; } - countersize = COUNTER_OFFSET(tmp.nentries) * num_possible_cpus(); + countersize = COUNTER_OFFSET(tmp.nentries) * + (highest_possible_processor_id()+1); newinfo = (struct ebt_table_info *) vmalloc(sizeof(struct ebt_table_info) + countersize); if (!newinfo) @@ -1022,7 +1028,7 @@ static int do_replace(void __user *user, unsigned int len) vfree(table->entries); if (table->chainstack) { - for (i = 0; i < num_possible_cpus(); i++) + for_each_cpu(i) vfree(table->chainstack[i]); vfree(table->chainstack); } @@ -1040,7 +1046,7 @@ free_counterstmp: vfree(counterstmp); /* can be initialized in translate_table() */ if (newinfo->chainstack) { - for (i = 0; i < num_possible_cpus(); i++) + for_each_cpu(i) vfree(newinfo->chainstack[i]); vfree(newinfo->chainstack); } @@ -1132,7 +1138,8 @@ int ebt_register_table(struct ebt_table *table) return -EINVAL; } - countersize = COUNTER_OFFSET(table->table->nentries) * num_possible_cpus(); + countersize = COUNTER_OFFSET(table->table->nentries) * + (highest_possible_processor_id()+1); newinfo = (struct ebt_table_info *) vmalloc(sizeof(struct ebt_table_info) + countersize); ret = -ENOMEM; @@ -1186,7 +1193,7 @@ free_unlock: up(&ebt_mutex); free_chainstack: if (newinfo->chainstack) { - for (i = 0; i < num_possible_cpus(); i++) + for_each_cpu(i) vfree(newinfo->chainstack[i]); vfree(newinfo->chainstack); } @@ -1209,7 +1216,7 @@ void ebt_unregister_table(struct ebt_table *table) up(&ebt_mutex); vfree(table->private->entries); if (table->private->chainstack) { - for (i = 0; i < num_possible_cpus(); i++) + for_each_cpu(i) vfree(table->private->chainstack[i]); vfree(table->private->chainstack); } diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index fa1634256680..a7969286e6e7 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -716,8 +716,10 @@ static int translate_table(const char *name, } /* And one copy for every other CPU */ - for (i = 1; i < num_possible_cpus(); i++) { - memcpy(newinfo->entries + SMP_ALIGN(newinfo->size)*i, + for_each_cpu(i) { + if (i == 0) + continue; + memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i, newinfo->entries, SMP_ALIGN(newinfo->size)); } @@ -767,7 +769,7 @@ static void get_counters(const struct arpt_table_info *t, unsigned int cpu; unsigned int i; - for (cpu = 0; cpu < num_possible_cpus(); cpu++) { + for_each_cpu(cpu) { i = 0; ARPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu), t->size, @@ -885,7 +887,8 @@ static int do_replace(void __user *user, unsigned int len) return -ENOMEM; newinfo = vmalloc(sizeof(struct arpt_table_info) - + SMP_ALIGN(tmp.size) * num_possible_cpus()); + + SMP_ALIGN(tmp.size) * + (highest_possible_processor_id()+1)); if (!newinfo) return -ENOMEM; @@ -1158,7 +1161,8 @@ int arpt_register_table(struct arpt_table *table, = { 0, 0, 0, { 0 }, { 0 }, { } }; newinfo = vmalloc(sizeof(struct arpt_table_info) - + SMP_ALIGN(repl->size) * num_possible_cpus()); + + SMP_ALIGN(repl->size) * + (highest_possible_processor_id()+1)); if (!newinfo) { ret = -ENOMEM; return ret; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index eef99a1b5de6..75c27e92f6ab 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -27,6 +27,7 @@ #include #include #include +#include #include @@ -921,8 +922,10 @@ translate_table(const char *name, } /* And one copy for every other CPU */ - for (i = 1; i < num_possible_cpus(); i++) { - memcpy(newinfo->entries + SMP_ALIGN(newinfo->size)*i, + for_each_cpu(i) { + if (i == 0) + continue; + memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i, newinfo->entries, SMP_ALIGN(newinfo->size)); } @@ -943,7 +946,7 @@ replace_table(struct ipt_table *table, struct ipt_entry *table_base; unsigned int i; - for (i = 0; i < num_possible_cpus(); i++) { + for_each_cpu(i) { table_base = (void *)newinfo->entries + TABLE_OFFSET(newinfo, i); @@ -990,7 +993,7 @@ get_counters(const struct ipt_table_info *t, unsigned int cpu; unsigned int i; - for (cpu = 0; cpu < num_possible_cpus(); cpu++) { + for_each_cpu(cpu) { i = 0; IPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu), t->size, @@ -1128,7 +1131,8 @@ do_replace(void __user *user, unsigned int len) return -ENOMEM; newinfo = vmalloc(sizeof(struct ipt_table_info) - + SMP_ALIGN(tmp.size) * num_possible_cpus()); + + SMP_ALIGN(tmp.size) * + (highest_possible_processor_id()+1)); if (!newinfo) return -ENOMEM; @@ -1458,7 +1462,8 @@ int ipt_register_table(struct ipt_table *table, const struct ipt_replace *repl) = { 0, 0, 0, { 0 }, { 0 }, { } }; newinfo = vmalloc(sizeof(struct ipt_table_info) - + SMP_ALIGN(repl->size) * num_possible_cpus()); + + SMP_ALIGN(repl->size) * + (highest_possible_processor_id()+1)); if (!newinfo) return -ENOMEM; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 2da514b16d95..b03e90649eb5 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -28,6 +28,7 @@ #include #include #include +#include #include @@ -950,8 +951,10 @@ translate_table(const char *name, } /* And one copy for every other CPU */ - for (i = 1; i < num_possible_cpus(); i++) { - memcpy(newinfo->entries + SMP_ALIGN(newinfo->size)*i, + for_each_cpu(i) { + if (i == 0) + continue; + memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i, newinfo->entries, SMP_ALIGN(newinfo->size)); } @@ -973,6 +976,7 @@ replace_table(struct ip6t_table *table, unsigned int i; for (i = 0; i < num_possible_cpus(); i++) { + for_each_cpu(i) { table_base = (void *)newinfo->entries + TABLE_OFFSET(newinfo, i); @@ -1019,7 +1023,7 @@ get_counters(const struct ip6t_table_info *t, unsigned int cpu; unsigned int i; - for (cpu = 0; cpu < num_possible_cpus(); cpu++) { + for_each_cpu(cpu) { i = 0; IP6T_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu), t->size, @@ -1153,7 +1157,8 @@ do_replace(void __user *user, unsigned int len) return -ENOMEM; newinfo = vmalloc(sizeof(struct ip6t_table_info) - + SMP_ALIGN(tmp.size) * num_possible_cpus()); + + SMP_ALIGN(tmp.size) * + (highest_possible_processor_id()+1)); if (!newinfo) return -ENOMEM; @@ -1467,7 +1472,8 @@ int ip6t_register_table(struct ip6t_table *table, = { 0, 0, 0, { 0 }, { 0 }, { } }; newinfo = vmalloc(sizeof(struct ip6t_table_info) - + SMP_ALIGN(repl->size) * num_possible_cpus()); + + SMP_ALIGN(repl->size) * + (highest_possible_processor_id()+1)); if (!newinfo) return -ENOMEM; From 34cb711ba922f53cca45443b8c3c1078873cf599 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 13 Oct 2005 14:41:44 -0700 Subject: [PATCH 2/4] [NET]: Disable NET_SCH_CLK_CPU for SMP x86 hosts Opterons with frequency scaling have fully unsynchronized TSCs running at different frequencies, so using TSCs there is not a good idea. Also some other x86 boxes have this problem. gettimeofday should be good enough, so just disable it. Signed-off-by: Andi Kleen Signed-off-by: David S. Miller --- net/sched/Kconfig | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 45d3bc0812c8..81510da31792 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -72,9 +72,11 @@ config NET_SCH_CLK_GETTIMEOFDAY Choose this if you need a high resolution clock source but can't use the CPU's cycle counter. +# don't allow on SMP x86 because they can have unsynchronized TSCs. +# gettimeofday is a good alternative config NET_SCH_CLK_CPU bool "CPU cycle counter" - depends on X86_TSC || X86_64 || ALPHA || SPARC64 || PPC64 || IA64 + depends on ((X86_TSC || X86_64) && !SMP) || ALPHA || SPARC64 || PPC64 || IA64 help Say Y here if you want to use the CPU's cycle counter as clock source. This is a cheap and high resolution clock source, but on some From eb0d6041143fae63410c5622fef96862e6b20933 Mon Sep 17 00:00:00 2001 From: Evgeniy Polyakov Date: Thu, 13 Oct 2005 14:42:04 -0700 Subject: [PATCH 3/4] [CONNECTOR]: Update documentation to match reality. Updated documentation to reflect 2.6.14 netlink changes about socket options, multicasting and group number. Please concider for 2.6.14. Signed-off-by: Evgeniy Polyakov Signed-off-by: David S. Miller --- Documentation/connector/connector.txt | 44 +++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/Documentation/connector/connector.txt b/Documentation/connector/connector.txt index 54a0a14bfbe3..57a314b14cf8 100644 --- a/Documentation/connector/connector.txt +++ b/Documentation/connector/connector.txt @@ -131,3 +131,47 @@ Netlink itself is not reliable protocol, that means that messages can be lost due to memory pressure or process' receiving queue overflowed, so caller is warned must be prepared. That is why struct cn_msg [main connector's message header] contains u32 seq and u32 ack fields. + +/*****************************************/ +Userspace usage. +/*****************************************/ +2.6.14 has a new netlink socket implementation, which by default does not +allow to send data to netlink groups other than 1. +So, if to use netlink socket (for example using connector) +with different group number userspace application must subscribe to +that group. It can be achieved by following pseudocode: + +s = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_CONNECTOR); + +l_local.nl_family = AF_NETLINK; +l_local.nl_groups = 12345; +l_local.nl_pid = 0; + +if (bind(s, (struct sockaddr *)&l_local, sizeof(struct sockaddr_nl)) == -1) { + perror("bind"); + close(s); + return -1; +} + +{ + int on = l_local.nl_groups; + setsockopt(s, 270, 1, &on, sizeof(on)); +} + +Where 270 above is SOL_NETLINK, and 1 is a NETLINK_ADD_MEMBERSHIP socket +option. To drop multicast subscription one should call above socket option +with NETLINK_DROP_MEMBERSHIP parameter which is defined as 0. + +2.6.14 netlink code only allows to select a group which is less or equal to +the maximum group number, which is used at netlink_kernel_create() time. +In case of connector it is CN_NETLINK_USERS + 0xf, so if you want to use +group number 12345, you must increment CN_NETLINK_USERS to that number. +Additional 0xf numbers are allocated to be used by non-in-kernel users. + +Due to this limitation, group 0xffffffff does not work now, so one can +not use add/remove connector's group notifications, but as far as I know, +only cn_test.c test module used it. + +Some work in netlink area is still being done, so things can be changed in +2.6.15 timeframe, if it will happen, documentation will be updated for that +kernel. From 046d20b73960b7a2474b6d5e920d54c3fd7c23fe Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 13 Oct 2005 14:42:24 -0700 Subject: [PATCH 4/4] [TCP]: Ratelimit debugging warning. Better safe than sorry. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f37a50e55b68..7114031fdc70 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -436,11 +436,13 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss u16 flags; if (unlikely(len >= skb->len)) { - printk(KERN_DEBUG "TCP: seg_size=%u, mss=%u, seq=%u, " - "end_seq=%u, skb->len=%u.\n", len, mss_now, - TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, - skb->len); - WARN_ON(1); + if (net_ratelimit()) { + printk(KERN_DEBUG "TCP: seg_size=%u, mss=%u, seq=%u, " + "end_seq=%u, skb->len=%u.\n", len, mss_now, + TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, + skb->len); + WARN_ON(1); + } return 0; }