diff options
Diffstat (limited to 'arch/ia64/sn/kernel')
-rw-r--r-- | arch/ia64/sn/kernel/bte.c | 83 | ||||
-rw-r--r-- | arch/ia64/sn/kernel/huberror.c | 2 | ||||
-rw-r--r-- | arch/ia64/sn/kernel/io_init.c | 60 | ||||
-rw-r--r-- | arch/ia64/sn/kernel/irq.c | 77 | ||||
-rw-r--r-- | arch/ia64/sn/kernel/setup.c | 199 | ||||
-rw-r--r-- | arch/ia64/sn/kernel/sn2/ptc_deadlock.S | 13 | ||||
-rw-r--r-- | arch/ia64/sn/kernel/sn2/sn2_smp.c | 283 | ||||
-rw-r--r-- | arch/ia64/sn/kernel/sn2/sn_hwperf.c | 317 | ||||
-rw-r--r-- | arch/ia64/sn/kernel/sn2/sn_proc_fs.c | 6 | ||||
-rw-r--r-- | arch/ia64/sn/kernel/sn2/timer_interrupt.c | 22 | ||||
-rw-r--r-- | arch/ia64/sn/kernel/tiocx.c | 69 | ||||
-rw-r--r-- | arch/ia64/sn/kernel/xpc.h | 368 | ||||
-rw-r--r-- | arch/ia64/sn/kernel/xpc_channel.c | 329 | ||||
-rw-r--r-- | arch/ia64/sn/kernel/xpc_main.c | 334 | ||||
-rw-r--r-- | arch/ia64/sn/kernel/xpc_partition.c | 475 | ||||
-rw-r--r-- | arch/ia64/sn/kernel/xpnet.c | 6 |
16 files changed, 1922 insertions, 721 deletions
diff --git a/arch/ia64/sn/kernel/bte.c b/arch/ia64/sn/kernel/bte.c index 647deae9bfcd..d71f4de44f79 100644 --- a/arch/ia64/sn/kernel/bte.c +++ b/arch/ia64/sn/kernel/bte.c @@ -29,16 +29,30 @@ /* two interfaces on two btes */ #define MAX_INTERFACES_TO_TRY 4 +#define MAX_NODES_TO_TRY 2 static struct bteinfo_s *bte_if_on_node(nasid_t nasid, int interface) { nodepda_t *tmp_nodepda; + if (nasid_to_cnodeid(nasid) == -1) + return (struct bteinfo_s *)NULL;; + tmp_nodepda = NODEPDA(nasid_to_cnodeid(nasid)); return &tmp_nodepda->bte_if[interface]; } +static inline void bte_start_transfer(struct bteinfo_s *bte, u64 len, u64 mode) +{ + if (is_shub2()) { + BTE_CTRL_STORE(bte, (IBLS_BUSY | ((len) | (mode) << 24))); + } else { + BTE_LNSTAT_STORE(bte, len); + BTE_CTRL_STORE(bte, mode); + } +} + /************************************************************************ * Block Transfer Engine copy related functions. * @@ -67,13 +81,15 @@ bte_result_t bte_copy(u64 src, u64 dest, u64 len, u64 mode, void *notification) { u64 transfer_size; u64 transfer_stat; + u64 notif_phys_addr; struct bteinfo_s *bte; bte_result_t bte_status; unsigned long irq_flags; unsigned long itc_end = 0; - struct bteinfo_s *btes_to_try[MAX_INTERFACES_TO_TRY]; - int bte_if_index; - int bte_pri, bte_sec; + int nasid_to_try[MAX_NODES_TO_TRY]; + int my_nasid = cpuid_to_nasid(raw_smp_processor_id()); + int bte_if_index, nasid_index; + int bte_first, btes_per_node = BTES_PER_NODE; BTE_PRINTK(("bte_copy(0x%lx, 0x%lx, 0x%lx, 0x%lx, 0x%p)\n", src, dest, len, mode, notification)); @@ -86,36 +102,26 @@ bte_result_t bte_copy(u64 src, u64 dest, u64 len, u64 mode, void *notification) (src & L1_CACHE_MASK) || (dest & L1_CACHE_MASK)); BUG_ON(!(len < ((BTE_LEN_MASK + 1) << L1_CACHE_SHIFT))); - /* CPU 0 (per node) tries bte0 first, CPU 1 try bte1 first */ - if (cpuid_to_subnode(smp_processor_id()) == 0) { - bte_pri = 0; - bte_sec = 1; - } else { - bte_pri = 1; - bte_sec = 0; - } + /* + * Start with interface corresponding to cpu number + */ + bte_first = raw_smp_processor_id() % btes_per_node; if (mode & BTE_USE_DEST) { /* try remote then local */ - btes_to_try[0] = bte_if_on_node(NASID_GET(dest), bte_pri); - btes_to_try[1] = bte_if_on_node(NASID_GET(dest), bte_sec); + nasid_to_try[0] = NASID_GET(dest); if (mode & BTE_USE_ANY) { - btes_to_try[2] = bte_if_on_node(get_nasid(), bte_pri); - btes_to_try[3] = bte_if_on_node(get_nasid(), bte_sec); + nasid_to_try[1] = my_nasid; } else { - btes_to_try[2] = NULL; - btes_to_try[3] = NULL; + nasid_to_try[1] = (int)NULL; } } else { /* try local then remote */ - btes_to_try[0] = bte_if_on_node(get_nasid(), bte_pri); - btes_to_try[1] = bte_if_on_node(get_nasid(), bte_sec); + nasid_to_try[0] = my_nasid; if (mode & BTE_USE_ANY) { - btes_to_try[2] = bte_if_on_node(NASID_GET(dest), bte_pri); - btes_to_try[3] = bte_if_on_node(NASID_GET(dest), bte_sec); + nasid_to_try[1] = NASID_GET(dest); } else { - btes_to_try[2] = NULL; - btes_to_try[3] = NULL; + nasid_to_try[1] = (int)NULL; } } @@ -123,11 +129,12 @@ retry_bteop: do { local_irq_save(irq_flags); - bte_if_index = 0; + bte_if_index = bte_first; + nasid_index = 0; /* Attempt to lock one of the BTE interfaces. */ - while (bte_if_index < MAX_INTERFACES_TO_TRY) { - bte = btes_to_try[bte_if_index++]; + while (nasid_index < MAX_NODES_TO_TRY) { + bte = bte_if_on_node(nasid_to_try[nasid_index],bte_if_index); if (bte == NULL) { continue; @@ -143,6 +150,15 @@ retry_bteop: break; } } + + bte_if_index = (bte_if_index + 1) % btes_per_node; /* Next interface */ + if (bte_if_index == bte_first) { + /* + * We've tried all interfaces on this node + */ + nasid_index++; + } + bte = NULL; } @@ -169,7 +185,13 @@ retry_bteop: /* Initialize the notification to a known value. */ *bte->most_rcnt_na = BTE_WORD_BUSY; + notif_phys_addr = TO_PHYS(ia64_tpa((unsigned long)bte->most_rcnt_na)); + if (is_shub2()) { + src = SH2_TIO_PHYS_TO_DMA(src); + dest = SH2_TIO_PHYS_TO_DMA(dest); + notif_phys_addr = SH2_TIO_PHYS_TO_DMA(notif_phys_addr); + } /* Set the source and destination registers */ BTE_PRINTKV(("IBSA = 0x%lx)\n", (TO_PHYS(src)))); BTE_SRC_STORE(bte, TO_PHYS(src)); @@ -177,14 +199,12 @@ retry_bteop: BTE_DEST_STORE(bte, TO_PHYS(dest)); /* Set the notification register */ - BTE_PRINTKV(("IBNA = 0x%lx)\n", - TO_PHYS(ia64_tpa((unsigned long)bte->most_rcnt_na)))); - BTE_NOTIF_STORE(bte, - TO_PHYS(ia64_tpa((unsigned long)bte->most_rcnt_na))); + BTE_PRINTKV(("IBNA = 0x%lx)\n", notif_phys_addr)); + BTE_NOTIF_STORE(bte, notif_phys_addr); /* Initiate the transfer */ BTE_PRINTK(("IBCT = 0x%lx)\n", BTE_VALID_MODE(mode))); - BTE_START_TRANSFER(bte, transfer_size, BTE_VALID_MODE(mode)); + bte_start_transfer(bte, transfer_size, BTE_VALID_MODE(mode)); itc_end = ia64_get_itc() + (40000000 * local_cpu_data->cyc_per_usec); @@ -195,6 +215,7 @@ retry_bteop: } while ((transfer_stat = *bte->most_rcnt_na) == BTE_WORD_BUSY) { + cpu_relax(); if (ia64_get_itc() > itc_end) { BTE_PRINTK(("BTE timeout nasid 0x%x bte%d IBLS = 0x%lx na 0x%lx\n", NASID_GET(bte->bte_base_addr), bte->bte_num, diff --git a/arch/ia64/sn/kernel/huberror.c b/arch/ia64/sn/kernel/huberror.c index 5c39b43ba3c0..5c5eb01c50f0 100644 --- a/arch/ia64/sn/kernel/huberror.c +++ b/arch/ia64/sn/kernel/huberror.c @@ -76,7 +76,7 @@ void hubiio_crb_free(struct hubdev_info *hubdev_info, int crbnum) */ REMOTE_HUB_S(hubdev_info->hdi_nasid, IIO_ICDR, (IIO_ICDR_PND | crbnum)); while (REMOTE_HUB_L(hubdev_info->hdi_nasid, IIO_ICDR) & IIO_ICDR_PND) - udelay(1); + cpu_relax(); } diff --git a/arch/ia64/sn/kernel/io_init.c b/arch/ia64/sn/kernel/io_init.c index a6649baf629a..b4f5053f5e1b 100644 --- a/arch/ia64/sn/kernel/io_init.c +++ b/arch/ia64/sn/kernel/io_init.c @@ -18,11 +18,10 @@ #include <asm/sn/simulator.h> #include <asm/sn/sn_sal.h> #include <asm/sn/tioca_provider.h> +#include <asm/sn/tioce_provider.h> #include "xtalk/hubdev.h" #include "xtalk/xwidgetdev.h" -nasid_t master_nasid = INVALID_NASID; /* Partition Master */ - static struct list_head sn_sysdata_list; /* sysdata list struct */ @@ -44,6 +43,9 @@ int sn_ioif_inited = 0; /* SN I/O infrastructure initialized? */ struct sn_pcibus_provider *sn_pci_provider[PCIIO_ASIC_MAX_TYPES]; /* indexed by asic type */ +static int max_segment_number = 0; /* Default highest segment number */ +static int max_pcibus_number = 255; /* Default highest pci bus number */ + /* * Hooks and struct for unsupported pci providers */ @@ -157,13 +159,28 @@ static void sn_fixup_ionodes(void) uint64_t nasid; int i, widget; - for (i = 0; i < numionodes; i++) { + /* + * Get SGI Specific HUB chipset information. + * Inform Prom that this kernel can support domain bus numbering. + */ + for (i = 0; i < num_cnodes; i++) { hubdev = (struct hubdev_info *)(NODEPDA(i)->pdinfo); nasid = cnodeid_to_nasid(i); + hubdev->max_segment_number = 0xffffffff; + hubdev->max_pcibus_number = 0xff; status = sal_get_hubdev_info(nasid, (uint64_t) __pa(hubdev)); if (status) continue; + /* Save the largest Domain and pcibus numbers found. */ + if (hubdev->max_segment_number) { + /* + * Dealing with a Prom that supports segments. + */ + max_segment_number = hubdev->max_segment_number; + max_pcibus_number = hubdev->max_pcibus_number; + } + /* Attach the error interrupt handlers */ if (nasid & 1) ice_error_init(hubdev); @@ -203,6 +220,7 @@ static void sn_fixup_ionodes(void) continue; } + spin_lock_init(&sn_flush_device_list->sfdl_flush_lock); hubdev->hdi_flush_nasid_list.widget_p[widget] = sn_flush_device_list; } @@ -229,7 +247,7 @@ void sn_pci_unfixup_slot(struct pci_dev *dev) void sn_pci_fixup_slot(struct pci_dev *dev) { int idx; - int segment = 0; + int segment = pci_domain_nr(dev->bus); int status = 0; struct pcibus_bussoft *bs; struct pci_bus *host_pci_bus; @@ -282,9 +300,9 @@ void sn_pci_fixup_slot(struct pci_dev *dev) * PCI host_pci_dev struct and set up host bus linkages */ - bus_no = SN_PCIDEV_INFO(dev)->pdi_slot_host_handle >> 32; + bus_no = (SN_PCIDEV_INFO(dev)->pdi_slot_host_handle >> 32) & 0xff; devfn = SN_PCIDEV_INFO(dev)->pdi_slot_host_handle & 0xffffffff; - host_pci_bus = pci_find_bus(pci_domain_nr(dev->bus), bus_no); + host_pci_bus = pci_find_bus(segment, bus_no); host_pci_dev = pci_get_slot(host_pci_bus, devfn); SN_PCIDEV_INFO(dev)->host_pci_dev = host_pci_dev; @@ -322,7 +340,7 @@ void sn_pci_controller_fixup(int segment, int busnum, struct pci_bus *bus) struct pci_controller *controller; struct pcibus_bussoft *prom_bussoft_ptr; struct hubdev_info *hubdev_info; - void *provider_soft; + void *provider_soft = NULL; struct sn_pcibus_provider *provider; status = sal_get_pcibus_info((u64) segment, (u64) busnum, @@ -332,13 +350,14 @@ void sn_pci_controller_fixup(int segment, int busnum, struct pci_bus *bus) prom_bussoft_ptr = __va(prom_bussoft_ptr); controller = kcalloc(1,sizeof(struct pci_controller), GFP_KERNEL); + controller->segment = segment; if (!controller) BUG(); if (bus == NULL) { bus = pci_scan_bus(busnum, &pci_root_ops, controller); if (bus == NULL) - return; /* error, or bus already scanned */ + goto error_return; /* error, or bus already scanned */ bus->sysdata = NULL; } @@ -351,28 +370,30 @@ void sn_pci_controller_fixup(int segment, int busnum, struct pci_bus *bus) */ if (prom_bussoft_ptr->bs_asic_type >= PCIIO_ASIC_MAX_TYPES) - return; /* unsupported asic type */ + goto error_return; /* unsupported asic type */ if (prom_bussoft_ptr->bs_asic_type == PCIIO_ASIC_TYPE_PPB) goto error_return; /* no further fixup necessary */ provider = sn_pci_provider[prom_bussoft_ptr->bs_asic_type]; if (provider == NULL) - return; /* no provider registerd for this asic */ + goto error_return; /* no provider registerd for this asic */ - provider_soft = NULL; + bus->sysdata = controller; if (provider->bus_fixup) provider_soft = (*provider->bus_fixup) (prom_bussoft_ptr, controller); - if (provider_soft == NULL) - return; /* fixup failed or not applicable */ + if (provider_soft == NULL) { + /* fixup failed or not applicable */ + bus->sysdata = NULL; + goto error_return; + } /* * Generic bus fixup goes here. Don't reference prom_bussoft_ptr * after this point. */ - bus->sysdata = controller; PCI_CONTROLLER(bus)->platform_data = provider_soft; nasid = NASID_GET(SN_PCIBUS_BUSSOFT(bus)->bs_base); cnode = nasid_to_cnodeid(nasid); @@ -387,7 +408,7 @@ void sn_pci_controller_fixup(int segment, int busnum, struct pci_bus *bus) if (controller->node >= num_online_nodes()) { struct pcibus_bussoft *b = SN_PCIBUS_BUSSOFT(bus); - printk(KERN_WARNING "Device ASIC=%u XID=%u PBUSNUM=%lu" + printk(KERN_WARNING "Device ASIC=%u XID=%u PBUSNUM=%u" "L_IO=%lx L_MEM=%lx BASE=%lx\n", b->bs_asic_type, b->bs_xid, b->bs_persist_busnum, b->bs_legacy_io, b->bs_legacy_mem, b->bs_base); @@ -408,7 +429,7 @@ void sn_bus_store_sysdata(struct pci_dev *dev) { struct sysdata_el *element; - element = kcalloc(1, sizeof(struct sysdata_el), GFP_KERNEL); + element = kzalloc(sizeof(struct sysdata_el), GFP_KERNEL); if (!element) { dev_dbg(dev, "%s: out of memory!\n", __FUNCTION__); return; @@ -442,6 +463,7 @@ sn_sysdata_free_start: static int __init sn_pci_init(void) { int i = 0; + int j = 0; struct pci_dev *pci_dev = NULL; extern void sn_init_cpei_timer(void); #ifdef CONFIG_PROC_FS @@ -461,6 +483,7 @@ static int __init sn_pci_init(void) pcibr_init_provider(); tioca_init_provider(); + tioce_init_provider(); /* * This is needed to avoid bounce limit checks in the blk layer @@ -476,8 +499,9 @@ static int __init sn_pci_init(void) #endif /* busses are not known yet ... */ - for (i = 0; i < PCI_BUSES_TO_SCAN; i++) - sn_pci_controller_fixup(0, i, NULL); + for (i = 0; i <= max_segment_number; i++) + for (j = 0; j <= max_pcibus_number; j++) + sn_pci_controller_fixup(i, j, NULL); /* * Generic Linux PCI Layer has created the pci_bus and pci_dev diff --git a/arch/ia64/sn/kernel/irq.c b/arch/ia64/sn/kernel/irq.c index 84d276a14ecb..01d18b7b5bb3 100644 --- a/arch/ia64/sn/kernel/irq.c +++ b/arch/ia64/sn/kernel/irq.c @@ -5,7 +5,7 @@ * License. See the file "COPYING" in the main directory of this archive * for more details. * - * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) 2000-2005 Silicon Graphics, Inc. All Rights Reserved. */ #include <linux/irq.h> @@ -23,7 +23,7 @@ static void force_interrupt(int irq); static void register_intr_pda(struct sn_irq_info *sn_irq_info); static void unregister_intr_pda(struct sn_irq_info *sn_irq_info); -extern int sn_force_interrupt_flag; +int sn_force_interrupt_flag = 1; extern int sn_ioif_inited; static struct list_head **sn_irq_lh; static spinlock_t sn_irq_info_lock = SPIN_LOCK_UNLOCKED; /* non-IRQ lock */ @@ -76,16 +76,14 @@ static void sn_enable_irq(unsigned int irq) static void sn_ack_irq(unsigned int irq) { - uint64_t event_occurred, mask = 0; - int nasid; + u64 event_occurred, mask = 0; irq = irq & 0xff; - nasid = get_nasid(); event_occurred = - HUB_L((uint64_t *) GLOBAL_MMR_ADDR(nasid, SH_EVENT_OCCURRED)); + HUB_L((u64*)LOCAL_MMR_ADDR(SH_EVENT_OCCURRED)); mask = event_occurred & SH_ALL_INT_MASK; - HUB_S((uint64_t *) GLOBAL_MMR_ADDR(nasid, SH_EVENT_OCCURRED_ALIAS), - mask); + HUB_S((u64*)LOCAL_MMR_ADDR(SH_EVENT_OCCURRED_ALIAS), + mask); __set_bit(irq, (volatile void *)pda->sn_in_service_ivecs); move_irq(irq); @@ -93,15 +91,12 @@ static void sn_ack_irq(unsigned int irq) static void sn_end_irq(unsigned int irq) { - int nasid; int ivec; - uint64_t event_occurred; + u64 event_occurred; ivec = irq & 0xff; if (ivec == SGI_UART_VECTOR) { - nasid = get_nasid(); - event_occurred = HUB_L((uint64_t *) GLOBAL_MMR_ADDR - (nasid, SH_EVENT_OCCURRED)); + event_occurred = HUB_L((u64*)LOCAL_MMR_ADDR (SH_EVENT_OCCURRED)); /* If the UART bit is set here, we may have received an * interrupt from the UART that the driver missed. To * make sure, we IPI ourselves to force us to look again. @@ -132,6 +127,7 @@ static void sn_set_affinity_irq(unsigned int irq, cpumask_t mask) int local_widget, status; nasid_t local_nasid; struct sn_irq_info *new_irq_info; + struct sn_pcibus_provider *pci_provider; new_irq_info = kmalloc(sizeof(struct sn_irq_info), GFP_ATOMIC); if (new_irq_info == NULL) @@ -171,8 +167,9 @@ static void sn_set_affinity_irq(unsigned int irq, cpumask_t mask) new_irq_info->irq_cpuid = cpuid; register_intr_pda(new_irq_info); - if (IS_PCI_BRIDGE_ASIC(new_irq_info->irq_bridge_type)) - pcibr_change_devices_irq(new_irq_info); + pci_provider = sn_pci_provider[new_irq_info->irq_bridge_type]; + if (pci_provider && pci_provider->target_interrupt) + (pci_provider->target_interrupt)(new_irq_info); spin_lock(&sn_irq_info_lock); list_replace_rcu(&sn_irq_info->list, &new_irq_info->list); @@ -317,6 +314,16 @@ void sn_irq_unfixup(struct pci_dev *pci_dev) pci_dev_put(pci_dev); } +static inline void +sn_call_force_intr_provider(struct sn_irq_info *sn_irq_info) +{ + struct sn_pcibus_provider *pci_provider; + + pci_provider = sn_pci_provider[sn_irq_info->irq_bridge_type]; + if (pci_provider && pci_provider->force_interrupt) + (*pci_provider->force_interrupt)(sn_irq_info); +} + static void force_interrupt(int irq) { struct sn_irq_info *sn_irq_info; @@ -325,11 +332,9 @@ static void force_interrupt(int irq) return; rcu_read_lock(); - list_for_each_entry_rcu(sn_irq_info, sn_irq_lh[irq], list) { - if (IS_PCI_BRIDGE_ASIC(sn_irq_info->irq_bridge_type) && - (sn_irq_info->irq_bridge != NULL)) - pcibr_force_interrupt(sn_irq_info); - } + list_for_each_entry_rcu(sn_irq_info, sn_irq_lh[irq], list) + sn_call_force_intr_provider(sn_irq_info); + rcu_read_unlock(); } @@ -351,6 +356,14 @@ static void sn_check_intr(int irq, struct sn_irq_info *sn_irq_info) struct pcidev_info *pcidev_info; struct pcibus_info *pcibus_info; + /* + * Bridge types attached to TIO (anything but PIC) do not need this WAR + * since they do not target Shub II interrupt registers. If that + * ever changes, this check needs to accomodate. + */ + if (sn_irq_info->irq_bridge_type != PCIIO_ASIC_TYPE_PIC) + return; + pcidev_info = (struct pcidev_info *)sn_irq_info->irq_pciioinfo; if (!pcidev_info) return; @@ -377,16 +390,12 @@ static void sn_check_intr(int irq, struct sn_irq_info *sn_irq_info) break; } if (!test_bit(irr_bit, &irr_reg)) { - if (!test_bit(irq, pda->sn_soft_irr)) { - if (!test_bit(irq, pda->sn_in_service_ivecs)) { - regval &= 0xff; - if (sn_irq_info->irq_int_bit & regval & - sn_irq_info->irq_last_intr) { - regval &= - ~(sn_irq_info-> - irq_int_bit & regval); - pcibr_force_interrupt(sn_irq_info); - } + if (!test_bit(irq, pda->sn_in_service_ivecs)) { + regval &= 0xff; + if (sn_irq_info->irq_int_bit & regval & + sn_irq_info->irq_last_intr) { + regval &= ~(sn_irq_info->irq_int_bit & regval); + sn_call_force_intr_provider(sn_irq_info); } } } @@ -404,13 +413,7 @@ void sn_lb_int_war_check(void) rcu_read_lock(); for (i = pda->sn_first_irq; i <= pda->sn_last_irq; i++) { list_for_each_entry_rcu(sn_irq_info, sn_irq_lh[i], list) { - /* - * Only call for PCI bridges that are fully - * initialized. - */ - if (IS_PCI_BRIDGE_ASIC(sn_irq_info->irq_bridge_type) && - (sn_irq_info->irq_bridge != NULL)) - sn_check_intr(i, sn_irq_info); + sn_check_intr(i, sn_irq_info); } } rcu_read_unlock(); diff --git a/arch/ia64/sn/kernel/setup.c b/arch/ia64/sn/kernel/setup.c index 7c7fe441d623..0fb579ef18c2 100644 --- a/arch/ia64/sn/kernel/setup.c +++ b/arch/ia64/sn/kernel/setup.c @@ -49,6 +49,7 @@ #include <asm/sn/clksupport.h> #include <asm/sn/sn_sal.h> #include <asm/sn/geo.h> +#include <asm/sn/sn_feature_sets.h> #include "xtalk/xwidgetdev.h" #include "xtalk/hubdev.h" #include <asm/sn/klconfig.h> @@ -56,9 +57,7 @@ DEFINE_PER_CPU(struct pda_s, pda_percpu); -#define MAX_PHYS_MEMORY (1UL << 49) /* 1 TB */ - -lboard_t *root_lboard[MAX_COMPACT_NODES]; +#define MAX_PHYS_MEMORY (1UL << IA64_MAX_PHYS_BITS) /* Max physical address supported */ extern void bte_init_node(nodepda_t *, cnodeid_t); @@ -80,8 +79,6 @@ EXPORT_PER_CPU_SYMBOL(__sn_cnodeid_to_nasid); DEFINE_PER_CPU(struct nodepda_s *, __sn_nodepda); EXPORT_PER_CPU_SYMBOL(__sn_nodepda); -partid_t sn_partid = -1; -EXPORT_SYMBOL(sn_partid); char sn_system_serial_number_string[128]; EXPORT_SYMBOL(sn_system_serial_number_string); u64 sn_partition_serial_number; @@ -98,14 +95,15 @@ u8 sn_region_size; EXPORT_SYMBOL(sn_region_size); int sn_prom_type; /* 0=hardware, 1=medusa/realprom, 2=medusa/fakeprom */ -short physical_node_map[MAX_PHYSNODE_ID]; +short physical_node_map[MAX_NUMALINK_NODES]; +static unsigned long sn_prom_features[MAX_PROM_FEATURE_SETS]; EXPORT_SYMBOL(physical_node_map); -int numionodes; +int num_cnodes; static void sn_init_pdas(char **); -static void scan_for_ionodes(void); +static void build_cnode_tables(void); static nodepda_t *nodepdaindr[MAX_COMPACT_NODES]; @@ -140,19 +138,6 @@ char drive_info[4 * 16]; #endif /* - * Get nasid of current cpu early in boot before nodepda is initialized - */ -static int -boot_get_nasid(void) -{ - int nasid; - - if (ia64_sn_get_sapic_info(get_sapicid(), &nasid, NULL, NULL)) - BUG(); - return nasid; -} - -/* * This routine can only be used during init, since * smp_boot_data is an init data structure. * We have to use smp_boot_data.cpu_phys_id to find @@ -223,7 +208,6 @@ void __init early_sn_setup(void) } extern int platform_intr_list[]; -extern nasid_t master_nasid; static int __initdata shub_1_1_found = 0; /* @@ -269,11 +253,13 @@ static void __init sn_check_for_wars(void) void __init sn_setup(char **cmdline_p) { long status, ticks_per_sec, drift; - int pxm; u32 version = sn_sal_rev(); extern void sn_cpu_init(void); - ia64_sn_plat_set_error_handling_features(); + ia64_sn_plat_set_error_handling_features(); // obsolete + ia64_sn_set_os_feature(OSF_MCA_SLV_TO_OS_INIT_SLV); + ia64_sn_set_os_feature(OSF_FEAT_LOG_SBES); + #if defined(CONFIG_VT) && defined(CONFIG_VGA_CONSOLE) /* @@ -297,11 +283,10 @@ void __init sn_setup(char **cmdline_p) MAX_DMA_ADDRESS = PAGE_OFFSET + MAX_PHYS_MEMORY; - memset(physical_node_map, -1, sizeof(physical_node_map)); - for (pxm = 0; pxm < MAX_PXM_DOMAINS; pxm++) - if (pxm_to_nid_map[pxm] != -1) - physical_node_map[pxm_to_nasid(pxm)] = - pxm_to_nid_map[pxm]; + /* + * Build the tables for managing cnodes. + */ + build_cnode_tables(); /* * Old PROMs do not provide an ACPI FADT. Disable legacy keyboard @@ -316,18 +301,6 @@ void __init sn_setup(char **cmdline_p) printk("SGI SAL version %x.%02x\n", version >> 8, version & 0x00FF); - /* - * Confirm the SAL we're running on is recent enough... - */ - if (version < SN_SAL_MIN_VERSION) { - printk(KERN_ERR "This kernel needs SGI SAL version >= " - "%x.%02x\n", SN_SAL_MIN_VERSION >> 8, - SN_SAL_MIN_VERSION & 0x00FF); - panic("PROM version too old\n"); - } - - master_nasid = boot_get_nasid(); - status = ia64_sal_freq_base(SAL_FREQ_BASE_REALTIME_CLOCK, &ticks_per_sec, &drift); @@ -385,15 +358,6 @@ static void __init sn_init_pdas(char **cmdline_p) { cnodeid_t cnode; - memset(sn_cnodeid_to_nasid, -1, - sizeof(__ia64_per_cpu_var(__sn_cnodeid_to_nasid))); - for_each_online_node(cnode) - sn_cnodeid_to_nasid[cnode] = - pxm_to_nasid(nid_to_pxm_map[cnode]); - - numionodes = num_online_nodes(); - scan_for_ionodes(); - /* * Allocate & initalize the nodepda for each node. */ @@ -403,12 +367,13 @@ static void __init sn_init_pdas(char **cmdline_p) memset(nodepdaindr[cnode], 0, sizeof(nodepda_t)); memset(nodepdaindr[cnode]->phys_cpuid, -1, sizeof(nodepdaindr[cnode]->phys_cpuid)); + spin_lock_init(&nodepdaindr[cnode]->ptc_lock); } /* * Allocate & initialize nodepda for TIOs. For now, put them on node 0. */ - for (cnode = num_online_nodes(); cnode < numionodes; cnode++) { + for (cnode = num_online_nodes(); cnode < num_cnodes; cnode++) { nodepdaindr[cnode] = alloc_bootmem_node(NODE_DATA(0), sizeof(nodepda_t)); memset(nodepdaindr[cnode], 0, sizeof(nodepda_t)); @@ -417,7 +382,7 @@ static void __init sn_init_pdas(char **cmdline_p) /* * Now copy the array of nodepda pointers to each nodepda. */ - for (cnode = 0; cnode < numionodes; cnode++) + for (cnode = 0; cnode < num_cnodes; cnode++) memcpy(nodepdaindr[cnode]->pernode_pdaindr, nodepdaindr, sizeof(nodepdaindr)); @@ -434,7 +399,7 @@ static void __init sn_init_pdas(char **cmdline_p) * Initialize the per node hubdev. This includes IO Nodes and * headless/memless nodes. */ - for (cnode = 0; cnode < numionodes; cnode++) { + for (cnode = 0; cnode < num_cnodes; cnode++) { hubdev_init_node(nodepdaindr[cnode], cnode); } } @@ -481,6 +446,10 @@ void __init sn_cpu_init(void) if (nodepdaindr[0] == NULL) return; + for (i = 0; i < MAX_PROM_FEATURE_SETS; i++) + if (ia64_sn_get_prom_feature_set(i, &sn_prom_features[i]) != 0) + break; + cpuid = smp_processor_id(); cpuphyid = get_sapicid(); @@ -532,8 +501,8 @@ void __init sn_cpu_init(void) */ { u64 pio1[] = {SH1_PIO_WRITE_STATUS_0, 0, SH1_PIO_WRITE_STATUS_1, 0}; - u64 pio2[] = {SH2_PIO_WRITE_STATUS_0, SH2_PIO_WRITE_STATUS_1, - SH2_PIO_WRITE_STATUS_2, SH2_PIO_WRITE_STATUS_3}; + u64 pio2[] = {SH2_PIO_WRITE_STATUS_0, SH2_PIO_WRITE_STATUS_2, + SH2_PIO_WRITE_STATUS_1, SH2_PIO_WRITE_STATUS_3}; u64 *pio; pio = is_shub1() ? pio1 : pio2; pda->pio_write_status_addr = (volatile unsigned long *) LOCAL_MMR_ADDR(pio[slice]); @@ -555,87 +524,58 @@ void __init sn_cpu_init(void) } /* - * Scan klconfig for ionodes. Add the nasids to the - * physical_node_map and the pda and increment numionodes. + * Build tables for converting between NASIDs and cnodes. */ +static inline int __init board_needs_cnode(int type) +{ + return (type == KLTYPE_SNIA || type == KLTYPE_TIO); +} -static void __init scan_for_ionodes(void) +void __init build_cnode_tables(void) { - int nasid = 0; + int nasid; + int node; lboard_t *brd; - /* fakeprom does not support klgraph */ - if (IS_RUNNING_ON_FAKE_PROM()) - return; - - /* Setup ionodes with memory */ - for (nasid = 0; nasid < MAX_PHYSNODE_ID; nasid += 2) { - char *klgraph_header; - cnodeid_t cnodeid; - - if (physical_node_map[nasid] == -1) - continue; + memset(physical_node_map, -1, sizeof(physical_node_map)); + memset(sn_cnodeid_to_nasid, -1, + sizeof(__ia64_per_cpu_var(__sn_cnodeid_to_nasid))); - cnodeid = -1; - klgraph_header = __va(ia64_sn_get_klconfig_addr(nasid)); - if (!klgraph_header) { - BUG(); /* All nodes must have klconfig tables! */ - } - cnodeid = nasid_to_cnodeid(nasid); - root_lboard[cnodeid] = (lboard_t *) - NODE_OFFSET_TO_LBOARD((nasid), - ((kl_config_hdr_t - *) (klgraph_header))-> - ch_board_info); + /* + * First populate the tables with C/M bricks. This ensures that + * cnode == node for all C & M bricks. + */ + for_each_online_node(node) { + nasid = pxm_to_nasid(nid_to_pxm_map[node]); + sn_cnodeid_to_nasid[node] = nasid; + physical_node_map[nasid] = node; } - /* Scan headless/memless IO Nodes. */ - for (nasid = 0; nasid < MAX_PHYSNODE_ID; nasid += 2) { - /* if there's no nasid, don't try to read the klconfig on the node */ - if (physical_node_map[nasid] == -1) - continue; - brd = find_lboard_any((lboard_t *) - root_lboard[nasid_to_cnodeid(nasid)], - KLTYPE_SNIA); - if (brd) { - brd = KLCF_NEXT_ANY(brd); /* Skip this node's lboard */ - if (!brd) - continue; - } - - brd = find_lboard_any(brd, KLTYPE_SNIA); + /* + * num_cnodes is total number of C/M/TIO bricks. Because of the 256 node + * limit on the number of nodes, we can't use the generic node numbers + * for this. Note that num_cnodes is incremented below as TIOs or + * headless/memoryless nodes are discovered. + */ + num_cnodes = num_online_nodes(); - while (brd) { - sn_cnodeid_to_nasid[numionodes] = brd->brd_nasid; - physical_node_map[brd->brd_nasid] = numionodes; - root_lboard[numionodes] = brd; - numionodes++; - brd = KLCF_NEXT_ANY(brd); - if (!brd) - break; - - brd = find_lboard_any(brd, KLTYPE_SNIA); - } - } + /* fakeprom does not support klgraph */ + if (IS_RUNNING_ON_FAKE_PROM()) + return; - /* Scan for TIO nodes. */ - for (nasid = 0; nasid < MAX_PHYSNODE_ID; nasid += 2) { - /* if there's no nasid, don't try to read the klconfig on the node */ - if (physical_node_map[nasid] == -1) - continue; - brd = find_lboard_any((lboard_t *) - root_lboard[nasid_to_cnodeid(nasid)], - KLTYPE_TIO); + /* Find TIOs & headless/memoryless nodes and add them to the tables */ + for_each_online_node(node) { + kl_config_hdr_t *klgraph_header; + nasid = cnodeid_to_nasid(node); + if ((klgraph_header = ia64_sn_get_klconfig_addr(nasid)) == NULL) + BUG(); + brd = NODE_OFFSET_TO_LBOARD(nasid, klgraph_header->ch_board_info); while (brd) { - sn_cnodeid_to_nasid[numionodes] = brd->brd_nasid; - physical_node_map[brd->brd_nasid] = numionodes; - root_lboard[numionodes] = brd; - numionodes++; - brd = KLCF_NEXT_ANY(brd); - if (!brd) - break; - - brd = find_lboard_any(brd, KLTYPE_TIO); + if (board_needs_cnode(brd->brd_type) && physical_node_map[brd->brd_nasid] < 0) { + sn_cnodeid_to_nasid[num_cnodes] = brd->brd_nasid; + physical_node_map[brd->brd_nasid] = num_cnodes++; + } + brd = find_lboard_next(brd); } } } @@ -652,3 +592,12 @@ nasid_slice_to_cpuid(int nasid, int slice) return -1; } + +int sn_prom_feature_available(int id) +{ + if (id >= BITS_PER_LONG * MAX_PROM_FEATURE_SETS) + return 0; + return test_bit(id, sn_prom_features); +} +EXPORT_SYMBOL(sn_prom_feature_available); + diff --git a/arch/ia64/sn/kernel/sn2/ptc_deadlock.S b/arch/ia64/sn/kernel/sn2/ptc_deadlock.S index 96cb71d15682..3fa95065a446 100644 --- a/arch/ia64/sn/kernel/sn2/ptc_deadlock.S +++ b/arch/ia64/sn/kernel/sn2/ptc_deadlock.S @@ -3,7 +3,7 @@ * License. See the file "COPYING" in the main directory of this archive * for more details. * - * Copyright (C) 2000-2004 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2000-2005 Silicon Graphics, Inc. All rights reserved. */ #include <asm/types.h> @@ -11,7 +11,7 @@ #define DEADLOCKBIT SH_PIO_WRITE_STATUS_WRITE_DEADLOCK_SHFT #define WRITECOUNTMASK SH_PIO_WRITE_STATUS_PENDING_WRITE_COUNT_MASK -#define ALIAS_OFFSET (SH1_PIO_WRITE_STATUS_0_ALIAS-SH1_PIO_WRITE_STATUS_0) +#define ALIAS_OFFSET 8 .global sn2_ptc_deadlock_recovery_core @@ -36,13 +36,15 @@ sn2_ptc_deadlock_recovery_core: extr.u piowcphy=piowc,0,61;; // Convert piowc to uncached physical address dep piowcphy=-1,piowcphy,63,1 movl mask=WRITECOUNTMASK + mov r8=r0 1: add scr2=ALIAS_OFFSET,piowc // Address of WRITE_STATUS alias register - mov scr1=7;; // Clear DEADLOCK, WRITE_ERROR, MULTI_WRITE_ERROR - st8.rel [scr2]=scr1;; + ;; + ld8.acq scr1=[scr2];; 5: ld8.acq scr1=[piowc];; // Wait for PIOs to complete. + hint @pause and scr2=scr1,mask;; // mask of writecount bits cmp.ne p6,p0=zeroval,scr2 (p6) br.cond.sptk 5b @@ -57,6 +59,7 @@ sn2_ptc_deadlock_recovery_core: st8.rel [ptc0]=data0 // Write PTC0 & wait for completion. 5: ld8.acq scr1=[piowcphy];; // Wait for PIOs to complete. + hint @pause and scr2=scr1,mask;; // mask of writecount bits cmp.ne p6,p0=zeroval,scr2 (p6) br.cond.sptk 5b;; @@ -67,6 +70,7 @@ sn2_ptc_deadlock_recovery_core: (p7) st8.rel [ptc1]=data1;; // Now write PTC1. 5: ld8.acq scr1=[piowcphy];; // Wait for PIOs to complete. + hint @pause and scr2=scr1,mask;; // mask of writecount bits cmp.ne p6,p0=zeroval,scr2 (p6) br.cond.sptk 5b @@ -77,6 +81,7 @@ sn2_ptc_deadlock_recovery_core: srlz.i;; ////////////// END PHYSICAL MODE //////////////////// +(p8) add r8=1,r8 (p8) br.cond.spnt 1b;; // Repeat if DEADLOCK occurred. br.ret.sptk rp diff --git a/arch/ia64/sn/kernel/sn2/sn2_smp.c b/arch/ia64/sn/kernel/sn2/sn2_smp.c index 7af05a7ac743..49b530c39a42 100644 --- a/arch/ia64/sn/kernel/sn2/sn2_smp.c +++ b/arch/ia64/sn/kernel/sn2/sn2_smp.c @@ -5,7 +5,7 @@ * License. See the file "COPYING" in the main directory of this archive * for more details. * - * Copyright (C) 2000-2004 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2000-2005 Silicon Graphics, Inc. All rights reserved. */ #include <linux/init.h> @@ -20,6 +20,8 @@ #include <linux/module.h> #include <linux/bitops.h> #include <linux/nodemask.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> #include <asm/processor.h> #include <asm/irq.h> @@ -39,12 +41,120 @@ #include <asm/sn/nodepda.h> #include <asm/sn/rw_mmr.h> -void sn2_ptc_deadlock_recovery(volatile unsigned long *, unsigned long data0, - volatile unsigned long *, unsigned long data1); +DEFINE_PER_CPU(struct ptc_stats, ptcstats); +DECLARE_PER_CPU(struct ptc_stats, ptcstats); static __cacheline_aligned DEFINE_SPINLOCK(sn2_global_ptc_lock); -static unsigned long sn2_ptc_deadlock_count; +void sn2_ptc_deadlock_recovery(short *, short, int, volatile unsigned long *, unsigned long data0, + volatile unsigned long *, unsigned long data1); + +#ifdef DEBUG_PTC +/* + * ptctest: + * + * xyz - 3 digit hex number: + * x - Force PTC purges to use shub: + * 0 - no force + * 1 - force + * y - interupt enable + * 0 - disable interrupts + * 1 - leave interuupts enabled + * z - type of lock: + * 0 - global lock + * 1 - node local lock + * 2 - no lock + * + * Note: on shub1, only ptctest == 0 is supported. Don't try other values! + */ + +static unsigned int sn2_ptctest = 0; + +static int __init ptc_test(char *str) +{ + get_option(&str, &sn2_ptctest); + return 1; +} +__setup("ptctest=", ptc_test); + +static inline int ptc_lock(unsigned long *flagp) +{ + unsigned long opt = sn2_ptctest & 255; + + switch (opt) { + case 0x00: + spin_lock_irqsave(&sn2_global_ptc_lock, *flagp); + break; + case 0x01: + spin_lock_irqsave(&sn_nodepda->ptc_lock, *flagp); + break; + case 0x02: + local_irq_save(*flagp); + break; + case 0x10: + spin_lock(&sn2_global_ptc_lock); + break; + case 0x11: + spin_lock(&sn_nodepda->ptc_lock); + break; + case 0x12: + break; + default: + BUG(); + } + return opt; +} + +static inline void ptc_unlock(unsigned long flags, int opt) +{ + switch (opt) { + case 0x00: + spin_unlock_irqrestore(&sn2_global_ptc_lock, flags); + break; + case 0x01: + spin_unlock_irqrestore(&sn_nodepda->ptc_lock, flags); + break; + case 0x02: + local_irq_restore(flags); + break; + case 0x10: + spin_unlock(&sn2_global_ptc_lock); + break; + case 0x11: + spin_unlock(&sn_nodepda->ptc_lock); + break; + case 0x12: + break; + default: + BUG(); + } +} +#else + +#define sn2_ptctest 0 + +static inline int ptc_lock(unsigned long *flagp) +{ + spin_lock_irqsave(&sn2_global_ptc_lock, *flagp); + return 0; +} + +static inline void ptc_unlock(unsigned long flags, int opt) +{ + spin_unlock_irqrestore(&sn2_global_ptc_lock, flags); +} +#endif + +struct ptc_stats { + unsigned long ptc_l; + unsigned long change_rid; + unsigned long shub_ptc_flushes; + unsigned long nodes_flushed; + unsigned long deadlocks; + unsigned long lock_itc_clocks; + unsigned long shub_itc_clocks; + unsigned long shub_itc_clocks_max; +}; static inline unsigned long wait_piowc(void) { @@ -67,6 +177,7 @@ void sn_tlb_migrate_finish(struct mm_struct *mm) /** * sn2_global_tlb_purge - globally purge translation cache of virtual address range + * @mm: mm_struct containing virtual address range * @start: start of virtual address range * @end: end of virtual address range * @nbits: specifies number of bytes to purge per instruction (num = 1<<(nbits & 0xfc)) @@ -78,21 +189,22 @@ void sn_tlb_migrate_finish(struct mm_struct *mm) * - cpu_vm_mask is a bit mask that indicates which cpus have loaded the context. * - cpu_vm_mask is converted into a nodemask of the nodes containing the * cpus in cpu_vm_mask. - * - if only one bit is set in cpu_vm_mask & it is the current cpu, - * then only the local TLB needs to be flushed. This flushing can be done - * using ptc.l. This is the common case & avoids the global spinlock. + * - if only one bit is set in cpu_vm_mask & it is the current cpu & the + * process is purging its own virtual address range, then only the + * local TLB needs to be flushed. This flushing can be done using + * ptc.l. This is the common case & avoids the global spinlock. * - if multiple cpus have loaded the context, then flushing has to be * done with ptc.g/MMRs under protection of the global ptc_lock. */ void -sn2_global_tlb_purge(unsigned long start, unsigned long end, - unsigned long nbits) +sn2_global_tlb_purge(struct mm_struct *mm, unsigned long start, + unsigned long end, unsigned long nbits) { - int i, shub1, cnode, mynasid, cpu, lcpu = 0, nasid, flushed = 0; + int i, opt, shub1, cnode, mynasid, cpu, lcpu = 0, nasid, flushed = 0; + int mymm = (mm == current->active_mm); volatile unsigned long *ptc0, *ptc1; - unsigned long flags = 0, data0 = 0, data1 = 0; - struct mm_struct *mm = current->active_mm; + unsigned long itc, itc2, flags, data0 = 0, data1 = 0, rr_value; short nasids[MAX_NUMNODES], nix; nodemask_t nodes_flushed; @@ -106,33 +218,41 @@ sn2_global_tlb_purge(unsigned long start, unsigned long end, i++; } + if (i == 0) + return; + preempt_disable(); - if (likely(i == 1 && lcpu == smp_processor_id())) { + if (likely(i == 1 && lcpu == smp_processor_id() && mymm)) { do { ia64_ptcl(start, nbits << 2); start += (1UL << nbits); } while (start < end); ia64_srlz_i(); + __get_cpu_var(ptcstats).ptc_l++; preempt_enable(); return; } - if (atomic_read(&mm->mm_users) == 1) { + if (atomic_read(&mm->mm_users) == 1 && mymm) { flush_tlb_mm(mm); + __get_cpu_var(ptcstats).change_rid++; preempt_enable(); return; } + itc = ia64_get_itc(); nix = 0; for_each_node_mask(cnode, nodes_flushed) nasids[nix++] = cnodeid_to_nasid(cnode); + rr_value = (mm->context << 3) | REGION_NUMBER(start); + shub1 = is_shub1(); if (shub1) { data0 = (1UL << SH1_PTC_0_A_SHFT) | (nbits << SH1_PTC_0_PS_SHFT) | - ((ia64_get_rr(start) >> 8) << SH1_PTC_0_RID_SHFT) | + (rr_value << SH1_PTC_0_RID_SHFT) | (1UL << SH1_PTC_0_START_SHFT); ptc0 = (long *)GLOBAL_MMR_PHYS_ADDR(0, SH1_PTC_0); ptc1 = (long *)GLOBAL_MMR_PHYS_ADDR(0, SH1_PTC_1); @@ -141,14 +261,19 @@ sn2_global_tlb_purge(unsigned long start, unsigned long end, (nbits << SH2_PTC_PS_SHFT) | (1UL << SH2_PTC_START_SHFT); ptc0 = (long *)GLOBAL_MMR_PHYS_ADDR(0, SH2_PTC + - ((ia64_get_rr(start) >> 8) << SH2_PTC_RID_SHFT) ); + (rr_value << SH2_PTC_RID_SHFT)); ptc1 = NULL; } mynasid = get_nasid(); - spin_lock_irqsave(&sn2_global_ptc_lock, flags); + itc = ia64_get_itc(); + opt = ptc_lock(&flags); + itc2 = ia64_get_itc(); + __get_cpu_var(ptcstats).lock_itc_clocks += itc2 - itc; + __get_cpu_var(ptcstats).shub_ptc_flushes++; + __get_cpu_var(ptcstats).nodes_flushed += nix; do { if (shub1) @@ -157,7 +282,7 @@ sn2_global_tlb_purge(unsigned long start, unsigned long end, data0 = (data0 & ~SH2_PTC_ADDR_MASK) | (start & SH2_PTC_ADDR_MASK); for (i = 0; i < nix; i++) { nasid = nasids[i]; - if (unlikely(nasid == mynasid)) { + if ((!(sn2_ptctest & 3)) && unlikely(nasid == mynasid && mymm)) { ia64_ptcga(start, nbits << 2); ia64_srlz_i(); } else { @@ -169,18 +294,22 @@ sn2_global_tlb_purge(unsigned long start, unsigned long end, flushed = 1; } } - if (flushed && (wait_piowc() & - SH_PIO_WRITE_STATUS_WRITE_DEADLOCK_MASK)) { - sn2_ptc_deadlock_recovery(ptc0, data0, ptc1, data1); + (SH_PIO_WRITE_STATUS_WRITE_DEADLOCK_MASK))) { + sn2_ptc_deadlock_recovery(nasids, nix, mynasid, ptc0, data0, ptc1, data1); } start += (1UL << nbits); } while (start < end); - spin_unlock_irqrestore(&sn2_global_ptc_lock, flags); + itc2 = ia64_get_itc() - itc2; + __get_cpu_var(ptcstats).shub_itc_clocks += itc2; + if (itc2 > __get_cpu_var(ptcstats).shub_itc_clocks_max) + __get_cpu_var(ptcstats).shub_itc_clocks_max = itc2; + + ptc_unlock(flags, opt); preempt_enable(); } @@ -192,31 +321,29 @@ sn2_global_tlb_purge(unsigned long start, unsigned long end, * TLB flush transaction. The recovery sequence is somewhat tricky & is * coded in assembly language. */ -void sn2_ptc_deadlock_recovery(volatile unsigned long *ptc0, unsigned long data0, +void sn2_ptc_deadlock_recovery(short *nasids, short nix, int mynasid, volatile unsigned long *ptc0, unsigned long data0, volatile unsigned long *ptc1, unsigned long data1) { extern void sn2_ptc_deadlock_recovery_core(volatile unsigned long *, unsigned long, volatile unsigned long *, unsigned long, volatile unsigned long *, unsigned long); - int cnode, mycnode, nasid; - volatile unsigned long *piows; - volatile unsigned long zeroval; + short nasid, i; + unsigned long *piows, zeroval; - sn2_ptc_deadlock_count++; + __get_cpu_var(ptcstats).deadlocks++; - piows = pda->pio_write_status_addr; + piows = (unsigned long *) pda->pio_write_status_addr; zeroval = pda->pio_write_status_val; - mycnode = numa_node_id(); - - for_each_online_node(cnode) { - if (is_headless_node(cnode) || cnode == mycnode) + for (i=0; i < nix; i++) { + nasid = nasids[i]; + if (!(sn2_ptctest & 3) && nasid == mynasid) continue; - nasid = cnodeid_to_nasid(cnode); ptc0 = CHANGE_NASID(nasid, ptc0); if (ptc1) ptc1 = CHANGE_NASID(nasid, ptc1); sn2_ptc_deadlock_recovery_core(ptc0, data0, ptc1, data1, piows, zeroval); } + } /** @@ -293,3 +420,93 @@ void sn2_send_IPI(int cpuid, int vector, int delivery_mode, int redirect) sn_send_IPI_phys(nasid, physid, vector, delivery_mode); } + +#ifdef CONFIG_PROC_FS + +#define PTC_BASENAME "sgi_sn/ptc_statistics" + +static void *sn2_ptc_seq_start(struct seq_file *file, loff_t * offset) +{ + if (*offset < NR_CPUS) + return offset; + return NULL; +} + +static void *sn2_ptc_seq_next(struct seq_file *file, void *data, loff_t * offset) +{ + (*offset)++; + if (*offset < NR_CPUS) + return offset; + return NULL; +} + +static void sn2_ptc_seq_stop(struct seq_file *file, void *data) +{ +} + +static int sn2_ptc_seq_show(struct seq_file *file, void *data) +{ + struct ptc_stats *stat; + int cpu; + + cpu = *(loff_t *) data; + + if (!cpu) { + seq_printf(file, "# ptc_l change_rid shub_ptc_flushes shub_nodes_flushed deadlocks lock_nsec shub_nsec shub_nsec_max\n"); + seq_printf(file, "# ptctest %d\n", sn2_ptctest); + } + + if (cpu < NR_CPUS && cpu_online(cpu)) { + stat = &per_cpu(ptcstats, cpu); + seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld\n", cpu, stat->ptc_l, + stat->change_rid, stat->shub_ptc_flushes, stat->nodes_flushed, + stat->deadlocks, + 1000 * stat->lock_itc_clocks / per_cpu(cpu_info, cpu).cyc_per_usec, + 1000 * stat->shub_itc_clocks / per_cpu(cpu_info, cpu).cyc_per_usec, + 1000 * stat->shub_itc_clocks_max / per_cpu(cpu_info, cpu).cyc_per_usec); + } + + return 0; +} + +static struct seq_operations sn2_ptc_seq_ops = { + .start = sn2_ptc_seq_start, + .next = sn2_ptc_seq_next, + .stop = sn2_ptc_seq_stop, + .show = sn2_ptc_seq_show +}; + +int sn2_ptc_proc_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &sn2_ptc_seq_ops); +} + +static struct file_operations proc_sn2_ptc_operations = { + .open = sn2_ptc_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct proc_dir_entry *proc_sn2_ptc; + +static int __init sn2_ptc_init(void) +{ + if (!(proc_sn2_ptc = create_proc_entry(PTC_BASENAME, 0444, NULL))) { + printk(KERN_ERR "unable to create %s proc entry", PTC_BASENAME); + return -EINVAL; + } + proc_sn2_ptc->proc_fops = &proc_sn2_ptc_operations; + spin_lock_init(&sn2_global_ptc_lock); + return 0; +} + +static void __exit sn2_ptc_exit(void) +{ + remove_proc_entry(PTC_BASENAME, NULL); +} + +module_init(sn2_ptc_init); +module_exit(sn2_ptc_exit); +#endif /* CONFIG_PROC_FS */ + diff --git a/arch/ia64/sn/kernel/sn2/sn_hwperf.c b/arch/ia64/sn/kernel/sn2/sn_hwperf.c index 833e700fdac9..6c6fbca3229c 100644 --- a/arch/ia64/sn/kernel/sn2/sn_hwperf.c +++ b/arch/ia64/sn/kernel/sn2/sn_hwperf.c @@ -36,7 +36,6 @@ #include <asm/topology.h> #include <asm/smp.h> #include <asm/semaphore.h> -#include <asm/segment.h> #include <asm/uaccess.h> #include <asm/sal.h> #include <asm/sn/io.h> @@ -59,7 +58,7 @@ static int sn_hwperf_enum_objects(int *nobj, struct sn_hwperf_object_info **ret) struct sn_hwperf_object_info *objbuf = NULL; if ((e = sn_hwperf_init()) < 0) { - printk("sn_hwperf_init failed: err %d\n", e); + printk(KERN_ERR "sn_hwperf_init failed: err %d\n", e); goto out; } @@ -111,7 +110,7 @@ static int sn_hwperf_geoid_to_cnode(char *location) if (sn_hwperf_location_to_bpos(location, &rack, &bay, &slot, &slab)) return -1; - for (cnode = 0; cnode < numionodes; cnode++) { + for_each_node(cnode) { geoid = cnodeid_get_geoid(cnode); module_id = geo_module(geoid); this_rack = MODULE_GET_RACK(module_id); @@ -124,11 +123,13 @@ static int sn_hwperf_geoid_to_cnode(char *location) } } - return cnode < numionodes ? cnode : -1; + return node_possible(cnode) ? cnode : -1; } static int sn_hwperf_obj_to_cnode(struct sn_hwperf_object_info * obj) { + if (!SN_HWPERF_IS_NODE(obj) && !SN_HWPERF_IS_IONODE(obj)) + BUG(); if (!obj->sn_hwp_this_part) return -1; return sn_hwperf_geoid_to_cnode(obj->location); @@ -174,31 +175,199 @@ static const char *sn_hwperf_get_slabname(struct sn_hwperf_object_info *obj, return slabname; } -static void print_pci_topology(struct seq_file *s, - struct sn_hwperf_object_info *obj, int *ordinal, - u64 rack, u64 bay, u64 slot, u64 slab) +static void print_pci_topology(struct seq_file *s) +{ + char *p; + size_t sz; + int e; + + for (sz = PAGE_SIZE; sz < 16 * PAGE_SIZE; sz += PAGE_SIZE) { + if (!(p = (char *)kmalloc(sz, GFP_KERNEL))) + break; + e = ia64_sn_ioif_get_pci_topology(__pa(p), sz); + if (e == SALRET_OK) + seq_puts(s, p); + kfree(p); + if (e == SALRET_OK || e == SALRET_NOT_IMPLEMENTED) + break; + } +} + +static inline int sn_hwperf_has_cpus(cnodeid_t node) +{ + return node_online(node) && nr_cpus_node(node); +} + +static inline int sn_hwperf_has_mem(cnodeid_t node) +{ + return node_online(node) && NODE_DATA(node)->node_present_pages; +} + +static struct sn_hwperf_object_info * +sn_hwperf_findobj_id(struct sn_hwperf_object_info *objbuf, + int nobj, int id) { - char *p1; - char *p2; - char *pg; - - if (!(pg = (char *)get_zeroed_page(GFP_KERNEL))) - return; /* ignore */ - if (ia64_sn_ioif_get_pci_topology(rack, bay, slot, slab, - __pa(pg), PAGE_SIZE) == SN_HWPERF_OP_OK) { - for (p1=pg; *p1 && p1 < pg + PAGE_SIZE;) { - if (!(p2 = strchr(p1, '\n'))) + int i; + struct sn_hwperf_object_info *p = objbuf; + + for (i=0; i < nobj; i++, p++) { + if (p->id == id) + return p; + } + + return NULL; + +} + +static int sn_hwperf_get_nearest_node_objdata(struct sn_hwperf_object_info *objbuf, + int nobj, cnodeid_t node, cnodeid_t *near_mem_node, cnodeid_t *near_cpu_node) +{ + int e; + struct sn_hwperf_object_info *nodeobj = NULL; + struct sn_hwperf_object_info *op; + struct sn_hwperf_object_info *dest; + struct sn_hwperf_object_info *router; + struct sn_hwperf_port_info ptdata[16]; + int sz, i, j; + cnodeid_t c; + int found_mem = 0; + int found_cpu = 0; + + if (!node_possible(node)) + return -EINVAL; + + if (sn_hwperf_has_cpus(node)) { + if (near_cpu_node) + *near_cpu_node = node; + found_cpu++; + } + + if (sn_hwperf_has_mem(node)) { + if (near_mem_node) + *near_mem_node = node; + found_mem++; + } + + if (found_cpu && found_mem) + return 0; /* trivially successful */ + + /* find the argument node object */ + for (i=0, op=objbuf; i < nobj; i++, op++) { + if (!SN_HWPERF_IS_NODE(op) && !SN_HWPERF_IS_IONODE(op)) + continue; + if (node == sn_hwperf_obj_to_cnode(op)) { + nodeobj = op; + break; + } + } + if (!nodeobj) { + e = -ENOENT; + goto err; + } + + /* get it's interconnect topology */ + sz = op->ports * sizeof(struct sn_hwperf_port_info); + if (sz > sizeof(ptdata)) + BUG(); + e = ia64_sn_hwperf_op(sn_hwperf_master_nasid, + SN_HWPERF_ENUM_PORTS, nodeobj->id, sz, + (u64)&ptdata, 0, 0, NULL); + if (e != SN_HWPERF_OP_OK) { + e = -EINVAL; + goto err; + } + + /* find nearest node with cpus and nearest memory */ + for (router=NULL, j=0; j < op->ports; j++) { + dest = sn_hwperf_findobj_id(objbuf, nobj, ptdata[j].conn_id); + if (!dest || SN_HWPERF_FOREIGN(dest) || + !SN_HWPERF_IS_NODE(dest) || SN_HWPERF_IS_IONODE(dest)) { + continue; + } + c = sn_hwperf_obj_to_cnode(dest); + if (!found_cpu && sn_hwperf_has_cpus(c)) { + if (near_cpu_node) + *near_cpu_node = c; + found_cpu++; + } + if (!found_mem && sn_hwperf_has_mem(c)) { + if (near_mem_node) + *near_mem_node = c; + found_mem++; + } + if (SN_HWPERF_IS_ROUTER(dest)) + router = dest; + } + + if (router && (!found_cpu || !found_mem)) { + /* search for a node connected to the same router */ + sz = router->ports * sizeof(struct sn_hwperf_port_info); + if (sz > sizeof(ptdata)) + BUG(); + e = ia64_sn_hwperf_op(sn_hwperf_master_nasid, + SN_HWPERF_ENUM_PORTS, router->id, sz, + (u64)&ptdata, 0, 0, NULL); + if (e != SN_HWPERF_OP_OK) { + e = -EINVAL; + goto err; + } + for (j=0; j < router->ports; j++) { + dest = sn_hwperf_findobj_id(objbuf, nobj, + ptdata[j].conn_id); + if (!dest || dest->id == node || + SN_HWPERF_FOREIGN(dest) || + !SN_HWPERF_IS_NODE(dest) || + SN_HWPERF_IS_IONODE(dest)) { + continue; + } + c = sn_hwperf_obj_to_cnode(dest); + if (!found_cpu && sn_hwperf_has_cpus(c)) { + if (near_cpu_node) + *near_cpu_node = c; + found_cpu++; + } + if (!found_mem && sn_hwperf_has_mem(c)) { + if (near_mem_node) + *near_mem_node = c; + found_mem++; + } + if (found_cpu && found_mem) + break; + } + } + + if (!found_cpu || !found_mem) { + /* resort to _any_ node with CPUs and memory */ + for (i=0, op=objbuf; i < nobj; i++, op++) { + if (SN_HWPERF_FOREIGN(op) || + SN_HWPERF_IS_IONODE(op) || + !SN_HWPERF_IS_NODE(op)) { + continue; + } + c = sn_hwperf_obj_to_cnode(op); + if (!found_cpu && sn_hwperf_has_cpus(c)) { + if (near_cpu_node) + *near_cpu_node = c; + found_cpu++; + } + if (!found_mem && sn_hwperf_has_mem(c)) { + if (near_mem_node) + *near_mem_node = c; + found_mem++; + } + if (found_cpu && found_mem) break; - *p2 = '\0'; - seq_printf(s, "pcibus %d %s-%s\n", - *ordinal, obj->location, p1); - (*ordinal)++; - p1 = p2 + 1; } } - free_page((unsigned long)pg); + + if (!found_cpu || !found_mem) + e = -ENODATA; + +err: + return e; } + static int sn_topology_show(struct seq_file *s, void *d) { int sz; @@ -215,7 +384,6 @@ static int sn_topology_show(struct seq_file *s, void *d) struct sn_hwperf_object_info *p; struct sn_hwperf_object_info *obj = d; /* this object */ struct sn_hwperf_object_info *objs = s->private; /* all objects */ - int rack, bay, slot, slab; u8 shubtype; u8 system_size; u8 sharing_size; @@ -225,7 +393,6 @@ static int sn_topology_show(struct seq_file *s, void *d) u8 region_size; u16 nasid_mask; int nasid_msb; - int pci_bus_ordinal = 0; if (obj == objs) { seq_printf(s, "# sn_topology version 2\n"); @@ -253,6 +420,8 @@ static int sn_topology_show(struct seq_file *s, void *d) shubtype ? "shub2" : "shub1", (u64)nasid_mask << nasid_shift, nasid_msb, nasid_shift, system_size, sharing_size, coher, region_size); + + print_pci_topology(s); } if (SN_HWPERF_FOREIGN(obj)) { @@ -272,11 +441,24 @@ static int sn_topology_show(struct seq_file *s, void *d) if (!SN_HWPERF_IS_NODE(obj) && !SN_HWPERF_IS_IONODE(obj)) seq_putc(s, '\n'); else { + cnodeid_t near_mem = -1; + cnodeid_t near_cpu = -1; + seq_printf(s, ", nasid 0x%x", cnodeid_to_nasid(ordinal)); - for (i=0; i < numionodes; i++) { - seq_printf(s, i ? ":%d" : ", dist %d", - node_distance(ordinal, i)); + + if (sn_hwperf_get_nearest_node_objdata(objs, sn_hwperf_obj_cnt, + ordinal, &near_mem, &near_cpu) == 0) { + seq_printf(s, ", near_mem_nodeid %d, near_cpu_nodeid %d", + near_mem, near_cpu); + } + + if (!SN_HWPERF_IS_IONODE(obj)) { + for_each_online_node(i) { + seq_printf(s, i ? ":%d" : ", dist %d", + node_distance(ordinal, i)); + } } + seq_putc(s, '\n'); /* @@ -294,23 +476,12 @@ static int sn_topology_show(struct seq_file *s, void *d) for_each_online_cpu(j) { seq_printf(s, j ? ":%d" : ", dist %d", node_distance( - cpuid_to_cnodeid(i), - cpuid_to_cnodeid(j))); + cpu_to_node(i), + cpu_to_node(j))); } seq_putc(s, '\n'); } } - - /* - * PCI busses attached to this node, if any - */ - if (sn_hwperf_location_to_bpos(obj->location, - &rack, &bay, &slot, &slab)) { - /* export pci bus info */ - print_pci_topology(s, obj, &pci_bus_ordinal, - rack, bay, slot, slab); - - } } if (obj->ports) { @@ -572,6 +743,8 @@ sn_hwperf_ioctl(struct inode *in, struct file *fp, u32 op, u64 arg) if ((r = sn_hwperf_enum_objects(&nobj, &objs)) == 0) { memset(p, 0, a.sz); for (i = 0; i < nobj; i++) { + if (!SN_HWPERF_IS_NODE(objs + i)) + continue; node = sn_hwperf_obj_to_cnode(objs + i); for_each_online_cpu(j) { if (node != cpu_to_node(j)) @@ -598,7 +771,7 @@ sn_hwperf_ioctl(struct inode *in, struct file *fp, u32 op, u64 arg) case SN_HWPERF_GET_NODE_NASID: if (a.sz != sizeof(u64) || - (node = a.arg) < 0 || node >= numionodes) { + (node = a.arg) < 0 || !node_possible(node)) { r = -EINVAL; goto error; } @@ -627,6 +800,14 @@ sn_hwperf_ioctl(struct inode *in, struct file *fp, u32 op, u64 arg) vfree(objs); goto error; } + + if (!SN_HWPERF_IS_NODE(objs + i) && + !SN_HWPERF_IS_IONODE(objs + i)) { + r = -ENOENT; + vfree(objs); + goto error; + } + *(u64 *)p = (u64)sn_hwperf_obj_to_cnode(objs + i); vfree(objs); } @@ -692,6 +873,7 @@ static int sn_hwperf_init(void) /* single threaded, once-only initialization */ down(&sn_hwperf_init_mutex); + if (sn_hwperf_salheap) { up(&sn_hwperf_init_mutex); return e; @@ -742,19 +924,6 @@ out: sn_hwperf_salheap = NULL; sn_hwperf_obj_cnt = 0; } - - if (!e) { - /* - * Register a dynamic misc device for ioctl. Platforms - * supporting hotplug will create /dev/sn_hwperf, else - * user can to look up the minor number in /proc/misc. - */ - if ((e = misc_register(&sn_hwperf_dev)) != 0) { - printk(KERN_ERR "sn_hwperf_init: misc register " - "for \"sn_hwperf\" failed, err %d\n", e); - } - } - up(&sn_hwperf_init_mutex); return e; } @@ -782,3 +951,41 @@ int sn_topology_release(struct inode *inode, struct file *file) vfree(seq->private); return seq_release(inode, file); } + +int sn_hwperf_get_nearest_node(cnodeid_t node, + cnodeid_t *near_mem_node, cnodeid_t *near_cpu_node) +{ + int e; + int nobj; + struct sn_hwperf_object_info *objbuf; + + if ((e = sn_hwperf_enum_objects(&nobj, &objbuf)) == 0) { + e = sn_hwperf_get_nearest_node_objdata(objbuf, nobj, + node, near_mem_node, near_cpu_node); + vfree(objbuf); + } + + return e; +} + +static int __devinit sn_hwperf_misc_register_init(void) +{ + int e; + + sn_hwperf_init(); + + /* + * Register a dynamic misc device for hwperf ioctls. Platforms + * supporting hotplug will create /dev/sn_hwperf, else user + * can to look up the minor number in /proc/misc. + */ + if ((e = misc_register(&sn_hwperf_dev)) != 0) { + printk(KERN_ERR "sn_hwperf_misc_register_init: failed to " + "register misc device for \"%s\"\n", sn_hwperf_dev.name); + } + + return e; +} + +device_initcall(sn_hwperf_misc_register_init); /* after misc_init() */ +EXPORT_SYMBOL(sn_hwperf_get_nearest_node); diff --git a/arch/ia64/sn/kernel/sn2/sn_proc_fs.c b/arch/ia64/sn/kernel/sn2/sn_proc_fs.c index 6a80fca807b9..a06719d752a0 100644 --- a/arch/ia64/sn/kernel/sn2/sn_proc_fs.c +++ b/arch/ia64/sn/kernel/sn2/sn_proc_fs.c @@ -3,7 +3,7 @@ * License. See the file "COPYING" in the main directory of this archive * for more details. * - * Copyright (C) 2000-2004 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2000-2005 Silicon Graphics, Inc. All rights reserved. */ #include <linux/config.h> #include <asm/uaccess.h> @@ -15,7 +15,7 @@ static int partition_id_show(struct seq_file *s, void *p) { - seq_printf(s, "%d\n", sn_local_partid()); + seq_printf(s, "%d\n", sn_partition_id); return 0; } @@ -52,7 +52,7 @@ static int licenseID_open(struct inode *inode, struct file *file) * the bridge chip. The hardware will then send an interrupt message if the * interrupt line is active. This mimics a level sensitive interrupt. */ -int sn_force_interrupt_flag = 1; +extern int sn_force_interrupt_flag; static int sn_force_interrupt_show(struct seq_file *s, void *p) { diff --git a/arch/ia64/sn/kernel/sn2/timer_interrupt.c b/arch/ia64/sn/kernel/sn2/timer_interrupt.c index cde7375390b0..adf5db2e2afe 100644 --- a/arch/ia64/sn/kernel/sn2/timer_interrupt.c +++ b/arch/ia64/sn/kernel/sn2/timer_interrupt.c @@ -1,7 +1,7 @@ /* * * - * Copyright (c) 2003 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) 2005 Silicon Graphics, Inc. All Rights Reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms of version 2 of the GNU General Public License @@ -50,14 +50,16 @@ void sn_timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) LED_CPU_HEARTBEAT, LED_CPU_HEARTBEAT); } - if (enable_shub_wars_1_1()) { - /* Bugfix code for SHUB 1.1 */ - if (pda->pio_shub_war_cam_addr) - *pda->pio_shub_war_cam_addr = 0x8000000000000010UL; + if (is_shub1()) { + if (enable_shub_wars_1_1()) { + /* Bugfix code for SHUB 1.1 */ + if (pda->pio_shub_war_cam_addr) + *pda->pio_shub_war_cam_addr = 0x8000000000000010UL; + } + if (pda->sn_lb_int_war_ticks == 0) + sn_lb_int_war_check(); + pda->sn_lb_int_war_ticks++; + if (pda->sn_lb_int_war_ticks >= SN_LB_INT_WAR_INTERVAL) + pda->sn_lb_int_war_ticks = 0; } - if (pda->sn_lb_int_war_ticks == 0) - sn_lb_int_war_check(); - pda->sn_lb_int_war_ticks++; - if (pda->sn_lb_int_war_ticks >= SN_LB_INT_WAR_INTERVAL) - pda->sn_lb_int_war_ticks = 0; } diff --git a/arch/ia64/sn/kernel/tiocx.c b/arch/ia64/sn/kernel/tiocx.c index 254fe15c064b..0d8592a745a7 100644 --- a/arch/ia64/sn/kernel/tiocx.c +++ b/arch/ia64/sn/kernel/tiocx.c @@ -183,15 +183,16 @@ int cx_driver_unregister(struct cx_drv *cx_driver) * @part_num: device's part number * @mfg_num: device's manufacturer number * @hubdev: hub info associated with this device + * @bt: board type of the device * */ int cx_device_register(nasid_t nasid, int part_num, int mfg_num, - struct hubdev_info *hubdev) + struct hubdev_info *hubdev, int bt) { struct cx_dev *cx_dev; - cx_dev = kcalloc(1, sizeof(struct cx_dev), GFP_KERNEL); + cx_dev = kzalloc(sizeof(struct cx_dev), GFP_KERNEL); DBG("cx_dev= 0x%p\n", cx_dev); if (cx_dev == NULL) return -ENOMEM; @@ -200,6 +201,7 @@ cx_device_register(nasid_t nasid, int part_num, int mfg_num, cx_dev->cx_id.mfg_num = mfg_num; cx_dev->cx_id.nasid = nasid; cx_dev->hubdev = hubdev; + cx_dev->bt = bt; cx_dev->dev.parent = NULL; cx_dev->dev.bus = &tiocx_bus_type; @@ -238,7 +240,8 @@ static int cx_device_reload(struct cx_dev *cx_dev) { cx_device_unregister(cx_dev); return cx_device_register(cx_dev->cx_id.nasid, cx_dev->cx_id.part_num, - cx_dev->cx_id.mfg_num, cx_dev->hubdev); + cx_dev->cx_id.mfg_num, cx_dev->hubdev, + cx_dev->bt); } static inline uint64_t tiocx_intr_alloc(nasid_t nasid, int widget, @@ -365,26 +368,20 @@ static void tio_corelet_reset(nasid_t nasid, int corelet) udelay(2000); } -static int tiocx_btchar_get(int nasid) +static int is_fpga_tio(int nasid, int *bt) { - moduleid_t module_id; - geoid_t geoid; - int cnodeid; - - cnodeid = nasid_to_cnodeid(nasid); - geoid = cnodeid_get_geoid(cnodeid); - module_id = geo_module(geoid); - return MODULE_GET_BTCHAR(module_id); -} + int ioboard_type; -static int is_fpga_brick(int nasid) -{ - switch (tiocx_btchar_get(nasid)) { + ioboard_type = ia64_sn_sysctl_ioboard_get(nasid); + + switch (ioboard_type) { case L1_BRICKTYPE_SA: case L1_BRICKTYPE_ATHENA: - case L1_BRICKTYPE_DAYTONA: + case L1_BOARDTYPE_DAYTONA: + *bt = ioboard_type; return 1; } + return 0; } @@ -407,16 +404,22 @@ static int tiocx_reload(struct cx_dev *cx_dev) if (bitstream_loaded(nasid)) { uint64_t cx_id; - - cx_id = - *(volatile uint64_t *)(TIO_SWIN_BASE(nasid, TIOCX_CORELET) + + int rv; + + rv = ia64_sn_sysctl_tio_clock_reset(nasid); + if (rv) { + printk(KERN_ALERT "CX port JTAG reset failed.\n"); + } else { + cx_id = *(volatile uint64_t *) + (TIO_SWIN_BASE(nasid, TIOCX_CORELET) + WIDGET_ID); - part_num = XWIDGET_PART_NUM(cx_id); - mfg_num = XWIDGET_MFG_NUM(cx_id); - DBG("part= 0x%x, mfg= 0x%x\n", part_num, mfg_num); - /* just ignore it if it's a CE */ - if (part_num == TIO_CE_ASIC_PARTNUM) - return 0; + part_num = XWIDGET_PART_NUM(cx_id); + mfg_num = XWIDGET_MFG_NUM(cx_id); + DBG("part= 0x%x, mfg= 0x%x\n", part_num, mfg_num); + /* just ignore it if it's a CE */ + if (part_num == TIO_CE_ASIC_PARTNUM) + return 0; + } } cx_dev->cx_id.part_num = part_num; @@ -436,10 +439,10 @@ static ssize_t show_cxdev_control(struct device *dev, struct device_attribute *a { struct cx_dev *cx_dev = to_cx_dev(dev); - return sprintf(buf, "0x%x 0x%x 0x%x %d\n", + return sprintf(buf, "0x%x 0x%x 0x%x 0x%x\n", cx_dev->cx_id.nasid, cx_dev->cx_id.part_num, cx_dev->cx_id.mfg_num, - tiocx_btchar_get(cx_dev->cx_id.nasid)); + cx_dev->bt); } static ssize_t store_cxdev_control(struct device *dev, struct device_attribute *attr, const char *buf, @@ -486,13 +489,13 @@ static int __init tiocx_init(void) bus_register(&tiocx_bus_type); - for (cnodeid = 0; cnodeid < MAX_COMPACT_NODES; cnodeid++) { + for (cnodeid = 0; cnodeid < num_cnodes; cnodeid++) { nasid_t nasid; + int bt; - if ((nasid = cnodeid_to_nasid(cnodeid)) < 0) - break; /* No more nasids .. bail out of loop */ + nasid = cnodeid_to_nasid(cnodeid); - if ((nasid & 0x1) && is_fpga_brick(nasid)) { + if ((nasid & 0x1) && is_fpga_tio(nasid, &bt)) { struct hubdev_info *hubdev; struct xwidget_info *widgetp; @@ -512,7 +515,7 @@ static int __init tiocx_init(void) if (cx_device_register (nasid, widgetp->xwi_hwid.part_num, - widgetp->xwi_hwid.mfg_num, hubdev) < 0) + widgetp->xwi_hwid.mfg_num, hubdev, bt) < 0) return -ENXIO; else found_tiocx_device++; diff --git a/arch/ia64/sn/kernel/xpc.h b/arch/ia64/sn/kernel/xpc.h index d0ee635daf2e..fbcedc7c27fa 100644 --- a/arch/ia64/sn/kernel/xpc.h +++ b/arch/ia64/sn/kernel/xpc.h @@ -57,7 +57,7 @@ #define XPC_NASID_FROM_W_B(_w, _b) (((_w) * 64 + (_b)) * 2) #define XPC_HB_DEFAULT_INTERVAL 5 /* incr HB every x secs */ -#define XPC_HB_CHECK_DEFAULT_TIMEOUT 20 /* check HB every x secs */ +#define XPC_HB_CHECK_DEFAULT_INTERVAL 20 /* check HB every x secs */ /* define the process name of HB checker and the CPU it is pinned to */ #define XPC_HB_CHECK_THREAD_NAME "xpc_hb" @@ -67,34 +67,82 @@ #define XPC_DISCOVERY_THREAD_NAME "xpc_discovery" -#define XPC_HB_ALLOWED(_p, _v) ((_v)->heartbeating_to_mask & (1UL << (_p))) -#define XPC_ALLOW_HB(_p, _v) (_v)->heartbeating_to_mask |= (1UL << (_p)) -#define XPC_DISALLOW_HB(_p, _v) (_v)->heartbeating_to_mask &= (~(1UL << (_p))) - - /* - * Reserved Page provided by SAL. + * the reserved page + * + * SAL reserves one page of memory per partition for XPC. Though a full page + * in length (16384 bytes), its starting address is not page aligned, but it + * is cacheline aligned. The reserved page consists of the following: + * + * reserved page header + * + * The first cacheline of the reserved page contains the header + * (struct xpc_rsvd_page). Before SAL initialization has completed, + * SAL has set up the following fields of the reserved page header: + * SAL_signature, SAL_version, partid, and nasids_size. The other + * fields are set up by XPC. (xpc_rsvd_page points to the local + * partition's reserved page.) * - * SAL provides one page per partition of reserved memory. When SAL - * initialization is complete, SAL_signature, SAL_version, partid, - * part_nasids, and mach_nasids are set. + * part_nasids mask + * mach_nasids mask + * + * SAL also sets up two bitmaps (or masks), one that reflects the actual + * nasids in this partition (part_nasids), and the other that reflects + * the actual nasids in the entire machine (mach_nasids). We're only + * interested in the even numbered nasids (which contain the processors + * and/or memory), so we only need half as many bits to represent the + * nasids. The part_nasids mask is located starting at the first cacheline + * following the reserved page header. The mach_nasids mask follows right + * after the part_nasids mask. The size in bytes of each mask is reflected + * by the reserved page header field 'nasids_size'. (Local partition's + * mask pointers are xpc_part_nasids and xpc_mach_nasids.) + * + * vars + * vars part + * + * Immediately following the mach_nasids mask are the XPC variables + * required by other partitions. First are those that are generic to all + * partitions (vars), followed on the next available cacheline by those + * which are partition specific (vars part). These are setup by XPC. + * (Local partition's vars pointers are xpc_vars and xpc_vars_part.) * * Note: Until vars_pa is set, the partition XPC code has not been initialized. */ struct xpc_rsvd_page { - u64 SAL_signature; /* SAL unique signature */ - u64 SAL_version; /* SAL specified version */ - u8 partid; /* partition ID from SAL */ + u64 SAL_signature; /* SAL: unique signature */ + u64 SAL_version; /* SAL: version */ + u8 partid; /* SAL: partition ID */ u8 version; - u8 pad[6]; /* pad to u64 align */ + u8 pad1[6]; /* align to next u64 in cacheline */ volatile u64 vars_pa; - u64 part_nasids[XP_NASID_MASK_WORDS] ____cacheline_aligned; - u64 mach_nasids[XP_NASID_MASK_WORDS] ____cacheline_aligned; + struct timespec stamp; /* time when reserved page was setup by XPC */ + u64 pad2[9]; /* align to last u64 in cacheline */ + u64 nasids_size; /* SAL: size of each nasid mask in bytes */ }; -#define XPC_RP_VERSION _XPC_VERSION(1,0) /* version 1.0 of the reserved page */ -#define XPC_RSVD_PAGE_ALIGNED_SIZE \ - (L1_CACHE_ALIGN(sizeof(struct xpc_rsvd_page))) +#define XPC_RP_VERSION _XPC_VERSION(1,1) /* version 1.1 of the reserved page */ + +#define XPC_SUPPORTS_RP_STAMP(_version) \ + (_version >= _XPC_VERSION(1,1)) + +/* + * compare stamps - the return value is: + * + * < 0, if stamp1 < stamp2 + * = 0, if stamp1 == stamp2 + * > 0, if stamp1 > stamp2 + */ +static inline int +xpc_compare_stamps(struct timespec *stamp1, struct timespec *stamp2) +{ + int ret; + + + if ((ret = stamp1->tv_sec - stamp2->tv_sec) == 0) { + ret = stamp1->tv_nsec - stamp2->tv_nsec; + } + return ret; +} /* @@ -121,11 +169,58 @@ struct xpc_vars { u64 vars_part_pa; u64 amos_page_pa; /* paddr of page of AMOs from MSPEC driver */ AMO_t *amos_page; /* vaddr of page of AMOs from MSPEC driver */ - AMO_t *act_amos; /* pointer to the first activation AMO */ }; -#define XPC_V_VERSION _XPC_VERSION(3,0) /* version 3.0 of the cross vars */ -#define XPC_VARS_ALIGNED_SIZE (L1_CACHE_ALIGN(sizeof(struct xpc_vars))) +#define XPC_V_VERSION _XPC_VERSION(3,1) /* version 3.1 of the cross vars */ + +#define XPC_SUPPORTS_DISENGAGE_REQUEST(_version) \ + (_version >= _XPC_VERSION(3,1)) + + +static inline int +xpc_hb_allowed(partid_t partid, struct xpc_vars *vars) +{ + return ((vars->heartbeating_to_mask & (1UL << partid)) != 0); +} + +static inline void +xpc_allow_hb(partid_t partid, struct xpc_vars *vars) +{ + u64 old_mask, new_mask; + + do { + old_mask = vars->heartbeating_to_mask; + new_mask = (old_mask | (1UL << partid)); + } while (cmpxchg(&vars->heartbeating_to_mask, old_mask, new_mask) != + old_mask); +} + +static inline void +xpc_disallow_hb(partid_t partid, struct xpc_vars *vars) +{ + u64 old_mask, new_mask; + + do { + old_mask = vars->heartbeating_to_mask; + new_mask = (old_mask & ~(1UL << partid)); + } while (cmpxchg(&vars->heartbeating_to_mask, old_mask, new_mask) != + old_mask); +} + + +/* + * The AMOs page consists of a number of AMO variables which are divided into + * four groups, The first two groups are used to identify an IRQ's sender. + * These two groups consist of 64 and 128 AMO variables respectively. The last + * two groups, consisting of just one AMO variable each, are used to identify + * the remote partitions that are currently engaged (from the viewpoint of + * the XPC running on the remote partition). + */ +#define XPC_NOTIFY_IRQ_AMOS 0 +#define XPC_ACTIVATE_IRQ_AMOS (XPC_NOTIFY_IRQ_AMOS + XP_MAX_PARTITIONS) +#define XPC_ENGAGED_PARTITIONS_AMO (XPC_ACTIVATE_IRQ_AMOS + XP_NASID_MASK_WORDS) +#define XPC_DISENGAGE_REQUEST_AMO (XPC_ENGAGED_PARTITIONS_AMO + 1) + /* * The following structure describes the per partition specific variables. @@ -165,6 +260,16 @@ struct xpc_vars_part { #define XPC_VP_MAGIC2 0x0073726176435058L /* 'XPCvars\0'L (little endian) */ +/* the reserved page sizes and offsets */ + +#define XPC_RP_HEADER_SIZE L1_CACHE_ALIGN(sizeof(struct xpc_rsvd_page)) +#define XPC_RP_VARS_SIZE L1_CACHE_ALIGN(sizeof(struct xpc_vars)) + +#define XPC_RP_PART_NASIDS(_rp) (u64 *) ((u8 *) _rp + XPC_RP_HEADER_SIZE) +#define XPC_RP_MACH_NASIDS(_rp) (XPC_RP_PART_NASIDS(_rp) + xp_nasid_mask_words) +#define XPC_RP_VARS(_rp) ((struct xpc_vars *) XPC_RP_MACH_NASIDS(_rp) + xp_nasid_mask_words) +#define XPC_RP_VARS_PART(_rp) (struct xpc_vars_part *) ((u8 *) XPC_RP_VARS(rp) + XPC_RP_VARS_SIZE) + /* * Functions registered by add_timer() or called by kernel_thread() only @@ -349,6 +454,9 @@ struct xpc_channel { atomic_t n_on_msg_allocate_wq; /* #on msg allocation wait queue */ wait_queue_head_t msg_allocate_wq; /* msg allocation wait queue */ + u8 delayed_IPI_flags; /* IPI flags received, but delayed */ + /* action until channel disconnected */ + /* queue of msg senders who want to be notified when msg received */ atomic_t n_to_notify; /* #of msg senders to notify */ @@ -358,7 +466,7 @@ struct xpc_channel { void *key; /* pointer to user's key */ struct semaphore msg_to_pull_sema; /* next msg to pull serialization */ - struct semaphore teardown_sema; /* wait for teardown completion */ + struct semaphore wdisconnect_sema; /* wait for channel disconnect */ struct xpc_openclose_args *local_openclose_args; /* args passed on */ /* opening or closing of channel */ @@ -410,6 +518,8 @@ struct xpc_channel { #define XPC_C_DISCONNECTED 0x00002000 /* channel is disconnected */ #define XPC_C_DISCONNECTING 0x00004000 /* channel is being disconnected */ +#define XPC_C_DISCONNECTCALLOUT 0x00008000 /* chan disconnected callout made */ +#define XPC_C_WDISCONNECT 0x00010000 /* waiting for channel disconnect */ @@ -422,6 +532,8 @@ struct xpc_partition { /* XPC HB infrastructure */ + u8 remote_rp_version; /* version# of partition's rsvd pg */ + struct timespec remote_rp_stamp;/* time when rsvd pg was initialized */ u64 remote_rp_pa; /* phys addr of partition's rsvd pg */ u64 remote_vars_pa; /* phys addr of partition's vars */ u64 remote_vars_part_pa; /* phys addr of partition's vars part */ @@ -432,14 +544,18 @@ struct xpc_partition { u32 act_IRQ_rcvd; /* IRQs since activation */ spinlock_t act_lock; /* protect updating of act_state */ u8 act_state; /* from XPC HB viewpoint */ + u8 remote_vars_version; /* version# of partition's vars */ enum xpc_retval reason; /* reason partition is deactivating */ int reason_line; /* line# deactivation initiated from */ int reactivate_nasid; /* nasid in partition to reactivate */ + unsigned long disengage_request_timeout; /* timeout in jiffies */ + struct timer_list disengage_request_timer; + /* XPC infrastructure referencing and teardown control */ - volatile u8 setup_state; /* infrastructure setup state */ + volatile u8 setup_state; /* infrastructure setup state */ wait_queue_head_t teardown_wq; /* kthread waiting to teardown infra */ atomic_t references; /* #of references to infrastructure */ @@ -454,6 +570,7 @@ struct xpc_partition { u8 nchannels; /* #of defined channels supported */ atomic_t nchannels_active; /* #of channels that are not DISCONNECTED */ + atomic_t nchannels_engaged;/* #of channels engaged with remote part */ struct xpc_channel *channels;/* array of channel structures */ void *local_GPs_base; /* base address of kmalloc'd space */ @@ -518,6 +635,7 @@ struct xpc_partition { #define XPC_P_TORNDOWN 0x03 /* infrastructure is torndown */ + /* * struct xpc_partition IPI_timer #of seconds to wait before checking for * dropped IPIs. These occur whenever an IPI amo write doesn't complete until @@ -526,6 +644,13 @@ struct xpc_partition { #define XPC_P_DROPPED_IPI_WAIT (0.25 * HZ) +/* number of seconds to wait for other partitions to disengage */ +#define XPC_DISENGAGE_REQUEST_DEFAULT_TIMELIMIT 90 + +/* interval in seconds to print 'waiting disengagement' messages */ +#define XPC_DISENGAGE_PRINTMSG_INTERVAL 10 + + #define XPC_PARTID(_p) ((partid_t) ((_p) - &xpc_partitions[0])) @@ -534,24 +659,20 @@ struct xpc_partition { extern struct xpc_registration xpc_registrations[]; -/* >>> found in xpc_main.c only */ +/* found in xpc_main.c */ extern struct device *xpc_part; extern struct device *xpc_chan; +extern int xpc_disengage_request_timelimit; extern irqreturn_t xpc_notify_IRQ_handler(int, void *, struct pt_regs *); extern void xpc_dropped_IPI_check(struct xpc_partition *); +extern void xpc_activate_partition(struct xpc_partition *); extern void xpc_activate_kthreads(struct xpc_channel *, int); extern void xpc_create_kthreads(struct xpc_channel *, int); extern void xpc_disconnect_wait(int); -/* found in xpc_main.c and efi-xpc.c */ -extern void xpc_activate_partition(struct xpc_partition *); - - /* found in xpc_partition.c */ extern int xpc_exiting; -extern int xpc_hb_interval; -extern int xpc_hb_check_interval; extern struct xpc_vars *xpc_vars; extern struct xpc_rsvd_page *xpc_rsvd_page; extern struct xpc_vars_part *xpc_vars_part; @@ -561,6 +682,7 @@ extern struct xpc_rsvd_page *xpc_rsvd_page_init(void); extern void xpc_allow_IPI_ops(void); extern void xpc_restrict_IPI_ops(void); extern int xpc_identify_act_IRQ_sender(void); +extern int xpc_partition_disengaged(struct xpc_partition *); extern enum xpc_retval xpc_mark_partition_active(struct xpc_partition *); extern void xpc_mark_partition_inactive(struct xpc_partition *); extern void xpc_discovery(void); @@ -585,8 +707,8 @@ extern void xpc_connected_callout(struct xpc_channel *); extern void xpc_deliver_msg(struct xpc_channel *); extern void xpc_disconnect_channel(const int, struct xpc_channel *, enum xpc_retval, unsigned long *); -extern void xpc_disconnected_callout(struct xpc_channel *); -extern void xpc_partition_down(struct xpc_partition *, enum xpc_retval); +extern void xpc_disconnecting_callout(struct xpc_channel *); +extern void xpc_partition_going_down(struct xpc_partition *, enum xpc_retval); extern void xpc_teardown_infrastructure(struct xpc_partition *); @@ -674,6 +796,157 @@ xpc_part_ref(struct xpc_partition *part) /* + * This next set of inlines are used to keep track of when a partition is + * potentially engaged in accessing memory belonging to another partition. + */ + +static inline void +xpc_mark_partition_engaged(struct xpc_partition *part) +{ + unsigned long irq_flags; + AMO_t *amo = (AMO_t *) __va(part->remote_amos_page_pa + + (XPC_ENGAGED_PARTITIONS_AMO * sizeof(AMO_t))); + + + local_irq_save(irq_flags); + + /* set bit corresponding to our partid in remote partition's AMO */ + FETCHOP_STORE_OP(TO_AMO((u64) &amo->variable), FETCHOP_OR, + (1UL << sn_partition_id)); + /* + * We must always use the nofault function regardless of whether we + * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we + * didn't, we'd never know that the other partition is down and would + * keep sending IPIs and AMOs to it until the heartbeat times out. + */ + (void) xp_nofault_PIOR((u64 *) GLOBAL_MMR_ADDR(NASID_GET(&amo-> + variable), xp_nofault_PIOR_target)); + + local_irq_restore(irq_flags); +} + +static inline void +xpc_mark_partition_disengaged(struct xpc_partition *part) +{ + unsigned long irq_flags; + AMO_t *amo = (AMO_t *) __va(part->remote_amos_page_pa + + (XPC_ENGAGED_PARTITIONS_AMO * sizeof(AMO_t))); + + + local_irq_save(irq_flags); + + /* clear bit corresponding to our partid in remote partition's AMO */ + FETCHOP_STORE_OP(TO_AMO((u64) &amo->variable), FETCHOP_AND, + ~(1UL << sn_partition_id)); + /* + * We must always use the nofault function regardless of whether we + * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we + * didn't, we'd never know that the other partition is down and would + * keep sending IPIs and AMOs to it until the heartbeat times out. + */ + (void) xp_nofault_PIOR((u64 *) GLOBAL_MMR_ADDR(NASID_GET(&amo-> + variable), xp_nofault_PIOR_target)); + + local_irq_restore(irq_flags); +} + +static inline void +xpc_request_partition_disengage(struct xpc_partition *part) +{ + unsigned long irq_flags; + AMO_t *amo = (AMO_t *) __va(part->remote_amos_page_pa + + (XPC_DISENGAGE_REQUEST_AMO * sizeof(AMO_t))); + + + local_irq_save(irq_flags); + + /* set bit corresponding to our partid in remote partition's AMO */ + FETCHOP_STORE_OP(TO_AMO((u64) &amo->variable), FETCHOP_OR, + (1UL << sn_partition_id)); + /* + * We must always use the nofault function regardless of whether we + * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we + * didn't, we'd never know that the other partition is down and would + * keep sending IPIs and AMOs to it until the heartbeat times out. + */ + (void) xp_nofault_PIOR((u64 *) GLOBAL_MMR_ADDR(NASID_GET(&amo-> + variable), xp_nofault_PIOR_target)); + + local_irq_restore(irq_flags); +} + +static inline void +xpc_cancel_partition_disengage_request(struct xpc_partition *part) +{ + unsigned long irq_flags; + AMO_t *amo = (AMO_t *) __va(part->remote_amos_page_pa + + (XPC_DISENGAGE_REQUEST_AMO * sizeof(AMO_t))); + + + local_irq_save(irq_flags); + + /* clear bit corresponding to our partid in remote partition's AMO */ + FETCHOP_STORE_OP(TO_AMO((u64) &amo->variable), FETCHOP_AND, + ~(1UL << sn_partition_id)); + /* + * We must always use the nofault function regardless of whether we + * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we + * didn't, we'd never know that the other partition is down and would + * keep sending IPIs and AMOs to it until the heartbeat times out. + */ + (void) xp_nofault_PIOR((u64 *) GLOBAL_MMR_ADDR(NASID_GET(&amo-> + variable), xp_nofault_PIOR_target)); + + local_irq_restore(irq_flags); +} + +static inline u64 +xpc_partition_engaged(u64 partid_mask) +{ + AMO_t *amo = xpc_vars->amos_page + XPC_ENGAGED_PARTITIONS_AMO; + + + /* return our partition's AMO variable ANDed with partid_mask */ + return (FETCHOP_LOAD_OP(TO_AMO((u64) &amo->variable), FETCHOP_LOAD) & + partid_mask); +} + +static inline u64 +xpc_partition_disengage_requested(u64 partid_mask) +{ + AMO_t *amo = xpc_vars->amos_page + XPC_DISENGAGE_REQUEST_AMO; + + + /* return our partition's AMO variable ANDed with partid_mask */ + return (FETCHOP_LOAD_OP(TO_AMO((u64) &amo->variable), FETCHOP_LOAD) & + partid_mask); +} + +static inline void +xpc_clear_partition_engaged(u64 partid_mask) +{ + AMO_t *amo = xpc_vars->amos_page + XPC_ENGAGED_PARTITIONS_AMO; + + + /* clear bit(s) based on partid_mask in our partition's AMO */ + FETCHOP_STORE_OP(TO_AMO((u64) &amo->variable), FETCHOP_AND, + ~partid_mask); +} + +static inline void +xpc_clear_partition_disengage_request(u64 partid_mask) +{ + AMO_t *amo = xpc_vars->amos_page + XPC_DISENGAGE_REQUEST_AMO; + + + /* clear bit(s) based on partid_mask in our partition's AMO */ + FETCHOP_STORE_OP(TO_AMO((u64) &amo->variable), FETCHOP_AND, + ~partid_mask); +} + + + +/* * The following set of macros and inlines are used for the sending and * receiving of IPIs (also known as IRQs). There are two flavors of IPIs, * one that is associated with partition activity (SGI_XPC_ACTIVATE) and @@ -722,13 +995,13 @@ xpc_IPI_send(AMO_t *amo, u64 flag, int nasid, int phys_cpuid, int vector) * Flag the appropriate AMO variable and send an IPI to the specified node. */ static inline void -xpc_activate_IRQ_send(u64 amos_page, int from_nasid, int to_nasid, +xpc_activate_IRQ_send(u64 amos_page_pa, int from_nasid, int to_nasid, int to_phys_cpuid) { int w_index = XPC_NASID_W_INDEX(from_nasid); int b_index = XPC_NASID_B_INDEX(from_nasid); - AMO_t *amos = (AMO_t *) __va(amos_page + - (XP_MAX_PARTITIONS * sizeof(AMO_t))); + AMO_t *amos = (AMO_t *) __va(amos_page_pa + + (XPC_ACTIVATE_IRQ_AMOS * sizeof(AMO_t))); (void) xpc_IPI_send(&amos[w_index], (1UL << b_index), to_nasid, @@ -756,6 +1029,13 @@ xpc_IPI_send_reactivate(struct xpc_partition *part) xpc_vars->act_nasid, xpc_vars->act_phys_cpuid); } +static inline void +xpc_IPI_send_disengage(struct xpc_partition *part) +{ + xpc_activate_IRQ_send(part->remote_amos_page_pa, cnodeid_to_nasid(0), + part->remote_act_nasid, part->remote_act_phys_cpuid); +} + /* * IPIs associated with SGI_XPC_NOTIFY IRQ. @@ -836,6 +1116,7 @@ xpc_notify_IRQ_send_local(struct xpc_channel *ch, u8 ipi_flag, /* given an AMO variable and a channel#, get its associated IPI flags */ #define XPC_GET_IPI_FLAGS(_amo, _c) ((u8) (((_amo) >> ((_c) * 8)) & 0xff)) +#define XPC_SET_IPI_FLAGS(_amo, _c, _f) (_amo) |= ((u64) (_f) << ((_c) * 8)) #define XPC_ANY_OPENCLOSE_IPI_FLAGS_SET(_amo) ((_amo) & 0x0f0f0f0f0f0f0f0f) #define XPC_ANY_MSG_IPI_FLAGS_SET(_amo) ((_amo) & 0x1010101010101010) @@ -903,17 +1184,18 @@ xpc_IPI_send_local_msgrequest(struct xpc_channel *ch) * cacheable mapping for the entire region. This will prevent speculative * reading of cached copies of our lines from being issued which will cause * a PI FSB Protocol error to be generated by the SHUB. For XPC, we need 64 - * (XP_MAX_PARTITIONS) AMO variables for message notification (xpc_main.c) - * and an additional 16 AMO variables for partition activation (xpc_hb.c). + * AMO variables (based on XP_MAX_PARTITIONS) for message notification and an + * additional 128 AMO variables (based on XP_NASID_MASK_WORDS) for partition + * activation and 2 AMO variables for partition deactivation. */ static inline AMO_t * -xpc_IPI_init(partid_t partid) +xpc_IPI_init(int index) { - AMO_t *part_amo = xpc_vars->amos_page + partid; + AMO_t *amo = xpc_vars->amos_page + index; - xpc_IPI_receive(part_amo); - return part_amo; + (void) xpc_IPI_receive(amo); /* clear AMO variable */ + return amo; } @@ -939,7 +1221,7 @@ xpc_map_bte_errors(bte_result_t error) static inline void * -xpc_kmalloc_cacheline_aligned(size_t size, int flags, void **base) +xpc_kmalloc_cacheline_aligned(size_t size, gfp_t flags, void **base) { /* see if kmalloc will give us cachline aligned memory by default */ *base = kmalloc(size, flags); diff --git a/arch/ia64/sn/kernel/xpc_channel.c b/arch/ia64/sn/kernel/xpc_channel.c index 94698bea7be0..abf4fc2a87bb 100644 --- a/arch/ia64/sn/kernel/xpc_channel.c +++ b/arch/ia64/sn/kernel/xpc_channel.c @@ -57,6 +57,7 @@ xpc_initialize_channels(struct xpc_partition *part, partid_t partid) spin_lock_init(&ch->lock); sema_init(&ch->msg_to_pull_sema, 1); /* mutex */ + sema_init(&ch->wdisconnect_sema, 0); /* event wait */ atomic_set(&ch->n_on_msg_allocate_wq, 0); init_waitqueue_head(&ch->msg_allocate_wq); @@ -166,6 +167,7 @@ xpc_setup_infrastructure(struct xpc_partition *part) xpc_initialize_channels(part, partid); atomic_set(&part->nchannels_active, 0); + atomic_set(&part->nchannels_engaged, 0); /* local_IPI_amo were set to 0 by an earlier memset() */ @@ -555,8 +557,6 @@ xpc_allocate_msgqueues(struct xpc_channel *ch) sema_init(&ch->notify_queue[i].sema, 0); } - sema_init(&ch->teardown_sema, 0); /* event wait */ - spin_lock_irqsave(&ch->lock, irq_flags); ch->flags |= XPC_C_SETUP; spin_unlock_irqrestore(&ch->lock, irq_flags); @@ -626,6 +626,55 @@ xpc_process_connect(struct xpc_channel *ch, unsigned long *irq_flags) /* + * Notify those who wanted to be notified upon delivery of their message. + */ +static void +xpc_notify_senders(struct xpc_channel *ch, enum xpc_retval reason, s64 put) +{ + struct xpc_notify *notify; + u8 notify_type; + s64 get = ch->w_remote_GP.get - 1; + + + while (++get < put && atomic_read(&ch->n_to_notify) > 0) { + + notify = &ch->notify_queue[get % ch->local_nentries]; + + /* + * See if the notify entry indicates it was associated with + * a message who's sender wants to be notified. It is possible + * that it is, but someone else is doing or has done the + * notification. + */ + notify_type = notify->type; + if (notify_type == 0 || + cmpxchg(¬ify->type, notify_type, 0) != + notify_type) { + continue; + } + + DBUG_ON(notify_type != XPC_N_CALL); + + atomic_dec(&ch->n_to_notify); + + if (notify->func != NULL) { + dev_dbg(xpc_chan, "notify->func() called, notify=0x%p, " + "msg_number=%ld, partid=%d, channel=%d\n", + (void *) notify, get, ch->partid, ch->number); + + notify->func(reason, ch->partid, ch->number, + notify->key); + + dev_dbg(xpc_chan, "notify->func() returned, " + "notify=0x%p, msg_number=%ld, partid=%d, " + "channel=%d\n", (void *) notify, get, + ch->partid, ch->number); + } + } +} + + +/* * Free up message queues and other stuff that were allocated for the specified * channel. * @@ -669,9 +718,6 @@ xpc_free_msgqueues(struct xpc_channel *ch) ch->remote_msgqueue = NULL; kfree(ch->notify_queue); ch->notify_queue = NULL; - - /* in case someone is waiting for the teardown to complete */ - up(&ch->teardown_sema); } } @@ -683,7 +729,7 @@ static void xpc_process_disconnect(struct xpc_channel *ch, unsigned long *irq_flags) { struct xpc_partition *part = &xpc_partitions[ch->partid]; - u32 ch_flags = ch->flags; + u32 channel_was_connected = (ch->flags & XPC_C_WASCONNECTED); DBUG_ON(!spin_is_locked(&ch->lock)); @@ -701,12 +747,13 @@ xpc_process_disconnect(struct xpc_channel *ch, unsigned long *irq_flags) } DBUG_ON(atomic_read(&ch->kthreads_assigned) != 0); - /* it's now safe to free the channel's message queues */ - - xpc_free_msgqueues(ch); - DBUG_ON(ch->flags & XPC_C_SETUP); + if (part->act_state == XPC_P_DEACTIVATING) { + /* can't proceed until the other side disengages from us */ + if (xpc_partition_engaged(1UL << ch->partid)) { + return; + } - if (part->act_state != XPC_P_DEACTIVATING) { + } else { /* as long as the other side is up do the full protocol */ @@ -724,16 +771,42 @@ xpc_process_disconnect(struct xpc_channel *ch, unsigned long *irq_flags) } } + /* wake those waiting for notify completion */ + if (atomic_read(&ch->n_to_notify) > 0) { + /* >>> we do callout while holding ch->lock */ + xpc_notify_senders(ch, ch->reason, ch->w_local_GP.put); + } + /* both sides are disconnected now */ - ch->flags = XPC_C_DISCONNECTED; /* clear all flags, but this one */ + /* it's now safe to free the channel's message queues */ + xpc_free_msgqueues(ch); + + /* mark disconnected, clear all other flags except XPC_C_WDISCONNECT */ + ch->flags = (XPC_C_DISCONNECTED | (ch->flags & XPC_C_WDISCONNECT)); atomic_dec(&part->nchannels_active); - if (ch_flags & XPC_C_WASCONNECTED) { + if (channel_was_connected) { dev_info(xpc_chan, "channel %d to partition %d disconnected, " "reason=%d\n", ch->number, ch->partid, ch->reason); } + + if (ch->flags & XPC_C_WDISCONNECT) { + spin_unlock_irqrestore(&ch->lock, *irq_flags); + up(&ch->wdisconnect_sema); + spin_lock_irqsave(&ch->lock, *irq_flags); + + } else if (ch->delayed_IPI_flags) { + if (part->act_state != XPC_P_DEACTIVATING) { + /* time to take action on any delayed IPI flags */ + spin_lock(&part->IPI_lock); + XPC_SET_IPI_FLAGS(part->local_IPI_amo, ch->number, + ch->delayed_IPI_flags); + spin_unlock(&part->IPI_lock); + } + ch->delayed_IPI_flags = 0; + } } @@ -754,6 +827,19 @@ xpc_process_openclose_IPI(struct xpc_partition *part, int ch_number, spin_lock_irqsave(&ch->lock, irq_flags); +again: + + if ((ch->flags & XPC_C_DISCONNECTED) && + (ch->flags & XPC_C_WDISCONNECT)) { + /* + * Delay processing IPI flags until thread waiting disconnect + * has had a chance to see that the channel is disconnected. + */ + ch->delayed_IPI_flags |= IPI_flags; + spin_unlock_irqrestore(&ch->lock, irq_flags); + return; + } + if (IPI_flags & XPC_IPI_CLOSEREQUEST) { @@ -764,7 +850,7 @@ xpc_process_openclose_IPI(struct xpc_partition *part, int ch_number, /* * If RCLOSEREQUEST is set, we're probably waiting for * RCLOSEREPLY. We should find it and a ROPENREQUEST packed - * with this RCLOSEQREUQEST in the IPI_flags. + * with this RCLOSEREQUEST in the IPI_flags. */ if (ch->flags & XPC_C_RCLOSEREQUEST) { @@ -779,14 +865,22 @@ xpc_process_openclose_IPI(struct xpc_partition *part, int ch_number, /* both sides have finished disconnecting */ xpc_process_disconnect(ch, &irq_flags); + DBUG_ON(!(ch->flags & XPC_C_DISCONNECTED)); + goto again; } if (ch->flags & XPC_C_DISCONNECTED) { - // >>> explain this section - if (!(IPI_flags & XPC_IPI_OPENREQUEST)) { - DBUG_ON(part->act_state != - XPC_P_DEACTIVATING); + if ((XPC_GET_IPI_FLAGS(part->local_IPI_amo, + ch_number) & XPC_IPI_OPENREQUEST)) { + + DBUG_ON(ch->delayed_IPI_flags != 0); + spin_lock(&part->IPI_lock); + XPC_SET_IPI_FLAGS(part->local_IPI_amo, + ch_number, + XPC_IPI_CLOSEREQUEST); + spin_unlock(&part->IPI_lock); + } spin_unlock_irqrestore(&ch->lock, irq_flags); return; } @@ -816,9 +910,13 @@ xpc_process_openclose_IPI(struct xpc_partition *part, int ch_number, } XPC_DISCONNECT_CHANNEL(ch, reason, &irq_flags); - } else { - xpc_process_disconnect(ch, &irq_flags); + + DBUG_ON(IPI_flags & XPC_IPI_CLOSEREPLY); + spin_unlock_irqrestore(&ch->lock, irq_flags); + return; } + + xpc_process_disconnect(ch, &irq_flags); } @@ -834,7 +932,20 @@ xpc_process_openclose_IPI(struct xpc_partition *part, int ch_number, } DBUG_ON(!(ch->flags & XPC_C_CLOSEREQUEST)); - DBUG_ON(!(ch->flags & XPC_C_RCLOSEREQUEST)); + + if (!(ch->flags & XPC_C_RCLOSEREQUEST)) { + if ((XPC_GET_IPI_FLAGS(part->local_IPI_amo, ch_number) + & XPC_IPI_CLOSEREQUEST)) { + + DBUG_ON(ch->delayed_IPI_flags != 0); + spin_lock(&part->IPI_lock); + XPC_SET_IPI_FLAGS(part->local_IPI_amo, + ch_number, XPC_IPI_CLOSEREPLY); + spin_unlock(&part->IPI_lock); + } + spin_unlock_irqrestore(&ch->lock, irq_flags); + return; + } ch->flags |= XPC_C_RCLOSEREPLY; @@ -852,8 +963,14 @@ xpc_process_openclose_IPI(struct xpc_partition *part, int ch_number, "channel=%d\n", args->msg_size, args->local_nentries, ch->partid, ch->number); - if ((ch->flags & XPC_C_DISCONNECTING) || - part->act_state == XPC_P_DEACTIVATING) { + if (part->act_state == XPC_P_DEACTIVATING || + (ch->flags & XPC_C_ROPENREQUEST)) { + spin_unlock_irqrestore(&ch->lock, irq_flags); + return; + } + + if (ch->flags & (XPC_C_DISCONNECTING | XPC_C_WDISCONNECT)) { + ch->delayed_IPI_flags |= XPC_IPI_OPENREQUEST; spin_unlock_irqrestore(&ch->lock, irq_flags); return; } @@ -867,8 +984,11 @@ xpc_process_openclose_IPI(struct xpc_partition *part, int ch_number, * msg_size = size of channel's messages in bytes * local_nentries = remote partition's local_nentries */ - DBUG_ON(args->msg_size == 0); - DBUG_ON(args->local_nentries == 0); + if (args->msg_size == 0 || args->local_nentries == 0) { + /* assume OPENREQUEST was delayed by mistake */ + spin_unlock_irqrestore(&ch->lock, irq_flags); + return; + } ch->flags |= (XPC_C_ROPENREQUEST | XPC_C_CONNECTING); ch->remote_nentries = args->local_nentries; @@ -906,7 +1026,13 @@ xpc_process_openclose_IPI(struct xpc_partition *part, int ch_number, spin_unlock_irqrestore(&ch->lock, irq_flags); return; } - DBUG_ON(!(ch->flags & XPC_C_OPENREQUEST)); + if (!(ch->flags & XPC_C_OPENREQUEST)) { + XPC_DISCONNECT_CHANNEL(ch, xpcOpenCloseError, + &irq_flags); + spin_unlock_irqrestore(&ch->lock, irq_flags); + return; + } + DBUG_ON(!(ch->flags & XPC_C_ROPENREQUEST)); DBUG_ON(ch->flags & XPC_C_CONNECTED); @@ -960,8 +1086,8 @@ xpc_connect_channel(struct xpc_channel *ch) struct xpc_registration *registration = &xpc_registrations[ch->number]; - if (down_interruptible(®istration->sema) != 0) { - return xpcInterrupted; + if (down_trylock(®istration->sema) != 0) { + return xpcRetry; } if (!XPC_CHANNEL_REGISTERED(ch->number)) { @@ -1040,55 +1166,6 @@ xpc_connect_channel(struct xpc_channel *ch) /* - * Notify those who wanted to be notified upon delivery of their message. - */ -static void -xpc_notify_senders(struct xpc_channel *ch, enum xpc_retval reason, s64 put) -{ - struct xpc_notify *notify; - u8 notify_type; - s64 get = ch->w_remote_GP.get - 1; - - - while (++get < put && atomic_read(&ch->n_to_notify) > 0) { - - notify = &ch->notify_queue[get % ch->local_nentries]; - - /* - * See if the notify entry indicates it was associated with - * a message who's sender wants to be notified. It is possible - * that it is, but someone else is doing or has done the - * notification. - */ - notify_type = notify->type; - if (notify_type == 0 || - cmpxchg(¬ify->type, notify_type, 0) != - notify_type) { - continue; - } - - DBUG_ON(notify_type != XPC_N_CALL); - - atomic_dec(&ch->n_to_notify); - - if (notify->func != NULL) { - dev_dbg(xpc_chan, "notify->func() called, notify=0x%p, " - "msg_number=%ld, partid=%d, channel=%d\n", - (void *) notify, get, ch->partid, ch->number); - - notify->func(reason, ch->partid, ch->number, - notify->key); - - dev_dbg(xpc_chan, "notify->func() returned, " - "notify=0x%p, msg_number=%ld, partid=%d, " - "channel=%d\n", (void *) notify, get, - ch->partid, ch->number); - } - } -} - - -/* * Clear some of the msg flags in the local message queue. */ static inline void @@ -1240,6 +1317,7 @@ xpc_process_channel_activity(struct xpc_partition *part) u64 IPI_amo, IPI_flags; struct xpc_channel *ch; int ch_number; + u32 ch_flags; IPI_amo = xpc_get_IPI_flags(part); @@ -1266,8 +1344,9 @@ xpc_process_channel_activity(struct xpc_partition *part) xpc_process_openclose_IPI(part, ch_number, IPI_flags); } + ch_flags = ch->flags; /* need an atomic snapshot of flags */ - if (ch->flags & XPC_C_DISCONNECTING) { + if (ch_flags & XPC_C_DISCONNECTING) { spin_lock_irqsave(&ch->lock, irq_flags); xpc_process_disconnect(ch, &irq_flags); spin_unlock_irqrestore(&ch->lock, irq_flags); @@ -1278,9 +1357,9 @@ xpc_process_channel_activity(struct xpc_partition *part) continue; } - if (!(ch->flags & XPC_C_CONNECTED)) { - if (!(ch->flags & XPC_C_OPENREQUEST)) { - DBUG_ON(ch->flags & XPC_C_SETUP); + if (!(ch_flags & XPC_C_CONNECTED)) { + if (!(ch_flags & XPC_C_OPENREQUEST)) { + DBUG_ON(ch_flags & XPC_C_SETUP); (void) xpc_connect_channel(ch); } else { spin_lock_irqsave(&ch->lock, irq_flags); @@ -1305,8 +1384,8 @@ xpc_process_channel_activity(struct xpc_partition *part) /* - * XPC's heartbeat code calls this function to inform XPC that a partition has - * gone down. XPC responds by tearing down the XPartition Communication + * XPC's heartbeat code calls this function to inform XPC that a partition is + * going down. XPC responds by tearing down the XPartition Communication * infrastructure used for the just downed partition. * * XPC's heartbeat code will never call this function and xpc_partition_up() @@ -1314,7 +1393,7 @@ xpc_process_channel_activity(struct xpc_partition *part) * at the same time. */ void -xpc_partition_down(struct xpc_partition *part, enum xpc_retval reason) +xpc_partition_going_down(struct xpc_partition *part, enum xpc_retval reason) { unsigned long irq_flags; int ch_number; @@ -1330,12 +1409,11 @@ xpc_partition_down(struct xpc_partition *part, enum xpc_retval reason) } - /* disconnect all channels associated with the downed partition */ + /* disconnect channels associated with the partition going down */ for (ch_number = 0; ch_number < part->nchannels; ch_number++) { ch = &part->channels[ch_number]; - xpc_msgqueue_ref(ch); spin_lock_irqsave(&ch->lock, irq_flags); @@ -1370,6 +1448,7 @@ xpc_teardown_infrastructure(struct xpc_partition *part) * this partition. */ + DBUG_ON(atomic_read(&part->nchannels_engaged) != 0); DBUG_ON(atomic_read(&part->nchannels_active) != 0); DBUG_ON(part->setup_state != XPC_P_SETUP); part->setup_state = XPC_P_WTEARDOWN; @@ -1428,19 +1507,11 @@ xpc_initiate_connect(int ch_number) if (xpc_part_ref(part)) { ch = &part->channels[ch_number]; - if (!(ch->flags & XPC_C_DISCONNECTING)) { - DBUG_ON(ch->flags & XPC_C_OPENREQUEST); - DBUG_ON(ch->flags & XPC_C_CONNECTED); - DBUG_ON(ch->flags & XPC_C_SETUP); - - /* - * Initiate the establishment of a connection - * on the newly registered channel to the - * remote partition. - */ - xpc_wakeup_channel_mgr(part); - } - + /* + * Initiate the establishment of a connection on the + * newly registered channel to the remote partition. + */ + xpc_wakeup_channel_mgr(part); xpc_part_deref(part); } } @@ -1450,9 +1521,6 @@ xpc_initiate_connect(int ch_number) void xpc_connected_callout(struct xpc_channel *ch) { - unsigned long irq_flags; - - /* let the registerer know that a connection has been established */ if (ch->func != NULL) { @@ -1465,10 +1533,6 @@ xpc_connected_callout(struct xpc_channel *ch) dev_dbg(xpc_chan, "ch->func() returned, reason=xpcConnected, " "partid=%d, channel=%d\n", ch->partid, ch->number); } - - spin_lock_irqsave(&ch->lock, irq_flags); - ch->flags |= XPC_C_CONNECTCALLOUT; - spin_unlock_irqrestore(&ch->lock, irq_flags); } @@ -1506,8 +1570,12 @@ xpc_initiate_disconnect(int ch_number) spin_lock_irqsave(&ch->lock, irq_flags); - XPC_DISCONNECT_CHANNEL(ch, xpcUnregistering, + if (!(ch->flags & XPC_C_DISCONNECTED)) { + ch->flags |= XPC_C_WDISCONNECT; + + XPC_DISCONNECT_CHANNEL(ch, xpcUnregistering, &irq_flags); + } spin_unlock_irqrestore(&ch->lock, irq_flags); @@ -1523,8 +1591,9 @@ xpc_initiate_disconnect(int ch_number) /* * To disconnect a channel, and reflect it back to all who may be waiting. * - * >>> An OPEN is not allowed until XPC_C_DISCONNECTING is cleared by - * >>> xpc_free_msgqueues(). + * An OPEN is not allowed until XPC_C_DISCONNECTING is cleared by + * xpc_process_disconnect(), and if set, XPC_C_WDISCONNECT is cleared by + * xpc_disconnect_wait(). * * THE CHANNEL IS TO BE LOCKED BY THE CALLER AND WILL REMAIN LOCKED UPON RETURN. */ @@ -1532,7 +1601,7 @@ void xpc_disconnect_channel(const int line, struct xpc_channel *ch, enum xpc_retval reason, unsigned long *irq_flags) { - u32 flags; + u32 channel_was_connected = (ch->flags & XPC_C_CONNECTED); DBUG_ON(!spin_is_locked(&ch->lock)); @@ -1547,61 +1616,53 @@ xpc_disconnect_channel(const int line, struct xpc_channel *ch, XPC_SET_REASON(ch, reason, line); - flags = ch->flags; + ch->flags |= (XPC_C_CLOSEREQUEST | XPC_C_DISCONNECTING); /* some of these may not have been set */ ch->flags &= ~(XPC_C_OPENREQUEST | XPC_C_OPENREPLY | XPC_C_ROPENREQUEST | XPC_C_ROPENREPLY | XPC_C_CONNECTING | XPC_C_CONNECTED); - ch->flags |= (XPC_C_CLOSEREQUEST | XPC_C_DISCONNECTING); xpc_IPI_send_closerequest(ch, irq_flags); - if (flags & XPC_C_CONNECTED) { + if (channel_was_connected) { ch->flags |= XPC_C_WASCONNECTED; } + spin_unlock_irqrestore(&ch->lock, *irq_flags); + + /* wake all idle kthreads so they can exit */ if (atomic_read(&ch->kthreads_idle) > 0) { - /* wake all idle kthreads so they can exit */ wake_up_all(&ch->idle_wq); } - spin_unlock_irqrestore(&ch->lock, *irq_flags); - - /* wake those waiting to allocate an entry from the local msg queue */ - if (atomic_read(&ch->n_on_msg_allocate_wq) > 0) { wake_up(&ch->msg_allocate_wq); } - /* wake those waiting for notify completion */ - - if (atomic_read(&ch->n_to_notify) > 0) { - xpc_notify_senders(ch, reason, ch->w_local_GP.put); - } - spin_lock_irqsave(&ch->lock, *irq_flags); } void -xpc_disconnected_callout(struct xpc_channel *ch) +xpc_disconnecting_callout(struct xpc_channel *ch) { /* - * Let the channel's registerer know that the channel is now + * Let the channel's registerer know that the channel is being * disconnected. We don't want to do this if the registerer was never - * informed of a connection being made, unless the disconnect was for - * abnormal reasons. + * informed of a connection being made. */ if (ch->func != NULL) { - dev_dbg(xpc_chan, "ch->func() called, reason=%d, partid=%d, " - "channel=%d\n", ch->reason, ch->partid, ch->number); + dev_dbg(xpc_chan, "ch->func() called, reason=xpcDisconnecting," + " partid=%d, channel=%d\n", ch->partid, ch->number); - ch->func(ch->reason, ch->partid, ch->number, NULL, ch->key); + ch->func(xpcDisconnecting, ch->partid, ch->number, NULL, + ch->key); - dev_dbg(xpc_chan, "ch->func() returned, reason=%d, partid=%d, " - "channel=%d\n", ch->reason, ch->partid, ch->number); + dev_dbg(xpc_chan, "ch->func() returned, reason=" + "xpcDisconnecting, partid=%d, channel=%d\n", + ch->partid, ch->number); } } @@ -1848,7 +1909,7 @@ xpc_send_msg(struct xpc_channel *ch, struct xpc_msg *msg, u8 notify_type, xpc_notify_func func, void *key) { enum xpc_retval ret = xpcSuccess; - struct xpc_notify *notify = NULL; // >>> to keep the compiler happy!! + struct xpc_notify *notify = notify; s64 put, msg_number = msg->number; diff --git a/arch/ia64/sn/kernel/xpc_main.c b/arch/ia64/sn/kernel/xpc_main.c index bb1d5cf30440..cece3c7c69be 100644 --- a/arch/ia64/sn/kernel/xpc_main.c +++ b/arch/ia64/sn/kernel/xpc_main.c @@ -54,6 +54,7 @@ #include <linux/interrupt.h> #include <linux/slab.h> #include <linux/delay.h> +#include <linux/reboot.h> #include <asm/sn/intr.h> #include <asm/sn/sn_sal.h> #include <asm/uaccess.h> @@ -82,11 +83,17 @@ struct device *xpc_chan = &xpc_chan_dbg_subname; /* systune related variables for /proc/sys directories */ -static int xpc_hb_min = 1; -static int xpc_hb_max = 10; +static int xpc_hb_interval = XPC_HB_DEFAULT_INTERVAL; +static int xpc_hb_min_interval = 1; +static int xpc_hb_max_interval = 10; -static int xpc_hb_check_min = 10; -static int xpc_hb_check_max = 120; +static int xpc_hb_check_interval = XPC_HB_CHECK_DEFAULT_INTERVAL; +static int xpc_hb_check_min_interval = 10; +static int xpc_hb_check_max_interval = 120; + +int xpc_disengage_request_timelimit = XPC_DISENGAGE_REQUEST_DEFAULT_TIMELIMIT; +static int xpc_disengage_request_min_timelimit = 0; +static int xpc_disengage_request_max_timelimit = 120; static ctl_table xpc_sys_xpc_hb_dir[] = { { @@ -99,7 +106,8 @@ static ctl_table xpc_sys_xpc_hb_dir[] = { &proc_dointvec_minmax, &sysctl_intvec, NULL, - &xpc_hb_min, &xpc_hb_max + &xpc_hb_min_interval, + &xpc_hb_max_interval }, { 2, @@ -111,7 +119,8 @@ static ctl_table xpc_sys_xpc_hb_dir[] = { &proc_dointvec_minmax, &sysctl_intvec, NULL, - &xpc_hb_check_min, &xpc_hb_check_max + &xpc_hb_check_min_interval, + &xpc_hb_check_max_interval }, {0} }; @@ -124,6 +133,19 @@ static ctl_table xpc_sys_xpc_dir[] = { 0555, xpc_sys_xpc_hb_dir }, + { + 2, + "disengage_request_timelimit", + &xpc_disengage_request_timelimit, + sizeof(int), + 0644, + NULL, + &proc_dointvec_minmax, + &sysctl_intvec, + NULL, + &xpc_disengage_request_min_timelimit, + &xpc_disengage_request_max_timelimit + }, {0} }; static ctl_table xpc_sys_dir[] = { @@ -148,10 +170,10 @@ static DECLARE_WAIT_QUEUE_HEAD(xpc_act_IRQ_wq); static unsigned long xpc_hb_check_timeout; -/* xpc_hb_checker thread exited notification */ +/* notification that the xpc_hb_checker thread has exited */ static DECLARE_MUTEX_LOCKED(xpc_hb_checker_exited); -/* xpc_discovery thread exited notification */ +/* notification that the xpc_discovery thread has exited */ static DECLARE_MUTEX_LOCKED(xpc_discovery_exited); @@ -161,6 +183,30 @@ static struct timer_list xpc_hb_timer; static void xpc_kthread_waitmsgs(struct xpc_partition *, struct xpc_channel *); +static int xpc_system_reboot(struct notifier_block *, unsigned long, void *); +static struct notifier_block xpc_reboot_notifier = { + .notifier_call = xpc_system_reboot, +}; + + +/* + * Timer function to enforce the timelimit on the partition disengage request. + */ +static void +xpc_timeout_partition_disengage_request(unsigned long data) +{ + struct xpc_partition *part = (struct xpc_partition *) data; + + + DBUG_ON(jiffies < part->disengage_request_timeout); + + (void) xpc_partition_disengaged(part); + + DBUG_ON(part->disengage_request_timeout != 0); + DBUG_ON(xpc_partition_engaged(1UL << XPC_PARTID(part)) != 0); +} + + /* * Notify the heartbeat check thread that an IRQ has been received. */ @@ -214,12 +260,6 @@ xpc_hb_checker(void *ignore) while (!(volatile int) xpc_exiting) { - /* wait for IRQ or timeout */ - (void) wait_event_interruptible(xpc_act_IRQ_wq, - (last_IRQ_count < atomic_read(&xpc_act_IRQ_rcvd) || - jiffies >= xpc_hb_check_timeout || - (volatile int) xpc_exiting)); - dev_dbg(xpc_part, "woke up with %d ticks rem; %d IRQs have " "been received\n", (int) (xpc_hb_check_timeout - jiffies), @@ -240,6 +280,7 @@ xpc_hb_checker(void *ignore) } + /* check for outstanding IRQs */ new_IRQ_count = atomic_read(&xpc_act_IRQ_rcvd); if (last_IRQ_count < new_IRQ_count || force_IRQ != 0) { force_IRQ = 0; @@ -257,12 +298,18 @@ xpc_hb_checker(void *ignore) xpc_hb_check_timeout = jiffies + (xpc_hb_check_interval * HZ); } + + /* wait for IRQ or timeout */ + (void) wait_event_interruptible(xpc_act_IRQ_wq, + (last_IRQ_count < atomic_read(&xpc_act_IRQ_rcvd) || + jiffies >= xpc_hb_check_timeout || + (volatile int) xpc_exiting)); } dev_dbg(xpc_part, "heartbeat checker is exiting\n"); - /* mark this thread as inactive */ + /* mark this thread as having exited */ up(&xpc_hb_checker_exited); return 0; } @@ -282,7 +329,7 @@ xpc_initiate_discovery(void *ignore) dev_dbg(xpc_part, "discovery thread is exiting\n"); - /* mark this thread as inactive */ + /* mark this thread as having exited */ up(&xpc_discovery_exited); return 0; } @@ -309,7 +356,7 @@ xpc_make_first_contact(struct xpc_partition *part) "partition %d\n", XPC_PARTID(part)); /* wait a 1/4 of a second or so */ - msleep_interruptible(250); + (void) msleep_interruptible(250); if (part->act_state == XPC_P_DEACTIVATING) { return part->reason; @@ -336,7 +383,8 @@ static void xpc_channel_mgr(struct xpc_partition *part) { while (part->act_state != XPC_P_DEACTIVATING || - atomic_read(&part->nchannels_active) > 0) { + atomic_read(&part->nchannels_active) > 0 || + !xpc_partition_disengaged(part)) { xpc_process_channel_activity(part); @@ -360,7 +408,8 @@ xpc_channel_mgr(struct xpc_partition *part) (volatile u64) part->local_IPI_amo != 0 || ((volatile u8) part->act_state == XPC_P_DEACTIVATING && - atomic_read(&part->nchannels_active) == 0))); + atomic_read(&part->nchannels_active) == 0 && + xpc_partition_disengaged(part)))); atomic_set(&part->channel_mgr_requests, 1); // >>> Does it need to wakeup periodically as well? In case we @@ -482,7 +531,7 @@ xpc_activating(void *__partid) return 0; } - XPC_ALLOW_HB(partid, xpc_vars); + xpc_allow_hb(partid, xpc_vars); xpc_IPI_send_activated(part); @@ -492,6 +541,7 @@ xpc_activating(void *__partid) */ (void) xpc_partition_up(part); + xpc_disallow_hb(partid, xpc_vars); xpc_mark_partition_inactive(part); if (part->reason == xpcReactivating) { @@ -670,6 +720,7 @@ xpc_daemonize_kthread(void *args) struct xpc_partition *part = &xpc_partitions[partid]; struct xpc_channel *ch; int n_needed; + unsigned long irq_flags; daemonize("xpc%02dc%d", partid, ch_number); @@ -680,11 +731,14 @@ xpc_daemonize_kthread(void *args) ch = &part->channels[ch_number]; if (!(ch->flags & XPC_C_DISCONNECTING)) { - DBUG_ON(!(ch->flags & XPC_C_CONNECTED)); /* let registerer know that connection has been established */ - if (atomic_read(&ch->kthreads_assigned) == 1) { + spin_lock_irqsave(&ch->lock, irq_flags); + if (!(ch->flags & XPC_C_CONNECTCALLOUT)) { + ch->flags |= XPC_C_CONNECTCALLOUT; + spin_unlock_irqrestore(&ch->lock, irq_flags); + xpc_connected_callout(ch); /* @@ -699,16 +753,28 @@ xpc_daemonize_kthread(void *args) !(ch->flags & XPC_C_DISCONNECTING)) { xpc_activate_kthreads(ch, n_needed); } + } else { + spin_unlock_irqrestore(&ch->lock, irq_flags); } xpc_kthread_waitmsgs(part, ch); } - if (atomic_dec_return(&ch->kthreads_assigned) == 0 && - ((ch->flags & XPC_C_CONNECTCALLOUT) || - (ch->reason != xpcUnregistering && - ch->reason != xpcOtherUnregistering))) { - xpc_disconnected_callout(ch); + if (atomic_dec_return(&ch->kthreads_assigned) == 0) { + spin_lock_irqsave(&ch->lock, irq_flags); + if ((ch->flags & XPC_C_CONNECTCALLOUT) && + !(ch->flags & XPC_C_DISCONNECTCALLOUT)) { + ch->flags |= XPC_C_DISCONNECTCALLOUT; + spin_unlock_irqrestore(&ch->lock, irq_flags); + + xpc_disconnecting_callout(ch); + } else { + spin_unlock_irqrestore(&ch->lock, irq_flags); + } + if (atomic_dec_return(&part->nchannels_engaged) == 0) { + xpc_mark_partition_disengaged(part); + xpc_IPI_send_disengage(part); + } } @@ -740,12 +806,33 @@ xpc_create_kthreads(struct xpc_channel *ch, int needed) unsigned long irq_flags; pid_t pid; u64 args = XPC_PACK_ARGS(ch->partid, ch->number); + struct xpc_partition *part = &xpc_partitions[ch->partid]; while (needed-- > 0) { + + /* + * The following is done on behalf of the newly created + * kthread. That kthread is responsible for doing the + * counterpart to the following before it exits. + */ + (void) xpc_part_ref(part); + xpc_msgqueue_ref(ch); + if (atomic_inc_return(&ch->kthreads_assigned) == 1 && + atomic_inc_return(&part->nchannels_engaged) == 1) { + xpc_mark_partition_engaged(part); + } + pid = kernel_thread(xpc_daemonize_kthread, (void *) args, 0); if (pid < 0) { /* the fork failed */ + if (atomic_dec_return(&ch->kthreads_assigned) == 0 && + atomic_dec_return(&part->nchannels_engaged) == 0) { + xpc_mark_partition_disengaged(part); + xpc_IPI_send_disengage(part); + } + xpc_msgqueue_deref(ch); + xpc_part_deref(part); if (atomic_read(&ch->kthreads_assigned) < ch->kthreads_idle_limit) { @@ -765,14 +852,6 @@ xpc_create_kthreads(struct xpc_channel *ch, int needed) break; } - /* - * The following is done on behalf of the newly created - * kthread. That kthread is responsible for doing the - * counterpart to the following before it exits. - */ - (void) xpc_part_ref(&xpc_partitions[ch->partid]); - xpc_msgqueue_ref(ch); - atomic_inc(&ch->kthreads_assigned); ch->kthreads_created++; // >>> temporary debug only!!! } } @@ -781,88 +860,143 @@ xpc_create_kthreads(struct xpc_channel *ch, int needed) void xpc_disconnect_wait(int ch_number) { + unsigned long irq_flags; partid_t partid; struct xpc_partition *part; struct xpc_channel *ch; + int wakeup_channel_mgr; /* now wait for all callouts to the caller's function to cease */ for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) { part = &xpc_partitions[partid]; - if (xpc_part_ref(part)) { - ch = &part->channels[ch_number]; + if (!xpc_part_ref(part)) { + continue; + } -// >>> how do we keep from falling into the window between our check and going -// >>> down and coming back up where sema is re-inited? - if (ch->flags & XPC_C_SETUP) { - (void) down(&ch->teardown_sema); - } + ch = &part->channels[ch_number]; + if (!(ch->flags & XPC_C_WDISCONNECT)) { xpc_part_deref(part); + continue; } + + (void) down(&ch->wdisconnect_sema); + + spin_lock_irqsave(&ch->lock, irq_flags); + DBUG_ON(!(ch->flags & XPC_C_DISCONNECTED)); + wakeup_channel_mgr = 0; + + if (ch->delayed_IPI_flags) { + if (part->act_state != XPC_P_DEACTIVATING) { + spin_lock(&part->IPI_lock); + XPC_SET_IPI_FLAGS(part->local_IPI_amo, + ch->number, ch->delayed_IPI_flags); + spin_unlock(&part->IPI_lock); + wakeup_channel_mgr = 1; + } + ch->delayed_IPI_flags = 0; + } + + ch->flags &= ~XPC_C_WDISCONNECT; + spin_unlock_irqrestore(&ch->lock, irq_flags); + + if (wakeup_channel_mgr) { + xpc_wakeup_channel_mgr(part); + } + + xpc_part_deref(part); } } static void -xpc_do_exit(void) +xpc_do_exit(enum xpc_retval reason) { partid_t partid; int active_part_count; struct xpc_partition *part; + unsigned long printmsg_time; - /* now it's time to eliminate our heartbeat */ - del_timer_sync(&xpc_hb_timer); - xpc_vars->heartbeating_to_mask = 0; - - /* indicate to others that our reserved page is uninitialized */ - xpc_rsvd_page->vars_pa = 0; - - /* - * Ignore all incoming interrupts. Without interupts the heartbeat - * checker won't activate any new partitions that may come up. - */ - free_irq(SGI_XPC_ACTIVATE, NULL); + /* a 'rmmod XPC' and a 'reboot' cannot both end up here together */ + DBUG_ON(xpc_exiting == 1); /* - * Cause the heartbeat checker and the discovery threads to exit. - * We don't want them attempting to activate new partitions as we - * try to deactivate the existing ones. + * Let the heartbeat checker thread and the discovery thread + * (if one is running) know that they should exit. Also wake up + * the heartbeat checker thread in case it's sleeping. */ xpc_exiting = 1; wake_up_interruptible(&xpc_act_IRQ_wq); - /* wait for the heartbeat checker thread to mark itself inactive */ - down(&xpc_hb_checker_exited); + /* ignore all incoming interrupts */ + free_irq(SGI_XPC_ACTIVATE, NULL); - /* wait for the discovery thread to mark itself inactive */ + /* wait for the discovery thread to exit */ down(&xpc_discovery_exited); + /* wait for the heartbeat checker thread to exit */ + down(&xpc_hb_checker_exited); + - msleep_interruptible(300); + /* sleep for a 1/3 of a second or so */ + (void) msleep_interruptible(300); /* wait for all partitions to become inactive */ + printmsg_time = jiffies; + do { active_part_count = 0; for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) { part = &xpc_partitions[partid]; - if (part->act_state != XPC_P_INACTIVE) { - active_part_count++; - XPC_DEACTIVATE_PARTITION(part, xpcUnloading); + if (xpc_partition_disengaged(part) && + part->act_state == XPC_P_INACTIVE) { + continue; } + + active_part_count++; + + XPC_DEACTIVATE_PARTITION(part, reason); + } + + if (active_part_count == 0) { + break; + } + + if (jiffies >= printmsg_time) { + dev_info(xpc_part, "waiting for partitions to " + "deactivate/disengage, active count=%d, remote " + "engaged=0x%lx\n", active_part_count, + xpc_partition_engaged(1UL << partid)); + + printmsg_time = jiffies + + (XPC_DISENGAGE_PRINTMSG_INTERVAL * HZ); } - if (active_part_count) - msleep_interruptible(300); - } while (active_part_count > 0); + /* sleep for a 1/3 of a second or so */ + (void) msleep_interruptible(300); + + } while (1); + + DBUG_ON(xpc_partition_engaged(-1UL)); + /* indicate to others that our reserved page is uninitialized */ + xpc_rsvd_page->vars_pa = 0; + + /* now it's time to eliminate our heartbeat */ + del_timer_sync(&xpc_hb_timer); + DBUG_ON(xpc_vars->heartbeating_to_mask != 0); + + /* take ourselves off of the reboot_notifier_list */ + (void) unregister_reboot_notifier(&xpc_reboot_notifier); + /* close down protections for IPI operations */ xpc_restrict_IPI_ops(); @@ -876,6 +1010,34 @@ xpc_do_exit(void) } +/* + * This function is called when the system is being rebooted. + */ +static int +xpc_system_reboot(struct notifier_block *nb, unsigned long event, void *unused) +{ + enum xpc_retval reason; + + + switch (event) { + case SYS_RESTART: + reason = xpcSystemReboot; + break; + case SYS_HALT: + reason = xpcSystemHalt; + break; + case SYS_POWER_OFF: + reason = xpcSystemPoweroff; + break; + default: + reason = xpcSystemGoingDown; + } + + xpc_do_exit(reason); + return NOTIFY_DONE; +} + + int __init xpc_init(void) { @@ -885,13 +1047,17 @@ xpc_init(void) pid_t pid; + if (!ia64_platform_is("sn2")) { + return -ENODEV; + } + /* * xpc_remote_copy_buffer is used as a temporary buffer for bte_copy'ng - * both a partition's reserved page and its XPC variables. Its size was - * based on the size of a reserved page. So we need to ensure that the - * XPC variables will fit as well. + * various portions of a partition's reserved page. Its size is based + * on the size of the reserved page header and part_nasids mask. So we + * need to ensure that the other items will fit as well. */ - if (XPC_VARS_ALIGNED_SIZE > XPC_RSVD_PAGE_ALIGNED_SIZE) { + if (XPC_RP_VARS_SIZE > XPC_RP_HEADER_SIZE + XP_NASID_MASK_BYTES) { dev_err(xpc_part, "xpc_remote_copy_buffer is not big enough\n"); return -EPERM; } @@ -920,6 +1086,12 @@ xpc_init(void) spin_lock_init(&part->act_lock); part->act_state = XPC_P_INACTIVE; XPC_SET_REASON(part, 0, 0); + + init_timer(&part->disengage_request_timer); + part->disengage_request_timer.function = + xpc_timeout_partition_disengage_request; + part->disengage_request_timer.data = (unsigned long) part; + part->setup_state = XPC_P_UNSET; init_waitqueue_head(&part->teardown_wq); atomic_set(&part->references, 0); @@ -976,6 +1148,13 @@ xpc_init(void) } + /* add ourselves to the reboot_notifier_list */ + ret = register_reboot_notifier(&xpc_reboot_notifier); + if (ret != 0) { + dev_warn(xpc_part, "can't register reboot notifier\n"); + } + + /* * Set the beating to other partitions into motion. This is * the last requirement for other partitions' discovery to @@ -997,6 +1176,9 @@ xpc_init(void) /* indicate to others that our reserved page is uninitialized */ xpc_rsvd_page->vars_pa = 0; + /* take ourselves off of the reboot_notifier_list */ + (void) unregister_reboot_notifier(&xpc_reboot_notifier); + del_timer_sync(&xpc_hb_timer); free_irq(SGI_XPC_ACTIVATE, NULL); xpc_restrict_IPI_ops(); @@ -1020,7 +1202,7 @@ xpc_init(void) /* mark this new thread as a non-starter */ up(&xpc_discovery_exited); - xpc_do_exit(); + xpc_do_exit(xpcUnloading); return -EBUSY; } @@ -1039,7 +1221,7 @@ module_init(xpc_init); void __exit xpc_exit(void) { - xpc_do_exit(); + xpc_do_exit(xpcUnloading); } module_exit(xpc_exit); @@ -1056,3 +1238,7 @@ module_param(xpc_hb_check_interval, int, 0); MODULE_PARM_DESC(xpc_hb_check_interval, "Number of seconds between " "heartbeat checks."); +module_param(xpc_disengage_request_timelimit, int, 0); +MODULE_PARM_DESC(xpc_disengage_request_timelimit, "Number of seconds to wait " + "for disengage request to complete."); + diff --git a/arch/ia64/sn/kernel/xpc_partition.c b/arch/ia64/sn/kernel/xpc_partition.c index 578265ea9e67..581e113d2d37 100644 --- a/arch/ia64/sn/kernel/xpc_partition.c +++ b/arch/ia64/sn/kernel/xpc_partition.c @@ -44,16 +44,19 @@ static u64 xpc_sh2_IPI_access3; /* original protection values for each node */ -u64 xpc_prot_vec[MAX_COMPACT_NODES]; +u64 xpc_prot_vec[MAX_NUMNODES]; -/* this partition's reserved page */ +/* this partition's reserved page pointers */ struct xpc_rsvd_page *xpc_rsvd_page; - -/* this partition's XPC variables (within the reserved page) */ +static u64 *xpc_part_nasids; +static u64 *xpc_mach_nasids; struct xpc_vars *xpc_vars; struct xpc_vars_part *xpc_vars_part; +static int xp_nasid_mask_bytes; /* actual size in bytes of nasid mask */ +static int xp_nasid_mask_words; /* actual size in words of nasid mask */ + /* * For performance reasons, each entry of xpc_partitions[] is cacheline @@ -65,20 +68,16 @@ struct xpc_partition xpc_partitions[XP_MAX_PARTITIONS + 1]; /* - * Generic buffer used to store a local copy of the remote partitions - * reserved page or XPC variables. + * Generic buffer used to store a local copy of portions of a remote + * partition's reserved page (either its header and part_nasids mask, + * or its vars). * * xpc_discovery runs only once and is a seperate thread that is * very likely going to be processing in parallel with receiving * interrupts. */ -char ____cacheline_aligned - xpc_remote_copy_buffer[XPC_RSVD_PAGE_ALIGNED_SIZE]; - - -/* systune related variables */ -int xpc_hb_interval = XPC_HB_DEFAULT_INTERVAL; -int xpc_hb_check_interval = XPC_HB_CHECK_DEFAULT_TIMEOUT; +char ____cacheline_aligned xpc_remote_copy_buffer[XPC_RP_HEADER_SIZE + + XP_NASID_MASK_BYTES]; /* @@ -86,13 +85,16 @@ int xpc_hb_check_interval = XPC_HB_CHECK_DEFAULT_TIMEOUT; * for that nasid. This function returns 0 on any error. */ static u64 -xpc_get_rsvd_page_pa(int nasid, u64 buf, u64 buf_size) +xpc_get_rsvd_page_pa(int nasid) { bte_result_t bte_res; s64 status; u64 cookie = 0; u64 rp_pa = nasid; /* seed with nasid */ u64 len = 0; + u64 buf = buf; + u64 buf_len = 0; + void *buf_base = NULL; while (1) { @@ -108,13 +110,22 @@ xpc_get_rsvd_page_pa(int nasid, u64 buf, u64 buf_size) break; } - if (len > buf_size) { - dev_err(xpc_part, "len (=0x%016lx) > buf_size\n", len); - status = SALRET_ERROR; - break; + if (L1_CACHE_ALIGN(len) > buf_len) { + if (buf_base != NULL) { + kfree(buf_base); + } + buf_len = L1_CACHE_ALIGN(len); + buf = (u64) xpc_kmalloc_cacheline_aligned(buf_len, + GFP_KERNEL, &buf_base); + if (buf_base == NULL) { + dev_err(xpc_part, "unable to kmalloc " + "len=0x%016lx\n", buf_len); + status = SALRET_ERROR; + break; + } } - bte_res = xp_bte_copy(rp_pa, ia64_tpa(buf), buf_size, + bte_res = xp_bte_copy(rp_pa, ia64_tpa(buf), buf_len, (BTE_NOTIFY | BTE_WACQUIRE), NULL); if (bte_res != BTE_SUCCESS) { dev_dbg(xpc_part, "xp_bte_copy failed %i\n", bte_res); @@ -123,6 +134,10 @@ xpc_get_rsvd_page_pa(int nasid, u64 buf, u64 buf_size) } } + if (buf_base != NULL) { + kfree(buf_base); + } + if (status != SALRET_OK) { rp_pa = 0; } @@ -141,15 +156,15 @@ xpc_rsvd_page_init(void) { struct xpc_rsvd_page *rp; AMO_t *amos_page; - u64 rp_pa, next_cl, nasid_array = 0; + u64 rp_pa, nasid_array = 0; int i, ret; /* get the local reserved page's address */ - rp_pa = xpc_get_rsvd_page_pa(cnodeid_to_nasid(0), - (u64) xpc_remote_copy_buffer, - XPC_RSVD_PAGE_ALIGNED_SIZE); + preempt_disable(); + rp_pa = xpc_get_rsvd_page_pa(cpuid_to_nasid(smp_processor_id())); + preempt_enable(); if (rp_pa == 0) { dev_err(xpc_part, "SAL failed to locate the reserved page\n"); return NULL; @@ -164,12 +179,19 @@ xpc_rsvd_page_init(void) rp->version = XPC_RP_VERSION; - /* - * Place the XPC variables on the cache line following the - * reserved page structure. - */ - next_cl = (u64) rp + XPC_RSVD_PAGE_ALIGNED_SIZE; - xpc_vars = (struct xpc_vars *) next_cl; + /* establish the actual sizes of the nasid masks */ + if (rp->SAL_version == 1) { + /* SAL_version 1 didn't set the nasids_size field */ + rp->nasids_size = 128; + } + xp_nasid_mask_bytes = rp->nasids_size; + xp_nasid_mask_words = xp_nasid_mask_bytes / 8; + + /* setup the pointers to the various items in the reserved page */ + xpc_part_nasids = XPC_RP_PART_NASIDS(rp); + xpc_mach_nasids = XPC_RP_MACH_NASIDS(rp); + xpc_vars = XPC_RP_VARS(rp); + xpc_vars_part = XPC_RP_VARS_PART(rp); /* * Before clearing xpc_vars, see if a page of AMOs had been previously @@ -221,33 +243,32 @@ xpc_rsvd_page_init(void) amos_page = (AMO_t *) TO_AMO((u64) amos_page); } + /* clear xpc_vars */ memset(xpc_vars, 0, sizeof(struct xpc_vars)); - /* - * Place the XPC per partition specific variables on the cache line - * following the XPC variables structure. - */ - next_cl += XPC_VARS_ALIGNED_SIZE; - memset((u64 *) next_cl, 0, sizeof(struct xpc_vars_part) * - XP_MAX_PARTITIONS); - xpc_vars_part = (struct xpc_vars_part *) next_cl; - xpc_vars->vars_part_pa = __pa(next_cl); - xpc_vars->version = XPC_V_VERSION; xpc_vars->act_nasid = cpuid_to_nasid(0); xpc_vars->act_phys_cpuid = cpu_physical_id(0); + xpc_vars->vars_part_pa = __pa(xpc_vars_part); + xpc_vars->amos_page_pa = ia64_tpa((u64) amos_page); xpc_vars->amos_page = amos_page; /* save for next load of XPC */ - /* - * Initialize the activation related AMO variables. - */ - xpc_vars->act_amos = xpc_IPI_init(XP_MAX_PARTITIONS); - for (i = 1; i < XP_NASID_MASK_WORDS; i++) { - xpc_IPI_init(i + XP_MAX_PARTITIONS); + /* clear xpc_vars_part */ + memset((u64 *) xpc_vars_part, 0, sizeof(struct xpc_vars_part) * + XP_MAX_PARTITIONS); + + /* initialize the activate IRQ related AMO variables */ + for (i = 0; i < xp_nasid_mask_words; i++) { + (void) xpc_IPI_init(XPC_ACTIVATE_IRQ_AMOS + i); } - /* export AMO page's physical address to other partitions */ - xpc_vars->amos_page_pa = ia64_tpa((u64) xpc_vars->amos_page); + + /* initialize the engaged remote partitions related AMO variables */ + (void) xpc_IPI_init(XPC_ENGAGED_PARTITIONS_AMO); + (void) xpc_IPI_init(XPC_DISENGAGE_REQUEST_AMO); + + /* timestamp of when reserved page was setup by XPC */ + rp->stamp = CURRENT_TIME; /* * This signifies to the remote partition that our reserved @@ -387,6 +408,11 @@ xpc_check_remote_hb(void) remote_vars = (struct xpc_vars *) xpc_remote_copy_buffer; for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) { + + if (xpc_exiting) { + break; + } + if (partid == sn_partition_id) { continue; } @@ -401,7 +427,7 @@ xpc_check_remote_hb(void) /* pull the remote_hb cache line */ bres = xp_bte_copy(part->remote_vars_pa, ia64_tpa((u64) remote_vars), - XPC_VARS_ALIGNED_SIZE, + XPC_RP_VARS_SIZE, (BTE_NOTIFY | BTE_WACQUIRE), NULL); if (bres != BTE_SUCCESS) { XPC_DEACTIVATE_PARTITION(part, @@ -417,7 +443,7 @@ xpc_check_remote_hb(void) if (((remote_vars->heartbeat == part->last_heartbeat) && (remote_vars->kdb_status == 0)) || - !XPC_HB_ALLOWED(sn_partition_id, remote_vars)) { + !xpc_hb_allowed(sn_partition_id, remote_vars)) { XPC_DEACTIVATE_PARTITION(part, xpcNoHeartbeat); continue; @@ -429,31 +455,31 @@ xpc_check_remote_hb(void) /* - * Get a copy of the remote partition's rsvd page. + * Get a copy of a portion of the remote partition's rsvd page. * * remote_rp points to a buffer that is cacheline aligned for BTE copies and - * assumed to be of size XPC_RSVD_PAGE_ALIGNED_SIZE. + * is large enough to contain a copy of their reserved page header and + * part_nasids mask. */ static enum xpc_retval xpc_get_remote_rp(int nasid, u64 *discovered_nasids, - struct xpc_rsvd_page *remote_rp, u64 *remote_rsvd_page_pa) + struct xpc_rsvd_page *remote_rp, u64 *remote_rp_pa) { int bres, i; /* get the reserved page's physical address */ - *remote_rsvd_page_pa = xpc_get_rsvd_page_pa(nasid, (u64) remote_rp, - XPC_RSVD_PAGE_ALIGNED_SIZE); - if (*remote_rsvd_page_pa == 0) { + *remote_rp_pa = xpc_get_rsvd_page_pa(nasid); + if (*remote_rp_pa == 0) { return xpcNoRsvdPageAddr; } - /* pull over the reserved page structure */ + /* pull over the reserved page header and part_nasids mask */ - bres = xp_bte_copy(*remote_rsvd_page_pa, ia64_tpa((u64) remote_rp), - XPC_RSVD_PAGE_ALIGNED_SIZE, + bres = xp_bte_copy(*remote_rp_pa, ia64_tpa((u64) remote_rp), + XPC_RP_HEADER_SIZE + xp_nasid_mask_bytes, (BTE_NOTIFY | BTE_WACQUIRE), NULL); if (bres != BTE_SUCCESS) { return xpc_map_bte_errors(bres); @@ -461,8 +487,11 @@ xpc_get_remote_rp(int nasid, u64 *discovered_nasids, if (discovered_nasids != NULL) { - for (i = 0; i < XP_NASID_MASK_WORDS; i++) { - discovered_nasids[i] |= remote_rp->part_nasids[i]; + u64 *remote_part_nasids = XPC_RP_PART_NASIDS(remote_rp); + + + for (i = 0; i < xp_nasid_mask_words; i++) { + discovered_nasids[i] |= remote_part_nasids[i]; } } @@ -489,10 +518,10 @@ xpc_get_remote_rp(int nasid, u64 *discovered_nasids, /* - * Get a copy of the remote partition's XPC variables. + * Get a copy of the remote partition's XPC variables from the reserved page. * * remote_vars points to a buffer that is cacheline aligned for BTE copies and - * assumed to be of size XPC_VARS_ALIGNED_SIZE. + * assumed to be of size XPC_RP_VARS_SIZE. */ static enum xpc_retval xpc_get_remote_vars(u64 remote_vars_pa, struct xpc_vars *remote_vars) @@ -508,7 +537,7 @@ xpc_get_remote_vars(u64 remote_vars_pa, struct xpc_vars *remote_vars) /* pull over the cross partition variables */ bres = xp_bte_copy(remote_vars_pa, ia64_tpa((u64) remote_vars), - XPC_VARS_ALIGNED_SIZE, + XPC_RP_VARS_SIZE, (BTE_NOTIFY | BTE_WACQUIRE), NULL); if (bres != BTE_SUCCESS) { return xpc_map_bte_errors(bres); @@ -524,7 +553,56 @@ xpc_get_remote_vars(u64 remote_vars_pa, struct xpc_vars *remote_vars) /* - * Prior code has determine the nasid which generated an IPI. Inspect + * Update the remote partition's info. + */ +static void +xpc_update_partition_info(struct xpc_partition *part, u8 remote_rp_version, + struct timespec *remote_rp_stamp, u64 remote_rp_pa, + u64 remote_vars_pa, struct xpc_vars *remote_vars) +{ + part->remote_rp_version = remote_rp_version; + dev_dbg(xpc_part, " remote_rp_version = 0x%016lx\n", + part->remote_rp_version); + + part->remote_rp_stamp = *remote_rp_stamp; + dev_dbg(xpc_part, " remote_rp_stamp (tv_sec = 0x%lx tv_nsec = 0x%lx\n", + part->remote_rp_stamp.tv_sec, part->remote_rp_stamp.tv_nsec); + + part->remote_rp_pa = remote_rp_pa; + dev_dbg(xpc_part, " remote_rp_pa = 0x%016lx\n", part->remote_rp_pa); + + part->remote_vars_pa = remote_vars_pa; + dev_dbg(xpc_part, " remote_vars_pa = 0x%016lx\n", + part->remote_vars_pa); + + part->last_heartbeat = remote_vars->heartbeat; + dev_dbg(xpc_part, " last_heartbeat = 0x%016lx\n", + part->last_heartbeat); + + part->remote_vars_part_pa = remote_vars->vars_part_pa; + dev_dbg(xpc_part, " remote_vars_part_pa = 0x%016lx\n", + part->remote_vars_part_pa); + + part->remote_act_nasid = remote_vars->act_nasid; + dev_dbg(xpc_part, " remote_act_nasid = 0x%x\n", + part->remote_act_nasid); + + part->remote_act_phys_cpuid = remote_vars->act_phys_cpuid; + dev_dbg(xpc_part, " remote_act_phys_cpuid = 0x%x\n", + part->remote_act_phys_cpuid); + + part->remote_amos_page_pa = remote_vars->amos_page_pa; + dev_dbg(xpc_part, " remote_amos_page_pa = 0x%lx\n", + part->remote_amos_page_pa); + + part->remote_vars_version = remote_vars->version; + dev_dbg(xpc_part, " remote_vars_version = 0x%x\n", + part->remote_vars_version); +} + + +/* + * Prior code has determined the nasid which generated an IPI. Inspect * that nasid to determine if its partition needs to be activated or * deactivated. * @@ -542,8 +620,12 @@ xpc_identify_act_IRQ_req(int nasid) { struct xpc_rsvd_page *remote_rp; struct xpc_vars *remote_vars; - u64 remote_rsvd_page_pa; + u64 remote_rp_pa; u64 remote_vars_pa; + int remote_rp_version; + int reactivate = 0; + int stamp_diff; + struct timespec remote_rp_stamp = { 0, 0 }; partid_t partid; struct xpc_partition *part; enum xpc_retval ret; @@ -553,7 +635,7 @@ xpc_identify_act_IRQ_req(int nasid) remote_rp = (struct xpc_rsvd_page *) xpc_remote_copy_buffer; - ret = xpc_get_remote_rp(nasid, NULL, remote_rp, &remote_rsvd_page_pa); + ret = xpc_get_remote_rp(nasid, NULL, remote_rp, &remote_rp_pa); if (ret != xpcSuccess) { dev_warn(xpc_part, "unable to get reserved page from nasid %d, " "which sent interrupt, reason=%d\n", nasid, ret); @@ -561,6 +643,10 @@ xpc_identify_act_IRQ_req(int nasid) } remote_vars_pa = remote_rp->vars_pa; + remote_rp_version = remote_rp->version; + if (XPC_SUPPORTS_RP_STAMP(remote_rp_version)) { + remote_rp_stamp = remote_rp->stamp; + } partid = remote_rp->partid; part = &xpc_partitions[partid]; @@ -586,44 +672,117 @@ xpc_identify_act_IRQ_req(int nasid) "%ld:0x%lx\n", (int) nasid, (int) partid, part->act_IRQ_rcvd, remote_vars->heartbeat, remote_vars->heartbeating_to_mask); + if (xpc_partition_disengaged(part) && + part->act_state == XPC_P_INACTIVE) { - if (part->act_state == XPC_P_INACTIVE) { + xpc_update_partition_info(part, remote_rp_version, + &remote_rp_stamp, remote_rp_pa, + remote_vars_pa, remote_vars); - part->remote_rp_pa = remote_rsvd_page_pa; - dev_dbg(xpc_part, " remote_rp_pa = 0x%016lx\n", - part->remote_rp_pa); + if (XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version)) { + if (xpc_partition_disengage_requested(1UL << partid)) { + /* + * Other side is waiting on us to disengage, + * even though we already have. + */ + return; + } + } else { + /* other side doesn't support disengage requests */ + xpc_clear_partition_disengage_request(1UL << partid); + } - part->remote_vars_pa = remote_vars_pa; - dev_dbg(xpc_part, " remote_vars_pa = 0x%016lx\n", - part->remote_vars_pa); + xpc_activate_partition(part); + return; + } - part->last_heartbeat = remote_vars->heartbeat; - dev_dbg(xpc_part, " last_heartbeat = 0x%016lx\n", - part->last_heartbeat); + DBUG_ON(part->remote_rp_version == 0); + DBUG_ON(part->remote_vars_version == 0); + + if (!XPC_SUPPORTS_RP_STAMP(part->remote_rp_version)) { + DBUG_ON(XPC_SUPPORTS_DISENGAGE_REQUEST(part-> + remote_vars_version)); + + if (!XPC_SUPPORTS_RP_STAMP(remote_rp_version)) { + DBUG_ON(XPC_SUPPORTS_DISENGAGE_REQUEST(remote_vars-> + version)); + /* see if the other side rebooted */ + if (part->remote_amos_page_pa == + remote_vars->amos_page_pa && + xpc_hb_allowed(sn_partition_id, + remote_vars)) { + /* doesn't look that way, so ignore the IPI */ + return; + } + } - part->remote_vars_part_pa = remote_vars->vars_part_pa; - dev_dbg(xpc_part, " remote_vars_part_pa = 0x%016lx\n", - part->remote_vars_part_pa); + /* + * Other side rebooted and previous XPC didn't support the + * disengage request, so we don't need to do anything special. + */ - part->remote_act_nasid = remote_vars->act_nasid; - dev_dbg(xpc_part, " remote_act_nasid = 0x%x\n", - part->remote_act_nasid); + xpc_update_partition_info(part, remote_rp_version, + &remote_rp_stamp, remote_rp_pa, + remote_vars_pa, remote_vars); + part->reactivate_nasid = nasid; + XPC_DEACTIVATE_PARTITION(part, xpcReactivating); + return; + } - part->remote_act_phys_cpuid = remote_vars->act_phys_cpuid; - dev_dbg(xpc_part, " remote_act_phys_cpuid = 0x%x\n", - part->remote_act_phys_cpuid); + DBUG_ON(!XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version)); - part->remote_amos_page_pa = remote_vars->amos_page_pa; - dev_dbg(xpc_part, " remote_amos_page_pa = 0x%lx\n", - part->remote_amos_page_pa); + if (!XPC_SUPPORTS_RP_STAMP(remote_rp_version)) { + DBUG_ON(!XPC_SUPPORTS_DISENGAGE_REQUEST(remote_vars->version)); - xpc_activate_partition(part); + /* + * Other side rebooted and previous XPC did support the + * disengage request, but the new one doesn't. + */ + + xpc_clear_partition_engaged(1UL << partid); + xpc_clear_partition_disengage_request(1UL << partid); - } else if (part->remote_amos_page_pa != remote_vars->amos_page_pa || - !XPC_HB_ALLOWED(sn_partition_id, remote_vars)) { + xpc_update_partition_info(part, remote_rp_version, + &remote_rp_stamp, remote_rp_pa, + remote_vars_pa, remote_vars); + reactivate = 1; + + } else { + DBUG_ON(!XPC_SUPPORTS_DISENGAGE_REQUEST(remote_vars->version)); + stamp_diff = xpc_compare_stamps(&part->remote_rp_stamp, + &remote_rp_stamp); + if (stamp_diff != 0) { + DBUG_ON(stamp_diff >= 0); + + /* + * Other side rebooted and the previous XPC did support + * the disengage request, as does the new one. + */ + + DBUG_ON(xpc_partition_engaged(1UL << partid)); + DBUG_ON(xpc_partition_disengage_requested(1UL << + partid)); + + xpc_update_partition_info(part, remote_rp_version, + &remote_rp_stamp, remote_rp_pa, + remote_vars_pa, remote_vars); + reactivate = 1; + } + } + + if (!xpc_partition_disengaged(part)) { + /* still waiting on other side to disengage from us */ + return; + } + + if (reactivate) { part->reactivate_nasid = nasid; XPC_DEACTIVATE_PARTITION(part, xpcReactivating); + + } else if (XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version) && + xpc_partition_disengage_requested(1UL << partid)) { + XPC_DEACTIVATE_PARTITION(part, xpcOtherGoingDown); } } @@ -643,14 +802,17 @@ xpc_identify_act_IRQ_sender(void) u64 nasid; /* remote nasid */ int n_IRQs_detected = 0; AMO_t *act_amos; - struct xpc_rsvd_page *rp = (struct xpc_rsvd_page *) xpc_rsvd_page; - act_amos = xpc_vars->act_amos; + act_amos = xpc_vars->amos_page + XPC_ACTIVATE_IRQ_AMOS; /* scan through act AMO variable looking for non-zero entries */ - for (word = 0; word < XP_NASID_MASK_WORDS; word++) { + for (word = 0; word < xp_nasid_mask_words; word++) { + + if (xpc_exiting) { + break; + } nasid_mask = xpc_IPI_receive(&act_amos[word]); if (nasid_mask == 0) { @@ -668,7 +830,7 @@ xpc_identify_act_IRQ_sender(void) * remote nasid in our reserved pages machine mask. * This is used in the event of module reload. */ - rp->mach_nasids[word] |= nasid_mask; + xpc_mach_nasids[word] |= nasid_mask; /* locate the nasid(s) which sent interrupts */ @@ -688,6 +850,55 @@ xpc_identify_act_IRQ_sender(void) /* + * See if the other side has responded to a partition disengage request + * from us. + */ +int +xpc_partition_disengaged(struct xpc_partition *part) +{ + partid_t partid = XPC_PARTID(part); + int disengaged; + + + disengaged = (xpc_partition_engaged(1UL << partid) == 0); + if (part->disengage_request_timeout) { + if (!disengaged) { + if (jiffies < part->disengage_request_timeout) { + /* timelimit hasn't been reached yet */ + return 0; + } + + /* + * Other side hasn't responded to our disengage + * request in a timely fashion, so assume it's dead. + */ + + xpc_clear_partition_engaged(1UL << partid); + disengaged = 1; + } + part->disengage_request_timeout = 0; + + /* cancel the timer function, provided it's not us */ + if (!in_interrupt()) { + del_singleshot_timer_sync(&part-> + disengage_request_timer); + } + + DBUG_ON(part->act_state != XPC_P_DEACTIVATING && + part->act_state != XPC_P_INACTIVE); + if (part->act_state != XPC_P_INACTIVE) { + xpc_wakeup_channel_mgr(part); + } + + if (XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version)) { + xpc_cancel_partition_disengage_request(part); + } + } + return disengaged; +} + + +/* * Mark specified partition as active. */ enum xpc_retval @@ -721,7 +932,6 @@ xpc_deactivate_partition(const int line, struct xpc_partition *part, enum xpc_retval reason) { unsigned long irq_flags; - partid_t partid = XPC_PARTID(part); spin_lock_irqsave(&part->act_lock, irq_flags); @@ -749,17 +959,27 @@ xpc_deactivate_partition(const int line, struct xpc_partition *part, spin_unlock_irqrestore(&part->act_lock, irq_flags); - XPC_DISALLOW_HB(partid, xpc_vars); + if (XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version)) { + xpc_request_partition_disengage(part); + xpc_IPI_send_disengage(part); - dev_dbg(xpc_part, "bringing partition %d down, reason = %d\n", partid, - reason); + /* set a timelimit on the disengage request */ + part->disengage_request_timeout = jiffies + + (xpc_disengage_request_timelimit * HZ); + part->disengage_request_timer.expires = + part->disengage_request_timeout; + add_timer(&part->disengage_request_timer); + } + + dev_dbg(xpc_part, "bringing partition %d down, reason = %d\n", + XPC_PARTID(part), reason); - xpc_partition_down(part, reason); + xpc_partition_going_down(part, reason); } /* - * Mark specified partition as active. + * Mark specified partition as inactive. */ void xpc_mark_partition_inactive(struct xpc_partition *part) @@ -792,9 +1012,10 @@ xpc_discovery(void) void *remote_rp_base; struct xpc_rsvd_page *remote_rp; struct xpc_vars *remote_vars; - u64 remote_rsvd_page_pa; + u64 remote_rp_pa; u64 remote_vars_pa; int region; + int region_size; int max_regions; int nasid; struct xpc_rsvd_page *rp; @@ -804,7 +1025,8 @@ xpc_discovery(void) enum xpc_retval ret; - remote_rp = xpc_kmalloc_cacheline_aligned(XPC_RSVD_PAGE_ALIGNED_SIZE, + remote_rp = xpc_kmalloc_cacheline_aligned(XPC_RP_HEADER_SIZE + + xp_nasid_mask_bytes, GFP_KERNEL, &remote_rp_base); if (remote_rp == NULL) { return; @@ -812,13 +1034,13 @@ xpc_discovery(void) remote_vars = (struct xpc_vars *) remote_rp; - discovered_nasids = kmalloc(sizeof(u64) * XP_NASID_MASK_WORDS, + discovered_nasids = kmalloc(sizeof(u64) * xp_nasid_mask_words, GFP_KERNEL); if (discovered_nasids == NULL) { kfree(remote_rp_base); return; } - memset(discovered_nasids, 0, sizeof(u64) * XP_NASID_MASK_WORDS); + memset(discovered_nasids, 0, sizeof(u64) * xp_nasid_mask_words); rp = (struct xpc_rsvd_page *) xpc_rsvd_page; @@ -827,11 +1049,19 @@ xpc_discovery(void) * nodes that can comprise an access protection grouping. The access * protection is in regards to memory, IOI and IPI. */ -//>>> move the next two #defines into either include/asm-ia64/sn/arch.h or -//>>> include/asm-ia64/sn/addrs.h -#define SH1_MAX_REGIONS 64 -#define SH2_MAX_REGIONS 256 - max_regions = is_shub2() ? SH2_MAX_REGIONS : SH1_MAX_REGIONS; + max_regions = 64; + region_size = sn_region_size; + + switch (region_size) { + case 128: + max_regions *= 2; + case 64: + max_regions *= 2; + case 32: + max_regions *= 2; + region_size = 16; + DBUG_ON(!is_shub2()); + } for (region = 0; region < max_regions; region++) { @@ -841,8 +1071,8 @@ xpc_discovery(void) dev_dbg(xpc_part, "searching region %d\n", region); - for (nasid = (region * sn_region_size * 2); - nasid < ((region + 1) * sn_region_size * 2); + for (nasid = (region * region_size * 2); + nasid < ((region + 1) * region_size * 2); nasid += 2) { if ((volatile int) xpc_exiting) { @@ -852,14 +1082,14 @@ xpc_discovery(void) dev_dbg(xpc_part, "checking nasid %d\n", nasid); - if (XPC_NASID_IN_ARRAY(nasid, rp->part_nasids)) { + if (XPC_NASID_IN_ARRAY(nasid, xpc_part_nasids)) { dev_dbg(xpc_part, "PROM indicates Nasid %d is " "part of the local partition; skipping " "region\n", nasid); break; } - if (!(XPC_NASID_IN_ARRAY(nasid, rp->mach_nasids))) { + if (!(XPC_NASID_IN_ARRAY(nasid, xpc_mach_nasids))) { dev_dbg(xpc_part, "PROM indicates Nasid %d was " "not on Numa-Link network at reset\n", nasid); @@ -877,7 +1107,7 @@ xpc_discovery(void) /* pull over the reserved page structure */ ret = xpc_get_remote_rp(nasid, discovered_nasids, - remote_rp, &remote_rsvd_page_pa); + remote_rp, &remote_rp_pa); if (ret != xpcSuccess) { dev_dbg(xpc_part, "unable to get reserved page " "from nasid %d, reason=%d\n", nasid, @@ -948,6 +1178,13 @@ xpc_discovery(void) remote_vars->act_nasid, remote_vars->act_phys_cpuid); + if (XPC_SUPPORTS_DISENGAGE_REQUEST(remote_vars-> + version)) { + part->remote_amos_page_pa = + remote_vars->amos_page_pa; + xpc_mark_partition_disengaged(part); + xpc_cancel_partition_disengage_request(part); + } xpc_IPI_send_activate(remote_vars); } } @@ -974,12 +1211,12 @@ xpc_initiate_partid_to_nasids(partid_t partid, void *nasid_mask) return xpcPartitionDown; } - part_nasid_pa = part->remote_rp_pa + - (u64) &((struct xpc_rsvd_page *) 0)->part_nasids; + memset(nasid_mask, 0, XP_NASID_MASK_BYTES); + + part_nasid_pa = (u64) XPC_RP_PART_NASIDS(part->remote_rp_pa); bte_res = xp_bte_copy(part_nasid_pa, ia64_tpa((u64) nasid_mask), - L1_CACHE_ALIGN(XP_NASID_MASK_BYTES), - (BTE_NOTIFY | BTE_WACQUIRE), NULL); + xp_nasid_mask_bytes, (BTE_NOTIFY | BTE_WACQUIRE), NULL); return xpc_map_bte_errors(bte_res); } diff --git a/arch/ia64/sn/kernel/xpnet.c b/arch/ia64/sn/kernel/xpnet.c index 78c13d676fa6..e5c6d3c0a8e9 100644 --- a/arch/ia64/sn/kernel/xpnet.c +++ b/arch/ia64/sn/kernel/xpnet.c @@ -130,7 +130,7 @@ struct net_device *xpnet_device; */ static u64 xpnet_broadcast_partitions; /* protect above */ -static spinlock_t xpnet_broadcast_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(xpnet_broadcast_lock); /* * Since the Block Transfer Engine (BTE) is being used for the transfer @@ -636,6 +636,10 @@ xpnet_init(void) int result = -ENOMEM; + if (!ia64_platform_is("sn2")) { + return -ENODEV; + } + dev_info(xpnet, "registering network device %s\n", XPNET_DEVICE_NAME); /* |