summaryrefslogtreecommitdiff
path: root/arch/ia64/sn/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/ia64/sn/kernel')
-rw-r--r--arch/ia64/sn/kernel/bte.c83
-rw-r--r--arch/ia64/sn/kernel/huberror.c2
-rw-r--r--arch/ia64/sn/kernel/io_init.c60
-rw-r--r--arch/ia64/sn/kernel/irq.c77
-rw-r--r--arch/ia64/sn/kernel/setup.c199
-rw-r--r--arch/ia64/sn/kernel/sn2/ptc_deadlock.S13
-rw-r--r--arch/ia64/sn/kernel/sn2/sn2_smp.c283
-rw-r--r--arch/ia64/sn/kernel/sn2/sn_hwperf.c317
-rw-r--r--arch/ia64/sn/kernel/sn2/sn_proc_fs.c6
-rw-r--r--arch/ia64/sn/kernel/sn2/timer_interrupt.c22
-rw-r--r--arch/ia64/sn/kernel/tiocx.c69
-rw-r--r--arch/ia64/sn/kernel/xpc.h368
-rw-r--r--arch/ia64/sn/kernel/xpc_channel.c329
-rw-r--r--arch/ia64/sn/kernel/xpc_main.c334
-rw-r--r--arch/ia64/sn/kernel/xpc_partition.c475
-rw-r--r--arch/ia64/sn/kernel/xpnet.c6
16 files changed, 1922 insertions, 721 deletions
diff --git a/arch/ia64/sn/kernel/bte.c b/arch/ia64/sn/kernel/bte.c
index 647deae9bfcd..d71f4de44f79 100644
--- a/arch/ia64/sn/kernel/bte.c
+++ b/arch/ia64/sn/kernel/bte.c
@@ -29,16 +29,30 @@
/* two interfaces on two btes */
#define MAX_INTERFACES_TO_TRY 4
+#define MAX_NODES_TO_TRY 2
static struct bteinfo_s *bte_if_on_node(nasid_t nasid, int interface)
{
nodepda_t *tmp_nodepda;
+ if (nasid_to_cnodeid(nasid) == -1)
+ return (struct bteinfo_s *)NULL;;
+
tmp_nodepda = NODEPDA(nasid_to_cnodeid(nasid));
return &tmp_nodepda->bte_if[interface];
}
+static inline void bte_start_transfer(struct bteinfo_s *bte, u64 len, u64 mode)
+{
+ if (is_shub2()) {
+ BTE_CTRL_STORE(bte, (IBLS_BUSY | ((len) | (mode) << 24)));
+ } else {
+ BTE_LNSTAT_STORE(bte, len);
+ BTE_CTRL_STORE(bte, mode);
+ }
+}
+
/************************************************************************
* Block Transfer Engine copy related functions.
*
@@ -67,13 +81,15 @@ bte_result_t bte_copy(u64 src, u64 dest, u64 len, u64 mode, void *notification)
{
u64 transfer_size;
u64 transfer_stat;
+ u64 notif_phys_addr;
struct bteinfo_s *bte;
bte_result_t bte_status;
unsigned long irq_flags;
unsigned long itc_end = 0;
- struct bteinfo_s *btes_to_try[MAX_INTERFACES_TO_TRY];
- int bte_if_index;
- int bte_pri, bte_sec;
+ int nasid_to_try[MAX_NODES_TO_TRY];
+ int my_nasid = cpuid_to_nasid(raw_smp_processor_id());
+ int bte_if_index, nasid_index;
+ int bte_first, btes_per_node = BTES_PER_NODE;
BTE_PRINTK(("bte_copy(0x%lx, 0x%lx, 0x%lx, 0x%lx, 0x%p)\n",
src, dest, len, mode, notification));
@@ -86,36 +102,26 @@ bte_result_t bte_copy(u64 src, u64 dest, u64 len, u64 mode, void *notification)
(src & L1_CACHE_MASK) || (dest & L1_CACHE_MASK));
BUG_ON(!(len < ((BTE_LEN_MASK + 1) << L1_CACHE_SHIFT)));
- /* CPU 0 (per node) tries bte0 first, CPU 1 try bte1 first */
- if (cpuid_to_subnode(smp_processor_id()) == 0) {
- bte_pri = 0;
- bte_sec = 1;
- } else {
- bte_pri = 1;
- bte_sec = 0;
- }
+ /*
+ * Start with interface corresponding to cpu number
+ */
+ bte_first = raw_smp_processor_id() % btes_per_node;
if (mode & BTE_USE_DEST) {
/* try remote then local */
- btes_to_try[0] = bte_if_on_node(NASID_GET(dest), bte_pri);
- btes_to_try[1] = bte_if_on_node(NASID_GET(dest), bte_sec);
+ nasid_to_try[0] = NASID_GET(dest);
if (mode & BTE_USE_ANY) {
- btes_to_try[2] = bte_if_on_node(get_nasid(), bte_pri);
- btes_to_try[3] = bte_if_on_node(get_nasid(), bte_sec);
+ nasid_to_try[1] = my_nasid;
} else {
- btes_to_try[2] = NULL;
- btes_to_try[3] = NULL;
+ nasid_to_try[1] = (int)NULL;
}
} else {
/* try local then remote */
- btes_to_try[0] = bte_if_on_node(get_nasid(), bte_pri);
- btes_to_try[1] = bte_if_on_node(get_nasid(), bte_sec);
+ nasid_to_try[0] = my_nasid;
if (mode & BTE_USE_ANY) {
- btes_to_try[2] = bte_if_on_node(NASID_GET(dest), bte_pri);
- btes_to_try[3] = bte_if_on_node(NASID_GET(dest), bte_sec);
+ nasid_to_try[1] = NASID_GET(dest);
} else {
- btes_to_try[2] = NULL;
- btes_to_try[3] = NULL;
+ nasid_to_try[1] = (int)NULL;
}
}
@@ -123,11 +129,12 @@ retry_bteop:
do {
local_irq_save(irq_flags);
- bte_if_index = 0;
+ bte_if_index = bte_first;
+ nasid_index = 0;
/* Attempt to lock one of the BTE interfaces. */
- while (bte_if_index < MAX_INTERFACES_TO_TRY) {
- bte = btes_to_try[bte_if_index++];
+ while (nasid_index < MAX_NODES_TO_TRY) {
+ bte = bte_if_on_node(nasid_to_try[nasid_index],bte_if_index);
if (bte == NULL) {
continue;
@@ -143,6 +150,15 @@ retry_bteop:
break;
}
}
+
+ bte_if_index = (bte_if_index + 1) % btes_per_node; /* Next interface */
+ if (bte_if_index == bte_first) {
+ /*
+ * We've tried all interfaces on this node
+ */
+ nasid_index++;
+ }
+
bte = NULL;
}
@@ -169,7 +185,13 @@ retry_bteop:
/* Initialize the notification to a known value. */
*bte->most_rcnt_na = BTE_WORD_BUSY;
+ notif_phys_addr = TO_PHYS(ia64_tpa((unsigned long)bte->most_rcnt_na));
+ if (is_shub2()) {
+ src = SH2_TIO_PHYS_TO_DMA(src);
+ dest = SH2_TIO_PHYS_TO_DMA(dest);
+ notif_phys_addr = SH2_TIO_PHYS_TO_DMA(notif_phys_addr);
+ }
/* Set the source and destination registers */
BTE_PRINTKV(("IBSA = 0x%lx)\n", (TO_PHYS(src))));
BTE_SRC_STORE(bte, TO_PHYS(src));
@@ -177,14 +199,12 @@ retry_bteop:
BTE_DEST_STORE(bte, TO_PHYS(dest));
/* Set the notification register */
- BTE_PRINTKV(("IBNA = 0x%lx)\n",
- TO_PHYS(ia64_tpa((unsigned long)bte->most_rcnt_na))));
- BTE_NOTIF_STORE(bte,
- TO_PHYS(ia64_tpa((unsigned long)bte->most_rcnt_na)));
+ BTE_PRINTKV(("IBNA = 0x%lx)\n", notif_phys_addr));
+ BTE_NOTIF_STORE(bte, notif_phys_addr);
/* Initiate the transfer */
BTE_PRINTK(("IBCT = 0x%lx)\n", BTE_VALID_MODE(mode)));
- BTE_START_TRANSFER(bte, transfer_size, BTE_VALID_MODE(mode));
+ bte_start_transfer(bte, transfer_size, BTE_VALID_MODE(mode));
itc_end = ia64_get_itc() + (40000000 * local_cpu_data->cyc_per_usec);
@@ -195,6 +215,7 @@ retry_bteop:
}
while ((transfer_stat = *bte->most_rcnt_na) == BTE_WORD_BUSY) {
+ cpu_relax();
if (ia64_get_itc() > itc_end) {
BTE_PRINTK(("BTE timeout nasid 0x%x bte%d IBLS = 0x%lx na 0x%lx\n",
NASID_GET(bte->bte_base_addr), bte->bte_num,
diff --git a/arch/ia64/sn/kernel/huberror.c b/arch/ia64/sn/kernel/huberror.c
index 5c39b43ba3c0..5c5eb01c50f0 100644
--- a/arch/ia64/sn/kernel/huberror.c
+++ b/arch/ia64/sn/kernel/huberror.c
@@ -76,7 +76,7 @@ void hubiio_crb_free(struct hubdev_info *hubdev_info, int crbnum)
*/
REMOTE_HUB_S(hubdev_info->hdi_nasid, IIO_ICDR, (IIO_ICDR_PND | crbnum));
while (REMOTE_HUB_L(hubdev_info->hdi_nasid, IIO_ICDR) & IIO_ICDR_PND)
- udelay(1);
+ cpu_relax();
}
diff --git a/arch/ia64/sn/kernel/io_init.c b/arch/ia64/sn/kernel/io_init.c
index a6649baf629a..b4f5053f5e1b 100644
--- a/arch/ia64/sn/kernel/io_init.c
+++ b/arch/ia64/sn/kernel/io_init.c
@@ -18,11 +18,10 @@
#include <asm/sn/simulator.h>
#include <asm/sn/sn_sal.h>
#include <asm/sn/tioca_provider.h>
+#include <asm/sn/tioce_provider.h>
#include "xtalk/hubdev.h"
#include "xtalk/xwidgetdev.h"
-nasid_t master_nasid = INVALID_NASID; /* Partition Master */
-
static struct list_head sn_sysdata_list;
/* sysdata list struct */
@@ -44,6 +43,9 @@ int sn_ioif_inited = 0; /* SN I/O infrastructure initialized? */
struct sn_pcibus_provider *sn_pci_provider[PCIIO_ASIC_MAX_TYPES]; /* indexed by asic type */
+static int max_segment_number = 0; /* Default highest segment number */
+static int max_pcibus_number = 255; /* Default highest pci bus number */
+
/*
* Hooks and struct for unsupported pci providers
*/
@@ -157,13 +159,28 @@ static void sn_fixup_ionodes(void)
uint64_t nasid;
int i, widget;
- for (i = 0; i < numionodes; i++) {
+ /*
+ * Get SGI Specific HUB chipset information.
+ * Inform Prom that this kernel can support domain bus numbering.
+ */
+ for (i = 0; i < num_cnodes; i++) {
hubdev = (struct hubdev_info *)(NODEPDA(i)->pdinfo);
nasid = cnodeid_to_nasid(i);
+ hubdev->max_segment_number = 0xffffffff;
+ hubdev->max_pcibus_number = 0xff;
status = sal_get_hubdev_info(nasid, (uint64_t) __pa(hubdev));
if (status)
continue;
+ /* Save the largest Domain and pcibus numbers found. */
+ if (hubdev->max_segment_number) {
+ /*
+ * Dealing with a Prom that supports segments.
+ */
+ max_segment_number = hubdev->max_segment_number;
+ max_pcibus_number = hubdev->max_pcibus_number;
+ }
+
/* Attach the error interrupt handlers */
if (nasid & 1)
ice_error_init(hubdev);
@@ -203,6 +220,7 @@ static void sn_fixup_ionodes(void)
continue;
}
+ spin_lock_init(&sn_flush_device_list->sfdl_flush_lock);
hubdev->hdi_flush_nasid_list.widget_p[widget] =
sn_flush_device_list;
}
@@ -229,7 +247,7 @@ void sn_pci_unfixup_slot(struct pci_dev *dev)
void sn_pci_fixup_slot(struct pci_dev *dev)
{
int idx;
- int segment = 0;
+ int segment = pci_domain_nr(dev->bus);
int status = 0;
struct pcibus_bussoft *bs;
struct pci_bus *host_pci_bus;
@@ -282,9 +300,9 @@ void sn_pci_fixup_slot(struct pci_dev *dev)
* PCI host_pci_dev struct and set up host bus linkages
*/
- bus_no = SN_PCIDEV_INFO(dev)->pdi_slot_host_handle >> 32;
+ bus_no = (SN_PCIDEV_INFO(dev)->pdi_slot_host_handle >> 32) & 0xff;
devfn = SN_PCIDEV_INFO(dev)->pdi_slot_host_handle & 0xffffffff;
- host_pci_bus = pci_find_bus(pci_domain_nr(dev->bus), bus_no);
+ host_pci_bus = pci_find_bus(segment, bus_no);
host_pci_dev = pci_get_slot(host_pci_bus, devfn);
SN_PCIDEV_INFO(dev)->host_pci_dev = host_pci_dev;
@@ -322,7 +340,7 @@ void sn_pci_controller_fixup(int segment, int busnum, struct pci_bus *bus)
struct pci_controller *controller;
struct pcibus_bussoft *prom_bussoft_ptr;
struct hubdev_info *hubdev_info;
- void *provider_soft;
+ void *provider_soft = NULL;
struct sn_pcibus_provider *provider;
status = sal_get_pcibus_info((u64) segment, (u64) busnum,
@@ -332,13 +350,14 @@ void sn_pci_controller_fixup(int segment, int busnum, struct pci_bus *bus)
prom_bussoft_ptr = __va(prom_bussoft_ptr);
controller = kcalloc(1,sizeof(struct pci_controller), GFP_KERNEL);
+ controller->segment = segment;
if (!controller)
BUG();
if (bus == NULL) {
bus = pci_scan_bus(busnum, &pci_root_ops, controller);
if (bus == NULL)
- return; /* error, or bus already scanned */
+ goto error_return; /* error, or bus already scanned */
bus->sysdata = NULL;
}
@@ -351,28 +370,30 @@ void sn_pci_controller_fixup(int segment, int busnum, struct pci_bus *bus)
*/
if (prom_bussoft_ptr->bs_asic_type >= PCIIO_ASIC_MAX_TYPES)
- return; /* unsupported asic type */
+ goto error_return; /* unsupported asic type */
if (prom_bussoft_ptr->bs_asic_type == PCIIO_ASIC_TYPE_PPB)
goto error_return; /* no further fixup necessary */
provider = sn_pci_provider[prom_bussoft_ptr->bs_asic_type];
if (provider == NULL)
- return; /* no provider registerd for this asic */
+ goto error_return; /* no provider registerd for this asic */
- provider_soft = NULL;
+ bus->sysdata = controller;
if (provider->bus_fixup)
provider_soft = (*provider->bus_fixup) (prom_bussoft_ptr, controller);
- if (provider_soft == NULL)
- return; /* fixup failed or not applicable */
+ if (provider_soft == NULL) {
+ /* fixup failed or not applicable */
+ bus->sysdata = NULL;
+ goto error_return;
+ }
/*
* Generic bus fixup goes here. Don't reference prom_bussoft_ptr
* after this point.
*/
- bus->sysdata = controller;
PCI_CONTROLLER(bus)->platform_data = provider_soft;
nasid = NASID_GET(SN_PCIBUS_BUSSOFT(bus)->bs_base);
cnode = nasid_to_cnodeid(nasid);
@@ -387,7 +408,7 @@ void sn_pci_controller_fixup(int segment, int busnum, struct pci_bus *bus)
if (controller->node >= num_online_nodes()) {
struct pcibus_bussoft *b = SN_PCIBUS_BUSSOFT(bus);
- printk(KERN_WARNING "Device ASIC=%u XID=%u PBUSNUM=%lu"
+ printk(KERN_WARNING "Device ASIC=%u XID=%u PBUSNUM=%u"
"L_IO=%lx L_MEM=%lx BASE=%lx\n",
b->bs_asic_type, b->bs_xid, b->bs_persist_busnum,
b->bs_legacy_io, b->bs_legacy_mem, b->bs_base);
@@ -408,7 +429,7 @@ void sn_bus_store_sysdata(struct pci_dev *dev)
{
struct sysdata_el *element;
- element = kcalloc(1, sizeof(struct sysdata_el), GFP_KERNEL);
+ element = kzalloc(sizeof(struct sysdata_el), GFP_KERNEL);
if (!element) {
dev_dbg(dev, "%s: out of memory!\n", __FUNCTION__);
return;
@@ -442,6 +463,7 @@ sn_sysdata_free_start:
static int __init sn_pci_init(void)
{
int i = 0;
+ int j = 0;
struct pci_dev *pci_dev = NULL;
extern void sn_init_cpei_timer(void);
#ifdef CONFIG_PROC_FS
@@ -461,6 +483,7 @@ static int __init sn_pci_init(void)
pcibr_init_provider();
tioca_init_provider();
+ tioce_init_provider();
/*
* This is needed to avoid bounce limit checks in the blk layer
@@ -476,8 +499,9 @@ static int __init sn_pci_init(void)
#endif
/* busses are not known yet ... */
- for (i = 0; i < PCI_BUSES_TO_SCAN; i++)
- sn_pci_controller_fixup(0, i, NULL);
+ for (i = 0; i <= max_segment_number; i++)
+ for (j = 0; j <= max_pcibus_number; j++)
+ sn_pci_controller_fixup(i, j, NULL);
/*
* Generic Linux PCI Layer has created the pci_bus and pci_dev
diff --git a/arch/ia64/sn/kernel/irq.c b/arch/ia64/sn/kernel/irq.c
index 84d276a14ecb..01d18b7b5bb3 100644
--- a/arch/ia64/sn/kernel/irq.c
+++ b/arch/ia64/sn/kernel/irq.c
@@ -5,7 +5,7 @@
* License. See the file "COPYING" in the main directory of this archive
* for more details.
*
- * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc. All Rights Reserved.
*/
#include <linux/irq.h>
@@ -23,7 +23,7 @@ static void force_interrupt(int irq);
static void register_intr_pda(struct sn_irq_info *sn_irq_info);
static void unregister_intr_pda(struct sn_irq_info *sn_irq_info);
-extern int sn_force_interrupt_flag;
+int sn_force_interrupt_flag = 1;
extern int sn_ioif_inited;
static struct list_head **sn_irq_lh;
static spinlock_t sn_irq_info_lock = SPIN_LOCK_UNLOCKED; /* non-IRQ lock */
@@ -76,16 +76,14 @@ static void sn_enable_irq(unsigned int irq)
static void sn_ack_irq(unsigned int irq)
{
- uint64_t event_occurred, mask = 0;
- int nasid;
+ u64 event_occurred, mask = 0;
irq = irq & 0xff;
- nasid = get_nasid();
event_occurred =
- HUB_L((uint64_t *) GLOBAL_MMR_ADDR(nasid, SH_EVENT_OCCURRED));
+ HUB_L((u64*)LOCAL_MMR_ADDR(SH_EVENT_OCCURRED));
mask = event_occurred & SH_ALL_INT_MASK;
- HUB_S((uint64_t *) GLOBAL_MMR_ADDR(nasid, SH_EVENT_OCCURRED_ALIAS),
- mask);
+ HUB_S((u64*)LOCAL_MMR_ADDR(SH_EVENT_OCCURRED_ALIAS),
+ mask);
__set_bit(irq, (volatile void *)pda->sn_in_service_ivecs);
move_irq(irq);
@@ -93,15 +91,12 @@ static void sn_ack_irq(unsigned int irq)
static void sn_end_irq(unsigned int irq)
{
- int nasid;
int ivec;
- uint64_t event_occurred;
+ u64 event_occurred;
ivec = irq & 0xff;
if (ivec == SGI_UART_VECTOR) {
- nasid = get_nasid();
- event_occurred = HUB_L((uint64_t *) GLOBAL_MMR_ADDR
- (nasid, SH_EVENT_OCCURRED));
+ event_occurred = HUB_L((u64*)LOCAL_MMR_ADDR (SH_EVENT_OCCURRED));
/* If the UART bit is set here, we may have received an
* interrupt from the UART that the driver missed. To
* make sure, we IPI ourselves to force us to look again.
@@ -132,6 +127,7 @@ static void sn_set_affinity_irq(unsigned int irq, cpumask_t mask)
int local_widget, status;
nasid_t local_nasid;
struct sn_irq_info *new_irq_info;
+ struct sn_pcibus_provider *pci_provider;
new_irq_info = kmalloc(sizeof(struct sn_irq_info), GFP_ATOMIC);
if (new_irq_info == NULL)
@@ -171,8 +167,9 @@ static void sn_set_affinity_irq(unsigned int irq, cpumask_t mask)
new_irq_info->irq_cpuid = cpuid;
register_intr_pda(new_irq_info);
- if (IS_PCI_BRIDGE_ASIC(new_irq_info->irq_bridge_type))
- pcibr_change_devices_irq(new_irq_info);
+ pci_provider = sn_pci_provider[new_irq_info->irq_bridge_type];
+ if (pci_provider && pci_provider->target_interrupt)
+ (pci_provider->target_interrupt)(new_irq_info);
spin_lock(&sn_irq_info_lock);
list_replace_rcu(&sn_irq_info->list, &new_irq_info->list);
@@ -317,6 +314,16 @@ void sn_irq_unfixup(struct pci_dev *pci_dev)
pci_dev_put(pci_dev);
}
+static inline void
+sn_call_force_intr_provider(struct sn_irq_info *sn_irq_info)
+{
+ struct sn_pcibus_provider *pci_provider;
+
+ pci_provider = sn_pci_provider[sn_irq_info->irq_bridge_type];
+ if (pci_provider && pci_provider->force_interrupt)
+ (*pci_provider->force_interrupt)(sn_irq_info);
+}
+
static void force_interrupt(int irq)
{
struct sn_irq_info *sn_irq_info;
@@ -325,11 +332,9 @@ static void force_interrupt(int irq)
return;
rcu_read_lock();
- list_for_each_entry_rcu(sn_irq_info, sn_irq_lh[irq], list) {
- if (IS_PCI_BRIDGE_ASIC(sn_irq_info->irq_bridge_type) &&
- (sn_irq_info->irq_bridge != NULL))
- pcibr_force_interrupt(sn_irq_info);
- }
+ list_for_each_entry_rcu(sn_irq_info, sn_irq_lh[irq], list)
+ sn_call_force_intr_provider(sn_irq_info);
+
rcu_read_unlock();
}
@@ -351,6 +356,14 @@ static void sn_check_intr(int irq, struct sn_irq_info *sn_irq_info)
struct pcidev_info *pcidev_info;
struct pcibus_info *pcibus_info;
+ /*
+ * Bridge types attached to TIO (anything but PIC) do not need this WAR
+ * since they do not target Shub II interrupt registers. If that
+ * ever changes, this check needs to accomodate.
+ */
+ if (sn_irq_info->irq_bridge_type != PCIIO_ASIC_TYPE_PIC)
+ return;
+
pcidev_info = (struct pcidev_info *)sn_irq_info->irq_pciioinfo;
if (!pcidev_info)
return;
@@ -377,16 +390,12 @@ static void sn_check_intr(int irq, struct sn_irq_info *sn_irq_info)
break;
}
if (!test_bit(irr_bit, &irr_reg)) {
- if (!test_bit(irq, pda->sn_soft_irr)) {
- if (!test_bit(irq, pda->sn_in_service_ivecs)) {
- regval &= 0xff;
- if (sn_irq_info->irq_int_bit & regval &
- sn_irq_info->irq_last_intr) {
- regval &=
- ~(sn_irq_info->
- irq_int_bit & regval);
- pcibr_force_interrupt(sn_irq_info);
- }
+ if (!test_bit(irq, pda->sn_in_service_ivecs)) {
+ regval &= 0xff;
+ if (sn_irq_info->irq_int_bit & regval &
+ sn_irq_info->irq_last_intr) {
+ regval &= ~(sn_irq_info->irq_int_bit & regval);
+ sn_call_force_intr_provider(sn_irq_info);
}
}
}
@@ -404,13 +413,7 @@ void sn_lb_int_war_check(void)
rcu_read_lock();
for (i = pda->sn_first_irq; i <= pda->sn_last_irq; i++) {
list_for_each_entry_rcu(sn_irq_info, sn_irq_lh[i], list) {
- /*
- * Only call for PCI bridges that are fully
- * initialized.
- */
- if (IS_PCI_BRIDGE_ASIC(sn_irq_info->irq_bridge_type) &&
- (sn_irq_info->irq_bridge != NULL))
- sn_check_intr(i, sn_irq_info);
+ sn_check_intr(i, sn_irq_info);
}
}
rcu_read_unlock();
diff --git a/arch/ia64/sn/kernel/setup.c b/arch/ia64/sn/kernel/setup.c
index 7c7fe441d623..0fb579ef18c2 100644
--- a/arch/ia64/sn/kernel/setup.c
+++ b/arch/ia64/sn/kernel/setup.c
@@ -49,6 +49,7 @@
#include <asm/sn/clksupport.h>
#include <asm/sn/sn_sal.h>
#include <asm/sn/geo.h>
+#include <asm/sn/sn_feature_sets.h>
#include "xtalk/xwidgetdev.h"
#include "xtalk/hubdev.h"
#include <asm/sn/klconfig.h>
@@ -56,9 +57,7 @@
DEFINE_PER_CPU(struct pda_s, pda_percpu);
-#define MAX_PHYS_MEMORY (1UL << 49) /* 1 TB */
-
-lboard_t *root_lboard[MAX_COMPACT_NODES];
+#define MAX_PHYS_MEMORY (1UL << IA64_MAX_PHYS_BITS) /* Max physical address supported */
extern void bte_init_node(nodepda_t *, cnodeid_t);
@@ -80,8 +79,6 @@ EXPORT_PER_CPU_SYMBOL(__sn_cnodeid_to_nasid);
DEFINE_PER_CPU(struct nodepda_s *, __sn_nodepda);
EXPORT_PER_CPU_SYMBOL(__sn_nodepda);
-partid_t sn_partid = -1;
-EXPORT_SYMBOL(sn_partid);
char sn_system_serial_number_string[128];
EXPORT_SYMBOL(sn_system_serial_number_string);
u64 sn_partition_serial_number;
@@ -98,14 +95,15 @@ u8 sn_region_size;
EXPORT_SYMBOL(sn_region_size);
int sn_prom_type; /* 0=hardware, 1=medusa/realprom, 2=medusa/fakeprom */
-short physical_node_map[MAX_PHYSNODE_ID];
+short physical_node_map[MAX_NUMALINK_NODES];
+static unsigned long sn_prom_features[MAX_PROM_FEATURE_SETS];
EXPORT_SYMBOL(physical_node_map);
-int numionodes;
+int num_cnodes;
static void sn_init_pdas(char **);
-static void scan_for_ionodes(void);
+static void build_cnode_tables(void);
static nodepda_t *nodepdaindr[MAX_COMPACT_NODES];
@@ -140,19 +138,6 @@ char drive_info[4 * 16];
#endif
/*
- * Get nasid of current cpu early in boot before nodepda is initialized
- */
-static int
-boot_get_nasid(void)
-{
- int nasid;
-
- if (ia64_sn_get_sapic_info(get_sapicid(), &nasid, NULL, NULL))
- BUG();
- return nasid;
-}
-
-/*
* This routine can only be used during init, since
* smp_boot_data is an init data structure.
* We have to use smp_boot_data.cpu_phys_id to find
@@ -223,7 +208,6 @@ void __init early_sn_setup(void)
}
extern int platform_intr_list[];
-extern nasid_t master_nasid;
static int __initdata shub_1_1_found = 0;
/*
@@ -269,11 +253,13 @@ static void __init sn_check_for_wars(void)
void __init sn_setup(char **cmdline_p)
{
long status, ticks_per_sec, drift;
- int pxm;
u32 version = sn_sal_rev();
extern void sn_cpu_init(void);
- ia64_sn_plat_set_error_handling_features();
+ ia64_sn_plat_set_error_handling_features(); // obsolete
+ ia64_sn_set_os_feature(OSF_MCA_SLV_TO_OS_INIT_SLV);
+ ia64_sn_set_os_feature(OSF_FEAT_LOG_SBES);
+
#if defined(CONFIG_VT) && defined(CONFIG_VGA_CONSOLE)
/*
@@ -297,11 +283,10 @@ void __init sn_setup(char **cmdline_p)
MAX_DMA_ADDRESS = PAGE_OFFSET + MAX_PHYS_MEMORY;
- memset(physical_node_map, -1, sizeof(physical_node_map));
- for (pxm = 0; pxm < MAX_PXM_DOMAINS; pxm++)
- if (pxm_to_nid_map[pxm] != -1)
- physical_node_map[pxm_to_nasid(pxm)] =
- pxm_to_nid_map[pxm];
+ /*
+ * Build the tables for managing cnodes.
+ */
+ build_cnode_tables();
/*
* Old PROMs do not provide an ACPI FADT. Disable legacy keyboard
@@ -316,18 +301,6 @@ void __init sn_setup(char **cmdline_p)
printk("SGI SAL version %x.%02x\n", version >> 8, version & 0x00FF);
- /*
- * Confirm the SAL we're running on is recent enough...
- */
- if (version < SN_SAL_MIN_VERSION) {
- printk(KERN_ERR "This kernel needs SGI SAL version >= "
- "%x.%02x\n", SN_SAL_MIN_VERSION >> 8,
- SN_SAL_MIN_VERSION & 0x00FF);
- panic("PROM version too old\n");
- }
-
- master_nasid = boot_get_nasid();
-
status =
ia64_sal_freq_base(SAL_FREQ_BASE_REALTIME_CLOCK, &ticks_per_sec,
&drift);
@@ -385,15 +358,6 @@ static void __init sn_init_pdas(char **cmdline_p)
{
cnodeid_t cnode;
- memset(sn_cnodeid_to_nasid, -1,
- sizeof(__ia64_per_cpu_var(__sn_cnodeid_to_nasid)));
- for_each_online_node(cnode)
- sn_cnodeid_to_nasid[cnode] =
- pxm_to_nasid(nid_to_pxm_map[cnode]);
-
- numionodes = num_online_nodes();
- scan_for_ionodes();
-
/*
* Allocate & initalize the nodepda for each node.
*/
@@ -403,12 +367,13 @@ static void __init sn_init_pdas(char **cmdline_p)
memset(nodepdaindr[cnode], 0, sizeof(nodepda_t));
memset(nodepdaindr[cnode]->phys_cpuid, -1,
sizeof(nodepdaindr[cnode]->phys_cpuid));
+ spin_lock_init(&nodepdaindr[cnode]->ptc_lock);
}
/*
* Allocate & initialize nodepda for TIOs. For now, put them on node 0.
*/
- for (cnode = num_online_nodes(); cnode < numionodes; cnode++) {
+ for (cnode = num_online_nodes(); cnode < num_cnodes; cnode++) {
nodepdaindr[cnode] =
alloc_bootmem_node(NODE_DATA(0), sizeof(nodepda_t));
memset(nodepdaindr[cnode], 0, sizeof(nodepda_t));
@@ -417,7 +382,7 @@ static void __init sn_init_pdas(char **cmdline_p)
/*
* Now copy the array of nodepda pointers to each nodepda.
*/
- for (cnode = 0; cnode < numionodes; cnode++)
+ for (cnode = 0; cnode < num_cnodes; cnode++)
memcpy(nodepdaindr[cnode]->pernode_pdaindr, nodepdaindr,
sizeof(nodepdaindr));
@@ -434,7 +399,7 @@ static void __init sn_init_pdas(char **cmdline_p)
* Initialize the per node hubdev. This includes IO Nodes and
* headless/memless nodes.
*/
- for (cnode = 0; cnode < numionodes; cnode++) {
+ for (cnode = 0; cnode < num_cnodes; cnode++) {
hubdev_init_node(nodepdaindr[cnode], cnode);
}
}
@@ -481,6 +446,10 @@ void __init sn_cpu_init(void)
if (nodepdaindr[0] == NULL)
return;
+ for (i = 0; i < MAX_PROM_FEATURE_SETS; i++)
+ if (ia64_sn_get_prom_feature_set(i, &sn_prom_features[i]) != 0)
+ break;
+
cpuid = smp_processor_id();
cpuphyid = get_sapicid();
@@ -532,8 +501,8 @@ void __init sn_cpu_init(void)
*/
{
u64 pio1[] = {SH1_PIO_WRITE_STATUS_0, 0, SH1_PIO_WRITE_STATUS_1, 0};
- u64 pio2[] = {SH2_PIO_WRITE_STATUS_0, SH2_PIO_WRITE_STATUS_1,
- SH2_PIO_WRITE_STATUS_2, SH2_PIO_WRITE_STATUS_3};
+ u64 pio2[] = {SH2_PIO_WRITE_STATUS_0, SH2_PIO_WRITE_STATUS_2,
+ SH2_PIO_WRITE_STATUS_1, SH2_PIO_WRITE_STATUS_3};
u64 *pio;
pio = is_shub1() ? pio1 : pio2;
pda->pio_write_status_addr = (volatile unsigned long *) LOCAL_MMR_ADDR(pio[slice]);
@@ -555,87 +524,58 @@ void __init sn_cpu_init(void)
}
/*
- * Scan klconfig for ionodes. Add the nasids to the
- * physical_node_map and the pda and increment numionodes.
+ * Build tables for converting between NASIDs and cnodes.
*/
+static inline int __init board_needs_cnode(int type)
+{
+ return (type == KLTYPE_SNIA || type == KLTYPE_TIO);
+}
-static void __init scan_for_ionodes(void)
+void __init build_cnode_tables(void)
{
- int nasid = 0;
+ int nasid;
+ int node;
lboard_t *brd;
- /* fakeprom does not support klgraph */
- if (IS_RUNNING_ON_FAKE_PROM())
- return;
-
- /* Setup ionodes with memory */
- for (nasid = 0; nasid < MAX_PHYSNODE_ID; nasid += 2) {
- char *klgraph_header;
- cnodeid_t cnodeid;
-
- if (physical_node_map[nasid] == -1)
- continue;
+ memset(physical_node_map, -1, sizeof(physical_node_map));
+ memset(sn_cnodeid_to_nasid, -1,
+ sizeof(__ia64_per_cpu_var(__sn_cnodeid_to_nasid)));
- cnodeid = -1;
- klgraph_header = __va(ia64_sn_get_klconfig_addr(nasid));
- if (!klgraph_header) {
- BUG(); /* All nodes must have klconfig tables! */
- }
- cnodeid = nasid_to_cnodeid(nasid);
- root_lboard[cnodeid] = (lboard_t *)
- NODE_OFFSET_TO_LBOARD((nasid),
- ((kl_config_hdr_t
- *) (klgraph_header))->
- ch_board_info);
+ /*
+ * First populate the tables with C/M bricks. This ensures that
+ * cnode == node for all C & M bricks.
+ */
+ for_each_online_node(node) {
+ nasid = pxm_to_nasid(nid_to_pxm_map[node]);
+ sn_cnodeid_to_nasid[node] = nasid;
+ physical_node_map[nasid] = node;
}
- /* Scan headless/memless IO Nodes. */
- for (nasid = 0; nasid < MAX_PHYSNODE_ID; nasid += 2) {
- /* if there's no nasid, don't try to read the klconfig on the node */
- if (physical_node_map[nasid] == -1)
- continue;
- brd = find_lboard_any((lboard_t *)
- root_lboard[nasid_to_cnodeid(nasid)],
- KLTYPE_SNIA);
- if (brd) {
- brd = KLCF_NEXT_ANY(brd); /* Skip this node's lboard */
- if (!brd)
- continue;
- }
-
- brd = find_lboard_any(brd, KLTYPE_SNIA);
+ /*
+ * num_cnodes is total number of C/M/TIO bricks. Because of the 256 node
+ * limit on the number of nodes, we can't use the generic node numbers
+ * for this. Note that num_cnodes is incremented below as TIOs or
+ * headless/memoryless nodes are discovered.
+ */
+ num_cnodes = num_online_nodes();
- while (brd) {
- sn_cnodeid_to_nasid[numionodes] = brd->brd_nasid;
- physical_node_map[brd->brd_nasid] = numionodes;
- root_lboard[numionodes] = brd;
- numionodes++;
- brd = KLCF_NEXT_ANY(brd);
- if (!brd)
- break;
-
- brd = find_lboard_any(brd, KLTYPE_SNIA);
- }
- }
+ /* fakeprom does not support klgraph */
+ if (IS_RUNNING_ON_FAKE_PROM())
+ return;
- /* Scan for TIO nodes. */
- for (nasid = 0; nasid < MAX_PHYSNODE_ID; nasid += 2) {
- /* if there's no nasid, don't try to read the klconfig on the node */
- if (physical_node_map[nasid] == -1)
- continue;
- brd = find_lboard_any((lboard_t *)
- root_lboard[nasid_to_cnodeid(nasid)],
- KLTYPE_TIO);
+ /* Find TIOs & headless/memoryless nodes and add them to the tables */
+ for_each_online_node(node) {
+ kl_config_hdr_t *klgraph_header;
+ nasid = cnodeid_to_nasid(node);
+ if ((klgraph_header = ia64_sn_get_klconfig_addr(nasid)) == NULL)
+ BUG();
+ brd = NODE_OFFSET_TO_LBOARD(nasid, klgraph_header->ch_board_info);
while (brd) {
- sn_cnodeid_to_nasid[numionodes] = brd->brd_nasid;
- physical_node_map[brd->brd_nasid] = numionodes;
- root_lboard[numionodes] = brd;
- numionodes++;
- brd = KLCF_NEXT_ANY(brd);
- if (!brd)
- break;
-
- brd = find_lboard_any(brd, KLTYPE_TIO);
+ if (board_needs_cnode(brd->brd_type) && physical_node_map[brd->brd_nasid] < 0) {
+ sn_cnodeid_to_nasid[num_cnodes] = brd->brd_nasid;
+ physical_node_map[brd->brd_nasid] = num_cnodes++;
+ }
+ brd = find_lboard_next(brd);
}
}
}
@@ -652,3 +592,12 @@ nasid_slice_to_cpuid(int nasid, int slice)
return -1;
}
+
+int sn_prom_feature_available(int id)
+{
+ if (id >= BITS_PER_LONG * MAX_PROM_FEATURE_SETS)
+ return 0;
+ return test_bit(id, sn_prom_features);
+}
+EXPORT_SYMBOL(sn_prom_feature_available);
+
diff --git a/arch/ia64/sn/kernel/sn2/ptc_deadlock.S b/arch/ia64/sn/kernel/sn2/ptc_deadlock.S
index 96cb71d15682..3fa95065a446 100644
--- a/arch/ia64/sn/kernel/sn2/ptc_deadlock.S
+++ b/arch/ia64/sn/kernel/sn2/ptc_deadlock.S
@@ -3,7 +3,7 @@
* License. See the file "COPYING" in the main directory of this archive
* for more details.
*
- * Copyright (C) 2000-2004 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2000-2005 Silicon Graphics, Inc. All rights reserved.
*/
#include <asm/types.h>
@@ -11,7 +11,7 @@
#define DEADLOCKBIT SH_PIO_WRITE_STATUS_WRITE_DEADLOCK_SHFT
#define WRITECOUNTMASK SH_PIO_WRITE_STATUS_PENDING_WRITE_COUNT_MASK
-#define ALIAS_OFFSET (SH1_PIO_WRITE_STATUS_0_ALIAS-SH1_PIO_WRITE_STATUS_0)
+#define ALIAS_OFFSET 8
.global sn2_ptc_deadlock_recovery_core
@@ -36,13 +36,15 @@ sn2_ptc_deadlock_recovery_core:
extr.u piowcphy=piowc,0,61;; // Convert piowc to uncached physical address
dep piowcphy=-1,piowcphy,63,1
movl mask=WRITECOUNTMASK
+ mov r8=r0
1:
add scr2=ALIAS_OFFSET,piowc // Address of WRITE_STATUS alias register
- mov scr1=7;; // Clear DEADLOCK, WRITE_ERROR, MULTI_WRITE_ERROR
- st8.rel [scr2]=scr1;;
+ ;;
+ ld8.acq scr1=[scr2];;
5: ld8.acq scr1=[piowc];; // Wait for PIOs to complete.
+ hint @pause
and scr2=scr1,mask;; // mask of writecount bits
cmp.ne p6,p0=zeroval,scr2
(p6) br.cond.sptk 5b
@@ -57,6 +59,7 @@ sn2_ptc_deadlock_recovery_core:
st8.rel [ptc0]=data0 // Write PTC0 & wait for completion.
5: ld8.acq scr1=[piowcphy];; // Wait for PIOs to complete.
+ hint @pause
and scr2=scr1,mask;; // mask of writecount bits
cmp.ne p6,p0=zeroval,scr2
(p6) br.cond.sptk 5b;;
@@ -67,6 +70,7 @@ sn2_ptc_deadlock_recovery_core:
(p7) st8.rel [ptc1]=data1;; // Now write PTC1.
5: ld8.acq scr1=[piowcphy];; // Wait for PIOs to complete.
+ hint @pause
and scr2=scr1,mask;; // mask of writecount bits
cmp.ne p6,p0=zeroval,scr2
(p6) br.cond.sptk 5b
@@ -77,6 +81,7 @@ sn2_ptc_deadlock_recovery_core:
srlz.i;;
////////////// END PHYSICAL MODE ////////////////////
+(p8) add r8=1,r8
(p8) br.cond.spnt 1b;; // Repeat if DEADLOCK occurred.
br.ret.sptk rp
diff --git a/arch/ia64/sn/kernel/sn2/sn2_smp.c b/arch/ia64/sn/kernel/sn2/sn2_smp.c
index 7af05a7ac743..49b530c39a42 100644
--- a/arch/ia64/sn/kernel/sn2/sn2_smp.c
+++ b/arch/ia64/sn/kernel/sn2/sn2_smp.c
@@ -5,7 +5,7 @@
* License. See the file "COPYING" in the main directory of this archive
* for more details.
*
- * Copyright (C) 2000-2004 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2000-2005 Silicon Graphics, Inc. All rights reserved.
*/
#include <linux/init.h>
@@ -20,6 +20,8 @@
#include <linux/module.h>
#include <linux/bitops.h>
#include <linux/nodemask.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
#include <asm/processor.h>
#include <asm/irq.h>
@@ -39,12 +41,120 @@
#include <asm/sn/nodepda.h>
#include <asm/sn/rw_mmr.h>
-void sn2_ptc_deadlock_recovery(volatile unsigned long *, unsigned long data0,
- volatile unsigned long *, unsigned long data1);
+DEFINE_PER_CPU(struct ptc_stats, ptcstats);
+DECLARE_PER_CPU(struct ptc_stats, ptcstats);
static __cacheline_aligned DEFINE_SPINLOCK(sn2_global_ptc_lock);
-static unsigned long sn2_ptc_deadlock_count;
+void sn2_ptc_deadlock_recovery(short *, short, int, volatile unsigned long *, unsigned long data0,
+ volatile unsigned long *, unsigned long data1);
+
+#ifdef DEBUG_PTC
+/*
+ * ptctest:
+ *
+ * xyz - 3 digit hex number:
+ * x - Force PTC purges to use shub:
+ * 0 - no force
+ * 1 - force
+ * y - interupt enable
+ * 0 - disable interrupts
+ * 1 - leave interuupts enabled
+ * z - type of lock:
+ * 0 - global lock
+ * 1 - node local lock
+ * 2 - no lock
+ *
+ * Note: on shub1, only ptctest == 0 is supported. Don't try other values!
+ */
+
+static unsigned int sn2_ptctest = 0;
+
+static int __init ptc_test(char *str)
+{
+ get_option(&str, &sn2_ptctest);
+ return 1;
+}
+__setup("ptctest=", ptc_test);
+
+static inline int ptc_lock(unsigned long *flagp)
+{
+ unsigned long opt = sn2_ptctest & 255;
+
+ switch (opt) {
+ case 0x00:
+ spin_lock_irqsave(&sn2_global_ptc_lock, *flagp);
+ break;
+ case 0x01:
+ spin_lock_irqsave(&sn_nodepda->ptc_lock, *flagp);
+ break;
+ case 0x02:
+ local_irq_save(*flagp);
+ break;
+ case 0x10:
+ spin_lock(&sn2_global_ptc_lock);
+ break;
+ case 0x11:
+ spin_lock(&sn_nodepda->ptc_lock);
+ break;
+ case 0x12:
+ break;
+ default:
+ BUG();
+ }
+ return opt;
+}
+
+static inline void ptc_unlock(unsigned long flags, int opt)
+{
+ switch (opt) {
+ case 0x00:
+ spin_unlock_irqrestore(&sn2_global_ptc_lock, flags);
+ break;
+ case 0x01:
+ spin_unlock_irqrestore(&sn_nodepda->ptc_lock, flags);
+ break;
+ case 0x02:
+ local_irq_restore(flags);
+ break;
+ case 0x10:
+ spin_unlock(&sn2_global_ptc_lock);
+ break;
+ case 0x11:
+ spin_unlock(&sn_nodepda->ptc_lock);
+ break;
+ case 0x12:
+ break;
+ default:
+ BUG();
+ }
+}
+#else
+
+#define sn2_ptctest 0
+
+static inline int ptc_lock(unsigned long *flagp)
+{
+ spin_lock_irqsave(&sn2_global_ptc_lock, *flagp);
+ return 0;
+}
+
+static inline void ptc_unlock(unsigned long flags, int opt)
+{
+ spin_unlock_irqrestore(&sn2_global_ptc_lock, flags);
+}
+#endif
+
+struct ptc_stats {
+ unsigned long ptc_l;
+ unsigned long change_rid;
+ unsigned long shub_ptc_flushes;
+ unsigned long nodes_flushed;
+ unsigned long deadlocks;
+ unsigned long lock_itc_clocks;
+ unsigned long shub_itc_clocks;
+ unsigned long shub_itc_clocks_max;
+};
static inline unsigned long wait_piowc(void)
{
@@ -67,6 +177,7 @@ void sn_tlb_migrate_finish(struct mm_struct *mm)
/**
* sn2_global_tlb_purge - globally purge translation cache of virtual address range
+ * @mm: mm_struct containing virtual address range
* @start: start of virtual address range
* @end: end of virtual address range
* @nbits: specifies number of bytes to purge per instruction (num = 1<<(nbits & 0xfc))
@@ -78,21 +189,22 @@ void sn_tlb_migrate_finish(struct mm_struct *mm)
* - cpu_vm_mask is a bit mask that indicates which cpus have loaded the context.
* - cpu_vm_mask is converted into a nodemask of the nodes containing the
* cpus in cpu_vm_mask.
- * - if only one bit is set in cpu_vm_mask & it is the current cpu,
- * then only the local TLB needs to be flushed. This flushing can be done
- * using ptc.l. This is the common case & avoids the global spinlock.
+ * - if only one bit is set in cpu_vm_mask & it is the current cpu & the
+ * process is purging its own virtual address range, then only the
+ * local TLB needs to be flushed. This flushing can be done using
+ * ptc.l. This is the common case & avoids the global spinlock.
* - if multiple cpus have loaded the context, then flushing has to be
* done with ptc.g/MMRs under protection of the global ptc_lock.
*/
void
-sn2_global_tlb_purge(unsigned long start, unsigned long end,
- unsigned long nbits)
+sn2_global_tlb_purge(struct mm_struct *mm, unsigned long start,
+ unsigned long end, unsigned long nbits)
{
- int i, shub1, cnode, mynasid, cpu, lcpu = 0, nasid, flushed = 0;
+ int i, opt, shub1, cnode, mynasid, cpu, lcpu = 0, nasid, flushed = 0;
+ int mymm = (mm == current->active_mm);
volatile unsigned long *ptc0, *ptc1;
- unsigned long flags = 0, data0 = 0, data1 = 0;
- struct mm_struct *mm = current->active_mm;
+ unsigned long itc, itc2, flags, data0 = 0, data1 = 0, rr_value;
short nasids[MAX_NUMNODES], nix;
nodemask_t nodes_flushed;
@@ -106,33 +218,41 @@ sn2_global_tlb_purge(unsigned long start, unsigned long end,
i++;
}
+ if (i == 0)
+ return;
+
preempt_disable();
- if (likely(i == 1 && lcpu == smp_processor_id())) {
+ if (likely(i == 1 && lcpu == smp_processor_id() && mymm)) {
do {
ia64_ptcl(start, nbits << 2);
start += (1UL << nbits);
} while (start < end);
ia64_srlz_i();
+ __get_cpu_var(ptcstats).ptc_l++;
preempt_enable();
return;
}
- if (atomic_read(&mm->mm_users) == 1) {
+ if (atomic_read(&mm->mm_users) == 1 && mymm) {
flush_tlb_mm(mm);
+ __get_cpu_var(ptcstats).change_rid++;
preempt_enable();
return;
}
+ itc = ia64_get_itc();
nix = 0;
for_each_node_mask(cnode, nodes_flushed)
nasids[nix++] = cnodeid_to_nasid(cnode);
+ rr_value = (mm->context << 3) | REGION_NUMBER(start);
+
shub1 = is_shub1();
if (shub1) {
data0 = (1UL << SH1_PTC_0_A_SHFT) |
(nbits << SH1_PTC_0_PS_SHFT) |
- ((ia64_get_rr(start) >> 8) << SH1_PTC_0_RID_SHFT) |
+ (rr_value << SH1_PTC_0_RID_SHFT) |
(1UL << SH1_PTC_0_START_SHFT);
ptc0 = (long *)GLOBAL_MMR_PHYS_ADDR(0, SH1_PTC_0);
ptc1 = (long *)GLOBAL_MMR_PHYS_ADDR(0, SH1_PTC_1);
@@ -141,14 +261,19 @@ sn2_global_tlb_purge(unsigned long start, unsigned long end,
(nbits << SH2_PTC_PS_SHFT) |
(1UL << SH2_PTC_START_SHFT);
ptc0 = (long *)GLOBAL_MMR_PHYS_ADDR(0, SH2_PTC +
- ((ia64_get_rr(start) >> 8) << SH2_PTC_RID_SHFT) );
+ (rr_value << SH2_PTC_RID_SHFT));
ptc1 = NULL;
}
mynasid = get_nasid();
- spin_lock_irqsave(&sn2_global_ptc_lock, flags);
+ itc = ia64_get_itc();
+ opt = ptc_lock(&flags);
+ itc2 = ia64_get_itc();
+ __get_cpu_var(ptcstats).lock_itc_clocks += itc2 - itc;
+ __get_cpu_var(ptcstats).shub_ptc_flushes++;
+ __get_cpu_var(ptcstats).nodes_flushed += nix;
do {
if (shub1)
@@ -157,7 +282,7 @@ sn2_global_tlb_purge(unsigned long start, unsigned long end,
data0 = (data0 & ~SH2_PTC_ADDR_MASK) | (start & SH2_PTC_ADDR_MASK);
for (i = 0; i < nix; i++) {
nasid = nasids[i];
- if (unlikely(nasid == mynasid)) {
+ if ((!(sn2_ptctest & 3)) && unlikely(nasid == mynasid && mymm)) {
ia64_ptcga(start, nbits << 2);
ia64_srlz_i();
} else {
@@ -169,18 +294,22 @@ sn2_global_tlb_purge(unsigned long start, unsigned long end,
flushed = 1;
}
}
-
if (flushed
&& (wait_piowc() &
- SH_PIO_WRITE_STATUS_WRITE_DEADLOCK_MASK)) {
- sn2_ptc_deadlock_recovery(ptc0, data0, ptc1, data1);
+ (SH_PIO_WRITE_STATUS_WRITE_DEADLOCK_MASK))) {
+ sn2_ptc_deadlock_recovery(nasids, nix, mynasid, ptc0, data0, ptc1, data1);
}
start += (1UL << nbits);
} while (start < end);
- spin_unlock_irqrestore(&sn2_global_ptc_lock, flags);
+ itc2 = ia64_get_itc() - itc2;
+ __get_cpu_var(ptcstats).shub_itc_clocks += itc2;
+ if (itc2 > __get_cpu_var(ptcstats).shub_itc_clocks_max)
+ __get_cpu_var(ptcstats).shub_itc_clocks_max = itc2;
+
+ ptc_unlock(flags, opt);
preempt_enable();
}
@@ -192,31 +321,29 @@ sn2_global_tlb_purge(unsigned long start, unsigned long end,
* TLB flush transaction. The recovery sequence is somewhat tricky & is
* coded in assembly language.
*/
-void sn2_ptc_deadlock_recovery(volatile unsigned long *ptc0, unsigned long data0,
+void sn2_ptc_deadlock_recovery(short *nasids, short nix, int mynasid, volatile unsigned long *ptc0, unsigned long data0,
volatile unsigned long *ptc1, unsigned long data1)
{
extern void sn2_ptc_deadlock_recovery_core(volatile unsigned long *, unsigned long,
volatile unsigned long *, unsigned long, volatile unsigned long *, unsigned long);
- int cnode, mycnode, nasid;
- volatile unsigned long *piows;
- volatile unsigned long zeroval;
+ short nasid, i;
+ unsigned long *piows, zeroval;
- sn2_ptc_deadlock_count++;
+ __get_cpu_var(ptcstats).deadlocks++;
- piows = pda->pio_write_status_addr;
+ piows = (unsigned long *) pda->pio_write_status_addr;
zeroval = pda->pio_write_status_val;
- mycnode = numa_node_id();
-
- for_each_online_node(cnode) {
- if (is_headless_node(cnode) || cnode == mycnode)
+ for (i=0; i < nix; i++) {
+ nasid = nasids[i];
+ if (!(sn2_ptctest & 3) && nasid == mynasid)
continue;
- nasid = cnodeid_to_nasid(cnode);
ptc0 = CHANGE_NASID(nasid, ptc0);
if (ptc1)
ptc1 = CHANGE_NASID(nasid, ptc1);
sn2_ptc_deadlock_recovery_core(ptc0, data0, ptc1, data1, piows, zeroval);
}
+
}
/**
@@ -293,3 +420,93 @@ void sn2_send_IPI(int cpuid, int vector, int delivery_mode, int redirect)
sn_send_IPI_phys(nasid, physid, vector, delivery_mode);
}
+
+#ifdef CONFIG_PROC_FS
+
+#define PTC_BASENAME "sgi_sn/ptc_statistics"
+
+static void *sn2_ptc_seq_start(struct seq_file *file, loff_t * offset)
+{
+ if (*offset < NR_CPUS)
+ return offset;
+ return NULL;
+}
+
+static void *sn2_ptc_seq_next(struct seq_file *file, void *data, loff_t * offset)
+{
+ (*offset)++;
+ if (*offset < NR_CPUS)
+ return offset;
+ return NULL;
+}
+
+static void sn2_ptc_seq_stop(struct seq_file *file, void *data)
+{
+}
+
+static int sn2_ptc_seq_show(struct seq_file *file, void *data)
+{
+ struct ptc_stats *stat;
+ int cpu;
+
+ cpu = *(loff_t *) data;
+
+ if (!cpu) {
+ seq_printf(file, "# ptc_l change_rid shub_ptc_flushes shub_nodes_flushed deadlocks lock_nsec shub_nsec shub_nsec_max\n");
+ seq_printf(file, "# ptctest %d\n", sn2_ptctest);
+ }
+
+ if (cpu < NR_CPUS && cpu_online(cpu)) {
+ stat = &per_cpu(ptcstats, cpu);
+ seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld\n", cpu, stat->ptc_l,
+ stat->change_rid, stat->shub_ptc_flushes, stat->nodes_flushed,
+ stat->deadlocks,
+ 1000 * stat->lock_itc_clocks / per_cpu(cpu_info, cpu).cyc_per_usec,
+ 1000 * stat->shub_itc_clocks / per_cpu(cpu_info, cpu).cyc_per_usec,
+ 1000 * stat->shub_itc_clocks_max / per_cpu(cpu_info, cpu).cyc_per_usec);
+ }
+
+ return 0;
+}
+
+static struct seq_operations sn2_ptc_seq_ops = {
+ .start = sn2_ptc_seq_start,
+ .next = sn2_ptc_seq_next,
+ .stop = sn2_ptc_seq_stop,
+ .show = sn2_ptc_seq_show
+};
+
+int sn2_ptc_proc_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &sn2_ptc_seq_ops);
+}
+
+static struct file_operations proc_sn2_ptc_operations = {
+ .open = sn2_ptc_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static struct proc_dir_entry *proc_sn2_ptc;
+
+static int __init sn2_ptc_init(void)
+{
+ if (!(proc_sn2_ptc = create_proc_entry(PTC_BASENAME, 0444, NULL))) {
+ printk(KERN_ERR "unable to create %s proc entry", PTC_BASENAME);
+ return -EINVAL;
+ }
+ proc_sn2_ptc->proc_fops = &proc_sn2_ptc_operations;
+ spin_lock_init(&sn2_global_ptc_lock);
+ return 0;
+}
+
+static void __exit sn2_ptc_exit(void)
+{
+ remove_proc_entry(PTC_BASENAME, NULL);
+}
+
+module_init(sn2_ptc_init);
+module_exit(sn2_ptc_exit);
+#endif /* CONFIG_PROC_FS */
+
diff --git a/arch/ia64/sn/kernel/sn2/sn_hwperf.c b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
index 833e700fdac9..6c6fbca3229c 100644
--- a/arch/ia64/sn/kernel/sn2/sn_hwperf.c
+++ b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
@@ -36,7 +36,6 @@
#include <asm/topology.h>
#include <asm/smp.h>
#include <asm/semaphore.h>
-#include <asm/segment.h>
#include <asm/uaccess.h>
#include <asm/sal.h>
#include <asm/sn/io.h>
@@ -59,7 +58,7 @@ static int sn_hwperf_enum_objects(int *nobj, struct sn_hwperf_object_info **ret)
struct sn_hwperf_object_info *objbuf = NULL;
if ((e = sn_hwperf_init()) < 0) {
- printk("sn_hwperf_init failed: err %d\n", e);
+ printk(KERN_ERR "sn_hwperf_init failed: err %d\n", e);
goto out;
}
@@ -111,7 +110,7 @@ static int sn_hwperf_geoid_to_cnode(char *location)
if (sn_hwperf_location_to_bpos(location, &rack, &bay, &slot, &slab))
return -1;
- for (cnode = 0; cnode < numionodes; cnode++) {
+ for_each_node(cnode) {
geoid = cnodeid_get_geoid(cnode);
module_id = geo_module(geoid);
this_rack = MODULE_GET_RACK(module_id);
@@ -124,11 +123,13 @@ static int sn_hwperf_geoid_to_cnode(char *location)
}
}
- return cnode < numionodes ? cnode : -1;
+ return node_possible(cnode) ? cnode : -1;
}
static int sn_hwperf_obj_to_cnode(struct sn_hwperf_object_info * obj)
{
+ if (!SN_HWPERF_IS_NODE(obj) && !SN_HWPERF_IS_IONODE(obj))
+ BUG();
if (!obj->sn_hwp_this_part)
return -1;
return sn_hwperf_geoid_to_cnode(obj->location);
@@ -174,31 +175,199 @@ static const char *sn_hwperf_get_slabname(struct sn_hwperf_object_info *obj,
return slabname;
}
-static void print_pci_topology(struct seq_file *s,
- struct sn_hwperf_object_info *obj, int *ordinal,
- u64 rack, u64 bay, u64 slot, u64 slab)
+static void print_pci_topology(struct seq_file *s)
+{
+ char *p;
+ size_t sz;
+ int e;
+
+ for (sz = PAGE_SIZE; sz < 16 * PAGE_SIZE; sz += PAGE_SIZE) {
+ if (!(p = (char *)kmalloc(sz, GFP_KERNEL)))
+ break;
+ e = ia64_sn_ioif_get_pci_topology(__pa(p), sz);
+ if (e == SALRET_OK)
+ seq_puts(s, p);
+ kfree(p);
+ if (e == SALRET_OK || e == SALRET_NOT_IMPLEMENTED)
+ break;
+ }
+}
+
+static inline int sn_hwperf_has_cpus(cnodeid_t node)
+{
+ return node_online(node) && nr_cpus_node(node);
+}
+
+static inline int sn_hwperf_has_mem(cnodeid_t node)
+{
+ return node_online(node) && NODE_DATA(node)->node_present_pages;
+}
+
+static struct sn_hwperf_object_info *
+sn_hwperf_findobj_id(struct sn_hwperf_object_info *objbuf,
+ int nobj, int id)
{
- char *p1;
- char *p2;
- char *pg;
-
- if (!(pg = (char *)get_zeroed_page(GFP_KERNEL)))
- return; /* ignore */
- if (ia64_sn_ioif_get_pci_topology(rack, bay, slot, slab,
- __pa(pg), PAGE_SIZE) == SN_HWPERF_OP_OK) {
- for (p1=pg; *p1 && p1 < pg + PAGE_SIZE;) {
- if (!(p2 = strchr(p1, '\n')))
+ int i;
+ struct sn_hwperf_object_info *p = objbuf;
+
+ for (i=0; i < nobj; i++, p++) {
+ if (p->id == id)
+ return p;
+ }
+
+ return NULL;
+
+}
+
+static int sn_hwperf_get_nearest_node_objdata(struct sn_hwperf_object_info *objbuf,
+ int nobj, cnodeid_t node, cnodeid_t *near_mem_node, cnodeid_t *near_cpu_node)
+{
+ int e;
+ struct sn_hwperf_object_info *nodeobj = NULL;
+ struct sn_hwperf_object_info *op;
+ struct sn_hwperf_object_info *dest;
+ struct sn_hwperf_object_info *router;
+ struct sn_hwperf_port_info ptdata[16];
+ int sz, i, j;
+ cnodeid_t c;
+ int found_mem = 0;
+ int found_cpu = 0;
+
+ if (!node_possible(node))
+ return -EINVAL;
+
+ if (sn_hwperf_has_cpus(node)) {
+ if (near_cpu_node)
+ *near_cpu_node = node;
+ found_cpu++;
+ }
+
+ if (sn_hwperf_has_mem(node)) {
+ if (near_mem_node)
+ *near_mem_node = node;
+ found_mem++;
+ }
+
+ if (found_cpu && found_mem)
+ return 0; /* trivially successful */
+
+ /* find the argument node object */
+ for (i=0, op=objbuf; i < nobj; i++, op++) {
+ if (!SN_HWPERF_IS_NODE(op) && !SN_HWPERF_IS_IONODE(op))
+ continue;
+ if (node == sn_hwperf_obj_to_cnode(op)) {
+ nodeobj = op;
+ break;
+ }
+ }
+ if (!nodeobj) {
+ e = -ENOENT;
+ goto err;
+ }
+
+ /* get it's interconnect topology */
+ sz = op->ports * sizeof(struct sn_hwperf_port_info);
+ if (sz > sizeof(ptdata))
+ BUG();
+ e = ia64_sn_hwperf_op(sn_hwperf_master_nasid,
+ SN_HWPERF_ENUM_PORTS, nodeobj->id, sz,
+ (u64)&ptdata, 0, 0, NULL);
+ if (e != SN_HWPERF_OP_OK) {
+ e = -EINVAL;
+ goto err;
+ }
+
+ /* find nearest node with cpus and nearest memory */
+ for (router=NULL, j=0; j < op->ports; j++) {
+ dest = sn_hwperf_findobj_id(objbuf, nobj, ptdata[j].conn_id);
+ if (!dest || SN_HWPERF_FOREIGN(dest) ||
+ !SN_HWPERF_IS_NODE(dest) || SN_HWPERF_IS_IONODE(dest)) {
+ continue;
+ }
+ c = sn_hwperf_obj_to_cnode(dest);
+ if (!found_cpu && sn_hwperf_has_cpus(c)) {
+ if (near_cpu_node)
+ *near_cpu_node = c;
+ found_cpu++;
+ }
+ if (!found_mem && sn_hwperf_has_mem(c)) {
+ if (near_mem_node)
+ *near_mem_node = c;
+ found_mem++;
+ }
+ if (SN_HWPERF_IS_ROUTER(dest))
+ router = dest;
+ }
+
+ if (router && (!found_cpu || !found_mem)) {
+ /* search for a node connected to the same router */
+ sz = router->ports * sizeof(struct sn_hwperf_port_info);
+ if (sz > sizeof(ptdata))
+ BUG();
+ e = ia64_sn_hwperf_op(sn_hwperf_master_nasid,
+ SN_HWPERF_ENUM_PORTS, router->id, sz,
+ (u64)&ptdata, 0, 0, NULL);
+ if (e != SN_HWPERF_OP_OK) {
+ e = -EINVAL;
+ goto err;
+ }
+ for (j=0; j < router->ports; j++) {
+ dest = sn_hwperf_findobj_id(objbuf, nobj,
+ ptdata[j].conn_id);
+ if (!dest || dest->id == node ||
+ SN_HWPERF_FOREIGN(dest) ||
+ !SN_HWPERF_IS_NODE(dest) ||
+ SN_HWPERF_IS_IONODE(dest)) {
+ continue;
+ }
+ c = sn_hwperf_obj_to_cnode(dest);
+ if (!found_cpu && sn_hwperf_has_cpus(c)) {
+ if (near_cpu_node)
+ *near_cpu_node = c;
+ found_cpu++;
+ }
+ if (!found_mem && sn_hwperf_has_mem(c)) {
+ if (near_mem_node)
+ *near_mem_node = c;
+ found_mem++;
+ }
+ if (found_cpu && found_mem)
+ break;
+ }
+ }
+
+ if (!found_cpu || !found_mem) {
+ /* resort to _any_ node with CPUs and memory */
+ for (i=0, op=objbuf; i < nobj; i++, op++) {
+ if (SN_HWPERF_FOREIGN(op) ||
+ SN_HWPERF_IS_IONODE(op) ||
+ !SN_HWPERF_IS_NODE(op)) {
+ continue;
+ }
+ c = sn_hwperf_obj_to_cnode(op);
+ if (!found_cpu && sn_hwperf_has_cpus(c)) {
+ if (near_cpu_node)
+ *near_cpu_node = c;
+ found_cpu++;
+ }
+ if (!found_mem && sn_hwperf_has_mem(c)) {
+ if (near_mem_node)
+ *near_mem_node = c;
+ found_mem++;
+ }
+ if (found_cpu && found_mem)
break;
- *p2 = '\0';
- seq_printf(s, "pcibus %d %s-%s\n",
- *ordinal, obj->location, p1);
- (*ordinal)++;
- p1 = p2 + 1;
}
}
- free_page((unsigned long)pg);
+
+ if (!found_cpu || !found_mem)
+ e = -ENODATA;
+
+err:
+ return e;
}
+
static int sn_topology_show(struct seq_file *s, void *d)
{
int sz;
@@ -215,7 +384,6 @@ static int sn_topology_show(struct seq_file *s, void *d)
struct sn_hwperf_object_info *p;
struct sn_hwperf_object_info *obj = d; /* this object */
struct sn_hwperf_object_info *objs = s->private; /* all objects */
- int rack, bay, slot, slab;
u8 shubtype;
u8 system_size;
u8 sharing_size;
@@ -225,7 +393,6 @@ static int sn_topology_show(struct seq_file *s, void *d)
u8 region_size;
u16 nasid_mask;
int nasid_msb;
- int pci_bus_ordinal = 0;
if (obj == objs) {
seq_printf(s, "# sn_topology version 2\n");
@@ -253,6 +420,8 @@ static int sn_topology_show(struct seq_file *s, void *d)
shubtype ? "shub2" : "shub1",
(u64)nasid_mask << nasid_shift, nasid_msb, nasid_shift,
system_size, sharing_size, coher, region_size);
+
+ print_pci_topology(s);
}
if (SN_HWPERF_FOREIGN(obj)) {
@@ -272,11 +441,24 @@ static int sn_topology_show(struct seq_file *s, void *d)
if (!SN_HWPERF_IS_NODE(obj) && !SN_HWPERF_IS_IONODE(obj))
seq_putc(s, '\n');
else {
+ cnodeid_t near_mem = -1;
+ cnodeid_t near_cpu = -1;
+
seq_printf(s, ", nasid 0x%x", cnodeid_to_nasid(ordinal));
- for (i=0; i < numionodes; i++) {
- seq_printf(s, i ? ":%d" : ", dist %d",
- node_distance(ordinal, i));
+
+ if (sn_hwperf_get_nearest_node_objdata(objs, sn_hwperf_obj_cnt,
+ ordinal, &near_mem, &near_cpu) == 0) {
+ seq_printf(s, ", near_mem_nodeid %d, near_cpu_nodeid %d",
+ near_mem, near_cpu);
+ }
+
+ if (!SN_HWPERF_IS_IONODE(obj)) {
+ for_each_online_node(i) {
+ seq_printf(s, i ? ":%d" : ", dist %d",
+ node_distance(ordinal, i));
+ }
}
+
seq_putc(s, '\n');
/*
@@ -294,23 +476,12 @@ static int sn_topology_show(struct seq_file *s, void *d)
for_each_online_cpu(j) {
seq_printf(s, j ? ":%d" : ", dist %d",
node_distance(
- cpuid_to_cnodeid(i),
- cpuid_to_cnodeid(j)));
+ cpu_to_node(i),
+ cpu_to_node(j)));
}
seq_putc(s, '\n');
}
}
-
- /*
- * PCI busses attached to this node, if any
- */
- if (sn_hwperf_location_to_bpos(obj->location,
- &rack, &bay, &slot, &slab)) {
- /* export pci bus info */
- print_pci_topology(s, obj, &pci_bus_ordinal,
- rack, bay, slot, slab);
-
- }
}
if (obj->ports) {
@@ -572,6 +743,8 @@ sn_hwperf_ioctl(struct inode *in, struct file *fp, u32 op, u64 arg)
if ((r = sn_hwperf_enum_objects(&nobj, &objs)) == 0) {
memset(p, 0, a.sz);
for (i = 0; i < nobj; i++) {
+ if (!SN_HWPERF_IS_NODE(objs + i))
+ continue;
node = sn_hwperf_obj_to_cnode(objs + i);
for_each_online_cpu(j) {
if (node != cpu_to_node(j))
@@ -598,7 +771,7 @@ sn_hwperf_ioctl(struct inode *in, struct file *fp, u32 op, u64 arg)
case SN_HWPERF_GET_NODE_NASID:
if (a.sz != sizeof(u64) ||
- (node = a.arg) < 0 || node >= numionodes) {
+ (node = a.arg) < 0 || !node_possible(node)) {
r = -EINVAL;
goto error;
}
@@ -627,6 +800,14 @@ sn_hwperf_ioctl(struct inode *in, struct file *fp, u32 op, u64 arg)
vfree(objs);
goto error;
}
+
+ if (!SN_HWPERF_IS_NODE(objs + i) &&
+ !SN_HWPERF_IS_IONODE(objs + i)) {
+ r = -ENOENT;
+ vfree(objs);
+ goto error;
+ }
+
*(u64 *)p = (u64)sn_hwperf_obj_to_cnode(objs + i);
vfree(objs);
}
@@ -692,6 +873,7 @@ static int sn_hwperf_init(void)
/* single threaded, once-only initialization */
down(&sn_hwperf_init_mutex);
+
if (sn_hwperf_salheap) {
up(&sn_hwperf_init_mutex);
return e;
@@ -742,19 +924,6 @@ out:
sn_hwperf_salheap = NULL;
sn_hwperf_obj_cnt = 0;
}
-
- if (!e) {
- /*
- * Register a dynamic misc device for ioctl. Platforms
- * supporting hotplug will create /dev/sn_hwperf, else
- * user can to look up the minor number in /proc/misc.
- */
- if ((e = misc_register(&sn_hwperf_dev)) != 0) {
- printk(KERN_ERR "sn_hwperf_init: misc register "
- "for \"sn_hwperf\" failed, err %d\n", e);
- }
- }
-
up(&sn_hwperf_init_mutex);
return e;
}
@@ -782,3 +951,41 @@ int sn_topology_release(struct inode *inode, struct file *file)
vfree(seq->private);
return seq_release(inode, file);
}
+
+int sn_hwperf_get_nearest_node(cnodeid_t node,
+ cnodeid_t *near_mem_node, cnodeid_t *near_cpu_node)
+{
+ int e;
+ int nobj;
+ struct sn_hwperf_object_info *objbuf;
+
+ if ((e = sn_hwperf_enum_objects(&nobj, &objbuf)) == 0) {
+ e = sn_hwperf_get_nearest_node_objdata(objbuf, nobj,
+ node, near_mem_node, near_cpu_node);
+ vfree(objbuf);
+ }
+
+ return e;
+}
+
+static int __devinit sn_hwperf_misc_register_init(void)
+{
+ int e;
+
+ sn_hwperf_init();
+
+ /*
+ * Register a dynamic misc device for hwperf ioctls. Platforms
+ * supporting hotplug will create /dev/sn_hwperf, else user
+ * can to look up the minor number in /proc/misc.
+ */
+ if ((e = misc_register(&sn_hwperf_dev)) != 0) {
+ printk(KERN_ERR "sn_hwperf_misc_register_init: failed to "
+ "register misc device for \"%s\"\n", sn_hwperf_dev.name);
+ }
+
+ return e;
+}
+
+device_initcall(sn_hwperf_misc_register_init); /* after misc_init() */
+EXPORT_SYMBOL(sn_hwperf_get_nearest_node);
diff --git a/arch/ia64/sn/kernel/sn2/sn_proc_fs.c b/arch/ia64/sn/kernel/sn2/sn_proc_fs.c
index 6a80fca807b9..a06719d752a0 100644
--- a/arch/ia64/sn/kernel/sn2/sn_proc_fs.c
+++ b/arch/ia64/sn/kernel/sn2/sn_proc_fs.c
@@ -3,7 +3,7 @@
* License. See the file "COPYING" in the main directory of this archive
* for more details.
*
- * Copyright (C) 2000-2004 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2000-2005 Silicon Graphics, Inc. All rights reserved.
*/
#include <linux/config.h>
#include <asm/uaccess.h>
@@ -15,7 +15,7 @@
static int partition_id_show(struct seq_file *s, void *p)
{
- seq_printf(s, "%d\n", sn_local_partid());
+ seq_printf(s, "%d\n", sn_partition_id);
return 0;
}
@@ -52,7 +52,7 @@ static int licenseID_open(struct inode *inode, struct file *file)
* the bridge chip. The hardware will then send an interrupt message if the
* interrupt line is active. This mimics a level sensitive interrupt.
*/
-int sn_force_interrupt_flag = 1;
+extern int sn_force_interrupt_flag;
static int sn_force_interrupt_show(struct seq_file *s, void *p)
{
diff --git a/arch/ia64/sn/kernel/sn2/timer_interrupt.c b/arch/ia64/sn/kernel/sn2/timer_interrupt.c
index cde7375390b0..adf5db2e2afe 100644
--- a/arch/ia64/sn/kernel/sn2/timer_interrupt.c
+++ b/arch/ia64/sn/kernel/sn2/timer_interrupt.c
@@ -1,7 +1,7 @@
/*
*
*
- * Copyright (c) 2003 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) 2005 Silicon Graphics, Inc. All Rights Reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of version 2 of the GNU General Public License
@@ -50,14 +50,16 @@ void sn_timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
LED_CPU_HEARTBEAT, LED_CPU_HEARTBEAT);
}
- if (enable_shub_wars_1_1()) {
- /* Bugfix code for SHUB 1.1 */
- if (pda->pio_shub_war_cam_addr)
- *pda->pio_shub_war_cam_addr = 0x8000000000000010UL;
+ if (is_shub1()) {
+ if (enable_shub_wars_1_1()) {
+ /* Bugfix code for SHUB 1.1 */
+ if (pda->pio_shub_war_cam_addr)
+ *pda->pio_shub_war_cam_addr = 0x8000000000000010UL;
+ }
+ if (pda->sn_lb_int_war_ticks == 0)
+ sn_lb_int_war_check();
+ pda->sn_lb_int_war_ticks++;
+ if (pda->sn_lb_int_war_ticks >= SN_LB_INT_WAR_INTERVAL)
+ pda->sn_lb_int_war_ticks = 0;
}
- if (pda->sn_lb_int_war_ticks == 0)
- sn_lb_int_war_check();
- pda->sn_lb_int_war_ticks++;
- if (pda->sn_lb_int_war_ticks >= SN_LB_INT_WAR_INTERVAL)
- pda->sn_lb_int_war_ticks = 0;
}
diff --git a/arch/ia64/sn/kernel/tiocx.c b/arch/ia64/sn/kernel/tiocx.c
index 254fe15c064b..0d8592a745a7 100644
--- a/arch/ia64/sn/kernel/tiocx.c
+++ b/arch/ia64/sn/kernel/tiocx.c
@@ -183,15 +183,16 @@ int cx_driver_unregister(struct cx_drv *cx_driver)
* @part_num: device's part number
* @mfg_num: device's manufacturer number
* @hubdev: hub info associated with this device
+ * @bt: board type of the device
*
*/
int
cx_device_register(nasid_t nasid, int part_num, int mfg_num,
- struct hubdev_info *hubdev)
+ struct hubdev_info *hubdev, int bt)
{
struct cx_dev *cx_dev;
- cx_dev = kcalloc(1, sizeof(struct cx_dev), GFP_KERNEL);
+ cx_dev = kzalloc(sizeof(struct cx_dev), GFP_KERNEL);
DBG("cx_dev= 0x%p\n", cx_dev);
if (cx_dev == NULL)
return -ENOMEM;
@@ -200,6 +201,7 @@ cx_device_register(nasid_t nasid, int part_num, int mfg_num,
cx_dev->cx_id.mfg_num = mfg_num;
cx_dev->cx_id.nasid = nasid;
cx_dev->hubdev = hubdev;
+ cx_dev->bt = bt;
cx_dev->dev.parent = NULL;
cx_dev->dev.bus = &tiocx_bus_type;
@@ -238,7 +240,8 @@ static int cx_device_reload(struct cx_dev *cx_dev)
{
cx_device_unregister(cx_dev);
return cx_device_register(cx_dev->cx_id.nasid, cx_dev->cx_id.part_num,
- cx_dev->cx_id.mfg_num, cx_dev->hubdev);
+ cx_dev->cx_id.mfg_num, cx_dev->hubdev,
+ cx_dev->bt);
}
static inline uint64_t tiocx_intr_alloc(nasid_t nasid, int widget,
@@ -365,26 +368,20 @@ static void tio_corelet_reset(nasid_t nasid, int corelet)
udelay(2000);
}
-static int tiocx_btchar_get(int nasid)
+static int is_fpga_tio(int nasid, int *bt)
{
- moduleid_t module_id;
- geoid_t geoid;
- int cnodeid;
-
- cnodeid = nasid_to_cnodeid(nasid);
- geoid = cnodeid_get_geoid(cnodeid);
- module_id = geo_module(geoid);
- return MODULE_GET_BTCHAR(module_id);
-}
+ int ioboard_type;
-static int is_fpga_brick(int nasid)
-{
- switch (tiocx_btchar_get(nasid)) {
+ ioboard_type = ia64_sn_sysctl_ioboard_get(nasid);
+
+ switch (ioboard_type) {
case L1_BRICKTYPE_SA:
case L1_BRICKTYPE_ATHENA:
- case L1_BRICKTYPE_DAYTONA:
+ case L1_BOARDTYPE_DAYTONA:
+ *bt = ioboard_type;
return 1;
}
+
return 0;
}
@@ -407,16 +404,22 @@ static int tiocx_reload(struct cx_dev *cx_dev)
if (bitstream_loaded(nasid)) {
uint64_t cx_id;
-
- cx_id =
- *(volatile uint64_t *)(TIO_SWIN_BASE(nasid, TIOCX_CORELET) +
+ int rv;
+
+ rv = ia64_sn_sysctl_tio_clock_reset(nasid);
+ if (rv) {
+ printk(KERN_ALERT "CX port JTAG reset failed.\n");
+ } else {
+ cx_id = *(volatile uint64_t *)
+ (TIO_SWIN_BASE(nasid, TIOCX_CORELET) +
WIDGET_ID);
- part_num = XWIDGET_PART_NUM(cx_id);
- mfg_num = XWIDGET_MFG_NUM(cx_id);
- DBG("part= 0x%x, mfg= 0x%x\n", part_num, mfg_num);
- /* just ignore it if it's a CE */
- if (part_num == TIO_CE_ASIC_PARTNUM)
- return 0;
+ part_num = XWIDGET_PART_NUM(cx_id);
+ mfg_num = XWIDGET_MFG_NUM(cx_id);
+ DBG("part= 0x%x, mfg= 0x%x\n", part_num, mfg_num);
+ /* just ignore it if it's a CE */
+ if (part_num == TIO_CE_ASIC_PARTNUM)
+ return 0;
+ }
}
cx_dev->cx_id.part_num = part_num;
@@ -436,10 +439,10 @@ static ssize_t show_cxdev_control(struct device *dev, struct device_attribute *a
{
struct cx_dev *cx_dev = to_cx_dev(dev);
- return sprintf(buf, "0x%x 0x%x 0x%x %d\n",
+ return sprintf(buf, "0x%x 0x%x 0x%x 0x%x\n",
cx_dev->cx_id.nasid,
cx_dev->cx_id.part_num, cx_dev->cx_id.mfg_num,
- tiocx_btchar_get(cx_dev->cx_id.nasid));
+ cx_dev->bt);
}
static ssize_t store_cxdev_control(struct device *dev, struct device_attribute *attr, const char *buf,
@@ -486,13 +489,13 @@ static int __init tiocx_init(void)
bus_register(&tiocx_bus_type);
- for (cnodeid = 0; cnodeid < MAX_COMPACT_NODES; cnodeid++) {
+ for (cnodeid = 0; cnodeid < num_cnodes; cnodeid++) {
nasid_t nasid;
+ int bt;
- if ((nasid = cnodeid_to_nasid(cnodeid)) < 0)
- break; /* No more nasids .. bail out of loop */
+ nasid = cnodeid_to_nasid(cnodeid);
- if ((nasid & 0x1) && is_fpga_brick(nasid)) {
+ if ((nasid & 0x1) && is_fpga_tio(nasid, &bt)) {
struct hubdev_info *hubdev;
struct xwidget_info *widgetp;
@@ -512,7 +515,7 @@ static int __init tiocx_init(void)
if (cx_device_register
(nasid, widgetp->xwi_hwid.part_num,
- widgetp->xwi_hwid.mfg_num, hubdev) < 0)
+ widgetp->xwi_hwid.mfg_num, hubdev, bt) < 0)
return -ENXIO;
else
found_tiocx_device++;
diff --git a/arch/ia64/sn/kernel/xpc.h b/arch/ia64/sn/kernel/xpc.h
index d0ee635daf2e..fbcedc7c27fa 100644
--- a/arch/ia64/sn/kernel/xpc.h
+++ b/arch/ia64/sn/kernel/xpc.h
@@ -57,7 +57,7 @@
#define XPC_NASID_FROM_W_B(_w, _b) (((_w) * 64 + (_b)) * 2)
#define XPC_HB_DEFAULT_INTERVAL 5 /* incr HB every x secs */
-#define XPC_HB_CHECK_DEFAULT_TIMEOUT 20 /* check HB every x secs */
+#define XPC_HB_CHECK_DEFAULT_INTERVAL 20 /* check HB every x secs */
/* define the process name of HB checker and the CPU it is pinned to */
#define XPC_HB_CHECK_THREAD_NAME "xpc_hb"
@@ -67,34 +67,82 @@
#define XPC_DISCOVERY_THREAD_NAME "xpc_discovery"
-#define XPC_HB_ALLOWED(_p, _v) ((_v)->heartbeating_to_mask & (1UL << (_p)))
-#define XPC_ALLOW_HB(_p, _v) (_v)->heartbeating_to_mask |= (1UL << (_p))
-#define XPC_DISALLOW_HB(_p, _v) (_v)->heartbeating_to_mask &= (~(1UL << (_p)))
-
-
/*
- * Reserved Page provided by SAL.
+ * the reserved page
+ *
+ * SAL reserves one page of memory per partition for XPC. Though a full page
+ * in length (16384 bytes), its starting address is not page aligned, but it
+ * is cacheline aligned. The reserved page consists of the following:
+ *
+ * reserved page header
+ *
+ * The first cacheline of the reserved page contains the header
+ * (struct xpc_rsvd_page). Before SAL initialization has completed,
+ * SAL has set up the following fields of the reserved page header:
+ * SAL_signature, SAL_version, partid, and nasids_size. The other
+ * fields are set up by XPC. (xpc_rsvd_page points to the local
+ * partition's reserved page.)
*
- * SAL provides one page per partition of reserved memory. When SAL
- * initialization is complete, SAL_signature, SAL_version, partid,
- * part_nasids, and mach_nasids are set.
+ * part_nasids mask
+ * mach_nasids mask
+ *
+ * SAL also sets up two bitmaps (or masks), one that reflects the actual
+ * nasids in this partition (part_nasids), and the other that reflects
+ * the actual nasids in the entire machine (mach_nasids). We're only
+ * interested in the even numbered nasids (which contain the processors
+ * and/or memory), so we only need half as many bits to represent the
+ * nasids. The part_nasids mask is located starting at the first cacheline
+ * following the reserved page header. The mach_nasids mask follows right
+ * after the part_nasids mask. The size in bytes of each mask is reflected
+ * by the reserved page header field 'nasids_size'. (Local partition's
+ * mask pointers are xpc_part_nasids and xpc_mach_nasids.)
+ *
+ * vars
+ * vars part
+ *
+ * Immediately following the mach_nasids mask are the XPC variables
+ * required by other partitions. First are those that are generic to all
+ * partitions (vars), followed on the next available cacheline by those
+ * which are partition specific (vars part). These are setup by XPC.
+ * (Local partition's vars pointers are xpc_vars and xpc_vars_part.)
*
* Note: Until vars_pa is set, the partition XPC code has not been initialized.
*/
struct xpc_rsvd_page {
- u64 SAL_signature; /* SAL unique signature */
- u64 SAL_version; /* SAL specified version */
- u8 partid; /* partition ID from SAL */
+ u64 SAL_signature; /* SAL: unique signature */
+ u64 SAL_version; /* SAL: version */
+ u8 partid; /* SAL: partition ID */
u8 version;
- u8 pad[6]; /* pad to u64 align */
+ u8 pad1[6]; /* align to next u64 in cacheline */
volatile u64 vars_pa;
- u64 part_nasids[XP_NASID_MASK_WORDS] ____cacheline_aligned;
- u64 mach_nasids[XP_NASID_MASK_WORDS] ____cacheline_aligned;
+ struct timespec stamp; /* time when reserved page was setup by XPC */
+ u64 pad2[9]; /* align to last u64 in cacheline */
+ u64 nasids_size; /* SAL: size of each nasid mask in bytes */
};
-#define XPC_RP_VERSION _XPC_VERSION(1,0) /* version 1.0 of the reserved page */
-#define XPC_RSVD_PAGE_ALIGNED_SIZE \
- (L1_CACHE_ALIGN(sizeof(struct xpc_rsvd_page)))
+#define XPC_RP_VERSION _XPC_VERSION(1,1) /* version 1.1 of the reserved page */
+
+#define XPC_SUPPORTS_RP_STAMP(_version) \
+ (_version >= _XPC_VERSION(1,1))
+
+/*
+ * compare stamps - the return value is:
+ *
+ * < 0, if stamp1 < stamp2
+ * = 0, if stamp1 == stamp2
+ * > 0, if stamp1 > stamp2
+ */
+static inline int
+xpc_compare_stamps(struct timespec *stamp1, struct timespec *stamp2)
+{
+ int ret;
+
+
+ if ((ret = stamp1->tv_sec - stamp2->tv_sec) == 0) {
+ ret = stamp1->tv_nsec - stamp2->tv_nsec;
+ }
+ return ret;
+}
/*
@@ -121,11 +169,58 @@ struct xpc_vars {
u64 vars_part_pa;
u64 amos_page_pa; /* paddr of page of AMOs from MSPEC driver */
AMO_t *amos_page; /* vaddr of page of AMOs from MSPEC driver */
- AMO_t *act_amos; /* pointer to the first activation AMO */
};
-#define XPC_V_VERSION _XPC_VERSION(3,0) /* version 3.0 of the cross vars */
-#define XPC_VARS_ALIGNED_SIZE (L1_CACHE_ALIGN(sizeof(struct xpc_vars)))
+#define XPC_V_VERSION _XPC_VERSION(3,1) /* version 3.1 of the cross vars */
+
+#define XPC_SUPPORTS_DISENGAGE_REQUEST(_version) \
+ (_version >= _XPC_VERSION(3,1))
+
+
+static inline int
+xpc_hb_allowed(partid_t partid, struct xpc_vars *vars)
+{
+ return ((vars->heartbeating_to_mask & (1UL << partid)) != 0);
+}
+
+static inline void
+xpc_allow_hb(partid_t partid, struct xpc_vars *vars)
+{
+ u64 old_mask, new_mask;
+
+ do {
+ old_mask = vars->heartbeating_to_mask;
+ new_mask = (old_mask | (1UL << partid));
+ } while (cmpxchg(&vars->heartbeating_to_mask, old_mask, new_mask) !=
+ old_mask);
+}
+
+static inline void
+xpc_disallow_hb(partid_t partid, struct xpc_vars *vars)
+{
+ u64 old_mask, new_mask;
+
+ do {
+ old_mask = vars->heartbeating_to_mask;
+ new_mask = (old_mask & ~(1UL << partid));
+ } while (cmpxchg(&vars->heartbeating_to_mask, old_mask, new_mask) !=
+ old_mask);
+}
+
+
+/*
+ * The AMOs page consists of a number of AMO variables which are divided into
+ * four groups, The first two groups are used to identify an IRQ's sender.
+ * These two groups consist of 64 and 128 AMO variables respectively. The last
+ * two groups, consisting of just one AMO variable each, are used to identify
+ * the remote partitions that are currently engaged (from the viewpoint of
+ * the XPC running on the remote partition).
+ */
+#define XPC_NOTIFY_IRQ_AMOS 0
+#define XPC_ACTIVATE_IRQ_AMOS (XPC_NOTIFY_IRQ_AMOS + XP_MAX_PARTITIONS)
+#define XPC_ENGAGED_PARTITIONS_AMO (XPC_ACTIVATE_IRQ_AMOS + XP_NASID_MASK_WORDS)
+#define XPC_DISENGAGE_REQUEST_AMO (XPC_ENGAGED_PARTITIONS_AMO + 1)
+
/*
* The following structure describes the per partition specific variables.
@@ -165,6 +260,16 @@ struct xpc_vars_part {
#define XPC_VP_MAGIC2 0x0073726176435058L /* 'XPCvars\0'L (little endian) */
+/* the reserved page sizes and offsets */
+
+#define XPC_RP_HEADER_SIZE L1_CACHE_ALIGN(sizeof(struct xpc_rsvd_page))
+#define XPC_RP_VARS_SIZE L1_CACHE_ALIGN(sizeof(struct xpc_vars))
+
+#define XPC_RP_PART_NASIDS(_rp) (u64 *) ((u8 *) _rp + XPC_RP_HEADER_SIZE)
+#define XPC_RP_MACH_NASIDS(_rp) (XPC_RP_PART_NASIDS(_rp) + xp_nasid_mask_words)
+#define XPC_RP_VARS(_rp) ((struct xpc_vars *) XPC_RP_MACH_NASIDS(_rp) + xp_nasid_mask_words)
+#define XPC_RP_VARS_PART(_rp) (struct xpc_vars_part *) ((u8 *) XPC_RP_VARS(rp) + XPC_RP_VARS_SIZE)
+
/*
* Functions registered by add_timer() or called by kernel_thread() only
@@ -349,6 +454,9 @@ struct xpc_channel {
atomic_t n_on_msg_allocate_wq; /* #on msg allocation wait queue */
wait_queue_head_t msg_allocate_wq; /* msg allocation wait queue */
+ u8 delayed_IPI_flags; /* IPI flags received, but delayed */
+ /* action until channel disconnected */
+
/* queue of msg senders who want to be notified when msg received */
atomic_t n_to_notify; /* #of msg senders to notify */
@@ -358,7 +466,7 @@ struct xpc_channel {
void *key; /* pointer to user's key */
struct semaphore msg_to_pull_sema; /* next msg to pull serialization */
- struct semaphore teardown_sema; /* wait for teardown completion */
+ struct semaphore wdisconnect_sema; /* wait for channel disconnect */
struct xpc_openclose_args *local_openclose_args; /* args passed on */
/* opening or closing of channel */
@@ -410,6 +518,8 @@ struct xpc_channel {
#define XPC_C_DISCONNECTED 0x00002000 /* channel is disconnected */
#define XPC_C_DISCONNECTING 0x00004000 /* channel is being disconnected */
+#define XPC_C_DISCONNECTCALLOUT 0x00008000 /* chan disconnected callout made */
+#define XPC_C_WDISCONNECT 0x00010000 /* waiting for channel disconnect */
@@ -422,6 +532,8 @@ struct xpc_partition {
/* XPC HB infrastructure */
+ u8 remote_rp_version; /* version# of partition's rsvd pg */
+ struct timespec remote_rp_stamp;/* time when rsvd pg was initialized */
u64 remote_rp_pa; /* phys addr of partition's rsvd pg */
u64 remote_vars_pa; /* phys addr of partition's vars */
u64 remote_vars_part_pa; /* phys addr of partition's vars part */
@@ -432,14 +544,18 @@ struct xpc_partition {
u32 act_IRQ_rcvd; /* IRQs since activation */
spinlock_t act_lock; /* protect updating of act_state */
u8 act_state; /* from XPC HB viewpoint */
+ u8 remote_vars_version; /* version# of partition's vars */
enum xpc_retval reason; /* reason partition is deactivating */
int reason_line; /* line# deactivation initiated from */
int reactivate_nasid; /* nasid in partition to reactivate */
+ unsigned long disengage_request_timeout; /* timeout in jiffies */
+ struct timer_list disengage_request_timer;
+
/* XPC infrastructure referencing and teardown control */
- volatile u8 setup_state; /* infrastructure setup state */
+ volatile u8 setup_state; /* infrastructure setup state */
wait_queue_head_t teardown_wq; /* kthread waiting to teardown infra */
atomic_t references; /* #of references to infrastructure */
@@ -454,6 +570,7 @@ struct xpc_partition {
u8 nchannels; /* #of defined channels supported */
atomic_t nchannels_active; /* #of channels that are not DISCONNECTED */
+ atomic_t nchannels_engaged;/* #of channels engaged with remote part */
struct xpc_channel *channels;/* array of channel structures */
void *local_GPs_base; /* base address of kmalloc'd space */
@@ -518,6 +635,7 @@ struct xpc_partition {
#define XPC_P_TORNDOWN 0x03 /* infrastructure is torndown */
+
/*
* struct xpc_partition IPI_timer #of seconds to wait before checking for
* dropped IPIs. These occur whenever an IPI amo write doesn't complete until
@@ -526,6 +644,13 @@ struct xpc_partition {
#define XPC_P_DROPPED_IPI_WAIT (0.25 * HZ)
+/* number of seconds to wait for other partitions to disengage */
+#define XPC_DISENGAGE_REQUEST_DEFAULT_TIMELIMIT 90
+
+/* interval in seconds to print 'waiting disengagement' messages */
+#define XPC_DISENGAGE_PRINTMSG_INTERVAL 10
+
+
#define XPC_PARTID(_p) ((partid_t) ((_p) - &xpc_partitions[0]))
@@ -534,24 +659,20 @@ struct xpc_partition {
extern struct xpc_registration xpc_registrations[];
-/* >>> found in xpc_main.c only */
+/* found in xpc_main.c */
extern struct device *xpc_part;
extern struct device *xpc_chan;
+extern int xpc_disengage_request_timelimit;
extern irqreturn_t xpc_notify_IRQ_handler(int, void *, struct pt_regs *);
extern void xpc_dropped_IPI_check(struct xpc_partition *);
+extern void xpc_activate_partition(struct xpc_partition *);
extern void xpc_activate_kthreads(struct xpc_channel *, int);
extern void xpc_create_kthreads(struct xpc_channel *, int);
extern void xpc_disconnect_wait(int);
-/* found in xpc_main.c and efi-xpc.c */
-extern void xpc_activate_partition(struct xpc_partition *);
-
-
/* found in xpc_partition.c */
extern int xpc_exiting;
-extern int xpc_hb_interval;
-extern int xpc_hb_check_interval;
extern struct xpc_vars *xpc_vars;
extern struct xpc_rsvd_page *xpc_rsvd_page;
extern struct xpc_vars_part *xpc_vars_part;
@@ -561,6 +682,7 @@ extern struct xpc_rsvd_page *xpc_rsvd_page_init(void);
extern void xpc_allow_IPI_ops(void);
extern void xpc_restrict_IPI_ops(void);
extern int xpc_identify_act_IRQ_sender(void);
+extern int xpc_partition_disengaged(struct xpc_partition *);
extern enum xpc_retval xpc_mark_partition_active(struct xpc_partition *);
extern void xpc_mark_partition_inactive(struct xpc_partition *);
extern void xpc_discovery(void);
@@ -585,8 +707,8 @@ extern void xpc_connected_callout(struct xpc_channel *);
extern void xpc_deliver_msg(struct xpc_channel *);
extern void xpc_disconnect_channel(const int, struct xpc_channel *,
enum xpc_retval, unsigned long *);
-extern void xpc_disconnected_callout(struct xpc_channel *);
-extern void xpc_partition_down(struct xpc_partition *, enum xpc_retval);
+extern void xpc_disconnecting_callout(struct xpc_channel *);
+extern void xpc_partition_going_down(struct xpc_partition *, enum xpc_retval);
extern void xpc_teardown_infrastructure(struct xpc_partition *);
@@ -674,6 +796,157 @@ xpc_part_ref(struct xpc_partition *part)
/*
+ * This next set of inlines are used to keep track of when a partition is
+ * potentially engaged in accessing memory belonging to another partition.
+ */
+
+static inline void
+xpc_mark_partition_engaged(struct xpc_partition *part)
+{
+ unsigned long irq_flags;
+ AMO_t *amo = (AMO_t *) __va(part->remote_amos_page_pa +
+ (XPC_ENGAGED_PARTITIONS_AMO * sizeof(AMO_t)));
+
+
+ local_irq_save(irq_flags);
+
+ /* set bit corresponding to our partid in remote partition's AMO */
+ FETCHOP_STORE_OP(TO_AMO((u64) &amo->variable), FETCHOP_OR,
+ (1UL << sn_partition_id));
+ /*
+ * We must always use the nofault function regardless of whether we
+ * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we
+ * didn't, we'd never know that the other partition is down and would
+ * keep sending IPIs and AMOs to it until the heartbeat times out.
+ */
+ (void) xp_nofault_PIOR((u64 *) GLOBAL_MMR_ADDR(NASID_GET(&amo->
+ variable), xp_nofault_PIOR_target));
+
+ local_irq_restore(irq_flags);
+}
+
+static inline void
+xpc_mark_partition_disengaged(struct xpc_partition *part)
+{
+ unsigned long irq_flags;
+ AMO_t *amo = (AMO_t *) __va(part->remote_amos_page_pa +
+ (XPC_ENGAGED_PARTITIONS_AMO * sizeof(AMO_t)));
+
+
+ local_irq_save(irq_flags);
+
+ /* clear bit corresponding to our partid in remote partition's AMO */
+ FETCHOP_STORE_OP(TO_AMO((u64) &amo->variable), FETCHOP_AND,
+ ~(1UL << sn_partition_id));
+ /*
+ * We must always use the nofault function regardless of whether we
+ * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we
+ * didn't, we'd never know that the other partition is down and would
+ * keep sending IPIs and AMOs to it until the heartbeat times out.
+ */
+ (void) xp_nofault_PIOR((u64 *) GLOBAL_MMR_ADDR(NASID_GET(&amo->
+ variable), xp_nofault_PIOR_target));
+
+ local_irq_restore(irq_flags);
+}
+
+static inline void
+xpc_request_partition_disengage(struct xpc_partition *part)
+{
+ unsigned long irq_flags;
+ AMO_t *amo = (AMO_t *) __va(part->remote_amos_page_pa +
+ (XPC_DISENGAGE_REQUEST_AMO * sizeof(AMO_t)));
+
+
+ local_irq_save(irq_flags);
+
+ /* set bit corresponding to our partid in remote partition's AMO */
+ FETCHOP_STORE_OP(TO_AMO((u64) &amo->variable), FETCHOP_OR,
+ (1UL << sn_partition_id));
+ /*
+ * We must always use the nofault function regardless of whether we
+ * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we
+ * didn't, we'd never know that the other partition is down and would
+ * keep sending IPIs and AMOs to it until the heartbeat times out.
+ */
+ (void) xp_nofault_PIOR((u64 *) GLOBAL_MMR_ADDR(NASID_GET(&amo->
+ variable), xp_nofault_PIOR_target));
+
+ local_irq_restore(irq_flags);
+}
+
+static inline void
+xpc_cancel_partition_disengage_request(struct xpc_partition *part)
+{
+ unsigned long irq_flags;
+ AMO_t *amo = (AMO_t *) __va(part->remote_amos_page_pa +
+ (XPC_DISENGAGE_REQUEST_AMO * sizeof(AMO_t)));
+
+
+ local_irq_save(irq_flags);
+
+ /* clear bit corresponding to our partid in remote partition's AMO */
+ FETCHOP_STORE_OP(TO_AMO((u64) &amo->variable), FETCHOP_AND,
+ ~(1UL << sn_partition_id));
+ /*
+ * We must always use the nofault function regardless of whether we
+ * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we
+ * didn't, we'd never know that the other partition is down and would
+ * keep sending IPIs and AMOs to it until the heartbeat times out.
+ */
+ (void) xp_nofault_PIOR((u64 *) GLOBAL_MMR_ADDR(NASID_GET(&amo->
+ variable), xp_nofault_PIOR_target));
+
+ local_irq_restore(irq_flags);
+}
+
+static inline u64
+xpc_partition_engaged(u64 partid_mask)
+{
+ AMO_t *amo = xpc_vars->amos_page + XPC_ENGAGED_PARTITIONS_AMO;
+
+
+ /* return our partition's AMO variable ANDed with partid_mask */
+ return (FETCHOP_LOAD_OP(TO_AMO((u64) &amo->variable), FETCHOP_LOAD) &
+ partid_mask);
+}
+
+static inline u64
+xpc_partition_disengage_requested(u64 partid_mask)
+{
+ AMO_t *amo = xpc_vars->amos_page + XPC_DISENGAGE_REQUEST_AMO;
+
+
+ /* return our partition's AMO variable ANDed with partid_mask */
+ return (FETCHOP_LOAD_OP(TO_AMO((u64) &amo->variable), FETCHOP_LOAD) &
+ partid_mask);
+}
+
+static inline void
+xpc_clear_partition_engaged(u64 partid_mask)
+{
+ AMO_t *amo = xpc_vars->amos_page + XPC_ENGAGED_PARTITIONS_AMO;
+
+
+ /* clear bit(s) based on partid_mask in our partition's AMO */
+ FETCHOP_STORE_OP(TO_AMO((u64) &amo->variable), FETCHOP_AND,
+ ~partid_mask);
+}
+
+static inline void
+xpc_clear_partition_disengage_request(u64 partid_mask)
+{
+ AMO_t *amo = xpc_vars->amos_page + XPC_DISENGAGE_REQUEST_AMO;
+
+
+ /* clear bit(s) based on partid_mask in our partition's AMO */
+ FETCHOP_STORE_OP(TO_AMO((u64) &amo->variable), FETCHOP_AND,
+ ~partid_mask);
+}
+
+
+
+/*
* The following set of macros and inlines are used for the sending and
* receiving of IPIs (also known as IRQs). There are two flavors of IPIs,
* one that is associated with partition activity (SGI_XPC_ACTIVATE) and
@@ -722,13 +995,13 @@ xpc_IPI_send(AMO_t *amo, u64 flag, int nasid, int phys_cpuid, int vector)
* Flag the appropriate AMO variable and send an IPI to the specified node.
*/
static inline void
-xpc_activate_IRQ_send(u64 amos_page, int from_nasid, int to_nasid,
+xpc_activate_IRQ_send(u64 amos_page_pa, int from_nasid, int to_nasid,
int to_phys_cpuid)
{
int w_index = XPC_NASID_W_INDEX(from_nasid);
int b_index = XPC_NASID_B_INDEX(from_nasid);
- AMO_t *amos = (AMO_t *) __va(amos_page +
- (XP_MAX_PARTITIONS * sizeof(AMO_t)));
+ AMO_t *amos = (AMO_t *) __va(amos_page_pa +
+ (XPC_ACTIVATE_IRQ_AMOS * sizeof(AMO_t)));
(void) xpc_IPI_send(&amos[w_index], (1UL << b_index), to_nasid,
@@ -756,6 +1029,13 @@ xpc_IPI_send_reactivate(struct xpc_partition *part)
xpc_vars->act_nasid, xpc_vars->act_phys_cpuid);
}
+static inline void
+xpc_IPI_send_disengage(struct xpc_partition *part)
+{
+ xpc_activate_IRQ_send(part->remote_amos_page_pa, cnodeid_to_nasid(0),
+ part->remote_act_nasid, part->remote_act_phys_cpuid);
+}
+
/*
* IPIs associated with SGI_XPC_NOTIFY IRQ.
@@ -836,6 +1116,7 @@ xpc_notify_IRQ_send_local(struct xpc_channel *ch, u8 ipi_flag,
/* given an AMO variable and a channel#, get its associated IPI flags */
#define XPC_GET_IPI_FLAGS(_amo, _c) ((u8) (((_amo) >> ((_c) * 8)) & 0xff))
+#define XPC_SET_IPI_FLAGS(_amo, _c, _f) (_amo) |= ((u64) (_f) << ((_c) * 8))
#define XPC_ANY_OPENCLOSE_IPI_FLAGS_SET(_amo) ((_amo) & 0x0f0f0f0f0f0f0f0f)
#define XPC_ANY_MSG_IPI_FLAGS_SET(_amo) ((_amo) & 0x1010101010101010)
@@ -903,17 +1184,18 @@ xpc_IPI_send_local_msgrequest(struct xpc_channel *ch)
* cacheable mapping for the entire region. This will prevent speculative
* reading of cached copies of our lines from being issued which will cause
* a PI FSB Protocol error to be generated by the SHUB. For XPC, we need 64
- * (XP_MAX_PARTITIONS) AMO variables for message notification (xpc_main.c)
- * and an additional 16 AMO variables for partition activation (xpc_hb.c).
+ * AMO variables (based on XP_MAX_PARTITIONS) for message notification and an
+ * additional 128 AMO variables (based on XP_NASID_MASK_WORDS) for partition
+ * activation and 2 AMO variables for partition deactivation.
*/
static inline AMO_t *
-xpc_IPI_init(partid_t partid)
+xpc_IPI_init(int index)
{
- AMO_t *part_amo = xpc_vars->amos_page + partid;
+ AMO_t *amo = xpc_vars->amos_page + index;
- xpc_IPI_receive(part_amo);
- return part_amo;
+ (void) xpc_IPI_receive(amo); /* clear AMO variable */
+ return amo;
}
@@ -939,7 +1221,7 @@ xpc_map_bte_errors(bte_result_t error)
static inline void *
-xpc_kmalloc_cacheline_aligned(size_t size, int flags, void **base)
+xpc_kmalloc_cacheline_aligned(size_t size, gfp_t flags, void **base)
{
/* see if kmalloc will give us cachline aligned memory by default */
*base = kmalloc(size, flags);
diff --git a/arch/ia64/sn/kernel/xpc_channel.c b/arch/ia64/sn/kernel/xpc_channel.c
index 94698bea7be0..abf4fc2a87bb 100644
--- a/arch/ia64/sn/kernel/xpc_channel.c
+++ b/arch/ia64/sn/kernel/xpc_channel.c
@@ -57,6 +57,7 @@ xpc_initialize_channels(struct xpc_partition *part, partid_t partid)
spin_lock_init(&ch->lock);
sema_init(&ch->msg_to_pull_sema, 1); /* mutex */
+ sema_init(&ch->wdisconnect_sema, 0); /* event wait */
atomic_set(&ch->n_on_msg_allocate_wq, 0);
init_waitqueue_head(&ch->msg_allocate_wq);
@@ -166,6 +167,7 @@ xpc_setup_infrastructure(struct xpc_partition *part)
xpc_initialize_channels(part, partid);
atomic_set(&part->nchannels_active, 0);
+ atomic_set(&part->nchannels_engaged, 0);
/* local_IPI_amo were set to 0 by an earlier memset() */
@@ -555,8 +557,6 @@ xpc_allocate_msgqueues(struct xpc_channel *ch)
sema_init(&ch->notify_queue[i].sema, 0);
}
- sema_init(&ch->teardown_sema, 0); /* event wait */
-
spin_lock_irqsave(&ch->lock, irq_flags);
ch->flags |= XPC_C_SETUP;
spin_unlock_irqrestore(&ch->lock, irq_flags);
@@ -626,6 +626,55 @@ xpc_process_connect(struct xpc_channel *ch, unsigned long *irq_flags)
/*
+ * Notify those who wanted to be notified upon delivery of their message.
+ */
+static void
+xpc_notify_senders(struct xpc_channel *ch, enum xpc_retval reason, s64 put)
+{
+ struct xpc_notify *notify;
+ u8 notify_type;
+ s64 get = ch->w_remote_GP.get - 1;
+
+
+ while (++get < put && atomic_read(&ch->n_to_notify) > 0) {
+
+ notify = &ch->notify_queue[get % ch->local_nentries];
+
+ /*
+ * See if the notify entry indicates it was associated with
+ * a message who's sender wants to be notified. It is possible
+ * that it is, but someone else is doing or has done the
+ * notification.
+ */
+ notify_type = notify->type;
+ if (notify_type == 0 ||
+ cmpxchg(&notify->type, notify_type, 0) !=
+ notify_type) {
+ continue;
+ }
+
+ DBUG_ON(notify_type != XPC_N_CALL);
+
+ atomic_dec(&ch->n_to_notify);
+
+ if (notify->func != NULL) {
+ dev_dbg(xpc_chan, "notify->func() called, notify=0x%p, "
+ "msg_number=%ld, partid=%d, channel=%d\n",
+ (void *) notify, get, ch->partid, ch->number);
+
+ notify->func(reason, ch->partid, ch->number,
+ notify->key);
+
+ dev_dbg(xpc_chan, "notify->func() returned, "
+ "notify=0x%p, msg_number=%ld, partid=%d, "
+ "channel=%d\n", (void *) notify, get,
+ ch->partid, ch->number);
+ }
+ }
+}
+
+
+/*
* Free up message queues and other stuff that were allocated for the specified
* channel.
*
@@ -669,9 +718,6 @@ xpc_free_msgqueues(struct xpc_channel *ch)
ch->remote_msgqueue = NULL;
kfree(ch->notify_queue);
ch->notify_queue = NULL;
-
- /* in case someone is waiting for the teardown to complete */
- up(&ch->teardown_sema);
}
}
@@ -683,7 +729,7 @@ static void
xpc_process_disconnect(struct xpc_channel *ch, unsigned long *irq_flags)
{
struct xpc_partition *part = &xpc_partitions[ch->partid];
- u32 ch_flags = ch->flags;
+ u32 channel_was_connected = (ch->flags & XPC_C_WASCONNECTED);
DBUG_ON(!spin_is_locked(&ch->lock));
@@ -701,12 +747,13 @@ xpc_process_disconnect(struct xpc_channel *ch, unsigned long *irq_flags)
}
DBUG_ON(atomic_read(&ch->kthreads_assigned) != 0);
- /* it's now safe to free the channel's message queues */
-
- xpc_free_msgqueues(ch);
- DBUG_ON(ch->flags & XPC_C_SETUP);
+ if (part->act_state == XPC_P_DEACTIVATING) {
+ /* can't proceed until the other side disengages from us */
+ if (xpc_partition_engaged(1UL << ch->partid)) {
+ return;
+ }
- if (part->act_state != XPC_P_DEACTIVATING) {
+ } else {
/* as long as the other side is up do the full protocol */
@@ -724,16 +771,42 @@ xpc_process_disconnect(struct xpc_channel *ch, unsigned long *irq_flags)
}
}
+ /* wake those waiting for notify completion */
+ if (atomic_read(&ch->n_to_notify) > 0) {
+ /* >>> we do callout while holding ch->lock */
+ xpc_notify_senders(ch, ch->reason, ch->w_local_GP.put);
+ }
+
/* both sides are disconnected now */
- ch->flags = XPC_C_DISCONNECTED; /* clear all flags, but this one */
+ /* it's now safe to free the channel's message queues */
+ xpc_free_msgqueues(ch);
+
+ /* mark disconnected, clear all other flags except XPC_C_WDISCONNECT */
+ ch->flags = (XPC_C_DISCONNECTED | (ch->flags & XPC_C_WDISCONNECT));
atomic_dec(&part->nchannels_active);
- if (ch_flags & XPC_C_WASCONNECTED) {
+ if (channel_was_connected) {
dev_info(xpc_chan, "channel %d to partition %d disconnected, "
"reason=%d\n", ch->number, ch->partid, ch->reason);
}
+
+ if (ch->flags & XPC_C_WDISCONNECT) {
+ spin_unlock_irqrestore(&ch->lock, *irq_flags);
+ up(&ch->wdisconnect_sema);
+ spin_lock_irqsave(&ch->lock, *irq_flags);
+
+ } else if (ch->delayed_IPI_flags) {
+ if (part->act_state != XPC_P_DEACTIVATING) {
+ /* time to take action on any delayed IPI flags */
+ spin_lock(&part->IPI_lock);
+ XPC_SET_IPI_FLAGS(part->local_IPI_amo, ch->number,
+ ch->delayed_IPI_flags);
+ spin_unlock(&part->IPI_lock);
+ }
+ ch->delayed_IPI_flags = 0;
+ }
}
@@ -754,6 +827,19 @@ xpc_process_openclose_IPI(struct xpc_partition *part, int ch_number,
spin_lock_irqsave(&ch->lock, irq_flags);
+again:
+
+ if ((ch->flags & XPC_C_DISCONNECTED) &&
+ (ch->flags & XPC_C_WDISCONNECT)) {
+ /*
+ * Delay processing IPI flags until thread waiting disconnect
+ * has had a chance to see that the channel is disconnected.
+ */
+ ch->delayed_IPI_flags |= IPI_flags;
+ spin_unlock_irqrestore(&ch->lock, irq_flags);
+ return;
+ }
+
if (IPI_flags & XPC_IPI_CLOSEREQUEST) {
@@ -764,7 +850,7 @@ xpc_process_openclose_IPI(struct xpc_partition *part, int ch_number,
/*
* If RCLOSEREQUEST is set, we're probably waiting for
* RCLOSEREPLY. We should find it and a ROPENREQUEST packed
- * with this RCLOSEQREUQEST in the IPI_flags.
+ * with this RCLOSEREQUEST in the IPI_flags.
*/
if (ch->flags & XPC_C_RCLOSEREQUEST) {
@@ -779,14 +865,22 @@ xpc_process_openclose_IPI(struct xpc_partition *part, int ch_number,
/* both sides have finished disconnecting */
xpc_process_disconnect(ch, &irq_flags);
+ DBUG_ON(!(ch->flags & XPC_C_DISCONNECTED));
+ goto again;
}
if (ch->flags & XPC_C_DISCONNECTED) {
- // >>> explain this section
-
if (!(IPI_flags & XPC_IPI_OPENREQUEST)) {
- DBUG_ON(part->act_state !=
- XPC_P_DEACTIVATING);
+ if ((XPC_GET_IPI_FLAGS(part->local_IPI_amo,
+ ch_number) & XPC_IPI_OPENREQUEST)) {
+
+ DBUG_ON(ch->delayed_IPI_flags != 0);
+ spin_lock(&part->IPI_lock);
+ XPC_SET_IPI_FLAGS(part->local_IPI_amo,
+ ch_number,
+ XPC_IPI_CLOSEREQUEST);
+ spin_unlock(&part->IPI_lock);
+ }
spin_unlock_irqrestore(&ch->lock, irq_flags);
return;
}
@@ -816,9 +910,13 @@ xpc_process_openclose_IPI(struct xpc_partition *part, int ch_number,
}
XPC_DISCONNECT_CHANNEL(ch, reason, &irq_flags);
- } else {
- xpc_process_disconnect(ch, &irq_flags);
+
+ DBUG_ON(IPI_flags & XPC_IPI_CLOSEREPLY);
+ spin_unlock_irqrestore(&ch->lock, irq_flags);
+ return;
}
+
+ xpc_process_disconnect(ch, &irq_flags);
}
@@ -834,7 +932,20 @@ xpc_process_openclose_IPI(struct xpc_partition *part, int ch_number,
}
DBUG_ON(!(ch->flags & XPC_C_CLOSEREQUEST));
- DBUG_ON(!(ch->flags & XPC_C_RCLOSEREQUEST));
+
+ if (!(ch->flags & XPC_C_RCLOSEREQUEST)) {
+ if ((XPC_GET_IPI_FLAGS(part->local_IPI_amo, ch_number)
+ & XPC_IPI_CLOSEREQUEST)) {
+
+ DBUG_ON(ch->delayed_IPI_flags != 0);
+ spin_lock(&part->IPI_lock);
+ XPC_SET_IPI_FLAGS(part->local_IPI_amo,
+ ch_number, XPC_IPI_CLOSEREPLY);
+ spin_unlock(&part->IPI_lock);
+ }
+ spin_unlock_irqrestore(&ch->lock, irq_flags);
+ return;
+ }
ch->flags |= XPC_C_RCLOSEREPLY;
@@ -852,8 +963,14 @@ xpc_process_openclose_IPI(struct xpc_partition *part, int ch_number,
"channel=%d\n", args->msg_size, args->local_nentries,
ch->partid, ch->number);
- if ((ch->flags & XPC_C_DISCONNECTING) ||
- part->act_state == XPC_P_DEACTIVATING) {
+ if (part->act_state == XPC_P_DEACTIVATING ||
+ (ch->flags & XPC_C_ROPENREQUEST)) {
+ spin_unlock_irqrestore(&ch->lock, irq_flags);
+ return;
+ }
+
+ if (ch->flags & (XPC_C_DISCONNECTING | XPC_C_WDISCONNECT)) {
+ ch->delayed_IPI_flags |= XPC_IPI_OPENREQUEST;
spin_unlock_irqrestore(&ch->lock, irq_flags);
return;
}
@@ -867,8 +984,11 @@ xpc_process_openclose_IPI(struct xpc_partition *part, int ch_number,
* msg_size = size of channel's messages in bytes
* local_nentries = remote partition's local_nentries
*/
- DBUG_ON(args->msg_size == 0);
- DBUG_ON(args->local_nentries == 0);
+ if (args->msg_size == 0 || args->local_nentries == 0) {
+ /* assume OPENREQUEST was delayed by mistake */
+ spin_unlock_irqrestore(&ch->lock, irq_flags);
+ return;
+ }
ch->flags |= (XPC_C_ROPENREQUEST | XPC_C_CONNECTING);
ch->remote_nentries = args->local_nentries;
@@ -906,7 +1026,13 @@ xpc_process_openclose_IPI(struct xpc_partition *part, int ch_number,
spin_unlock_irqrestore(&ch->lock, irq_flags);
return;
}
- DBUG_ON(!(ch->flags & XPC_C_OPENREQUEST));
+ if (!(ch->flags & XPC_C_OPENREQUEST)) {
+ XPC_DISCONNECT_CHANNEL(ch, xpcOpenCloseError,
+ &irq_flags);
+ spin_unlock_irqrestore(&ch->lock, irq_flags);
+ return;
+ }
+
DBUG_ON(!(ch->flags & XPC_C_ROPENREQUEST));
DBUG_ON(ch->flags & XPC_C_CONNECTED);
@@ -960,8 +1086,8 @@ xpc_connect_channel(struct xpc_channel *ch)
struct xpc_registration *registration = &xpc_registrations[ch->number];
- if (down_interruptible(&registration->sema) != 0) {
- return xpcInterrupted;
+ if (down_trylock(&registration->sema) != 0) {
+ return xpcRetry;
}
if (!XPC_CHANNEL_REGISTERED(ch->number)) {
@@ -1040,55 +1166,6 @@ xpc_connect_channel(struct xpc_channel *ch)
/*
- * Notify those who wanted to be notified upon delivery of their message.
- */
-static void
-xpc_notify_senders(struct xpc_channel *ch, enum xpc_retval reason, s64 put)
-{
- struct xpc_notify *notify;
- u8 notify_type;
- s64 get = ch->w_remote_GP.get - 1;
-
-
- while (++get < put && atomic_read(&ch->n_to_notify) > 0) {
-
- notify = &ch->notify_queue[get % ch->local_nentries];
-
- /*
- * See if the notify entry indicates it was associated with
- * a message who's sender wants to be notified. It is possible
- * that it is, but someone else is doing or has done the
- * notification.
- */
- notify_type = notify->type;
- if (notify_type == 0 ||
- cmpxchg(&notify->type, notify_type, 0) !=
- notify_type) {
- continue;
- }
-
- DBUG_ON(notify_type != XPC_N_CALL);
-
- atomic_dec(&ch->n_to_notify);
-
- if (notify->func != NULL) {
- dev_dbg(xpc_chan, "notify->func() called, notify=0x%p, "
- "msg_number=%ld, partid=%d, channel=%d\n",
- (void *) notify, get, ch->partid, ch->number);
-
- notify->func(reason, ch->partid, ch->number,
- notify->key);
-
- dev_dbg(xpc_chan, "notify->func() returned, "
- "notify=0x%p, msg_number=%ld, partid=%d, "
- "channel=%d\n", (void *) notify, get,
- ch->partid, ch->number);
- }
- }
-}
-
-
-/*
* Clear some of the msg flags in the local message queue.
*/
static inline void
@@ -1240,6 +1317,7 @@ xpc_process_channel_activity(struct xpc_partition *part)
u64 IPI_amo, IPI_flags;
struct xpc_channel *ch;
int ch_number;
+ u32 ch_flags;
IPI_amo = xpc_get_IPI_flags(part);
@@ -1266,8 +1344,9 @@ xpc_process_channel_activity(struct xpc_partition *part)
xpc_process_openclose_IPI(part, ch_number, IPI_flags);
}
+ ch_flags = ch->flags; /* need an atomic snapshot of flags */
- if (ch->flags & XPC_C_DISCONNECTING) {
+ if (ch_flags & XPC_C_DISCONNECTING) {
spin_lock_irqsave(&ch->lock, irq_flags);
xpc_process_disconnect(ch, &irq_flags);
spin_unlock_irqrestore(&ch->lock, irq_flags);
@@ -1278,9 +1357,9 @@ xpc_process_channel_activity(struct xpc_partition *part)
continue;
}
- if (!(ch->flags & XPC_C_CONNECTED)) {
- if (!(ch->flags & XPC_C_OPENREQUEST)) {
- DBUG_ON(ch->flags & XPC_C_SETUP);
+ if (!(ch_flags & XPC_C_CONNECTED)) {
+ if (!(ch_flags & XPC_C_OPENREQUEST)) {
+ DBUG_ON(ch_flags & XPC_C_SETUP);
(void) xpc_connect_channel(ch);
} else {
spin_lock_irqsave(&ch->lock, irq_flags);
@@ -1305,8 +1384,8 @@ xpc_process_channel_activity(struct xpc_partition *part)
/*
- * XPC's heartbeat code calls this function to inform XPC that a partition has
- * gone down. XPC responds by tearing down the XPartition Communication
+ * XPC's heartbeat code calls this function to inform XPC that a partition is
+ * going down. XPC responds by tearing down the XPartition Communication
* infrastructure used for the just downed partition.
*
* XPC's heartbeat code will never call this function and xpc_partition_up()
@@ -1314,7 +1393,7 @@ xpc_process_channel_activity(struct xpc_partition *part)
* at the same time.
*/
void
-xpc_partition_down(struct xpc_partition *part, enum xpc_retval reason)
+xpc_partition_going_down(struct xpc_partition *part, enum xpc_retval reason)
{
unsigned long irq_flags;
int ch_number;
@@ -1330,12 +1409,11 @@ xpc_partition_down(struct xpc_partition *part, enum xpc_retval reason)
}
- /* disconnect all channels associated with the downed partition */
+ /* disconnect channels associated with the partition going down */
for (ch_number = 0; ch_number < part->nchannels; ch_number++) {
ch = &part->channels[ch_number];
-
xpc_msgqueue_ref(ch);
spin_lock_irqsave(&ch->lock, irq_flags);
@@ -1370,6 +1448,7 @@ xpc_teardown_infrastructure(struct xpc_partition *part)
* this partition.
*/
+ DBUG_ON(atomic_read(&part->nchannels_engaged) != 0);
DBUG_ON(atomic_read(&part->nchannels_active) != 0);
DBUG_ON(part->setup_state != XPC_P_SETUP);
part->setup_state = XPC_P_WTEARDOWN;
@@ -1428,19 +1507,11 @@ xpc_initiate_connect(int ch_number)
if (xpc_part_ref(part)) {
ch = &part->channels[ch_number];
- if (!(ch->flags & XPC_C_DISCONNECTING)) {
- DBUG_ON(ch->flags & XPC_C_OPENREQUEST);
- DBUG_ON(ch->flags & XPC_C_CONNECTED);
- DBUG_ON(ch->flags & XPC_C_SETUP);
-
- /*
- * Initiate the establishment of a connection
- * on the newly registered channel to the
- * remote partition.
- */
- xpc_wakeup_channel_mgr(part);
- }
-
+ /*
+ * Initiate the establishment of a connection on the
+ * newly registered channel to the remote partition.
+ */
+ xpc_wakeup_channel_mgr(part);
xpc_part_deref(part);
}
}
@@ -1450,9 +1521,6 @@ xpc_initiate_connect(int ch_number)
void
xpc_connected_callout(struct xpc_channel *ch)
{
- unsigned long irq_flags;
-
-
/* let the registerer know that a connection has been established */
if (ch->func != NULL) {
@@ -1465,10 +1533,6 @@ xpc_connected_callout(struct xpc_channel *ch)
dev_dbg(xpc_chan, "ch->func() returned, reason=xpcConnected, "
"partid=%d, channel=%d\n", ch->partid, ch->number);
}
-
- spin_lock_irqsave(&ch->lock, irq_flags);
- ch->flags |= XPC_C_CONNECTCALLOUT;
- spin_unlock_irqrestore(&ch->lock, irq_flags);
}
@@ -1506,8 +1570,12 @@ xpc_initiate_disconnect(int ch_number)
spin_lock_irqsave(&ch->lock, irq_flags);
- XPC_DISCONNECT_CHANNEL(ch, xpcUnregistering,
+ if (!(ch->flags & XPC_C_DISCONNECTED)) {
+ ch->flags |= XPC_C_WDISCONNECT;
+
+ XPC_DISCONNECT_CHANNEL(ch, xpcUnregistering,
&irq_flags);
+ }
spin_unlock_irqrestore(&ch->lock, irq_flags);
@@ -1523,8 +1591,9 @@ xpc_initiate_disconnect(int ch_number)
/*
* To disconnect a channel, and reflect it back to all who may be waiting.
*
- * >>> An OPEN is not allowed until XPC_C_DISCONNECTING is cleared by
- * >>> xpc_free_msgqueues().
+ * An OPEN is not allowed until XPC_C_DISCONNECTING is cleared by
+ * xpc_process_disconnect(), and if set, XPC_C_WDISCONNECT is cleared by
+ * xpc_disconnect_wait().
*
* THE CHANNEL IS TO BE LOCKED BY THE CALLER AND WILL REMAIN LOCKED UPON RETURN.
*/
@@ -1532,7 +1601,7 @@ void
xpc_disconnect_channel(const int line, struct xpc_channel *ch,
enum xpc_retval reason, unsigned long *irq_flags)
{
- u32 flags;
+ u32 channel_was_connected = (ch->flags & XPC_C_CONNECTED);
DBUG_ON(!spin_is_locked(&ch->lock));
@@ -1547,61 +1616,53 @@ xpc_disconnect_channel(const int line, struct xpc_channel *ch,
XPC_SET_REASON(ch, reason, line);
- flags = ch->flags;
+ ch->flags |= (XPC_C_CLOSEREQUEST | XPC_C_DISCONNECTING);
/* some of these may not have been set */
ch->flags &= ~(XPC_C_OPENREQUEST | XPC_C_OPENREPLY |
XPC_C_ROPENREQUEST | XPC_C_ROPENREPLY |
XPC_C_CONNECTING | XPC_C_CONNECTED);
- ch->flags |= (XPC_C_CLOSEREQUEST | XPC_C_DISCONNECTING);
xpc_IPI_send_closerequest(ch, irq_flags);
- if (flags & XPC_C_CONNECTED) {
+ if (channel_was_connected) {
ch->flags |= XPC_C_WASCONNECTED;
}
+ spin_unlock_irqrestore(&ch->lock, *irq_flags);
+
+ /* wake all idle kthreads so they can exit */
if (atomic_read(&ch->kthreads_idle) > 0) {
- /* wake all idle kthreads so they can exit */
wake_up_all(&ch->idle_wq);
}
- spin_unlock_irqrestore(&ch->lock, *irq_flags);
-
-
/* wake those waiting to allocate an entry from the local msg queue */
-
if (atomic_read(&ch->n_on_msg_allocate_wq) > 0) {
wake_up(&ch->msg_allocate_wq);
}
- /* wake those waiting for notify completion */
-
- if (atomic_read(&ch->n_to_notify) > 0) {
- xpc_notify_senders(ch, reason, ch->w_local_GP.put);
- }
-
spin_lock_irqsave(&ch->lock, *irq_flags);
}
void
-xpc_disconnected_callout(struct xpc_channel *ch)
+xpc_disconnecting_callout(struct xpc_channel *ch)
{
/*
- * Let the channel's registerer know that the channel is now
+ * Let the channel's registerer know that the channel is being
* disconnected. We don't want to do this if the registerer was never
- * informed of a connection being made, unless the disconnect was for
- * abnormal reasons.
+ * informed of a connection being made.
*/
if (ch->func != NULL) {
- dev_dbg(xpc_chan, "ch->func() called, reason=%d, partid=%d, "
- "channel=%d\n", ch->reason, ch->partid, ch->number);
+ dev_dbg(xpc_chan, "ch->func() called, reason=xpcDisconnecting,"
+ " partid=%d, channel=%d\n", ch->partid, ch->number);
- ch->func(ch->reason, ch->partid, ch->number, NULL, ch->key);
+ ch->func(xpcDisconnecting, ch->partid, ch->number, NULL,
+ ch->key);
- dev_dbg(xpc_chan, "ch->func() returned, reason=%d, partid=%d, "
- "channel=%d\n", ch->reason, ch->partid, ch->number);
+ dev_dbg(xpc_chan, "ch->func() returned, reason="
+ "xpcDisconnecting, partid=%d, channel=%d\n",
+ ch->partid, ch->number);
}
}
@@ -1848,7 +1909,7 @@ xpc_send_msg(struct xpc_channel *ch, struct xpc_msg *msg, u8 notify_type,
xpc_notify_func func, void *key)
{
enum xpc_retval ret = xpcSuccess;
- struct xpc_notify *notify = NULL; // >>> to keep the compiler happy!!
+ struct xpc_notify *notify = notify;
s64 put, msg_number = msg->number;
diff --git a/arch/ia64/sn/kernel/xpc_main.c b/arch/ia64/sn/kernel/xpc_main.c
index bb1d5cf30440..cece3c7c69be 100644
--- a/arch/ia64/sn/kernel/xpc_main.c
+++ b/arch/ia64/sn/kernel/xpc_main.c
@@ -54,6 +54,7 @@
#include <linux/interrupt.h>
#include <linux/slab.h>
#include <linux/delay.h>
+#include <linux/reboot.h>
#include <asm/sn/intr.h>
#include <asm/sn/sn_sal.h>
#include <asm/uaccess.h>
@@ -82,11 +83,17 @@ struct device *xpc_chan = &xpc_chan_dbg_subname;
/* systune related variables for /proc/sys directories */
-static int xpc_hb_min = 1;
-static int xpc_hb_max = 10;
+static int xpc_hb_interval = XPC_HB_DEFAULT_INTERVAL;
+static int xpc_hb_min_interval = 1;
+static int xpc_hb_max_interval = 10;
-static int xpc_hb_check_min = 10;
-static int xpc_hb_check_max = 120;
+static int xpc_hb_check_interval = XPC_HB_CHECK_DEFAULT_INTERVAL;
+static int xpc_hb_check_min_interval = 10;
+static int xpc_hb_check_max_interval = 120;
+
+int xpc_disengage_request_timelimit = XPC_DISENGAGE_REQUEST_DEFAULT_TIMELIMIT;
+static int xpc_disengage_request_min_timelimit = 0;
+static int xpc_disengage_request_max_timelimit = 120;
static ctl_table xpc_sys_xpc_hb_dir[] = {
{
@@ -99,7 +106,8 @@ static ctl_table xpc_sys_xpc_hb_dir[] = {
&proc_dointvec_minmax,
&sysctl_intvec,
NULL,
- &xpc_hb_min, &xpc_hb_max
+ &xpc_hb_min_interval,
+ &xpc_hb_max_interval
},
{
2,
@@ -111,7 +119,8 @@ static ctl_table xpc_sys_xpc_hb_dir[] = {
&proc_dointvec_minmax,
&sysctl_intvec,
NULL,
- &xpc_hb_check_min, &xpc_hb_check_max
+ &xpc_hb_check_min_interval,
+ &xpc_hb_check_max_interval
},
{0}
};
@@ -124,6 +133,19 @@ static ctl_table xpc_sys_xpc_dir[] = {
0555,
xpc_sys_xpc_hb_dir
},
+ {
+ 2,
+ "disengage_request_timelimit",
+ &xpc_disengage_request_timelimit,
+ sizeof(int),
+ 0644,
+ NULL,
+ &proc_dointvec_minmax,
+ &sysctl_intvec,
+ NULL,
+ &xpc_disengage_request_min_timelimit,
+ &xpc_disengage_request_max_timelimit
+ },
{0}
};
static ctl_table xpc_sys_dir[] = {
@@ -148,10 +170,10 @@ static DECLARE_WAIT_QUEUE_HEAD(xpc_act_IRQ_wq);
static unsigned long xpc_hb_check_timeout;
-/* xpc_hb_checker thread exited notification */
+/* notification that the xpc_hb_checker thread has exited */
static DECLARE_MUTEX_LOCKED(xpc_hb_checker_exited);
-/* xpc_discovery thread exited notification */
+/* notification that the xpc_discovery thread has exited */
static DECLARE_MUTEX_LOCKED(xpc_discovery_exited);
@@ -161,6 +183,30 @@ static struct timer_list xpc_hb_timer;
static void xpc_kthread_waitmsgs(struct xpc_partition *, struct xpc_channel *);
+static int xpc_system_reboot(struct notifier_block *, unsigned long, void *);
+static struct notifier_block xpc_reboot_notifier = {
+ .notifier_call = xpc_system_reboot,
+};
+
+
+/*
+ * Timer function to enforce the timelimit on the partition disengage request.
+ */
+static void
+xpc_timeout_partition_disengage_request(unsigned long data)
+{
+ struct xpc_partition *part = (struct xpc_partition *) data;
+
+
+ DBUG_ON(jiffies < part->disengage_request_timeout);
+
+ (void) xpc_partition_disengaged(part);
+
+ DBUG_ON(part->disengage_request_timeout != 0);
+ DBUG_ON(xpc_partition_engaged(1UL << XPC_PARTID(part)) != 0);
+}
+
+
/*
* Notify the heartbeat check thread that an IRQ has been received.
*/
@@ -214,12 +260,6 @@ xpc_hb_checker(void *ignore)
while (!(volatile int) xpc_exiting) {
- /* wait for IRQ or timeout */
- (void) wait_event_interruptible(xpc_act_IRQ_wq,
- (last_IRQ_count < atomic_read(&xpc_act_IRQ_rcvd) ||
- jiffies >= xpc_hb_check_timeout ||
- (volatile int) xpc_exiting));
-
dev_dbg(xpc_part, "woke up with %d ticks rem; %d IRQs have "
"been received\n",
(int) (xpc_hb_check_timeout - jiffies),
@@ -240,6 +280,7 @@ xpc_hb_checker(void *ignore)
}
+ /* check for outstanding IRQs */
new_IRQ_count = atomic_read(&xpc_act_IRQ_rcvd);
if (last_IRQ_count < new_IRQ_count || force_IRQ != 0) {
force_IRQ = 0;
@@ -257,12 +298,18 @@ xpc_hb_checker(void *ignore)
xpc_hb_check_timeout = jiffies +
(xpc_hb_check_interval * HZ);
}
+
+ /* wait for IRQ or timeout */
+ (void) wait_event_interruptible(xpc_act_IRQ_wq,
+ (last_IRQ_count < atomic_read(&xpc_act_IRQ_rcvd) ||
+ jiffies >= xpc_hb_check_timeout ||
+ (volatile int) xpc_exiting));
}
dev_dbg(xpc_part, "heartbeat checker is exiting\n");
- /* mark this thread as inactive */
+ /* mark this thread as having exited */
up(&xpc_hb_checker_exited);
return 0;
}
@@ -282,7 +329,7 @@ xpc_initiate_discovery(void *ignore)
dev_dbg(xpc_part, "discovery thread is exiting\n");
- /* mark this thread as inactive */
+ /* mark this thread as having exited */
up(&xpc_discovery_exited);
return 0;
}
@@ -309,7 +356,7 @@ xpc_make_first_contact(struct xpc_partition *part)
"partition %d\n", XPC_PARTID(part));
/* wait a 1/4 of a second or so */
- msleep_interruptible(250);
+ (void) msleep_interruptible(250);
if (part->act_state == XPC_P_DEACTIVATING) {
return part->reason;
@@ -336,7 +383,8 @@ static void
xpc_channel_mgr(struct xpc_partition *part)
{
while (part->act_state != XPC_P_DEACTIVATING ||
- atomic_read(&part->nchannels_active) > 0) {
+ atomic_read(&part->nchannels_active) > 0 ||
+ !xpc_partition_disengaged(part)) {
xpc_process_channel_activity(part);
@@ -360,7 +408,8 @@ xpc_channel_mgr(struct xpc_partition *part)
(volatile u64) part->local_IPI_amo != 0 ||
((volatile u8) part->act_state ==
XPC_P_DEACTIVATING &&
- atomic_read(&part->nchannels_active) == 0)));
+ atomic_read(&part->nchannels_active) == 0 &&
+ xpc_partition_disengaged(part))));
atomic_set(&part->channel_mgr_requests, 1);
// >>> Does it need to wakeup periodically as well? In case we
@@ -482,7 +531,7 @@ xpc_activating(void *__partid)
return 0;
}
- XPC_ALLOW_HB(partid, xpc_vars);
+ xpc_allow_hb(partid, xpc_vars);
xpc_IPI_send_activated(part);
@@ -492,6 +541,7 @@ xpc_activating(void *__partid)
*/
(void) xpc_partition_up(part);
+ xpc_disallow_hb(partid, xpc_vars);
xpc_mark_partition_inactive(part);
if (part->reason == xpcReactivating) {
@@ -670,6 +720,7 @@ xpc_daemonize_kthread(void *args)
struct xpc_partition *part = &xpc_partitions[partid];
struct xpc_channel *ch;
int n_needed;
+ unsigned long irq_flags;
daemonize("xpc%02dc%d", partid, ch_number);
@@ -680,11 +731,14 @@ xpc_daemonize_kthread(void *args)
ch = &part->channels[ch_number];
if (!(ch->flags & XPC_C_DISCONNECTING)) {
- DBUG_ON(!(ch->flags & XPC_C_CONNECTED));
/* let registerer know that connection has been established */
- if (atomic_read(&ch->kthreads_assigned) == 1) {
+ spin_lock_irqsave(&ch->lock, irq_flags);
+ if (!(ch->flags & XPC_C_CONNECTCALLOUT)) {
+ ch->flags |= XPC_C_CONNECTCALLOUT;
+ spin_unlock_irqrestore(&ch->lock, irq_flags);
+
xpc_connected_callout(ch);
/*
@@ -699,16 +753,28 @@ xpc_daemonize_kthread(void *args)
!(ch->flags & XPC_C_DISCONNECTING)) {
xpc_activate_kthreads(ch, n_needed);
}
+ } else {
+ spin_unlock_irqrestore(&ch->lock, irq_flags);
}
xpc_kthread_waitmsgs(part, ch);
}
- if (atomic_dec_return(&ch->kthreads_assigned) == 0 &&
- ((ch->flags & XPC_C_CONNECTCALLOUT) ||
- (ch->reason != xpcUnregistering &&
- ch->reason != xpcOtherUnregistering))) {
- xpc_disconnected_callout(ch);
+ if (atomic_dec_return(&ch->kthreads_assigned) == 0) {
+ spin_lock_irqsave(&ch->lock, irq_flags);
+ if ((ch->flags & XPC_C_CONNECTCALLOUT) &&
+ !(ch->flags & XPC_C_DISCONNECTCALLOUT)) {
+ ch->flags |= XPC_C_DISCONNECTCALLOUT;
+ spin_unlock_irqrestore(&ch->lock, irq_flags);
+
+ xpc_disconnecting_callout(ch);
+ } else {
+ spin_unlock_irqrestore(&ch->lock, irq_flags);
+ }
+ if (atomic_dec_return(&part->nchannels_engaged) == 0) {
+ xpc_mark_partition_disengaged(part);
+ xpc_IPI_send_disengage(part);
+ }
}
@@ -740,12 +806,33 @@ xpc_create_kthreads(struct xpc_channel *ch, int needed)
unsigned long irq_flags;
pid_t pid;
u64 args = XPC_PACK_ARGS(ch->partid, ch->number);
+ struct xpc_partition *part = &xpc_partitions[ch->partid];
while (needed-- > 0) {
+
+ /*
+ * The following is done on behalf of the newly created
+ * kthread. That kthread is responsible for doing the
+ * counterpart to the following before it exits.
+ */
+ (void) xpc_part_ref(part);
+ xpc_msgqueue_ref(ch);
+ if (atomic_inc_return(&ch->kthreads_assigned) == 1 &&
+ atomic_inc_return(&part->nchannels_engaged) == 1) {
+ xpc_mark_partition_engaged(part);
+ }
+
pid = kernel_thread(xpc_daemonize_kthread, (void *) args, 0);
if (pid < 0) {
/* the fork failed */
+ if (atomic_dec_return(&ch->kthreads_assigned) == 0 &&
+ atomic_dec_return(&part->nchannels_engaged) == 0) {
+ xpc_mark_partition_disengaged(part);
+ xpc_IPI_send_disengage(part);
+ }
+ xpc_msgqueue_deref(ch);
+ xpc_part_deref(part);
if (atomic_read(&ch->kthreads_assigned) <
ch->kthreads_idle_limit) {
@@ -765,14 +852,6 @@ xpc_create_kthreads(struct xpc_channel *ch, int needed)
break;
}
- /*
- * The following is done on behalf of the newly created
- * kthread. That kthread is responsible for doing the
- * counterpart to the following before it exits.
- */
- (void) xpc_part_ref(&xpc_partitions[ch->partid]);
- xpc_msgqueue_ref(ch);
- atomic_inc(&ch->kthreads_assigned);
ch->kthreads_created++; // >>> temporary debug only!!!
}
}
@@ -781,88 +860,143 @@ xpc_create_kthreads(struct xpc_channel *ch, int needed)
void
xpc_disconnect_wait(int ch_number)
{
+ unsigned long irq_flags;
partid_t partid;
struct xpc_partition *part;
struct xpc_channel *ch;
+ int wakeup_channel_mgr;
/* now wait for all callouts to the caller's function to cease */
for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
part = &xpc_partitions[partid];
- if (xpc_part_ref(part)) {
- ch = &part->channels[ch_number];
+ if (!xpc_part_ref(part)) {
+ continue;
+ }
-// >>> how do we keep from falling into the window between our check and going
-// >>> down and coming back up where sema is re-inited?
- if (ch->flags & XPC_C_SETUP) {
- (void) down(&ch->teardown_sema);
- }
+ ch = &part->channels[ch_number];
+ if (!(ch->flags & XPC_C_WDISCONNECT)) {
xpc_part_deref(part);
+ continue;
}
+
+ (void) down(&ch->wdisconnect_sema);
+
+ spin_lock_irqsave(&ch->lock, irq_flags);
+ DBUG_ON(!(ch->flags & XPC_C_DISCONNECTED));
+ wakeup_channel_mgr = 0;
+
+ if (ch->delayed_IPI_flags) {
+ if (part->act_state != XPC_P_DEACTIVATING) {
+ spin_lock(&part->IPI_lock);
+ XPC_SET_IPI_FLAGS(part->local_IPI_amo,
+ ch->number, ch->delayed_IPI_flags);
+ spin_unlock(&part->IPI_lock);
+ wakeup_channel_mgr = 1;
+ }
+ ch->delayed_IPI_flags = 0;
+ }
+
+ ch->flags &= ~XPC_C_WDISCONNECT;
+ spin_unlock_irqrestore(&ch->lock, irq_flags);
+
+ if (wakeup_channel_mgr) {
+ xpc_wakeup_channel_mgr(part);
+ }
+
+ xpc_part_deref(part);
}
}
static void
-xpc_do_exit(void)
+xpc_do_exit(enum xpc_retval reason)
{
partid_t partid;
int active_part_count;
struct xpc_partition *part;
+ unsigned long printmsg_time;
- /* now it's time to eliminate our heartbeat */
- del_timer_sync(&xpc_hb_timer);
- xpc_vars->heartbeating_to_mask = 0;
-
- /* indicate to others that our reserved page is uninitialized */
- xpc_rsvd_page->vars_pa = 0;
-
- /*
- * Ignore all incoming interrupts. Without interupts the heartbeat
- * checker won't activate any new partitions that may come up.
- */
- free_irq(SGI_XPC_ACTIVATE, NULL);
+ /* a 'rmmod XPC' and a 'reboot' cannot both end up here together */
+ DBUG_ON(xpc_exiting == 1);
/*
- * Cause the heartbeat checker and the discovery threads to exit.
- * We don't want them attempting to activate new partitions as we
- * try to deactivate the existing ones.
+ * Let the heartbeat checker thread and the discovery thread
+ * (if one is running) know that they should exit. Also wake up
+ * the heartbeat checker thread in case it's sleeping.
*/
xpc_exiting = 1;
wake_up_interruptible(&xpc_act_IRQ_wq);
- /* wait for the heartbeat checker thread to mark itself inactive */
- down(&xpc_hb_checker_exited);
+ /* ignore all incoming interrupts */
+ free_irq(SGI_XPC_ACTIVATE, NULL);
- /* wait for the discovery thread to mark itself inactive */
+ /* wait for the discovery thread to exit */
down(&xpc_discovery_exited);
+ /* wait for the heartbeat checker thread to exit */
+ down(&xpc_hb_checker_exited);
+
- msleep_interruptible(300);
+ /* sleep for a 1/3 of a second or so */
+ (void) msleep_interruptible(300);
/* wait for all partitions to become inactive */
+ printmsg_time = jiffies;
+
do {
active_part_count = 0;
for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
part = &xpc_partitions[partid];
- if (part->act_state != XPC_P_INACTIVE) {
- active_part_count++;
- XPC_DEACTIVATE_PARTITION(part, xpcUnloading);
+ if (xpc_partition_disengaged(part) &&
+ part->act_state == XPC_P_INACTIVE) {
+ continue;
}
+
+ active_part_count++;
+
+ XPC_DEACTIVATE_PARTITION(part, reason);
+ }
+
+ if (active_part_count == 0) {
+ break;
+ }
+
+ if (jiffies >= printmsg_time) {
+ dev_info(xpc_part, "waiting for partitions to "
+ "deactivate/disengage, active count=%d, remote "
+ "engaged=0x%lx\n", active_part_count,
+ xpc_partition_engaged(1UL << partid));
+
+ printmsg_time = jiffies +
+ (XPC_DISENGAGE_PRINTMSG_INTERVAL * HZ);
}
- if (active_part_count)
- msleep_interruptible(300);
- } while (active_part_count > 0);
+ /* sleep for a 1/3 of a second or so */
+ (void) msleep_interruptible(300);
+
+ } while (1);
+
+ DBUG_ON(xpc_partition_engaged(-1UL));
+ /* indicate to others that our reserved page is uninitialized */
+ xpc_rsvd_page->vars_pa = 0;
+
+ /* now it's time to eliminate our heartbeat */
+ del_timer_sync(&xpc_hb_timer);
+ DBUG_ON(xpc_vars->heartbeating_to_mask != 0);
+
+ /* take ourselves off of the reboot_notifier_list */
+ (void) unregister_reboot_notifier(&xpc_reboot_notifier);
+
/* close down protections for IPI operations */
xpc_restrict_IPI_ops();
@@ -876,6 +1010,34 @@ xpc_do_exit(void)
}
+/*
+ * This function is called when the system is being rebooted.
+ */
+static int
+xpc_system_reboot(struct notifier_block *nb, unsigned long event, void *unused)
+{
+ enum xpc_retval reason;
+
+
+ switch (event) {
+ case SYS_RESTART:
+ reason = xpcSystemReboot;
+ break;
+ case SYS_HALT:
+ reason = xpcSystemHalt;
+ break;
+ case SYS_POWER_OFF:
+ reason = xpcSystemPoweroff;
+ break;
+ default:
+ reason = xpcSystemGoingDown;
+ }
+
+ xpc_do_exit(reason);
+ return NOTIFY_DONE;
+}
+
+
int __init
xpc_init(void)
{
@@ -885,13 +1047,17 @@ xpc_init(void)
pid_t pid;
+ if (!ia64_platform_is("sn2")) {
+ return -ENODEV;
+ }
+
/*
* xpc_remote_copy_buffer is used as a temporary buffer for bte_copy'ng
- * both a partition's reserved page and its XPC variables. Its size was
- * based on the size of a reserved page. So we need to ensure that the
- * XPC variables will fit as well.
+ * various portions of a partition's reserved page. Its size is based
+ * on the size of the reserved page header and part_nasids mask. So we
+ * need to ensure that the other items will fit as well.
*/
- if (XPC_VARS_ALIGNED_SIZE > XPC_RSVD_PAGE_ALIGNED_SIZE) {
+ if (XPC_RP_VARS_SIZE > XPC_RP_HEADER_SIZE + XP_NASID_MASK_BYTES) {
dev_err(xpc_part, "xpc_remote_copy_buffer is not big enough\n");
return -EPERM;
}
@@ -920,6 +1086,12 @@ xpc_init(void)
spin_lock_init(&part->act_lock);
part->act_state = XPC_P_INACTIVE;
XPC_SET_REASON(part, 0, 0);
+
+ init_timer(&part->disengage_request_timer);
+ part->disengage_request_timer.function =
+ xpc_timeout_partition_disengage_request;
+ part->disengage_request_timer.data = (unsigned long) part;
+
part->setup_state = XPC_P_UNSET;
init_waitqueue_head(&part->teardown_wq);
atomic_set(&part->references, 0);
@@ -976,6 +1148,13 @@ xpc_init(void)
}
+ /* add ourselves to the reboot_notifier_list */
+ ret = register_reboot_notifier(&xpc_reboot_notifier);
+ if (ret != 0) {
+ dev_warn(xpc_part, "can't register reboot notifier\n");
+ }
+
+
/*
* Set the beating to other partitions into motion. This is
* the last requirement for other partitions' discovery to
@@ -997,6 +1176,9 @@ xpc_init(void)
/* indicate to others that our reserved page is uninitialized */
xpc_rsvd_page->vars_pa = 0;
+ /* take ourselves off of the reboot_notifier_list */
+ (void) unregister_reboot_notifier(&xpc_reboot_notifier);
+
del_timer_sync(&xpc_hb_timer);
free_irq(SGI_XPC_ACTIVATE, NULL);
xpc_restrict_IPI_ops();
@@ -1020,7 +1202,7 @@ xpc_init(void)
/* mark this new thread as a non-starter */
up(&xpc_discovery_exited);
- xpc_do_exit();
+ xpc_do_exit(xpcUnloading);
return -EBUSY;
}
@@ -1039,7 +1221,7 @@ module_init(xpc_init);
void __exit
xpc_exit(void)
{
- xpc_do_exit();
+ xpc_do_exit(xpcUnloading);
}
module_exit(xpc_exit);
@@ -1056,3 +1238,7 @@ module_param(xpc_hb_check_interval, int, 0);
MODULE_PARM_DESC(xpc_hb_check_interval, "Number of seconds between "
"heartbeat checks.");
+module_param(xpc_disengage_request_timelimit, int, 0);
+MODULE_PARM_DESC(xpc_disengage_request_timelimit, "Number of seconds to wait "
+ "for disengage request to complete.");
+
diff --git a/arch/ia64/sn/kernel/xpc_partition.c b/arch/ia64/sn/kernel/xpc_partition.c
index 578265ea9e67..581e113d2d37 100644
--- a/arch/ia64/sn/kernel/xpc_partition.c
+++ b/arch/ia64/sn/kernel/xpc_partition.c
@@ -44,16 +44,19 @@ static u64 xpc_sh2_IPI_access3;
/* original protection values for each node */
-u64 xpc_prot_vec[MAX_COMPACT_NODES];
+u64 xpc_prot_vec[MAX_NUMNODES];
-/* this partition's reserved page */
+/* this partition's reserved page pointers */
struct xpc_rsvd_page *xpc_rsvd_page;
-
-/* this partition's XPC variables (within the reserved page) */
+static u64 *xpc_part_nasids;
+static u64 *xpc_mach_nasids;
struct xpc_vars *xpc_vars;
struct xpc_vars_part *xpc_vars_part;
+static int xp_nasid_mask_bytes; /* actual size in bytes of nasid mask */
+static int xp_nasid_mask_words; /* actual size in words of nasid mask */
+
/*
* For performance reasons, each entry of xpc_partitions[] is cacheline
@@ -65,20 +68,16 @@ struct xpc_partition xpc_partitions[XP_MAX_PARTITIONS + 1];
/*
- * Generic buffer used to store a local copy of the remote partitions
- * reserved page or XPC variables.
+ * Generic buffer used to store a local copy of portions of a remote
+ * partition's reserved page (either its header and part_nasids mask,
+ * or its vars).
*
* xpc_discovery runs only once and is a seperate thread that is
* very likely going to be processing in parallel with receiving
* interrupts.
*/
-char ____cacheline_aligned
- xpc_remote_copy_buffer[XPC_RSVD_PAGE_ALIGNED_SIZE];
-
-
-/* systune related variables */
-int xpc_hb_interval = XPC_HB_DEFAULT_INTERVAL;
-int xpc_hb_check_interval = XPC_HB_CHECK_DEFAULT_TIMEOUT;
+char ____cacheline_aligned xpc_remote_copy_buffer[XPC_RP_HEADER_SIZE +
+ XP_NASID_MASK_BYTES];
/*
@@ -86,13 +85,16 @@ int xpc_hb_check_interval = XPC_HB_CHECK_DEFAULT_TIMEOUT;
* for that nasid. This function returns 0 on any error.
*/
static u64
-xpc_get_rsvd_page_pa(int nasid, u64 buf, u64 buf_size)
+xpc_get_rsvd_page_pa(int nasid)
{
bte_result_t bte_res;
s64 status;
u64 cookie = 0;
u64 rp_pa = nasid; /* seed with nasid */
u64 len = 0;
+ u64 buf = buf;
+ u64 buf_len = 0;
+ void *buf_base = NULL;
while (1) {
@@ -108,13 +110,22 @@ xpc_get_rsvd_page_pa(int nasid, u64 buf, u64 buf_size)
break;
}
- if (len > buf_size) {
- dev_err(xpc_part, "len (=0x%016lx) > buf_size\n", len);
- status = SALRET_ERROR;
- break;
+ if (L1_CACHE_ALIGN(len) > buf_len) {
+ if (buf_base != NULL) {
+ kfree(buf_base);
+ }
+ buf_len = L1_CACHE_ALIGN(len);
+ buf = (u64) xpc_kmalloc_cacheline_aligned(buf_len,
+ GFP_KERNEL, &buf_base);
+ if (buf_base == NULL) {
+ dev_err(xpc_part, "unable to kmalloc "
+ "len=0x%016lx\n", buf_len);
+ status = SALRET_ERROR;
+ break;
+ }
}
- bte_res = xp_bte_copy(rp_pa, ia64_tpa(buf), buf_size,
+ bte_res = xp_bte_copy(rp_pa, ia64_tpa(buf), buf_len,
(BTE_NOTIFY | BTE_WACQUIRE), NULL);
if (bte_res != BTE_SUCCESS) {
dev_dbg(xpc_part, "xp_bte_copy failed %i\n", bte_res);
@@ -123,6 +134,10 @@ xpc_get_rsvd_page_pa(int nasid, u64 buf, u64 buf_size)
}
}
+ if (buf_base != NULL) {
+ kfree(buf_base);
+ }
+
if (status != SALRET_OK) {
rp_pa = 0;
}
@@ -141,15 +156,15 @@ xpc_rsvd_page_init(void)
{
struct xpc_rsvd_page *rp;
AMO_t *amos_page;
- u64 rp_pa, next_cl, nasid_array = 0;
+ u64 rp_pa, nasid_array = 0;
int i, ret;
/* get the local reserved page's address */
- rp_pa = xpc_get_rsvd_page_pa(cnodeid_to_nasid(0),
- (u64) xpc_remote_copy_buffer,
- XPC_RSVD_PAGE_ALIGNED_SIZE);
+ preempt_disable();
+ rp_pa = xpc_get_rsvd_page_pa(cpuid_to_nasid(smp_processor_id()));
+ preempt_enable();
if (rp_pa == 0) {
dev_err(xpc_part, "SAL failed to locate the reserved page\n");
return NULL;
@@ -164,12 +179,19 @@ xpc_rsvd_page_init(void)
rp->version = XPC_RP_VERSION;
- /*
- * Place the XPC variables on the cache line following the
- * reserved page structure.
- */
- next_cl = (u64) rp + XPC_RSVD_PAGE_ALIGNED_SIZE;
- xpc_vars = (struct xpc_vars *) next_cl;
+ /* establish the actual sizes of the nasid masks */
+ if (rp->SAL_version == 1) {
+ /* SAL_version 1 didn't set the nasids_size field */
+ rp->nasids_size = 128;
+ }
+ xp_nasid_mask_bytes = rp->nasids_size;
+ xp_nasid_mask_words = xp_nasid_mask_bytes / 8;
+
+ /* setup the pointers to the various items in the reserved page */
+ xpc_part_nasids = XPC_RP_PART_NASIDS(rp);
+ xpc_mach_nasids = XPC_RP_MACH_NASIDS(rp);
+ xpc_vars = XPC_RP_VARS(rp);
+ xpc_vars_part = XPC_RP_VARS_PART(rp);
/*
* Before clearing xpc_vars, see if a page of AMOs had been previously
@@ -221,33 +243,32 @@ xpc_rsvd_page_init(void)
amos_page = (AMO_t *) TO_AMO((u64) amos_page);
}
+ /* clear xpc_vars */
memset(xpc_vars, 0, sizeof(struct xpc_vars));
- /*
- * Place the XPC per partition specific variables on the cache line
- * following the XPC variables structure.
- */
- next_cl += XPC_VARS_ALIGNED_SIZE;
- memset((u64 *) next_cl, 0, sizeof(struct xpc_vars_part) *
- XP_MAX_PARTITIONS);
- xpc_vars_part = (struct xpc_vars_part *) next_cl;
- xpc_vars->vars_part_pa = __pa(next_cl);
-
xpc_vars->version = XPC_V_VERSION;
xpc_vars->act_nasid = cpuid_to_nasid(0);
xpc_vars->act_phys_cpuid = cpu_physical_id(0);
+ xpc_vars->vars_part_pa = __pa(xpc_vars_part);
+ xpc_vars->amos_page_pa = ia64_tpa((u64) amos_page);
xpc_vars->amos_page = amos_page; /* save for next load of XPC */
- /*
- * Initialize the activation related AMO variables.
- */
- xpc_vars->act_amos = xpc_IPI_init(XP_MAX_PARTITIONS);
- for (i = 1; i < XP_NASID_MASK_WORDS; i++) {
- xpc_IPI_init(i + XP_MAX_PARTITIONS);
+ /* clear xpc_vars_part */
+ memset((u64 *) xpc_vars_part, 0, sizeof(struct xpc_vars_part) *
+ XP_MAX_PARTITIONS);
+
+ /* initialize the activate IRQ related AMO variables */
+ for (i = 0; i < xp_nasid_mask_words; i++) {
+ (void) xpc_IPI_init(XPC_ACTIVATE_IRQ_AMOS + i);
}
- /* export AMO page's physical address to other partitions */
- xpc_vars->amos_page_pa = ia64_tpa((u64) xpc_vars->amos_page);
+
+ /* initialize the engaged remote partitions related AMO variables */
+ (void) xpc_IPI_init(XPC_ENGAGED_PARTITIONS_AMO);
+ (void) xpc_IPI_init(XPC_DISENGAGE_REQUEST_AMO);
+
+ /* timestamp of when reserved page was setup by XPC */
+ rp->stamp = CURRENT_TIME;
/*
* This signifies to the remote partition that our reserved
@@ -387,6 +408,11 @@ xpc_check_remote_hb(void)
remote_vars = (struct xpc_vars *) xpc_remote_copy_buffer;
for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
+
+ if (xpc_exiting) {
+ break;
+ }
+
if (partid == sn_partition_id) {
continue;
}
@@ -401,7 +427,7 @@ xpc_check_remote_hb(void)
/* pull the remote_hb cache line */
bres = xp_bte_copy(part->remote_vars_pa,
ia64_tpa((u64) remote_vars),
- XPC_VARS_ALIGNED_SIZE,
+ XPC_RP_VARS_SIZE,
(BTE_NOTIFY | BTE_WACQUIRE), NULL);
if (bres != BTE_SUCCESS) {
XPC_DEACTIVATE_PARTITION(part,
@@ -417,7 +443,7 @@ xpc_check_remote_hb(void)
if (((remote_vars->heartbeat == part->last_heartbeat) &&
(remote_vars->kdb_status == 0)) ||
- !XPC_HB_ALLOWED(sn_partition_id, remote_vars)) {
+ !xpc_hb_allowed(sn_partition_id, remote_vars)) {
XPC_DEACTIVATE_PARTITION(part, xpcNoHeartbeat);
continue;
@@ -429,31 +455,31 @@ xpc_check_remote_hb(void)
/*
- * Get a copy of the remote partition's rsvd page.
+ * Get a copy of a portion of the remote partition's rsvd page.
*
* remote_rp points to a buffer that is cacheline aligned for BTE copies and
- * assumed to be of size XPC_RSVD_PAGE_ALIGNED_SIZE.
+ * is large enough to contain a copy of their reserved page header and
+ * part_nasids mask.
*/
static enum xpc_retval
xpc_get_remote_rp(int nasid, u64 *discovered_nasids,
- struct xpc_rsvd_page *remote_rp, u64 *remote_rsvd_page_pa)
+ struct xpc_rsvd_page *remote_rp, u64 *remote_rp_pa)
{
int bres, i;
/* get the reserved page's physical address */
- *remote_rsvd_page_pa = xpc_get_rsvd_page_pa(nasid, (u64) remote_rp,
- XPC_RSVD_PAGE_ALIGNED_SIZE);
- if (*remote_rsvd_page_pa == 0) {
+ *remote_rp_pa = xpc_get_rsvd_page_pa(nasid);
+ if (*remote_rp_pa == 0) {
return xpcNoRsvdPageAddr;
}
- /* pull over the reserved page structure */
+ /* pull over the reserved page header and part_nasids mask */
- bres = xp_bte_copy(*remote_rsvd_page_pa, ia64_tpa((u64) remote_rp),
- XPC_RSVD_PAGE_ALIGNED_SIZE,
+ bres = xp_bte_copy(*remote_rp_pa, ia64_tpa((u64) remote_rp),
+ XPC_RP_HEADER_SIZE + xp_nasid_mask_bytes,
(BTE_NOTIFY | BTE_WACQUIRE), NULL);
if (bres != BTE_SUCCESS) {
return xpc_map_bte_errors(bres);
@@ -461,8 +487,11 @@ xpc_get_remote_rp(int nasid, u64 *discovered_nasids,
if (discovered_nasids != NULL) {
- for (i = 0; i < XP_NASID_MASK_WORDS; i++) {
- discovered_nasids[i] |= remote_rp->part_nasids[i];
+ u64 *remote_part_nasids = XPC_RP_PART_NASIDS(remote_rp);
+
+
+ for (i = 0; i < xp_nasid_mask_words; i++) {
+ discovered_nasids[i] |= remote_part_nasids[i];
}
}
@@ -489,10 +518,10 @@ xpc_get_remote_rp(int nasid, u64 *discovered_nasids,
/*
- * Get a copy of the remote partition's XPC variables.
+ * Get a copy of the remote partition's XPC variables from the reserved page.
*
* remote_vars points to a buffer that is cacheline aligned for BTE copies and
- * assumed to be of size XPC_VARS_ALIGNED_SIZE.
+ * assumed to be of size XPC_RP_VARS_SIZE.
*/
static enum xpc_retval
xpc_get_remote_vars(u64 remote_vars_pa, struct xpc_vars *remote_vars)
@@ -508,7 +537,7 @@ xpc_get_remote_vars(u64 remote_vars_pa, struct xpc_vars *remote_vars)
/* pull over the cross partition variables */
bres = xp_bte_copy(remote_vars_pa, ia64_tpa((u64) remote_vars),
- XPC_VARS_ALIGNED_SIZE,
+ XPC_RP_VARS_SIZE,
(BTE_NOTIFY | BTE_WACQUIRE), NULL);
if (bres != BTE_SUCCESS) {
return xpc_map_bte_errors(bres);
@@ -524,7 +553,56 @@ xpc_get_remote_vars(u64 remote_vars_pa, struct xpc_vars *remote_vars)
/*
- * Prior code has determine the nasid which generated an IPI. Inspect
+ * Update the remote partition's info.
+ */
+static void
+xpc_update_partition_info(struct xpc_partition *part, u8 remote_rp_version,
+ struct timespec *remote_rp_stamp, u64 remote_rp_pa,
+ u64 remote_vars_pa, struct xpc_vars *remote_vars)
+{
+ part->remote_rp_version = remote_rp_version;
+ dev_dbg(xpc_part, " remote_rp_version = 0x%016lx\n",
+ part->remote_rp_version);
+
+ part->remote_rp_stamp = *remote_rp_stamp;
+ dev_dbg(xpc_part, " remote_rp_stamp (tv_sec = 0x%lx tv_nsec = 0x%lx\n",
+ part->remote_rp_stamp.tv_sec, part->remote_rp_stamp.tv_nsec);
+
+ part->remote_rp_pa = remote_rp_pa;
+ dev_dbg(xpc_part, " remote_rp_pa = 0x%016lx\n", part->remote_rp_pa);
+
+ part->remote_vars_pa = remote_vars_pa;
+ dev_dbg(xpc_part, " remote_vars_pa = 0x%016lx\n",
+ part->remote_vars_pa);
+
+ part->last_heartbeat = remote_vars->heartbeat;
+ dev_dbg(xpc_part, " last_heartbeat = 0x%016lx\n",
+ part->last_heartbeat);
+
+ part->remote_vars_part_pa = remote_vars->vars_part_pa;
+ dev_dbg(xpc_part, " remote_vars_part_pa = 0x%016lx\n",
+ part->remote_vars_part_pa);
+
+ part->remote_act_nasid = remote_vars->act_nasid;
+ dev_dbg(xpc_part, " remote_act_nasid = 0x%x\n",
+ part->remote_act_nasid);
+
+ part->remote_act_phys_cpuid = remote_vars->act_phys_cpuid;
+ dev_dbg(xpc_part, " remote_act_phys_cpuid = 0x%x\n",
+ part->remote_act_phys_cpuid);
+
+ part->remote_amos_page_pa = remote_vars->amos_page_pa;
+ dev_dbg(xpc_part, " remote_amos_page_pa = 0x%lx\n",
+ part->remote_amos_page_pa);
+
+ part->remote_vars_version = remote_vars->version;
+ dev_dbg(xpc_part, " remote_vars_version = 0x%x\n",
+ part->remote_vars_version);
+}
+
+
+/*
+ * Prior code has determined the nasid which generated an IPI. Inspect
* that nasid to determine if its partition needs to be activated or
* deactivated.
*
@@ -542,8 +620,12 @@ xpc_identify_act_IRQ_req(int nasid)
{
struct xpc_rsvd_page *remote_rp;
struct xpc_vars *remote_vars;
- u64 remote_rsvd_page_pa;
+ u64 remote_rp_pa;
u64 remote_vars_pa;
+ int remote_rp_version;
+ int reactivate = 0;
+ int stamp_diff;
+ struct timespec remote_rp_stamp = { 0, 0 };
partid_t partid;
struct xpc_partition *part;
enum xpc_retval ret;
@@ -553,7 +635,7 @@ xpc_identify_act_IRQ_req(int nasid)
remote_rp = (struct xpc_rsvd_page *) xpc_remote_copy_buffer;
- ret = xpc_get_remote_rp(nasid, NULL, remote_rp, &remote_rsvd_page_pa);
+ ret = xpc_get_remote_rp(nasid, NULL, remote_rp, &remote_rp_pa);
if (ret != xpcSuccess) {
dev_warn(xpc_part, "unable to get reserved page from nasid %d, "
"which sent interrupt, reason=%d\n", nasid, ret);
@@ -561,6 +643,10 @@ xpc_identify_act_IRQ_req(int nasid)
}
remote_vars_pa = remote_rp->vars_pa;
+ remote_rp_version = remote_rp->version;
+ if (XPC_SUPPORTS_RP_STAMP(remote_rp_version)) {
+ remote_rp_stamp = remote_rp->stamp;
+ }
partid = remote_rp->partid;
part = &xpc_partitions[partid];
@@ -586,44 +672,117 @@ xpc_identify_act_IRQ_req(int nasid)
"%ld:0x%lx\n", (int) nasid, (int) partid, part->act_IRQ_rcvd,
remote_vars->heartbeat, remote_vars->heartbeating_to_mask);
+ if (xpc_partition_disengaged(part) &&
+ part->act_state == XPC_P_INACTIVE) {
- if (part->act_state == XPC_P_INACTIVE) {
+ xpc_update_partition_info(part, remote_rp_version,
+ &remote_rp_stamp, remote_rp_pa,
+ remote_vars_pa, remote_vars);
- part->remote_rp_pa = remote_rsvd_page_pa;
- dev_dbg(xpc_part, " remote_rp_pa = 0x%016lx\n",
- part->remote_rp_pa);
+ if (XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version)) {
+ if (xpc_partition_disengage_requested(1UL << partid)) {
+ /*
+ * Other side is waiting on us to disengage,
+ * even though we already have.
+ */
+ return;
+ }
+ } else {
+ /* other side doesn't support disengage requests */
+ xpc_clear_partition_disengage_request(1UL << partid);
+ }
- part->remote_vars_pa = remote_vars_pa;
- dev_dbg(xpc_part, " remote_vars_pa = 0x%016lx\n",
- part->remote_vars_pa);
+ xpc_activate_partition(part);
+ return;
+ }
- part->last_heartbeat = remote_vars->heartbeat;
- dev_dbg(xpc_part, " last_heartbeat = 0x%016lx\n",
- part->last_heartbeat);
+ DBUG_ON(part->remote_rp_version == 0);
+ DBUG_ON(part->remote_vars_version == 0);
+
+ if (!XPC_SUPPORTS_RP_STAMP(part->remote_rp_version)) {
+ DBUG_ON(XPC_SUPPORTS_DISENGAGE_REQUEST(part->
+ remote_vars_version));
+
+ if (!XPC_SUPPORTS_RP_STAMP(remote_rp_version)) {
+ DBUG_ON(XPC_SUPPORTS_DISENGAGE_REQUEST(remote_vars->
+ version));
+ /* see if the other side rebooted */
+ if (part->remote_amos_page_pa ==
+ remote_vars->amos_page_pa &&
+ xpc_hb_allowed(sn_partition_id,
+ remote_vars)) {
+ /* doesn't look that way, so ignore the IPI */
+ return;
+ }
+ }
- part->remote_vars_part_pa = remote_vars->vars_part_pa;
- dev_dbg(xpc_part, " remote_vars_part_pa = 0x%016lx\n",
- part->remote_vars_part_pa);
+ /*
+ * Other side rebooted and previous XPC didn't support the
+ * disengage request, so we don't need to do anything special.
+ */
- part->remote_act_nasid = remote_vars->act_nasid;
- dev_dbg(xpc_part, " remote_act_nasid = 0x%x\n",
- part->remote_act_nasid);
+ xpc_update_partition_info(part, remote_rp_version,
+ &remote_rp_stamp, remote_rp_pa,
+ remote_vars_pa, remote_vars);
+ part->reactivate_nasid = nasid;
+ XPC_DEACTIVATE_PARTITION(part, xpcReactivating);
+ return;
+ }
- part->remote_act_phys_cpuid = remote_vars->act_phys_cpuid;
- dev_dbg(xpc_part, " remote_act_phys_cpuid = 0x%x\n",
- part->remote_act_phys_cpuid);
+ DBUG_ON(!XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version));
- part->remote_amos_page_pa = remote_vars->amos_page_pa;
- dev_dbg(xpc_part, " remote_amos_page_pa = 0x%lx\n",
- part->remote_amos_page_pa);
+ if (!XPC_SUPPORTS_RP_STAMP(remote_rp_version)) {
+ DBUG_ON(!XPC_SUPPORTS_DISENGAGE_REQUEST(remote_vars->version));
- xpc_activate_partition(part);
+ /*
+ * Other side rebooted and previous XPC did support the
+ * disengage request, but the new one doesn't.
+ */
+
+ xpc_clear_partition_engaged(1UL << partid);
+ xpc_clear_partition_disengage_request(1UL << partid);
- } else if (part->remote_amos_page_pa != remote_vars->amos_page_pa ||
- !XPC_HB_ALLOWED(sn_partition_id, remote_vars)) {
+ xpc_update_partition_info(part, remote_rp_version,
+ &remote_rp_stamp, remote_rp_pa,
+ remote_vars_pa, remote_vars);
+ reactivate = 1;
+
+ } else {
+ DBUG_ON(!XPC_SUPPORTS_DISENGAGE_REQUEST(remote_vars->version));
+ stamp_diff = xpc_compare_stamps(&part->remote_rp_stamp,
+ &remote_rp_stamp);
+ if (stamp_diff != 0) {
+ DBUG_ON(stamp_diff >= 0);
+
+ /*
+ * Other side rebooted and the previous XPC did support
+ * the disengage request, as does the new one.
+ */
+
+ DBUG_ON(xpc_partition_engaged(1UL << partid));
+ DBUG_ON(xpc_partition_disengage_requested(1UL <<
+ partid));
+
+ xpc_update_partition_info(part, remote_rp_version,
+ &remote_rp_stamp, remote_rp_pa,
+ remote_vars_pa, remote_vars);
+ reactivate = 1;
+ }
+ }
+
+ if (!xpc_partition_disengaged(part)) {
+ /* still waiting on other side to disengage from us */
+ return;
+ }
+
+ if (reactivate) {
part->reactivate_nasid = nasid;
XPC_DEACTIVATE_PARTITION(part, xpcReactivating);
+
+ } else if (XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version) &&
+ xpc_partition_disengage_requested(1UL << partid)) {
+ XPC_DEACTIVATE_PARTITION(part, xpcOtherGoingDown);
}
}
@@ -643,14 +802,17 @@ xpc_identify_act_IRQ_sender(void)
u64 nasid; /* remote nasid */
int n_IRQs_detected = 0;
AMO_t *act_amos;
- struct xpc_rsvd_page *rp = (struct xpc_rsvd_page *) xpc_rsvd_page;
- act_amos = xpc_vars->act_amos;
+ act_amos = xpc_vars->amos_page + XPC_ACTIVATE_IRQ_AMOS;
/* scan through act AMO variable looking for non-zero entries */
- for (word = 0; word < XP_NASID_MASK_WORDS; word++) {
+ for (word = 0; word < xp_nasid_mask_words; word++) {
+
+ if (xpc_exiting) {
+ break;
+ }
nasid_mask = xpc_IPI_receive(&act_amos[word]);
if (nasid_mask == 0) {
@@ -668,7 +830,7 @@ xpc_identify_act_IRQ_sender(void)
* remote nasid in our reserved pages machine mask.
* This is used in the event of module reload.
*/
- rp->mach_nasids[word] |= nasid_mask;
+ xpc_mach_nasids[word] |= nasid_mask;
/* locate the nasid(s) which sent interrupts */
@@ -688,6 +850,55 @@ xpc_identify_act_IRQ_sender(void)
/*
+ * See if the other side has responded to a partition disengage request
+ * from us.
+ */
+int
+xpc_partition_disengaged(struct xpc_partition *part)
+{
+ partid_t partid = XPC_PARTID(part);
+ int disengaged;
+
+
+ disengaged = (xpc_partition_engaged(1UL << partid) == 0);
+ if (part->disengage_request_timeout) {
+ if (!disengaged) {
+ if (jiffies < part->disengage_request_timeout) {
+ /* timelimit hasn't been reached yet */
+ return 0;
+ }
+
+ /*
+ * Other side hasn't responded to our disengage
+ * request in a timely fashion, so assume it's dead.
+ */
+
+ xpc_clear_partition_engaged(1UL << partid);
+ disengaged = 1;
+ }
+ part->disengage_request_timeout = 0;
+
+ /* cancel the timer function, provided it's not us */
+ if (!in_interrupt()) {
+ del_singleshot_timer_sync(&part->
+ disengage_request_timer);
+ }
+
+ DBUG_ON(part->act_state != XPC_P_DEACTIVATING &&
+ part->act_state != XPC_P_INACTIVE);
+ if (part->act_state != XPC_P_INACTIVE) {
+ xpc_wakeup_channel_mgr(part);
+ }
+
+ if (XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version)) {
+ xpc_cancel_partition_disengage_request(part);
+ }
+ }
+ return disengaged;
+}
+
+
+/*
* Mark specified partition as active.
*/
enum xpc_retval
@@ -721,7 +932,6 @@ xpc_deactivate_partition(const int line, struct xpc_partition *part,
enum xpc_retval reason)
{
unsigned long irq_flags;
- partid_t partid = XPC_PARTID(part);
spin_lock_irqsave(&part->act_lock, irq_flags);
@@ -749,17 +959,27 @@ xpc_deactivate_partition(const int line, struct xpc_partition *part,
spin_unlock_irqrestore(&part->act_lock, irq_flags);
- XPC_DISALLOW_HB(partid, xpc_vars);
+ if (XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version)) {
+ xpc_request_partition_disengage(part);
+ xpc_IPI_send_disengage(part);
- dev_dbg(xpc_part, "bringing partition %d down, reason = %d\n", partid,
- reason);
+ /* set a timelimit on the disengage request */
+ part->disengage_request_timeout = jiffies +
+ (xpc_disengage_request_timelimit * HZ);
+ part->disengage_request_timer.expires =
+ part->disengage_request_timeout;
+ add_timer(&part->disengage_request_timer);
+ }
+
+ dev_dbg(xpc_part, "bringing partition %d down, reason = %d\n",
+ XPC_PARTID(part), reason);
- xpc_partition_down(part, reason);
+ xpc_partition_going_down(part, reason);
}
/*
- * Mark specified partition as active.
+ * Mark specified partition as inactive.
*/
void
xpc_mark_partition_inactive(struct xpc_partition *part)
@@ -792,9 +1012,10 @@ xpc_discovery(void)
void *remote_rp_base;
struct xpc_rsvd_page *remote_rp;
struct xpc_vars *remote_vars;
- u64 remote_rsvd_page_pa;
+ u64 remote_rp_pa;
u64 remote_vars_pa;
int region;
+ int region_size;
int max_regions;
int nasid;
struct xpc_rsvd_page *rp;
@@ -804,7 +1025,8 @@ xpc_discovery(void)
enum xpc_retval ret;
- remote_rp = xpc_kmalloc_cacheline_aligned(XPC_RSVD_PAGE_ALIGNED_SIZE,
+ remote_rp = xpc_kmalloc_cacheline_aligned(XPC_RP_HEADER_SIZE +
+ xp_nasid_mask_bytes,
GFP_KERNEL, &remote_rp_base);
if (remote_rp == NULL) {
return;
@@ -812,13 +1034,13 @@ xpc_discovery(void)
remote_vars = (struct xpc_vars *) remote_rp;
- discovered_nasids = kmalloc(sizeof(u64) * XP_NASID_MASK_WORDS,
+ discovered_nasids = kmalloc(sizeof(u64) * xp_nasid_mask_words,
GFP_KERNEL);
if (discovered_nasids == NULL) {
kfree(remote_rp_base);
return;
}
- memset(discovered_nasids, 0, sizeof(u64) * XP_NASID_MASK_WORDS);
+ memset(discovered_nasids, 0, sizeof(u64) * xp_nasid_mask_words);
rp = (struct xpc_rsvd_page *) xpc_rsvd_page;
@@ -827,11 +1049,19 @@ xpc_discovery(void)
* nodes that can comprise an access protection grouping. The access
* protection is in regards to memory, IOI and IPI.
*/
-//>>> move the next two #defines into either include/asm-ia64/sn/arch.h or
-//>>> include/asm-ia64/sn/addrs.h
-#define SH1_MAX_REGIONS 64
-#define SH2_MAX_REGIONS 256
- max_regions = is_shub2() ? SH2_MAX_REGIONS : SH1_MAX_REGIONS;
+ max_regions = 64;
+ region_size = sn_region_size;
+
+ switch (region_size) {
+ case 128:
+ max_regions *= 2;
+ case 64:
+ max_regions *= 2;
+ case 32:
+ max_regions *= 2;
+ region_size = 16;
+ DBUG_ON(!is_shub2());
+ }
for (region = 0; region < max_regions; region++) {
@@ -841,8 +1071,8 @@ xpc_discovery(void)
dev_dbg(xpc_part, "searching region %d\n", region);
- for (nasid = (region * sn_region_size * 2);
- nasid < ((region + 1) * sn_region_size * 2);
+ for (nasid = (region * region_size * 2);
+ nasid < ((region + 1) * region_size * 2);
nasid += 2) {
if ((volatile int) xpc_exiting) {
@@ -852,14 +1082,14 @@ xpc_discovery(void)
dev_dbg(xpc_part, "checking nasid %d\n", nasid);
- if (XPC_NASID_IN_ARRAY(nasid, rp->part_nasids)) {
+ if (XPC_NASID_IN_ARRAY(nasid, xpc_part_nasids)) {
dev_dbg(xpc_part, "PROM indicates Nasid %d is "
"part of the local partition; skipping "
"region\n", nasid);
break;
}
- if (!(XPC_NASID_IN_ARRAY(nasid, rp->mach_nasids))) {
+ if (!(XPC_NASID_IN_ARRAY(nasid, xpc_mach_nasids))) {
dev_dbg(xpc_part, "PROM indicates Nasid %d was "
"not on Numa-Link network at reset\n",
nasid);
@@ -877,7 +1107,7 @@ xpc_discovery(void)
/* pull over the reserved page structure */
ret = xpc_get_remote_rp(nasid, discovered_nasids,
- remote_rp, &remote_rsvd_page_pa);
+ remote_rp, &remote_rp_pa);
if (ret != xpcSuccess) {
dev_dbg(xpc_part, "unable to get reserved page "
"from nasid %d, reason=%d\n", nasid,
@@ -948,6 +1178,13 @@ xpc_discovery(void)
remote_vars->act_nasid,
remote_vars->act_phys_cpuid);
+ if (XPC_SUPPORTS_DISENGAGE_REQUEST(remote_vars->
+ version)) {
+ part->remote_amos_page_pa =
+ remote_vars->amos_page_pa;
+ xpc_mark_partition_disengaged(part);
+ xpc_cancel_partition_disengage_request(part);
+ }
xpc_IPI_send_activate(remote_vars);
}
}
@@ -974,12 +1211,12 @@ xpc_initiate_partid_to_nasids(partid_t partid, void *nasid_mask)
return xpcPartitionDown;
}
- part_nasid_pa = part->remote_rp_pa +
- (u64) &((struct xpc_rsvd_page *) 0)->part_nasids;
+ memset(nasid_mask, 0, XP_NASID_MASK_BYTES);
+
+ part_nasid_pa = (u64) XPC_RP_PART_NASIDS(part->remote_rp_pa);
bte_res = xp_bte_copy(part_nasid_pa, ia64_tpa((u64) nasid_mask),
- L1_CACHE_ALIGN(XP_NASID_MASK_BYTES),
- (BTE_NOTIFY | BTE_WACQUIRE), NULL);
+ xp_nasid_mask_bytes, (BTE_NOTIFY | BTE_WACQUIRE), NULL);
return xpc_map_bte_errors(bte_res);
}
diff --git a/arch/ia64/sn/kernel/xpnet.c b/arch/ia64/sn/kernel/xpnet.c
index 78c13d676fa6..e5c6d3c0a8e9 100644
--- a/arch/ia64/sn/kernel/xpnet.c
+++ b/arch/ia64/sn/kernel/xpnet.c
@@ -130,7 +130,7 @@ struct net_device *xpnet_device;
*/
static u64 xpnet_broadcast_partitions;
/* protect above */
-static spinlock_t xpnet_broadcast_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(xpnet_broadcast_lock);
/*
* Since the Block Transfer Engine (BTE) is being used for the transfer
@@ -636,6 +636,10 @@ xpnet_init(void)
int result = -ENOMEM;
+ if (!ia64_platform_is("sn2")) {
+ return -ENODEV;
+ }
+
dev_info(xpnet, "registering network device %s\n", XPNET_DEVICE_NAME);
/*