diff options
Diffstat (limited to 'arch/x86')
634 files changed, 64242 insertions, 17341 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 09521332636b..f4ed47db79ee 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -18,25 +18,28 @@ config X86_64 ### Arch settings config X86 def_bool y + select HAVE_AOUT if X86_32 select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_IDE select HAVE_OPROFILE + select HAVE_IOREMAP_PROT select HAVE_KPROBES + select ARCH_WANT_OPTIONAL_GPIOLIB select HAVE_KRETPROBES + select HAVE_FTRACE_MCOUNT_RECORD select HAVE_DYNAMIC_FTRACE select HAVE_FTRACE select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) select HAVE_ARCH_KGDB if !X86_VOYAGER + select HAVE_ARCH_TRACEHOOK + select HAVE_GENERIC_DMA_COHERENT if X86_32 + select HAVE_EFFICIENT_UNALIGNED_ACCESS config ARCH_DEFCONFIG string default "arch/x86/configs/i386_defconfig" if X86_32 default "arch/x86/configs/x86_64_defconfig" if X86_64 - -config GENERIC_LOCKBREAK - def_bool n - config GENERIC_TIME def_bool y @@ -89,7 +92,7 @@ config GENERIC_HWEIGHT def_bool y config GENERIC_GPIO - def_bool n + bool config ARCH_MAY_HAVE_PC_FDC def_bool y @@ -100,12 +103,6 @@ config RWSEM_GENERIC_SPINLOCK config RWSEM_XCHGADD_ALGORITHM def_bool X86_XADD -config ARCH_HAS_ILOG2_U32 - def_bool n - -config ARCH_HAS_ILOG2_U64 - def_bool n - config ARCH_HAS_CPU_IDLE_WAIT def_bool y @@ -119,6 +116,9 @@ config GENERIC_TIME_VSYSCALL config ARCH_HAS_CPU_RELAX def_bool y +config ARCH_HAS_DEFAULT_IDLE + def_bool y + config ARCH_HAS_CACHE_LINE_SIZE def_bool y @@ -147,9 +147,6 @@ config AUDIT_ARCH bool default X86_64 -config ARCH_SUPPORTS_AOUT - def_bool y - config ARCH_SUPPORTS_OPTIMIZED_INLINING def_bool y @@ -200,6 +197,7 @@ config X86_TRAMPOLINE config KTIME_SCALAR def_bool X86_32 source "init/Kconfig" +source "kernel/Kconfig.freezer" menu "Processor type and features" @@ -329,20 +327,6 @@ config X86_BIGSMP endif -config X86_RDC321X - bool "RDC R-321x SoC" - depends on X86_32 - select M486 - select X86_REBOOTFIXUPS - select GENERIC_GPIO - select LEDS_CLASS - select LEDS_GPIO - select NEW_LEDS - help - This option is needed for RDC R-321x system-on-chip, also known - as R-8610-(G). - If you don't have one of these chips, you should say N here. - config X86_VSMP bool "Support for ScaleMP vSMP" select PARAVIRT @@ -366,6 +350,16 @@ config X86_VISWS A kernel compiled for the Visual Workstation will run on general PCs as well. See <file:Documentation/sgi-visws.txt> for details. +config X86_RDC321X + bool "RDC R-321x SoC" + depends on X86_32 + select M486 + select X86_REBOOTFIXUPS + help + This option is needed for RDC R-321x system-on-chip, also known + as R-8610-(G). + If you don't have one of these chips, you should say N here. + config SCHED_NO_NO_OMIT_FRAME_POINTER def_bool y prompt "Single-depth WCHAN output" @@ -447,7 +441,6 @@ config PARAVIRT_DEBUG config MEMTEST bool "Memtest" - depends on X86_64 help This option adds a kernel parameter 'memtest', which allows memtest to be set. @@ -554,6 +547,7 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT config AMD_IOMMU bool "AMD IOMMU support" select SWIOTLB + select PCI_MSI depends on X86_64 && PCI && ACPI help With this option you can enable support for AMD IOMMU hardware in @@ -578,35 +572,29 @@ config SWIOTLB config IOMMU_HELPER def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU) + config MAXSMP bool "Configure Maximum number of SMP Processors and NUMA Nodes" - depends on X86_64 && SMP + depends on X86_64 && SMP && BROKEN default n help Configure maximum number of CPUS and NUMA Nodes for this architecture. If unsure, say N. -if MAXSMP -config NR_CPUS - int - default "4096" -endif - -if !MAXSMP config NR_CPUS - int "Maximum number of CPUs (2-4096)" - range 2 4096 + int "Maximum number of CPUs (2-512)" if !MAXSMP + range 2 512 depends on SMP + default "4096" if MAXSMP default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000 default "8" help This allows you to specify the maximum number of CPUs which this - kernel will support. The maximum supported value is 4096 and the + kernel will support. The maximum supported value is 512 and the minimum value which makes sense is 2. This is purely to save memory - each supported CPU adds approximately eight kilobytes to the kernel image. -endif config SCHED_SMT bool "SMT (Hyperthreading) scheduler support" @@ -789,9 +777,8 @@ config I8K Say N otherwise. config X86_REBOOTFIXUPS - def_bool n - prompt "Enable X86 board specific fixups for reboot" - depends on X86_32 && X86 + bool "Enable X86 board specific fixups for reboot" + depends on X86_32 ---help--- This enables chipset and/or board specific fixups to be done in order to get reboot to work correctly. This is only needed on @@ -807,23 +794,45 @@ config X86_REBOOTFIXUPS Say N otherwise. config MICROCODE - tristate "/dev/cpu/microcode - Intel IA32 CPU microcode support" + tristate "/dev/cpu/microcode - microcode support" select FW_LOADER ---help--- If you say Y here, you will be able to update the microcode on - Intel processors in the IA32 family, e.g. Pentium Pro, Pentium II, - Pentium III, Pentium 4, Xeon etc. You will obviously need the - actual microcode binary data itself which is not shipped with the - Linux kernel. + certain Intel and AMD processors. The Intel support is for the + IA32 family, e.g. Pentium Pro, Pentium II, Pentium III, + Pentium 4, Xeon etc. The AMD support is for family 0x10 and + 0x11 processors, e.g. Opteron, Phenom and Turion 64 Ultra. + You will obviously need the actual microcode binary data itself + which is not shipped with the Linux kernel. - For latest news and information on obtaining all the required - ingredients for this driver, check: - <http://www.urbanmyth.org/microcode/>. + This option selects the general module only, you need to select + at least one vendor specific module as well. To compile this driver as a module, choose M here: the module will be called microcode. -config MICROCODE_OLD_INTERFACE +config MICROCODE_INTEL + bool "Intel microcode patch loading support" + depends on MICROCODE + default MICROCODE + select FW_LOADER + --help--- + This options enables microcode patch loading support for Intel + processors. + + For latest news and information on obtaining all the required + Intel ingredients for this driver, check: + <http://www.urbanmyth.org/microcode/>. + +config MICROCODE_AMD + bool "AMD microcode patch loading support" + depends on MICROCODE + select FW_LOADER + --help--- + If you select this option, microcode patch loading support for AMD + processors will be enabled. + + config MICROCODE_OLD_INTERFACE def_bool y depends on MICROCODE @@ -953,16 +962,17 @@ config HIGHMEM depends on X86_32 && (HIGHMEM64G || HIGHMEM4G) config X86_PAE - def_bool n - prompt "PAE (Physical Address Extension) Support" + bool "PAE (Physical Address Extension) Support" depends on X86_32 && !HIGHMEM4G - select RESOURCES_64BIT help PAE is required for NX support, and furthermore enables larger swapspace support for non-overcommit purposes. It has the cost of more pagetable lookup overhead, and also consumes more pagetable space per process. +config ARCH_PHYS_ADDR_T_64BIT + def_bool X86_64 || X86_PAE + # Common NUMA Features config NUMA bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)" @@ -976,9 +986,9 @@ config NUMA local memory controller of the CPU and add some more NUMA awareness to the kernel. - For i386 this is currently highly experimental and should be only + For 32-bit this is currently highly experimental and should be only used for kernel development. It might also cause boot failures. - For x86_64 this is recommended on all multiprocessor Opteron systems. + For 64-bit this is recommended on all multiprocessor Opteron systems. If the system is EM64T, you should say N unless your system is EM64T NUMA. @@ -1021,17 +1031,10 @@ config NUMA_EMU into virtual nodes when booted with "numa=fake=N", where N is the number of nodes. This is only useful for debugging. -if MAXSMP - config NODES_SHIFT - int - default "9" -endif - -if !MAXSMP -config NODES_SHIFT - int "Maximum NUMA Nodes (as a power of 2)" + int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP range 1 9 if X86_64 + default "9" if MAXSMP default "6" if X86_64 default "4" if X86_NUMAQ default "3" @@ -1039,7 +1042,6 @@ config NODES_SHIFT help Specify the maximum number of NUMA Nodes available on the target system. Increases memory reserved to accomodate various tables. -endif config HAVE_ARCH_BOOTMEM_NODE def_bool y @@ -1059,7 +1061,7 @@ config HAVE_ARCH_ALLOC_REMAP config ARCH_FLATMEM_ENABLE def_bool y - depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && X86_PC && !NUMA + depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && !NUMA config ARCH_DISCONTIGMEM_ENABLE def_bool y @@ -1075,7 +1077,7 @@ config ARCH_SPARSEMEM_DEFAULT config ARCH_SPARSEMEM_ENABLE def_bool y - depends on X86_64 || NUMA || (EXPERIMENTAL && X86_PC) + depends on X86_64 || NUMA || (EXPERIMENTAL && X86_PC) || X86_GENERICARCH select SPARSEMEM_STATIC if X86_32 select SPARSEMEM_VMEMMAP_ENABLE if X86_64 @@ -1098,6 +1100,56 @@ config HIGHPTE low memory. Setting this option will put user-space page table entries in high memory. +config X86_CHECK_BIOS_CORRUPTION + bool "Check for low memory corruption" + help + Periodically check for memory corruption in low memory, which + is suspected to be caused by BIOS. Even when enabled in the + configuration, it is disabled at runtime. Enable it by + setting "memory_corruption_check=1" on the kernel command + line. By default it scans the low 64k of memory every 60 + seconds; see the memory_corruption_check_size and + memory_corruption_check_period parameters in + Documentation/kernel-parameters.txt to adjust this. + + When enabled with the default parameters, this option has + almost no overhead, as it reserves a relatively small amount + of memory and scans it infrequently. It both detects corruption + and prevents it from affecting the running system. + + It is, however, intended as a diagnostic tool; if repeatable + BIOS-originated corruption always affects the same memory, + you can use memmap= to prevent the kernel from using that + memory. + +config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK + bool "Set the default setting of memory_corruption_check" + depends on X86_CHECK_BIOS_CORRUPTION + default y + help + Set whether the default state of memory_corruption_check is + on or off. + +config X86_RESERVE_LOW_64K + bool "Reserve low 64K of RAM on AMI/Phoenix BIOSen" + default y + help + Reserve the first 64K of physical RAM on BIOSes that are known + to potentially corrupt that memory range. A numbers of BIOSes are + known to utilize this area during suspend/resume, so it must not + be used by the kernel. + + Set this to N if you are absolutely sure that you trust the BIOS + to get all its memory reservations and usages right. + + If you have doubts about the BIOS (e.g. suspend/resume does not + work or there's kernel crashes after certain hardware hotplug + events) and it's not AMI or Phoenix, then you might want to enable + X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check typical + corruption patterns. + + Say Y if unsure. + config MATH_EMULATION bool prompt "Math emulation" if X86_32 @@ -1156,10 +1208,10 @@ config MTRR You can safely say Y even if your machine doesn't have MTRRs, you'll just add about 9 KB to your kernel. - See <file:Documentation/mtrr.txt> for more information. + See <file:Documentation/x86/mtrr.txt> for more information. config MTRR_SANITIZER - bool + def_bool y prompt "MTRR cleanup support" depends on MTRR help @@ -1170,7 +1222,7 @@ config MTRR_SANITIZER The largest mtrr entry size for a continous block can be set with mtrr_chunk_size. - If unsure, say N. + If unsure, say Y. config MTRR_SANITIZER_ENABLE_DEFAULT int "MTRR cleanup enable value (0-1)" @@ -1205,8 +1257,7 @@ config X86_PAT If unsure, say Y. config EFI - def_bool n - prompt "EFI runtime service support" + bool "EFI runtime service support" depends on ACPI ---help--- This enables the kernel to use EFI runtime services that are @@ -1219,18 +1270,9 @@ config EFI resultant kernel should continue to boot on existing non-EFI platforms. -config IRQBALANCE - def_bool y - prompt "Enable kernel irq balancing" - depends on X86_32 && SMP && X86_IO_APIC - help - The default yes will allow the kernel to do irq load balancing. - Saying no will keep the kernel from doing irq load balancing. - config SECCOMP def_bool y prompt "Enable seccomp to safely compute untrusted bytecode" - depends on PROC_FS help This kernel feature is useful for number crunching applications that may need to compute untrusted bytecode during their @@ -1238,7 +1280,7 @@ config SECCOMP the process as file descriptors supporting the read/write syscalls, it's possible to isolate those applications in their own address space using seccomp. Once seccomp is - enabled via /proc/<pid>/seccomp, it cannot be disabled + enabled via prctl(PR_SET_SECCOMP), it cannot be disabled and the task is only allowed to execute a few safe syscalls defined by each seccomp mode. @@ -1288,7 +1330,7 @@ config KEXEC strongly in flux, so no good recommendation can be made. config CRASH_DUMP - bool "kernel crash dumps (EXPERIMENTAL)" + bool "kernel crash dumps" depends on X86_64 || (X86_32 && HIGHMEM) help Generate crash dump after being started by kexec. @@ -1301,6 +1343,14 @@ config CRASH_DUMP (CONFIG_RELOCATABLE=y). For more details see Documentation/kdump/kdump.txt +config KEXEC_JUMP + bool "kexec jump (EXPERIMENTAL)" + depends on EXPERIMENTAL + depends on KEXEC && HIBERNATION && X86_32 + help + Jump between original kernel and kexeced kernel and invoke + code in physical address mode via KEXEC + config PHYSICAL_START hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) default "0x1000000" if X86_NUMAQ @@ -1387,14 +1437,14 @@ config PHYSICAL_ALIGN Don't change this unless you know what you are doing. config HOTPLUG_CPU - bool "Support for suspend on SMP and hot-pluggable CPUs (EXPERIMENTAL)" - depends on SMP && HOTPLUG && EXPERIMENTAL && !X86_VOYAGER + bool "Support for hot-pluggable CPUs" + depends on SMP && HOTPLUG && !X86_VOYAGER ---help--- - Say Y here to experiment with turning CPUs off and on, and to - enable suspend on SMP systems. CPUs can be controlled through - /sys/devices/system/cpu. - Say N if you want to disable CPU hotplug and don't need to - suspend. + Say Y here to allow turning CPUs off and on. CPUs can be + controlled through /sys/devices/system/cpu. + ( Note: power management support will enable this option + automatically on SMP systems. ) + Say N if you want to disable CPU hotplug. config COMPAT_VDSO def_bool y @@ -1409,6 +1459,51 @@ config COMPAT_VDSO If unsure, say Y. +config CMDLINE_BOOL + bool "Built-in kernel command line" + default n + help + Allow for specifying boot arguments to the kernel at + build time. On some systems (e.g. embedded ones), it is + necessary or convenient to provide some or all of the + kernel boot arguments with the kernel itself (that is, + to not rely on the boot loader to provide them.) + + To compile command line arguments into the kernel, + set this option to 'Y', then fill in the + the boot arguments in CONFIG_CMDLINE. + + Systems with fully functional boot loaders (i.e. non-embedded) + should leave this option set to 'N'. + +config CMDLINE + string "Built-in kernel command string" + depends on CMDLINE_BOOL + default "" + help + Enter arguments here that should be compiled into the kernel + image and used at boot time. If the boot loader provides a + command line at boot time, it is appended to this string to + form the full kernel command line, when the system boots. + + However, you can use the CONFIG_CMDLINE_OVERRIDE option to + change this behavior. + + In most cases, the command line (whether built-in or provided + by the boot loader) should specify the device for the root + file system. + +config CMDLINE_OVERRIDE + bool "Built-in command line overrides boot loader arguments" + default n + depends on CMDLINE_BOOL + help + Set this option to 'Y' to have the kernel ignore the boot loader + command line, and use ONLY the built-in command line. + + This is used to work around broken boot loaders. This should + be set to 'N' under normal conditions. + endmenu config ARCH_ENABLE_MEMORY_HOTPLUG @@ -1567,6 +1662,8 @@ source "arch/x86/kernel/cpu/cpufreq/Kconfig" source "drivers/cpuidle/Kconfig" +source "drivers/idle/Kconfig" + endmenu @@ -1674,6 +1771,14 @@ config DMAR_FLOPPY_WA workaround will setup a 1:1 mapping for the first 16M to make floppy (an ISA device) work. +config INTR_REMAP + bool "Support for Interrupt Remapping (EXPERIMENTAL)" + depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI && EXPERIMENTAL + help + Supports Interrupt remapping for IO-APIC and MSI devices. + To use x2apic mode in the CPU's which support x2APIC enhancements or + to support platforms with CPU's having > 8 bit APIC ID, say Y. + source "drivers/pci/pcie/Kconfig" source "drivers/pci/Kconfig" @@ -1790,7 +1895,7 @@ config IA32_EMULATION config IA32_AOUT tristate "IA32 a.out support" - depends on IA32_EMULATION && ARCH_SUPPORTS_AOUT + depends on IA32_EMULATION help Support old a.out binaries in the 32bit emulation. @@ -1804,7 +1909,7 @@ config COMPAT_FOR_U64_ALIGNMENT config SYSVIPC_COMPAT def_bool y - depends on X86_64 && COMPAT && SYSVIPC + depends on COMPAT && SYSVIPC endmenu diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index abff1b84ed5b..0b7c4a3f0651 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -38,8 +38,7 @@ config M386 - "Crusoe" for the Transmeta Crusoe series. - "Efficeon" for the Transmeta Efficeon series. - "Winchip-C6" for original IDT Winchip. - - "Winchip-2" for IDT Winchip 2. - - "Winchip-2A" for IDT Winchips with 3dNow! capabilities. + - "Winchip-2" for IDT Winchips with 3dNow! capabilities. - "GeodeGX1" for Geode GX1 (Cyrix MediaGX). - "Geode GX/LX" For AMD Geode GX and LX processors. - "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3. @@ -194,19 +193,11 @@ config MWINCHIPC6 treat this chip as a 586TSC with some extended instructions and alignment requirements. -config MWINCHIP2 - bool "Winchip-2" - depends on X86_32 - help - Select this for an IDT Winchip-2. Linux and GCC - treat this chip as a 586TSC with some extended instructions - and alignment requirements. - config MWINCHIP3D - bool "Winchip-2A/Winchip-3" + bool "Winchip-2/Winchip-2A/Winchip-3" depends on X86_32 help - Select this for an IDT Winchip-2A or 3. Linux and GCC + Select this for an IDT Winchip-2, 2A or 3. Linux and GCC treat this chip as a 586TSC with some extended instructions and alignment requirements. Also enable out of order memory stores for this CPU, which can increase performance of some @@ -318,7 +309,7 @@ config X86_L1_CACHE_SHIFT int default "7" if MPENTIUM4 || X86_GENERIC || GENERIC_CPU || MPSC default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 - default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX + default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7 config X86_XADD @@ -360,11 +351,7 @@ config X86_POPAD_OK config X86_ALIGNMENT_16 def_bool y - depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 - -config X86_GOOD_APIC - def_bool y - depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 || MEFFICEON || MCORE2 || MVIAC7 || X86_64 + depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 config X86_INTEL_USERCOPY def_bool y @@ -372,7 +359,7 @@ config X86_INTEL_USERCOPY config X86_USE_PPRO_CHECKSUM def_bool y - depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2 + depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2 config X86_USE_3DNOW def_bool y @@ -380,24 +367,27 @@ config X86_USE_3DNOW config X86_OOSTORE def_bool y - depends on (MWINCHIP3D || MWINCHIP2 || MWINCHIPC6) && MTRR + depends on (MWINCHIP3D || MWINCHIPC6) && MTRR # # P6_NOPs are a relatively minor optimization that require a family >= # 6 processor, except that it is broken on certain VIA chips. # Furthermore, AMD chips prefer a totally different sequence of NOPs -# (which work on all CPUs). As a result, disallow these if we're -# compiling X86_GENERIC but not X86_64 (these NOPs do work on all -# x86-64 capable chips); the list of processors in the right-hand clause -# are the cores that benefit from this optimization. +# (which work on all CPUs). In addition, it looks like Virtual PC +# does not understand them. +# +# As a result, disallow these if we're not compiling for X86_64 (these +# NOPs do work on all x86-64 capable chips); the list of processors in +# the right-hand clause are the cores that benefit from this optimization. # config X86_P6_NOP def_bool y - depends on (X86_64 || !X86_GENERIC) && (M686 || MPENTIUMII || MPENTIUMIII || MPENTIUMM || MCORE2 || MPENTIUM4 || MPSC) + depends on X86_64 + depends on (MCORE2 || MPENTIUM4 || MPSC) config X86_TSC def_bool y - depends on ((MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64 + depends on ((MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64 config X86_CMPXCHG64 def_bool y @@ -407,7 +397,7 @@ config X86_CMPXCHG64 # generates cmov. config X86_CMOV def_bool y - depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || X86_64) + depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64) config X86_MINIMUM_CPU_FAMILY int @@ -418,4 +408,124 @@ config X86_MINIMUM_CPU_FAMILY config X86_DEBUGCTLMSR def_bool y - depends on !(M586MMX || M586TSC || M586 || M486 || M386) + depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386) + +menuconfig PROCESSOR_SELECT + bool "Supported processor vendors" if EMBEDDED + help + This lets you choose what x86 vendor support code your kernel + will include. + +config CPU_SUP_INTEL + default y + bool "Support Intel processors" if PROCESSOR_SELECT + help + This enables detection, tunings and quirks for Intel processors + + You need this enabled if you want your kernel to run on an + Intel CPU. Disabling this option on other types of CPUs + makes the kernel a tiny bit smaller. Disabling it on an Intel + CPU might render the kernel unbootable. + + If unsure, say N. + +config CPU_SUP_CYRIX_32 + default y + bool "Support Cyrix processors" if PROCESSOR_SELECT + depends on !64BIT + help + This enables detection, tunings and quirks for Cyrix processors + + You need this enabled if you want your kernel to run on a + Cyrix CPU. Disabling this option on other types of CPUs + makes the kernel a tiny bit smaller. Disabling it on a Cyrix + CPU might render the kernel unbootable. + + If unsure, say N. + +config CPU_SUP_AMD + default y + bool "Support AMD processors" if PROCESSOR_SELECT + help + This enables detection, tunings and quirks for AMD processors + + You need this enabled if you want your kernel to run on an + AMD CPU. Disabling this option on other types of CPUs + makes the kernel a tiny bit smaller. Disabling it on an AMD + CPU might render the kernel unbootable. + + If unsure, say N. + +config CPU_SUP_CENTAUR_32 + default y + bool "Support Centaur processors" if PROCESSOR_SELECT + depends on !64BIT + help + This enables detection, tunings and quirks for Centaur processors + + You need this enabled if you want your kernel to run on a + Centaur CPU. Disabling this option on other types of CPUs + makes the kernel a tiny bit smaller. Disabling it on a Centaur + CPU might render the kernel unbootable. + + If unsure, say N. + +config CPU_SUP_CENTAUR_64 + default y + bool "Support Centaur processors" if PROCESSOR_SELECT + depends on 64BIT + help + This enables detection, tunings and quirks for Centaur processors + + You need this enabled if you want your kernel to run on a + Centaur CPU. Disabling this option on other types of CPUs + makes the kernel a tiny bit smaller. Disabling it on a Centaur + CPU might render the kernel unbootable. + + If unsure, say N. + +config CPU_SUP_TRANSMETA_32 + default y + bool "Support Transmeta processors" if PROCESSOR_SELECT + depends on !64BIT + help + This enables detection, tunings and quirks for Transmeta processors + + You need this enabled if you want your kernel to run on a + Transmeta CPU. Disabling this option on other types of CPUs + makes the kernel a tiny bit smaller. Disabling it on a Transmeta + CPU might render the kernel unbootable. + + If unsure, say N. + +config CPU_SUP_UMC_32 + default y + bool "Support UMC processors" if PROCESSOR_SELECT + depends on !64BIT + help + This enables detection, tunings and quirks for UMC processors + + You need this enabled if you want your kernel to run on a + UMC CPU. Disabling this option on other types of CPUs + makes the kernel a tiny bit smaller. Disabling it on a UMC + CPU might render the kernel unbootable. + + If unsure, say N. + +config X86_DS + bool "Debug Store support" + default y + help + Add support for Debug Store. + This allows the kernel to provide a memory buffer to the hardware + to store various profiling and tracing events. + +config X86_PTRACE_BTS + bool "ptrace interface to Branch Trace Store" + default y + depends on (X86_DS && X86_DEBUGCTLMSR) + help + Add a ptrace interface to allow collecting an execution trace + of the traced task. + This collects control flow changes in a (cyclic) buffer and allows + debuggers to fill in the gaps and show an execution trace of the debuggee. diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index ae36bfa814e5..2a3dfbd5e677 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -5,13 +5,15 @@ config TRACE_IRQFLAGS_SUPPORT source "lib/Kconfig.debug" -config NONPROMISC_DEVMEM +config STRICT_DEVMEM bool "Filter access to /dev/mem" help - If this option is left off, you allow userspace access to all + If this option is disabled, you allow userspace (root) access to all of memory, including kernel and userspace memory. Accidental access to this is obviously disastrous, but specific access can - be used by people debugging the kernel. + be used by people debugging the kernel. Note that with PAT support + enabled, even in this case there are restrictions on /dev/mem + use due to the cache aliasing requirements. If this option is switched on, the /dev/mem file only allows userspace access to PCI space and the BIOS code and data regions. @@ -41,6 +43,19 @@ config EARLY_PRINTK with klogd/syslogd or the X server. You should normally N here, unless you want to debug such a crash. +config EARLY_PRINTK_DBGP + bool "Early printk via EHCI debug port" + default n + depends on EARLY_PRINTK && PCI + help + Write kernel log output directly into the EHCI debug port. + + This is useful for kernel debugging when your machine crashes very + early before the console code is initialized. For normal operation + it is not recommended because it looks ugly and doesn't cooperate + with klogd/syslogd or the X server. You should normally N here, + unless you want to debug such a crash. You need usb debug device. + config DEBUG_STACKOVERFLOW bool "Check for stack overflows" depends on DEBUG_KERNEL @@ -287,7 +302,6 @@ config CPA_DEBUG config OPTIMIZE_INLINING bool "Allow gcc to uninline functions marked 'inline'" - depends on BROKEN help This option determines if the kernel forces gcc to inline the functions developers have marked 'inline'. Doing so takes away freedom from gcc to @@ -298,5 +312,7 @@ config OPTIMIZE_INLINING become the default in the future, until then this option is there to test gcc for this. + If unsure, say N. + endmenu diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 919ce21ea654..d1a47adb5aec 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -110,21 +110,16 @@ KBUILD_CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,) mcore-y := arch/x86/mach-default/ # Voyager subarch support -mflags-$(CONFIG_X86_VOYAGER) := -Iinclude/asm-x86/mach-voyager +mflags-$(CONFIG_X86_VOYAGER) := -Iarch/x86/include/asm/mach-voyager mcore-$(CONFIG_X86_VOYAGER) := arch/x86/mach-voyager/ # generic subarchitecture -mflags-$(CONFIG_X86_GENERICARCH):= -Iinclude/asm-x86/mach-generic +mflags-$(CONFIG_X86_GENERICARCH):= -Iarch/x86/include/asm/mach-generic fcore-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/ mcore-$(CONFIG_X86_GENERICARCH) := arch/x86/mach-default/ -# RDC R-321x subarch support -mflags-$(CONFIG_X86_RDC321X) := -Iinclude/asm-x86/mach-rdc321x -mcore-$(CONFIG_X86_RDC321X) := arch/x86/mach-default/ -core-$(CONFIG_X86_RDC321X) += arch/x86/mach-rdc321x/ - # default subarch .h files -mflags-y += -Iinclude/asm-x86/mach-default +mflags-y += -Iarch/x86/include/asm/mach-default # 64 bit does not support subarch support - clear sub arch variables fcore-$(CONFIG_X86_64) := diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu index e372b584e919..80177ec052f0 100644 --- a/arch/x86/Makefile_32.cpu +++ b/arch/x86/Makefile_32.cpu @@ -28,7 +28,6 @@ cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8,-march=athlon) cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586) -cflags-$(CONFIG_MWINCHIP2) += $(call cc-option,-march=winchip2,-march=i586) cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586) cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) @@ -45,3 +44,8 @@ cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx # cpu entries cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686)) +# Bug fix for binutils: this option is required in order to keep +# binutils from generating NOPL instructions against our will. +ifneq ($(CONFIG_X86_P6_NOP),y) +cflags-y += $(call cc-option,-Wa$(comma)-mtune=generic32,) +endif diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 7ee102f9c4f8..cd48c7210016 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -72,9 +72,7 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \ KBUILD_CFLAGS += $(call cc-option,-m32) KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ -$(obj)/zImage: IMAGE_OFFSET := 0x1000 $(obj)/zImage: asflags-y := $(SVGA_MODE) $(RAMDISK) -$(obj)/bzImage: IMAGE_OFFSET := 0x100000 $(obj)/bzImage: ccflags-y := -D__BIG_KERNEL__ $(obj)/bzImage: asflags-y := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__ $(obj)/bzImage: BUILDFLAGS := -b @@ -117,7 +115,7 @@ $(obj)/setup.bin: $(obj)/setup.elf FORCE $(call if_changed,objcopy) $(obj)/compressed/vmlinux: FORCE - $(Q)$(MAKE) $(build)=$(obj)/compressed IMAGE_OFFSET=$(IMAGE_OFFSET) $@ + $(Q)$(MAKE) $(build)=$(obj)/compressed $@ # Set this if you want to pass append arguments to the zdisk/fdimage/isoimage kernel FDARGS = @@ -181,6 +179,7 @@ isoimage: $(BOOTIMAGE) mkisofs -J -r -o $(obj)/image.iso -b isolinux.bin -c boot.cat \ -no-emul-boot -boot-load-size 4 -boot-info-table \ $(obj)/isoimage + isohybrid $(obj)/image.iso 2>/dev/null || true rm -rf $(obj)/isoimage zlilo: $(BOOTIMAGE) diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index a34b9982c7cb..cc0ef13fba7a 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -24,10 +24,14 @@ #include <linux/edd.h> #include <asm/boot.h> #include <asm/setup.h> +#include "bitops.h" +#include <asm/cpufeature.h> /* Useful macros */ #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) +#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) + extern struct setup_header hdr; extern struct boot_params boot_params; @@ -242,6 +246,12 @@ int cmdline_find_option(const char *option, char *buffer, int bufsize); int cmdline_find_option_bool(const char *option); /* cpu.c, cpucheck.c */ +struct cpu_features { + int level; /* Family, or 64 for x86-64 */ + int model; + u32 flags[NCAPINTS]; +}; +extern struct cpu_features cpu; int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr); int validate_cpu(void); diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 92fdd35bd93e..1771c804e02f 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -27,9 +27,8 @@ $(obj)/vmlinux.bin: vmlinux FORCE $(call if_changed,objcopy) -ifeq ($(CONFIG_X86_32),y) -targets += vmlinux.bin.all vmlinux.relocs -hostprogs-y := relocs +targets += vmlinux.bin.all vmlinux.relocs relocs +hostprogs-$(CONFIG_X86_32) += relocs quiet_cmd_relocs = RELOCS $@ cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $< @@ -43,6 +42,8 @@ quiet_cmd_relocbin = BUILD $@ $(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE $(call if_changed,relocbin) +ifeq ($(CONFIG_X86_32),y) + ifdef CONFIG_RELOCATABLE $(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE $(call if_changed,gzip) @@ -59,6 +60,5 @@ $(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T endif - $(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE $(call if_changed,ld) diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index ba7736cf2ec7..29c5fbf08392 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -137,14 +137,15 @@ relocated: */ movl output_len(%ebx), %eax pushl %eax + # push arguments for decompress_kernel: pushl %ebp # output address movl input_len(%ebx), %eax pushl %eax # input_len leal input_data(%ebx), %eax pushl %eax # input_data leal boot_heap(%ebx), %eax - pushl %eax # heap area as third argument - pushl %esi # real mode pointer as second arg + pushl %eax # heap area + pushl %esi # real mode pointer call decompress_kernel addl $20, %esp popl %ecx diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index bc5553b496f7..da062216948a 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -16,7 +16,7 @@ */ #undef CONFIG_PARAVIRT #ifdef CONFIG_X86_32 -#define _ASM_DESC_H_ 1 +#define _ASM_X86_DESC_H 1 #endif #ifdef CONFIG_X86_64 @@ -27,7 +27,7 @@ #include <linux/linkage.h> #include <linux/screen_info.h> #include <linux/elf.h> -#include <asm/io.h> +#include <linux/io.h> #include <asm/page.h> #include <asm/boot.h> #include <asm/bootparam.h> @@ -182,8 +182,6 @@ static unsigned outcnt; static int fill_inbuf(void); static void flush_window(void); static void error(char *m); -static void gzip_mark(void **); -static void gzip_release(void **); /* * This is set up by the setup-routine at boot-time @@ -196,9 +194,6 @@ extern int input_len; static long bytes_out; -static void *malloc(int size); -static void free(void *where); - static void *memset(void *s, int c, unsigned n); static void *memcpy(void *dest, const void *src, unsigned n); @@ -220,40 +215,6 @@ static int lines, cols; #include "../../../../lib/inflate.c" -static void *malloc(int size) -{ - void *p; - - if (size < 0) - error("Malloc error"); - if (free_mem_ptr <= 0) - error("Memory error"); - - free_mem_ptr = (free_mem_ptr + 3) & ~3; /* Align */ - - p = (void *)free_mem_ptr; - free_mem_ptr += size; - - if (free_mem_ptr >= free_mem_end_ptr) - error("Out of memory"); - - return p; -} - -static void free(void *where) -{ /* Don't care */ -} - -static void gzip_mark(void **ptr) -{ - *ptr = (void *) free_mem_ptr; -} - -static void gzip_release(void **ptr) -{ - free_mem_ptr = (memptr) *ptr; -} - static void scroll(void) { int i; @@ -290,7 +251,7 @@ static void __putstr(int error, const char *s) y--; } } else { - vidmem [(x + cols * y) * 2] = c; + vidmem[(x + cols * y) * 2] = c; if (++x >= cols) { x = 0; if (++y >= lines) { @@ -316,7 +277,8 @@ static void *memset(void *s, int c, unsigned n) int i; char *ss = s; - for (i = 0; i < n; i++) ss[i] = c; + for (i = 0; i < n; i++) + ss[i] = c; return s; } @@ -326,7 +288,8 @@ static void *memcpy(void *dest, const void *src, unsigned n) const char *s = src; char *d = dest; - for (i = 0; i < n; i++) d[i] = s[i]; + for (i = 0; i < n; i++) + d[i] = s[i]; return dest; } diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c index a1310c52fc0c..857e492c571e 100644 --- a/arch/x86/boot/compressed/relocs.c +++ b/arch/x86/boot/compressed/relocs.c @@ -492,7 +492,7 @@ static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym)) continue; } sh_symtab = sec_symtab->symtab; - sym_strtab = sec->link->strtab; + sym_strtab = sec_symtab->link->strtab; for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) { Elf32_Rel *rel; Elf32_Sym *sym; diff --git a/arch/x86/boot/cpu.c b/arch/x86/boot/cpu.c index 92d6fd73dc7d..6ec6bb6e9957 100644 --- a/arch/x86/boot/cpu.c +++ b/arch/x86/boot/cpu.c @@ -16,9 +16,6 @@ */ #include "boot.h" -#include "bitops.h" -#include <asm/cpufeature.h> - #include "cpustr.h" static char *cpu_name(int level) @@ -62,17 +59,18 @@ int validate_cpu(void) u32 e = err_flags[i]; for (j = 0; j < 32; j++) { - int n = (i << 5)+j; - if (*msg_strs < n) { + if (msg_strs[0] < i || + (msg_strs[0] == i && msg_strs[1] < j)) { /* Skip to the next string */ - do { - msg_strs++; - } while (*msg_strs); - msg_strs++; + msg_strs += 2; + while (*msg_strs++) + ; } if (e & 1) { - if (*msg_strs == n && msg_strs[1]) - printf("%s ", msg_strs+1); + if (msg_strs[0] == i && + msg_strs[1] == j && + msg_strs[2]) + printf("%s ", msg_strs+2); else printf("%d:%d ", i, j); } diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c index 7804389ee005..4d3ff037201f 100644 --- a/arch/x86/boot/cpucheck.c +++ b/arch/x86/boot/cpucheck.c @@ -22,21 +22,13 @@ #ifdef _SETUP # include "boot.h" -# include "bitops.h" #endif #include <linux/types.h> -#include <asm/cpufeature.h> #include <asm/processor-flags.h> #include <asm/required-features.h> #include <asm/msr-index.h> -struct cpu_features { - int level; /* Family, or 64 for x86-64 */ - int model; - u32 flags[NCAPINTS]; -}; - -static struct cpu_features cpu; +struct cpu_features cpu; static u32 cpu_vendor[3]; static u32 err_flags[NCAPINTS]; @@ -46,12 +38,12 @@ static const u32 req_flags[NCAPINTS] = { REQUIRED_MASK0, REQUIRED_MASK1, - REQUIRED_MASK2, - REQUIRED_MASK3, + 0, /* REQUIRED_MASK2 not implemented in this file */ + 0, /* REQUIRED_MASK3 not implemented in this file */ REQUIRED_MASK4, - REQUIRED_MASK5, + 0, /* REQUIRED_MASK5 not implemented in this file */ REQUIRED_MASK6, - REQUIRED_MASK7, + 0, /* REQUIRED_MASK7 not implemented in this file */ }; #define A32(a, b, c, d) (((d) << 24)+((c) << 16)+((b) << 8)+(a)) diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c index 03399d64013b..1aae8f3e5ca1 100644 --- a/arch/x86/boot/edd.c +++ b/arch/x86/boot/edd.c @@ -41,6 +41,7 @@ static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig) char *mbrbuf_ptr, *mbrbuf_end; u32 buf_base, mbr_base; extern char _end[]; + u16 mbr_magic; sector_size = ei->params.bytes_per_sector; if (!sector_size) @@ -58,11 +59,15 @@ static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig) if (mbrbuf_end > (char *)(size_t)boot_params.hdr.heap_end_ptr) return -1; + memset(mbrbuf_ptr, 0, sector_size); if (read_mbr(devno, mbrbuf_ptr)) return -1; *mbrsig = *(u32 *)&mbrbuf_ptr[EDD_MBR_SIG_OFFSET]; - return 0; + mbr_magic = *(u16 *)&mbrbuf_ptr[510]; + + /* check for valid MBR magic */ + return mbr_magic == 0xAA55 ? 0 : -1; } static int get_edd_info(u8 devno, struct edd_info *ei) @@ -167,9 +172,8 @@ void query_edd(void) * Scan the BIOS-supported hard disks and query EDD * information... */ - get_edd_info(devno, &ei); - - if (boot_params.eddbuf_entries < EDDMAXNR) { + if (!get_edd_info(devno, &ei) + && boot_params.eddbuf_entries < EDDMAXNR) { memcpy(edp, &ei, sizeof ei); edp++; boot_params.eddbuf_entries++; diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index af86e431acfa..b993062e9a5f 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -30,7 +30,6 @@ SYSSEG = DEF_SYSSEG /* system loaded at 0x10000 (65536) */ SYSSIZE = DEF_SYSSIZE /* system size: # of 16-byte clicks */ /* to be loaded */ ROOT_DEV = 0 /* ROOT_DEV is now written by "build" */ -SWAP_DEV = 0 /* SWAP_DEV is now written by "build" */ #ifndef SVGA_MODE #define SVGA_MODE ASK_VGA diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c index 2296164b54d2..197421db1af1 100644 --- a/arch/x86/boot/main.c +++ b/arch/x86/boot/main.c @@ -73,6 +73,11 @@ static void keyboard_set_repeat(void) */ static void query_ist(void) { + /* Some older BIOSes apparently crash on this call, so filter + it from machines too old to have SpeedStep at all. */ + if (cpu.level < 6) + return; + asm("int $0x15" : "=a" (boot_params.ist_info.signature), "=b" (boot_params.ist_info.command), diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c index 53165c97336b..8c3c25f35578 100644 --- a/arch/x86/boot/memory.c +++ b/arch/x86/boot/memory.c @@ -13,7 +13,6 @@ */ #include "boot.h" -#include <linux/kernel.h> #define SMAP 0x534d4150 /* ASCII "SMAP" */ diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c index bbe76953bae9..8ef60f20b371 100644 --- a/arch/x86/boot/mkcpustr.c +++ b/arch/x86/boot/mkcpustr.c @@ -15,33 +15,33 @@ #include <stdio.h> -#include "../kernel/cpu/feature_names.c" - -#if NCAPFLAGS > 8 -# error "Need to adjust the boot code handling of CPUID strings" -#endif +#include "../kernel/cpu/capflags.c" int main(void) { - int i; + int i, j; const char *str; printf("static const char x86_cap_strs[] = \n"); - for (i = 0; i < NCAPINTS*32; i++) { - str = x86_cap_flags[i]; - - if (i == NCAPINTS*32-1) { - /* The last entry must be unconditional; this - also consumes the compiler-added null character */ - if (!str) - str = ""; - printf("\t\"\\x%02x\"\"%s\"\n", i, str); - } else if (str) { - printf("#if REQUIRED_MASK%d & (1 << %d)\n" - "\t\"\\x%02x\"\"%s\\0\"\n" - "#endif\n", - i >> 5, i & 31, i, str); + for (i = 0; i < NCAPINTS; i++) { + for (j = 0; j < 32; j++) { + str = x86_cap_flags[i*32+j]; + + if (i == NCAPINTS-1 && j == 31) { + /* The last entry must be unconditional; this + also consumes the compiler-added null + character */ + if (!str) + str = ""; + printf("\t\"\\x%02x\\x%02x\"\"%s\"\n", + i, j, str); + } else if (str) { + printf("#if REQUIRED_MASK%d & (1 << %d)\n" + "\t\"\\x%02x\\x%02x\"\"%s\\0\"\n" + "#endif\n", + i, j, i, j, str); + } } } printf("\t;\n"); diff --git a/arch/x86/boot/pm.c b/arch/x86/boot/pm.c index 328956fdb59e..85a1cd8a8ff8 100644 --- a/arch/x86/boot/pm.c +++ b/arch/x86/boot/pm.c @@ -98,12 +98,6 @@ static void reset_coprocessor(void) /* * Set up the GDT */ -#define GDT_ENTRY(flags, base, limit) \ - (((u64)(base & 0xff000000) << 32) | \ - ((u64)flags << 40) | \ - ((u64)(limit & 0x00ff0000) << 32) | \ - ((u64)(base & 0x00ffffff) << 16) | \ - ((u64)(limit & 0x0000ffff))) struct gdt_ptr { u16 len; diff --git a/arch/x86/boot/video-bios.c b/arch/x86/boot/video-bios.c index 49f26aaaebc8..3fa979c9c363 100644 --- a/arch/x86/boot/video-bios.c +++ b/arch/x86/boot/video-bios.c @@ -17,7 +17,7 @@ #include "boot.h" #include "video.h" -__videocard video_bios; +static __videocard video_bios; /* Set a conventional BIOS mode */ static int set_bios_mode(u8 mode); @@ -119,7 +119,7 @@ static int bios_probe(void) return nmodes; } -__videocard video_bios = +static __videocard video_bios = { .card_name = "BIOS", .probe = bios_probe, diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c index 401ad998ad08..75115849af33 100644 --- a/arch/x86/boot/video-vesa.c +++ b/arch/x86/boot/video-vesa.c @@ -20,7 +20,7 @@ static struct vesa_general_info vginfo; static struct vesa_mode_info vminfo; -__videocard video_vesa; +static __videocard video_vesa; #ifndef _WAKEUP static void vesa_store_mode_params_graphics(void); @@ -88,14 +88,11 @@ static int vesa_probe(void) (vminfo.memory_layout == 4 || vminfo.memory_layout == 6) && vminfo.memory_planes == 1) { -#ifdef CONFIG_FB +#ifdef CONFIG_FB_BOOT_VESA_SUPPORT /* Graphics mode, color, linear frame buffer supported. Only register the mode if if framebuffer is configured, however, - otherwise the user will be left without a screen. - We don't require CONFIG_FB_VESA, however, since - some of the other framebuffer drivers can use - this mode-setting, too. */ + otherwise the user will be left without a screen. */ mi = GET_HEAP(struct mode_info, 1); mi->mode = mode + VIDEO_FIRST_VESA; mi->depth = vminfo.bpp; @@ -133,10 +130,12 @@ static int vesa_set_mode(struct mode_info *mode) if ((vminfo.mode_attr & 0x15) == 0x05) { /* It's a supported text mode */ is_graphic = 0; +#ifdef CONFIG_FB_BOOT_VESA_SUPPORT } else if ((vminfo.mode_attr & 0x99) == 0x99) { /* It's a graphics mode with linear frame buffer */ is_graphic = 1; vesa_mode |= 0x4000; /* Request linear frame buffer */ +#endif } else { return -1; /* Invalid mode */ } @@ -224,7 +223,7 @@ static void vesa_store_pm_info(void) static void vesa_store_mode_params_graphics(void) { /* Tell the kernel we're in VESA graphics mode */ - boot_params.screen_info.orig_video_isVGA = 0x23; + boot_params.screen_info.orig_video_isVGA = VIDEO_TYPE_VLFB; /* Mode parameters */ boot_params.screen_info.vesa_attributes = vminfo.mode_attr; @@ -294,7 +293,7 @@ void vesa_store_edid(void) #endif /* not _WAKEUP */ -__videocard video_vesa = +static __videocard video_vesa = { .card_name = "VESA", .probe = vesa_probe, diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 9bc34e2033ec..13b8c86ae985 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -1,13 +1,13 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.26-rc1 -# Sun May 4 19:59:02 2008 +# Linux kernel version: 2.6.27-rc5 +# Wed Sep 3 17:23:09 2008 # # CONFIG_64BIT is not set CONFIG_X86_32=y # CONFIG_X86_64 is not set CONFIG_X86=y -CONFIG_DEFCONFIG_LIST="arch/x86/configs/i386_defconfig" +CONFIG_ARCH_DEFCONFIG="arch/x86/configs/i386_defconfig" # CONFIG_GENERIC_LOCKBREAK is not set CONFIG_GENERIC_TIME=y CONFIG_GENERIC_CMOS_UPDATE=y @@ -53,6 +53,7 @@ CONFIG_X86_HT=y CONFIG_X86_BIOS_REBOOT=y CONFIG_X86_TRAMPOLINE=y CONFIG_KTIME_SCALAR=y +CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" # # General setup @@ -82,6 +83,7 @@ CONFIG_CGROUPS=y CONFIG_CGROUP_NS=y # CONFIG_CGROUP_DEVICE is not set CONFIG_CPUSETS=y +CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y CONFIG_GROUP_SCHED=y CONFIG_FAIR_GROUP_SCHED=y # CONFIG_RT_GROUP_SCHED is not set @@ -105,7 +107,6 @@ CONFIG_SYSCTL=y # CONFIG_EMBEDDED is not set CONFIG_UID16=y CONFIG_SYSCTL_SYSCALL=y -CONFIG_SYSCTL_SYSCALL_CHECK=y CONFIG_KALLSYMS=y CONFIG_KALLSYMS_ALL=y CONFIG_KALLSYMS_EXTRA_PASS=y @@ -113,6 +114,7 @@ CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y CONFIG_ELF_CORE=y +CONFIG_PCSPKR_PLATFORM=y # CONFIG_COMPAT_BRK is not set CONFIG_BASE_FULL=y CONFIG_FUTEX=y @@ -132,27 +134,35 @@ CONFIG_MARKERS=y # CONFIG_OPROFILE is not set CONFIG_HAVE_OPROFILE=y CONFIG_KPROBES=y +CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y CONFIG_KRETPROBES=y +CONFIG_HAVE_IOREMAP_PROT=y CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y +# CONFIG_HAVE_ARCH_TRACEHOOK is not set # CONFIG_HAVE_DMA_ATTRS is not set +CONFIG_USE_GENERIC_SMP_HELPERS=y +# CONFIG_HAVE_CLK is not set CONFIG_PROC_PAGE_MONITOR=y +CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y # CONFIG_TINY_SHMEM is not set CONFIG_BASE_SMALL=0 CONFIG_MODULES=y +# CONFIG_MODULE_FORCE_LOAD is not set CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_MODVERSIONS is not set # CONFIG_MODULE_SRCVERSION_ALL is not set -# CONFIG_KMOD is not set +CONFIG_KMOD=y CONFIG_STOP_MACHINE=y CONFIG_BLOCK=y # CONFIG_LBD is not set CONFIG_BLK_DEV_IO_TRACE=y # CONFIG_LSF is not set CONFIG_BLK_DEV_BSG=y +# CONFIG_BLK_DEV_INTEGRITY is not set # # IO Schedulers @@ -176,25 +186,23 @@ CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y CONFIG_GENERIC_CLOCKEVENTS_BUILD=y CONFIG_SMP=y +CONFIG_X86_FIND_SMP_CONFIG=y +CONFIG_X86_MPPARSE=y CONFIG_X86_PC=y # CONFIG_X86_ELAN is not set # CONFIG_X86_VOYAGER is not set -# CONFIG_X86_NUMAQ is not set -# CONFIG_X86_SUMMIT is not set -# CONFIG_X86_BIGSMP is not set -# CONFIG_X86_VISWS is not set # CONFIG_X86_GENERICARCH is not set -# CONFIG_X86_ES7000 is not set -# CONFIG_X86_RDC321X is not set # CONFIG_X86_VSMP is not set +# CONFIG_X86_RDC321X is not set CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y # CONFIG_PARAVIRT_GUEST is not set +# CONFIG_MEMTEST is not set # CONFIG_M386 is not set # CONFIG_M486 is not set # CONFIG_M586 is not set # CONFIG_M586TSC is not set # CONFIG_M586MMX is not set -# CONFIG_M686 is not set +CONFIG_M686=y # CONFIG_MPENTIUMII is not set # CONFIG_MPENTIUMIII is not set # CONFIG_MPENTIUMM is not set @@ -205,7 +213,6 @@ CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y # CONFIG_MCRUSOE is not set # CONFIG_MEFFICEON is not set # CONFIG_MWINCHIPC6 is not set -# CONFIG_MWINCHIP2 is not set # CONFIG_MWINCHIP3D is not set # CONFIG_MGEODEGX1 is not set # CONFIG_MGEODE_LX is not set @@ -213,30 +220,30 @@ CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y # CONFIG_MVIAC3_2 is not set # CONFIG_MVIAC7 is not set # CONFIG_MPSC is not set -CONFIG_MCORE2=y +# CONFIG_MCORE2 is not set # CONFIG_GENERIC_CPU is not set -# CONFIG_X86_GENERIC is not set +CONFIG_X86_GENERIC=y CONFIG_X86_CPU=y CONFIG_X86_CMPXCHG=y -CONFIG_X86_L1_CACHE_SHIFT=6 +CONFIG_X86_L1_CACHE_SHIFT=7 CONFIG_X86_XADD=y +# CONFIG_X86_PPRO_FENCE is not set CONFIG_X86_WP_WORKS_OK=y CONFIG_X86_INVLPG=y CONFIG_X86_BSWAP=y CONFIG_X86_POPAD_OK=y -CONFIG_X86_GOOD_APIC=y CONFIG_X86_INTEL_USERCOPY=y CONFIG_X86_USE_PPRO_CHECKSUM=y -CONFIG_X86_P6_NOP=y CONFIG_X86_TSC=y -CONFIG_X86_MINIMUM_CPU_FAMILY=6 +CONFIG_X86_CMOV=y +CONFIG_X86_MINIMUM_CPU_FAMILY=4 CONFIG_X86_DEBUGCTLMSR=y CONFIG_HPET_TIMER=y CONFIG_HPET_EMULATE_RTC=y CONFIG_DMI=y # CONFIG_IOMMU_HELPER is not set -CONFIG_NR_CPUS=4 -# CONFIG_SCHED_SMT is not set +CONFIG_NR_CPUS=64 +CONFIG_SCHED_SMT=y CONFIG_SCHED_MC=y # CONFIG_PREEMPT_NONE is not set CONFIG_PREEMPT_VOLUNTARY=y @@ -247,8 +254,9 @@ CONFIG_X86_IO_APIC=y CONFIG_VM86=y # CONFIG_TOSHIBA is not set # CONFIG_I8K is not set -# CONFIG_X86_REBOOTFIXUPS is not set -# CONFIG_MICROCODE is not set +CONFIG_X86_REBOOTFIXUPS=y +CONFIG_MICROCODE=y +CONFIG_MICROCODE_OLD_INTERFACE=y CONFIG_X86_MSR=y CONFIG_X86_CPUID=y # CONFIG_NOHIGHMEM is not set @@ -256,34 +264,29 @@ CONFIG_HIGHMEM4G=y # CONFIG_HIGHMEM64G is not set CONFIG_PAGE_OFFSET=0xC0000000 CONFIG_HIGHMEM=y -CONFIG_NEED_NODE_MEMMAP_SIZE=y CONFIG_ARCH_FLATMEM_ENABLE=y CONFIG_ARCH_SPARSEMEM_ENABLE=y CONFIG_ARCH_SELECT_MEMORY_MODEL=y CONFIG_SELECT_MEMORY_MODEL=y -# CONFIG_FLATMEM_MANUAL is not set +CONFIG_FLATMEM_MANUAL=y # CONFIG_DISCONTIGMEM_MANUAL is not set -CONFIG_SPARSEMEM_MANUAL=y -CONFIG_SPARSEMEM=y -CONFIG_HAVE_MEMORY_PRESENT=y +# CONFIG_SPARSEMEM_MANUAL is not set +CONFIG_FLATMEM=y +CONFIG_FLAT_NODE_MEM_MAP=y CONFIG_SPARSEMEM_STATIC=y # CONFIG_SPARSEMEM_VMEMMAP_ENABLE is not set - -# -# Memory hotplug is currently incompatible with Software Suspend -# CONFIG_PAGEFLAGS_EXTENDED=y CONFIG_SPLIT_PTLOCK_CPUS=4 CONFIG_RESOURCES_64BIT=y CONFIG_ZONE_DMA_FLAG=1 CONFIG_BOUNCE=y CONFIG_VIRT_TO_BUS=y -# CONFIG_HIGHPTE is not set +CONFIG_HIGHPTE=y # CONFIG_MATH_EMULATION is not set CONFIG_MTRR=y -# CONFIG_X86_PAT is not set +# CONFIG_MTRR_SANITIZER is not set +CONFIG_X86_PAT=y CONFIG_EFI=y -# CONFIG_IRQBALANCE is not set CONFIG_SECCOMP=y # CONFIG_HZ_100 is not set # CONFIG_HZ_250 is not set @@ -293,6 +296,7 @@ CONFIG_HZ=1000 CONFIG_SCHED_HRTICK=y CONFIG_KEXEC=y CONFIG_CRASH_DUMP=y +# CONFIG_KEXEC_JUMP is not set CONFIG_PHYSICAL_START=0x1000000 CONFIG_RELOCATABLE=y CONFIG_PHYSICAL_ALIGN=0x200000 @@ -312,6 +316,7 @@ CONFIG_PM_TRACE_RTC=y CONFIG_PM_SLEEP_SMP=y CONFIG_PM_SLEEP=y CONFIG_SUSPEND=y +# CONFIG_PM_TEST_SUSPEND is not set CONFIG_SUSPEND_FREEZER=y CONFIG_HIBERNATION=y CONFIG_PM_STD_PARTITION="" @@ -337,6 +342,7 @@ CONFIG_ACPI_THERMAL=y CONFIG_ACPI_BLACKLIST_YEAR=0 # CONFIG_ACPI_DEBUG is not set CONFIG_ACPI_EC=y +# CONFIG_ACPI_PCI_SLOT is not set CONFIG_ACPI_POWER=y CONFIG_ACPI_SYSTEM=y CONFIG_X86_PM_TIMER=y @@ -395,8 +401,8 @@ CONFIG_PCI=y # CONFIG_PCI_GOBIOS is not set # CONFIG_PCI_GOMMCONFIG is not set # CONFIG_PCI_GODIRECT is not set -CONFIG_PCI_GOANY=y # CONFIG_PCI_GOOLPC is not set +CONFIG_PCI_GOANY=y CONFIG_PCI_BIOS=y CONFIG_PCI_DIRECT=y CONFIG_PCI_MMCONFIG=y @@ -448,10 +454,6 @@ CONFIG_HOTPLUG_PCI=y CONFIG_BINFMT_ELF=y # CONFIG_BINFMT_AOUT is not set CONFIG_BINFMT_MISC=y - -# -# Networking -# CONFIG_NET=y # @@ -475,7 +477,10 @@ CONFIG_IP_FIB_HASH=y CONFIG_IP_MULTIPLE_TABLES=y CONFIG_IP_ROUTE_MULTIPATH=y CONFIG_IP_ROUTE_VERBOSE=y -# CONFIG_IP_PNP is not set +CONFIG_IP_PNP=y +CONFIG_IP_PNP_DHCP=y +CONFIG_IP_PNP_BOOTP=y +CONFIG_IP_PNP_RARP=y # CONFIG_NET_IPIP is not set # CONFIG_NET_IPGRE is not set CONFIG_IP_MROUTE=y @@ -618,7 +623,6 @@ CONFIG_NET_SCHED=y # CONFIG_NET_SCH_HTB is not set # CONFIG_NET_SCH_HFSC is not set # CONFIG_NET_SCH_PRIO is not set -# CONFIG_NET_SCH_RR is not set # CONFIG_NET_SCH_RED is not set # CONFIG_NET_SCH_SFQ is not set # CONFIG_NET_SCH_TEQL is not set @@ -680,28 +684,19 @@ CONFIG_FIB_RULES=y CONFIG_CFG80211=y CONFIG_NL80211=y CONFIG_WIRELESS_EXT=y +CONFIG_WIRELESS_EXT_SYSFS=y CONFIG_MAC80211=y # # Rate control algorithm selection # +CONFIG_MAC80211_RC_PID=y CONFIG_MAC80211_RC_DEFAULT_PID=y -# CONFIG_MAC80211_RC_DEFAULT_NONE is not set - -# -# Selecting 'y' for an algorithm will -# - -# -# build the algorithm into mac80211. -# CONFIG_MAC80211_RC_DEFAULT="pid" -CONFIG_MAC80211_RC_PID=y # CONFIG_MAC80211_MESH is not set CONFIG_MAC80211_LEDS=y # CONFIG_MAC80211_DEBUGFS is not set -# CONFIG_MAC80211_DEBUG_PACKET_ALIGNMENT is not set -# CONFIG_MAC80211_DEBUG is not set +# CONFIG_MAC80211_DEBUG_MENU is not set # CONFIG_IEEE80211 is not set # CONFIG_RFKILL is not set # CONFIG_NET_9P is not set @@ -717,6 +712,8 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_STANDALONE=y CONFIG_PREVENT_FIRMWARE_BUILD=y CONFIG_FW_LOADER=y +CONFIG_FIRMWARE_IN_KERNEL=y +CONFIG_EXTRA_FIRMWARE="" # CONFIG_DEBUG_DRIVER is not set CONFIG_DEBUG_DEVRES=y # CONFIG_SYS_HYPERVISOR is not set @@ -749,6 +746,7 @@ CONFIG_BLK_DEV_RAM_SIZE=16384 # CONFIG_BLK_DEV_XIP is not set # CONFIG_CDROM_PKTCDVD is not set # CONFIG_ATA_OVER_ETH is not set +# CONFIG_BLK_DEV_HD is not set CONFIG_MISC_DEVICES=y # CONFIG_IBM_ASM is not set # CONFIG_PHANTOM is not set @@ -760,10 +758,12 @@ CONFIG_MISC_DEVICES=y # CONFIG_FUJITSU_LAPTOP is not set # CONFIG_TC1100_WMI is not set # CONFIG_MSI_LAPTOP is not set +# CONFIG_COMPAL_LAPTOP is not set # CONFIG_SONY_LAPTOP is not set # CONFIG_THINKPAD_ACPI is not set # CONFIG_INTEL_MENLOW is not set # CONFIG_ENCLOSURE_SERVICES is not set +# CONFIG_HP_ILO is not set CONFIG_HAVE_IDE=y # CONFIG_IDE is not set @@ -802,12 +802,13 @@ CONFIG_SCSI_WAIT_SCAN=m # CONFIG_SCSI_SPI_ATTRS=y # CONFIG_SCSI_FC_ATTRS is not set -# CONFIG_SCSI_ISCSI_ATTRS is not set +CONFIG_SCSI_ISCSI_ATTRS=y # CONFIG_SCSI_SAS_ATTRS is not set # CONFIG_SCSI_SAS_LIBSAS is not set # CONFIG_SCSI_SRP_ATTRS is not set # CONFIG_SCSI_LOWLEVEL is not set # CONFIG_SCSI_LOWLEVEL_PCMCIA is not set +# CONFIG_SCSI_DH is not set CONFIG_ATA=y # CONFIG_ATA_NONSTANDARD is not set CONFIG_ATA_ACPI=y @@ -842,7 +843,7 @@ CONFIG_PATA_AMD=y # CONFIG_PATA_CS5536 is not set # CONFIG_PATA_CYPRESS is not set # CONFIG_PATA_EFAR is not set -# CONFIG_ATA_GENERIC is not set +CONFIG_ATA_GENERIC=y # CONFIG_PATA_HPT366 is not set # CONFIG_PATA_HPT37X is not set # CONFIG_PATA_HPT3X2N is not set @@ -852,7 +853,7 @@ CONFIG_PATA_AMD=y # CONFIG_PATA_JMICRON is not set # CONFIG_PATA_TRIFLEX is not set # CONFIG_PATA_MARVELL is not set -# CONFIG_PATA_MPIIX is not set +CONFIG_PATA_MPIIX=y CONFIG_PATA_OLDPIIX=y # CONFIG_PATA_NETCELL is not set # CONFIG_PATA_NINJA32 is not set @@ -871,6 +872,7 @@ CONFIG_PATA_OLDPIIX=y # CONFIG_PATA_SIS is not set # CONFIG_PATA_VIA is not set # CONFIG_PATA_WINBOND is not set +CONFIG_PATA_SCH=y CONFIG_MD=y CONFIG_BLK_DEV_MD=y # CONFIG_MD_LINEAR is not set @@ -894,13 +896,16 @@ CONFIG_DM_ZERO=y # # IEEE 1394 (FireWire) support # + +# +# Enable only one of the two stacks, unless you know what you are doing +# # CONFIG_FIREWIRE is not set # CONFIG_IEEE1394 is not set # CONFIG_I2O is not set CONFIG_MACINTOSH_DRIVERS=y CONFIG_MAC_EMUMOUSEBTN=y CONFIG_NETDEVICES=y -# CONFIG_NETDEVICES_MULTIQUEUE is not set # CONFIG_IFB is not set # CONFIG_DUMMY is not set # CONFIG_BONDING is not set @@ -910,7 +915,23 @@ CONFIG_NETDEVICES=y # CONFIG_VETH is not set # CONFIG_NET_SB1000 is not set # CONFIG_ARCNET is not set -# CONFIG_PHYLIB is not set +CONFIG_PHYLIB=y + +# +# MII PHY device drivers +# +# CONFIG_MARVELL_PHY is not set +# CONFIG_DAVICOM_PHY is not set +# CONFIG_QSEMI_PHY is not set +# CONFIG_LXT_PHY is not set +# CONFIG_CICADA_PHY is not set +# CONFIG_VITESSE_PHY is not set +# CONFIG_SMSC_PHY is not set +# CONFIG_BROADCOM_PHY is not set +# CONFIG_ICPLUS_PHY is not set +# CONFIG_REALTEK_PHY is not set +# CONFIG_FIXED_PHY is not set +# CONFIG_MDIO_BITBANG is not set CONFIG_NET_ETHERNET=y CONFIG_MII=y # CONFIG_HAPPYMEAL is not set @@ -943,10 +964,10 @@ CONFIG_FORCEDETH=y CONFIG_E100=y # CONFIG_FEALNX is not set # CONFIG_NATSEMI is not set -# CONFIG_NE2K_PCI is not set +CONFIG_NE2K_PCI=y # CONFIG_8139CP is not set CONFIG_8139TOO=y -CONFIG_8139TOO_PIO=y +# CONFIG_8139TOO_PIO is not set # CONFIG_8139TOO_TUNE_TWISTER is not set # CONFIG_8139TOO_8129 is not set # CONFIG_8139_OLD_RX_RESET is not set @@ -961,25 +982,24 @@ CONFIG_NETDEV_1000=y # CONFIG_ACENIC is not set # CONFIG_DL2K is not set CONFIG_E1000=y -# CONFIG_E1000_NAPI is not set # CONFIG_E1000_DISABLE_PACKET_SPLIT is not set -# CONFIG_E1000E is not set -# CONFIG_E1000E_ENABLED is not set +CONFIG_E1000E=y # CONFIG_IP1000 is not set # CONFIG_IGB is not set # CONFIG_NS83820 is not set # CONFIG_HAMACHI is not set # CONFIG_YELLOWFIN is not set -# CONFIG_R8169 is not set +CONFIG_R8169=y # CONFIG_SIS190 is not set # CONFIG_SKGE is not set CONFIG_SKY2=y # CONFIG_SKY2_DEBUG is not set # CONFIG_VIA_VELOCITY is not set CONFIG_TIGON3=y -# CONFIG_BNX2 is not set +CONFIG_BNX2=y # CONFIG_QLA3XXX is not set # CONFIG_ATL1 is not set +# CONFIG_ATL1E is not set CONFIG_NETDEV_10000=y # CONFIG_CHELSIO_T1 is not set # CONFIG_CHELSIO_T3 is not set @@ -1019,13 +1039,14 @@ CONFIG_WLAN_80211=y # CONFIG_RTL8180 is not set # CONFIG_RTL8187 is not set # CONFIG_ADM8211 is not set +# CONFIG_MAC80211_HWSIM is not set # CONFIG_P54_COMMON is not set CONFIG_ATH5K=y # CONFIG_ATH5K_DEBUG is not set -# CONFIG_IWLWIFI is not set +# CONFIG_ATH9K is not set # CONFIG_IWLCORE is not set # CONFIG_IWLWIFI_LEDS is not set -# CONFIG_IWL4965 is not set +# CONFIG_IWLAGN is not set # CONFIG_IWL3945 is not set # CONFIG_HOSTAP is not set # CONFIG_B43 is not set @@ -1105,6 +1126,7 @@ CONFIG_MOUSE_PS2_TRACKPOINT=y # CONFIG_MOUSE_PS2_TOUCHKIT is not set # CONFIG_MOUSE_SERIAL is not set # CONFIG_MOUSE_APPLETOUCH is not set +# CONFIG_MOUSE_BCM5974 is not set # CONFIG_MOUSE_VSXXXAA is not set CONFIG_INPUT_JOYSTICK=y # CONFIG_JOYSTICK_ANALOG is not set @@ -1139,12 +1161,14 @@ CONFIG_INPUT_TOUCHSCREEN=y # CONFIG_TOUCHSCREEN_GUNZE is not set # CONFIG_TOUCHSCREEN_ELO is not set # CONFIG_TOUCHSCREEN_MTOUCH is not set +# CONFIG_TOUCHSCREEN_INEXIO is not set # CONFIG_TOUCHSCREEN_MK712 is not set # CONFIG_TOUCHSCREEN_PENMOUNT is not set # CONFIG_TOUCHSCREEN_TOUCHRIGHT is not set # CONFIG_TOUCHSCREEN_TOUCHWIN is not set # CONFIG_TOUCHSCREEN_UCB1400 is not set # CONFIG_TOUCHSCREEN_USB_COMPOSITE is not set +# CONFIG_TOUCHSCREEN_TOUCHIT213 is not set CONFIG_INPUT_MISC=y # CONFIG_INPUT_PCSPKR is not set # CONFIG_INPUT_APANEL is not set @@ -1173,6 +1197,7 @@ CONFIG_SERIO_LIBPS2=y # Character devices # CONFIG_VT=y +CONFIG_CONSOLE_TRANSLATIONS=y CONFIG_VT_CONSOLE=y CONFIG_HW_CONSOLE=y CONFIG_VT_HW_CONSOLE_BINDING=y @@ -1223,8 +1248,8 @@ CONFIG_UNIX98_PTYS=y # CONFIG_LEGACY_PTYS is not set # CONFIG_IPMI_HANDLER is not set CONFIG_HW_RANDOM=y -# CONFIG_HW_RANDOM_INTEL is not set -# CONFIG_HW_RANDOM_AMD is not set +CONFIG_HW_RANDOM_INTEL=y +CONFIG_HW_RANDOM_AMD=y CONFIG_HW_RANDOM_GEODE=y CONFIG_HW_RANDOM_VIA=y CONFIG_NVRAM=y @@ -1245,7 +1270,6 @@ CONFIG_NVRAM=y # CONFIG_CS5535_GPIO is not set # CONFIG_RAW_DRIVER is not set CONFIG_HPET=y -# CONFIG_HPET_RTC_IRQ is not set # CONFIG_HPET_MMAP is not set # CONFIG_HANGCHECK_TIMER is not set # CONFIG_TCG_TPM is not set @@ -1254,43 +1278,64 @@ CONFIG_DEVPORT=y CONFIG_I2C=y CONFIG_I2C_BOARDINFO=y # CONFIG_I2C_CHARDEV is not set +CONFIG_I2C_HELPER_AUTO=y # # I2C Hardware Bus support # + +# +# PC SMBus host controller drivers +# # CONFIG_I2C_ALI1535 is not set # CONFIG_I2C_ALI1563 is not set # CONFIG_I2C_ALI15X3 is not set # CONFIG_I2C_AMD756 is not set # CONFIG_I2C_AMD8111 is not set CONFIG_I2C_I801=y -# CONFIG_I2C_I810 is not set +# CONFIG_I2C_ISCH is not set # CONFIG_I2C_PIIX4 is not set # CONFIG_I2C_NFORCE2 is not set -# CONFIG_I2C_OCORES is not set -# CONFIG_I2C_PARPORT_LIGHT is not set -# CONFIG_I2C_PROSAVAGE is not set -# CONFIG_I2C_SAVAGE4 is not set -# CONFIG_I2C_SIMTEC is not set -# CONFIG_SCx200_ACB is not set # CONFIG_I2C_SIS5595 is not set # CONFIG_I2C_SIS630 is not set # CONFIG_I2C_SIS96X is not set -# CONFIG_I2C_TAOS_EVM is not set -# CONFIG_I2C_STUB is not set -# CONFIG_I2C_TINY_USB is not set # CONFIG_I2C_VIA is not set # CONFIG_I2C_VIAPRO is not set + +# +# I2C system bus drivers (mostly embedded / system-on-chip) +# +# CONFIG_I2C_OCORES is not set +# CONFIG_I2C_SIMTEC is not set + +# +# External I2C/SMBus adapter drivers +# +# CONFIG_I2C_PARPORT_LIGHT is not set +# CONFIG_I2C_TAOS_EVM is not set +# CONFIG_I2C_TINY_USB is not set + +# +# Graphics adapter I2C/DDC channel drivers +# # CONFIG_I2C_VOODOO3 is not set + +# +# Other I2C/SMBus bus drivers +# # CONFIG_I2C_PCA_PLATFORM is not set +# CONFIG_I2C_STUB is not set +# CONFIG_SCx200_ACB is not set # # Miscellaneous I2C Chip support # # CONFIG_DS1682 is not set +# CONFIG_AT24 is not set # CONFIG_SENSORS_EEPROM is not set # CONFIG_SENSORS_PCF8574 is not set # CONFIG_PCF8575 is not set +# CONFIG_SENSORS_PCA9539 is not set # CONFIG_SENSORS_PCF8591 is not set # CONFIG_SENSORS_MAX6875 is not set # CONFIG_SENSORS_TSL2550 is not set @@ -1299,6 +1344,8 @@ CONFIG_I2C_I801=y # CONFIG_I2C_DEBUG_BUS is not set # CONFIG_I2C_DEBUG_CHIP is not set # CONFIG_SPI is not set +CONFIG_ARCH_WANT_OPTIONAL_GPIOLIB=y +# CONFIG_GPIOLIB is not set # CONFIG_W1 is not set CONFIG_POWER_SUPPLY=y # CONFIG_POWER_SUPPLY_DEBUG is not set @@ -1360,8 +1407,10 @@ CONFIG_SSB_POSSIBLE=y # # Multifunction device drivers # +# CONFIG_MFD_CORE is not set # CONFIG_MFD_SM501 is not set # CONFIG_HTC_PASIC3 is not set +# CONFIG_MFD_TMIO is not set # # Multimedia devices @@ -1372,6 +1421,7 @@ CONFIG_SSB_POSSIBLE=y # # CONFIG_VIDEO_DEV is not set # CONFIG_DVB_CORE is not set +# CONFIG_VIDEO_MEDIA is not set # # Multimedia drivers @@ -1418,7 +1468,6 @@ CONFIG_FB_CFB_IMAGEBLIT=y # CONFIG_FB_SYS_IMAGEBLIT is not set # CONFIG_FB_FOREIGN_ENDIAN is not set # CONFIG_FB_SYS_FOPS is not set -CONFIG_FB_DEFERRED_IO=y # CONFIG_FB_SVGALIB is not set # CONFIG_FB_MACMODES is not set # CONFIG_FB_BACKLIGHT is not set @@ -1463,6 +1512,7 @@ CONFIG_FB_EFI=y # CONFIG_FB_TRIDENT is not set # CONFIG_FB_ARK is not set # CONFIG_FB_PM3 is not set +# CONFIG_FB_CARMINE is not set # CONFIG_FB_GEODE is not set # CONFIG_FB_VIRTUAL is not set CONFIG_BACKLIGHT_LCD_SUPPORT=y @@ -1470,6 +1520,7 @@ CONFIG_BACKLIGHT_LCD_SUPPORT=y CONFIG_BACKLIGHT_CLASS_DEVICE=y # CONFIG_BACKLIGHT_CORGI is not set # CONFIG_BACKLIGHT_PROGEAR is not set +# CONFIG_BACKLIGHT_MBP_NVIDIA is not set # # Display device support @@ -1482,22 +1533,13 @@ CONFIG_BACKLIGHT_CLASS_DEVICE=y CONFIG_VGA_CONSOLE=y CONFIG_VGACON_SOFT_SCROLLBACK=y CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64 -CONFIG_VIDEO_SELECT=y CONFIG_DUMMY_CONSOLE=y # CONFIG_FRAMEBUFFER_CONSOLE is not set CONFIG_LOGO=y # CONFIG_LOGO_LINUX_MONO is not set # CONFIG_LOGO_LINUX_VGA16 is not set CONFIG_LOGO_LINUX_CLUT224=y - -# -# Sound -# CONFIG_SOUND=y - -# -# Advanced Linux Sound Architecture -# CONFIG_SND=y CONFIG_SND_TIMER=y CONFIG_SND_PCM=y @@ -1515,20 +1557,14 @@ CONFIG_SND_VERBOSE_PROCFS=y # CONFIG_SND_VERBOSE_PRINTK is not set # CONFIG_SND_DEBUG is not set CONFIG_SND_VMASTER=y - -# -# Generic devices -# +CONFIG_SND_DRIVERS=y # CONFIG_SND_PCSP is not set # CONFIG_SND_DUMMY is not set # CONFIG_SND_VIRMIDI is not set # CONFIG_SND_MTPAV is not set # CONFIG_SND_SERIAL_U16550 is not set # CONFIG_SND_MPU401 is not set - -# -# PCI devices -# +CONFIG_SND_PCI=y # CONFIG_SND_AD1889 is not set # CONFIG_SND_ALS300 is not set # CONFIG_SND_ALS4000 is not set @@ -1603,36 +1639,14 @@ CONFIG_SND_HDA_GENERIC=y # CONFIG_SND_VIRTUOSO is not set # CONFIG_SND_VX222 is not set # CONFIG_SND_YMFPCI is not set - -# -# USB devices -# +CONFIG_SND_USB=y # CONFIG_SND_USB_AUDIO is not set # CONFIG_SND_USB_USX2Y is not set # CONFIG_SND_USB_CAIAQ is not set - -# -# PCMCIA devices -# +CONFIG_SND_PCMCIA=y # CONFIG_SND_VXPOCKET is not set # CONFIG_SND_PDAUDIOCF is not set - -# -# System on Chip audio support -# # CONFIG_SND_SOC is not set - -# -# ALSA SoC audio for Freescale SOCs -# - -# -# SoC Audio for the Texas Instruments OMAP -# - -# -# Open Sound System -# # CONFIG_SOUND_PRIME is not set CONFIG_HID_SUPPORT=y CONFIG_HID=y @@ -1668,6 +1682,7 @@ CONFIG_USB_DEVICEFS=y # CONFIG_USB_DYNAMIC_MINORS is not set CONFIG_USB_SUSPEND=y # CONFIG_USB_OTG is not set +CONFIG_USB_MON=y # # USB Host Controller Drivers @@ -1691,6 +1706,7 @@ CONFIG_USB_UHCI_HCD=y # # CONFIG_USB_ACM is not set CONFIG_USB_PRINTER=y +# CONFIG_USB_WDM is not set # # NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' @@ -1712,6 +1728,7 @@ CONFIG_USB_STORAGE=y # CONFIG_USB_STORAGE_ALAUDA is not set # CONFIG_USB_STORAGE_ONETOUCH is not set # CONFIG_USB_STORAGE_KARMA is not set +# CONFIG_USB_STORAGE_SIERRA is not set # CONFIG_USB_STORAGE_CYPRESS_ATACB is not set CONFIG_USB_LIBUSUAL=y @@ -1720,7 +1737,6 @@ CONFIG_USB_LIBUSUAL=y # # CONFIG_USB_MDC800 is not set # CONFIG_USB_MICROTEK is not set -CONFIG_USB_MON=y # # USB port drivers @@ -1733,7 +1749,6 @@ CONFIG_USB_MON=y # CONFIG_USB_EMI62 is not set # CONFIG_USB_EMI26 is not set # CONFIG_USB_ADUTUX is not set -# CONFIG_USB_AUERSWALD is not set # CONFIG_USB_RIO500 is not set # CONFIG_USB_LEGOTOWER is not set # CONFIG_USB_LCD is not set @@ -1750,6 +1765,7 @@ CONFIG_USB_MON=y # CONFIG_USB_TRANCEVIBRATOR is not set # CONFIG_USB_IOWARRIOR is not set # CONFIG_USB_TEST is not set +# CONFIG_USB_ISIGHTFW is not set # CONFIG_USB_GADGET is not set # CONFIG_MMC is not set # CONFIG_MEMSTICK is not set @@ -1759,7 +1775,9 @@ CONFIG_LEDS_CLASS=y # # LED drivers # +# CONFIG_LEDS_PCA9532 is not set # CONFIG_LEDS_CLEVO_MAIL is not set +# CONFIG_LEDS_PCA955X is not set # # LED Triggers @@ -1805,6 +1823,7 @@ CONFIG_RTC_INTF_DEV=y # CONFIG_RTC_DRV_PCF8583 is not set # CONFIG_RTC_DRV_M41T80 is not set # CONFIG_RTC_DRV_S35390A is not set +# CONFIG_RTC_DRV_FM3130 is not set # # SPI RTC drivers @@ -1837,11 +1856,13 @@ CONFIG_DMADEVICES=y # Firmware Drivers # # CONFIG_EDD is not set +CONFIG_FIRMWARE_MEMMAP=y CONFIG_EFI_VARS=y # CONFIG_DELL_RBU is not set # CONFIG_DCDBAS is not set CONFIG_DMIID=y -# CONFIG_ISCSI_IBFT_FIND is not set +CONFIG_ISCSI_IBFT_FIND=y +CONFIG_ISCSI_IBFT=y # # File systems @@ -1920,14 +1941,27 @@ CONFIG_HUGETLB_PAGE=y # CONFIG_CRAMFS is not set # CONFIG_VXFS_FS is not set # CONFIG_MINIX_FS is not set +# CONFIG_OMFS_FS is not set # CONFIG_HPFS_FS is not set # CONFIG_QNX4FS_FS is not set # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set CONFIG_NETWORK_FILESYSTEMS=y -# CONFIG_NFS_FS is not set +CONFIG_NFS_FS=y +CONFIG_NFS_V3=y +CONFIG_NFS_V3_ACL=y +CONFIG_NFS_V4=y +CONFIG_ROOT_NFS=y # CONFIG_NFSD is not set +CONFIG_LOCKD=y +CONFIG_LOCKD_V4=y +CONFIG_NFS_ACL_SUPPORT=y +CONFIG_NFS_COMMON=y +CONFIG_SUNRPC=y +CONFIG_SUNRPC_GSS=y +CONFIG_RPCSEC_GSS_KRB5=y +# CONFIG_RPCSEC_GSS_SPKM3 is not set # CONFIG_SMB_FS is not set # CONFIG_CIFS is not set # CONFIG_NCP_FS is not set @@ -2001,9 +2035,9 @@ CONFIG_NLS_UTF8=y # Kernel hacking # CONFIG_TRACE_IRQFLAGS_SUPPORT=y -# CONFIG_PRINTK_TIME is not set -# CONFIG_ENABLE_WARN_DEPRECATED is not set -# CONFIG_ENABLE_MUST_CHECK is not set +CONFIG_PRINTK_TIME=y +CONFIG_ENABLE_WARN_DEPRECATED=y +CONFIG_ENABLE_MUST_CHECK=y CONFIG_FRAME_WARN=2048 CONFIG_MAGIC_SYSRQ=y # CONFIG_UNUSED_SYMBOLS is not set @@ -2033,6 +2067,7 @@ CONFIG_DEBUG_BUGVERBOSE=y # CONFIG_DEBUG_INFO is not set # CONFIG_DEBUG_VM is not set # CONFIG_DEBUG_WRITECOUNT is not set +CONFIG_DEBUG_MEMORY_INIT=y # CONFIG_DEBUG_LIST is not set # CONFIG_DEBUG_SG is not set CONFIG_FRAME_POINTER=y @@ -2043,23 +2078,32 @@ CONFIG_FRAME_POINTER=y # CONFIG_LKDTM is not set # CONFIG_FAULT_INJECTION is not set # CONFIG_LATENCYTOP is not set +CONFIG_SYSCTL_SYSCALL_CHECK=y +CONFIG_HAVE_FTRACE=y +CONFIG_HAVE_DYNAMIC_FTRACE=y +# CONFIG_FTRACE is not set +# CONFIG_IRQSOFF_TRACER is not set +# CONFIG_SYSPROF_TRACER is not set +# CONFIG_SCHED_TRACER is not set +# CONFIG_CONTEXT_SWITCH_TRACER is not set CONFIG_PROVIDE_OHCI1394_DMA_INIT=y # CONFIG_SAMPLES is not set -# CONFIG_KGDB is not set CONFIG_HAVE_ARCH_KGDB=y -# CONFIG_NONPROMISC_DEVMEM is not set +# CONFIG_KGDB is not set +# CONFIG_STRICT_DEVMEM is not set +CONFIG_X86_VERBOSE_BOOTUP=y CONFIG_EARLY_PRINTK=y CONFIG_DEBUG_STACKOVERFLOW=y CONFIG_DEBUG_STACK_USAGE=y # CONFIG_DEBUG_PAGEALLOC is not set +# CONFIG_DEBUG_PER_CPU_MAPS is not set # CONFIG_X86_PTDUMP is not set CONFIG_DEBUG_RODATA=y # CONFIG_DEBUG_RODATA_TEST is not set CONFIG_DEBUG_NX_TEST=m # CONFIG_4KSTACKS is not set -CONFIG_X86_FIND_SMP_CONFIG=y -CONFIG_X86_MPPARSE=y CONFIG_DOUBLEFAULT=y +# CONFIG_MMIOTRACE is not set CONFIG_IO_DELAY_TYPE_0X80=0 CONFIG_IO_DELAY_TYPE_0XED=1 CONFIG_IO_DELAY_TYPE_UDELAY=2 @@ -2071,6 +2115,7 @@ CONFIG_IO_DELAY_0X80=y CONFIG_DEFAULT_IO_DELAY_TYPE=0 CONFIG_DEBUG_BOOT_PARAMS=y # CONFIG_CPA_DEBUG is not set +CONFIG_OPTIMIZE_INLINING=y # # Security options @@ -2080,7 +2125,6 @@ CONFIG_KEYS_DEBUG_PROC_KEYS=y CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y # CONFIG_SECURITY_NETWORK_XFRM is not set -CONFIG_SECURITY_CAPABILITIES=y CONFIG_SECURITY_FILE_CAPABILITIES=y # CONFIG_SECURITY_ROOTPLUG is not set CONFIG_SECURITY_DEFAULT_MMAP_MIN_ADDR=65536 @@ -2141,6 +2185,10 @@ CONFIG_CRYPTO_HMAC=y # CONFIG_CRYPTO_MD4 is not set CONFIG_CRYPTO_MD5=y # CONFIG_CRYPTO_MICHAEL_MIC is not set +# CONFIG_CRYPTO_RMD128 is not set +# CONFIG_CRYPTO_RMD160 is not set +# CONFIG_CRYPTO_RMD256 is not set +# CONFIG_CRYPTO_RMD320 is not set CONFIG_CRYPTO_SHA1=y # CONFIG_CRYPTO_SHA256 is not set # CONFIG_CRYPTO_SHA512 is not set @@ -2151,7 +2199,7 @@ CONFIG_CRYPTO_SHA1=y # Ciphers # CONFIG_CRYPTO_AES=y -# CONFIG_CRYPTO_AES_586 is not set +CONFIG_CRYPTO_AES_586=y # CONFIG_CRYPTO_ANUBIS is not set CONFIG_CRYPTO_ARC4=y # CONFIG_CRYPTO_BLOWFISH is not set @@ -2193,6 +2241,7 @@ CONFIG_GENERIC_FIND_FIRST_BIT=y CONFIG_GENERIC_FIND_NEXT_BIT=y # CONFIG_CRC_CCITT is not set # CONFIG_CRC16 is not set +CONFIG_CRC_T10DIF=y # CONFIG_CRC_ITU_T is not set CONFIG_CRC32=y # CONFIG_CRC7 is not set diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index ae5124e064d4..f0a03d7a7d63 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -1,13 +1,13 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.26-rc1 -# Sun May 4 19:59:57 2008 +# Linux kernel version: 2.6.27-rc5 +# Wed Sep 3 17:13:39 2008 # CONFIG_64BIT=y # CONFIG_X86_32 is not set CONFIG_X86_64=y CONFIG_X86=y -CONFIG_DEFCONFIG_LIST="arch/x86/configs/x86_64_defconfig" +CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig" # CONFIG_GENERIC_LOCKBREAK is not set CONFIG_GENERIC_TIME=y CONFIG_GENERIC_CMOS_UPDATE=y @@ -53,6 +53,7 @@ CONFIG_X86_HT=y CONFIG_X86_BIOS_REBOOT=y CONFIG_X86_TRAMPOLINE=y # CONFIG_KTIME_SCALAR is not set +CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" # # General setup @@ -82,6 +83,7 @@ CONFIG_CGROUPS=y CONFIG_CGROUP_NS=y # CONFIG_CGROUP_DEVICE is not set CONFIG_CPUSETS=y +CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y CONFIG_GROUP_SCHED=y CONFIG_FAIR_GROUP_SCHED=y # CONFIG_RT_GROUP_SCHED is not set @@ -105,7 +107,6 @@ CONFIG_SYSCTL=y # CONFIG_EMBEDDED is not set CONFIG_UID16=y CONFIG_SYSCTL_SYSCALL=y -CONFIG_SYSCTL_SYSCALL_CHECK=y CONFIG_KALLSYMS=y CONFIG_KALLSYMS_ALL=y CONFIG_KALLSYMS_EXTRA_PASS=y @@ -113,6 +114,7 @@ CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y CONFIG_ELF_CORE=y +CONFIG_PCSPKR_PLATFORM=y # CONFIG_COMPAT_BRK is not set CONFIG_BASE_FULL=y CONFIG_FUTEX=y @@ -132,25 +134,33 @@ CONFIG_MARKERS=y # CONFIG_OPROFILE is not set CONFIG_HAVE_OPROFILE=y CONFIG_KPROBES=y +CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y CONFIG_KRETPROBES=y +CONFIG_HAVE_IOREMAP_PROT=y CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y +# CONFIG_HAVE_ARCH_TRACEHOOK is not set # CONFIG_HAVE_DMA_ATTRS is not set +CONFIG_USE_GENERIC_SMP_HELPERS=y +# CONFIG_HAVE_CLK is not set CONFIG_PROC_PAGE_MONITOR=y +# CONFIG_HAVE_GENERIC_DMA_COHERENT is not set CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y # CONFIG_TINY_SHMEM is not set CONFIG_BASE_SMALL=0 CONFIG_MODULES=y +# CONFIG_MODULE_FORCE_LOAD is not set CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_MODVERSIONS is not set # CONFIG_MODULE_SRCVERSION_ALL is not set -# CONFIG_KMOD is not set +CONFIG_KMOD=y CONFIG_STOP_MACHINE=y CONFIG_BLOCK=y CONFIG_BLK_DEV_IO_TRACE=y CONFIG_BLK_DEV_BSG=y +# CONFIG_BLK_DEV_INTEGRITY is not set CONFIG_BLOCK_COMPAT=y # @@ -175,20 +185,15 @@ CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y CONFIG_GENERIC_CLOCKEVENTS_BUILD=y CONFIG_SMP=y +CONFIG_X86_FIND_SMP_CONFIG=y +CONFIG_X86_MPPARSE=y CONFIG_X86_PC=y # CONFIG_X86_ELAN is not set # CONFIG_X86_VOYAGER is not set -# CONFIG_X86_NUMAQ is not set -# CONFIG_X86_SUMMIT is not set -# CONFIG_X86_BIGSMP is not set -# CONFIG_X86_VISWS is not set # CONFIG_X86_GENERICARCH is not set -# CONFIG_X86_ES7000 is not set -# CONFIG_X86_RDC321X is not set # CONFIG_X86_VSMP is not set # CONFIG_PARAVIRT_GUEST is not set -CONFIG_MEMTEST_BOOTPARAM=y -CONFIG_MEMTEST_BOOTPARAM_VALUE=0 +# CONFIG_MEMTEST is not set # CONFIG_M386 is not set # CONFIG_M486 is not set # CONFIG_M586 is not set @@ -205,7 +210,6 @@ CONFIG_MEMTEST_BOOTPARAM_VALUE=0 # CONFIG_MCRUSOE is not set # CONFIG_MEFFICEON is not set # CONFIG_MWINCHIPC6 is not set -# CONFIG_MWINCHIP2 is not set # CONFIG_MWINCHIP3D is not set # CONFIG_MGEODEGX1 is not set # CONFIG_MGEODE_LX is not set @@ -213,18 +217,16 @@ CONFIG_MEMTEST_BOOTPARAM_VALUE=0 # CONFIG_MVIAC3_2 is not set # CONFIG_MVIAC7 is not set # CONFIG_MPSC is not set -CONFIG_MCORE2=y -# CONFIG_GENERIC_CPU is not set +# CONFIG_MCORE2 is not set +CONFIG_GENERIC_CPU=y CONFIG_X86_CPU=y -CONFIG_X86_L1_CACHE_BYTES=64 -CONFIG_X86_INTERNODE_CACHE_BYTES=64 +CONFIG_X86_L1_CACHE_BYTES=128 +CONFIG_X86_INTERNODE_CACHE_BYTES=128 CONFIG_X86_CMPXCHG=y -CONFIG_X86_L1_CACHE_SHIFT=6 -CONFIG_X86_GOOD_APIC=y -CONFIG_X86_INTEL_USERCOPY=y -CONFIG_X86_USE_PPRO_CHECKSUM=y -CONFIG_X86_P6_NOP=y +CONFIG_X86_L1_CACHE_SHIFT=7 +CONFIG_X86_WP_WORKS_OK=y CONFIG_X86_TSC=y +CONFIG_X86_CMPXCHG64=y CONFIG_X86_CMOV=y CONFIG_X86_MINIMUM_CPU_FAMILY=64 CONFIG_X86_DEBUGCTLMSR=y @@ -234,10 +236,11 @@ CONFIG_DMI=y CONFIG_GART_IOMMU=y CONFIG_CALGARY_IOMMU=y CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT=y +CONFIG_AMD_IOMMU=y CONFIG_SWIOTLB=y CONFIG_IOMMU_HELPER=y -CONFIG_NR_CPUS=4 -# CONFIG_SCHED_SMT is not set +CONFIG_NR_CPUS=64 +CONFIG_SCHED_SMT=y CONFIG_SCHED_MC=y # CONFIG_PREEMPT_NONE is not set CONFIG_PREEMPT_VOLUNTARY=y @@ -246,7 +249,8 @@ CONFIG_X86_LOCAL_APIC=y CONFIG_X86_IO_APIC=y # CONFIG_X86_MCE is not set # CONFIG_I8K is not set -# CONFIG_MICROCODE is not set +CONFIG_MICROCODE=y +CONFIG_MICROCODE_OLD_INTERFACE=y CONFIG_X86_MSR=y CONFIG_X86_CPUID=y CONFIG_NUMA=y @@ -281,7 +285,8 @@ CONFIG_ZONE_DMA_FLAG=1 CONFIG_BOUNCE=y CONFIG_VIRT_TO_BUS=y CONFIG_MTRR=y -# CONFIG_X86_PAT is not set +# CONFIG_MTRR_SANITIZER is not set +CONFIG_X86_PAT=y CONFIG_EFI=y CONFIG_SECCOMP=y # CONFIG_HZ_100 is not set @@ -313,6 +318,7 @@ CONFIG_PM_TRACE_RTC=y CONFIG_PM_SLEEP_SMP=y CONFIG_PM_SLEEP=y CONFIG_SUSPEND=y +# CONFIG_PM_TEST_SUSPEND is not set CONFIG_SUSPEND_FREEZER=y CONFIG_HIBERNATION=y CONFIG_PM_STD_PARTITION="" @@ -339,6 +345,7 @@ CONFIG_ACPI_NUMA=y CONFIG_ACPI_BLACKLIST_YEAR=0 # CONFIG_ACPI_DEBUG is not set CONFIG_ACPI_EC=y +# CONFIG_ACPI_PCI_SLOT is not set CONFIG_ACPI_POWER=y CONFIG_ACPI_SYSTEM=y CONFIG_X86_PM_TIMER=y @@ -437,10 +444,6 @@ CONFIG_IA32_EMULATION=y CONFIG_COMPAT=y CONFIG_COMPAT_FOR_U64_ALIGNMENT=y CONFIG_SYSVIPC_COMPAT=y - -# -# Networking -# CONFIG_NET=y # @@ -464,7 +467,10 @@ CONFIG_IP_FIB_HASH=y CONFIG_IP_MULTIPLE_TABLES=y CONFIG_IP_ROUTE_MULTIPATH=y CONFIG_IP_ROUTE_VERBOSE=y -# CONFIG_IP_PNP is not set +CONFIG_IP_PNP=y +CONFIG_IP_PNP_DHCP=y +CONFIG_IP_PNP_BOOTP=y +CONFIG_IP_PNP_RARP=y # CONFIG_NET_IPIP is not set # CONFIG_NET_IPGRE is not set CONFIG_IP_MROUTE=y @@ -607,7 +613,6 @@ CONFIG_NET_SCHED=y # CONFIG_NET_SCH_HTB is not set # CONFIG_NET_SCH_HFSC is not set # CONFIG_NET_SCH_PRIO is not set -# CONFIG_NET_SCH_RR is not set # CONFIG_NET_SCH_RED is not set # CONFIG_NET_SCH_SFQ is not set # CONFIG_NET_SCH_TEQL is not set @@ -669,28 +674,19 @@ CONFIG_FIB_RULES=y CONFIG_CFG80211=y CONFIG_NL80211=y CONFIG_WIRELESS_EXT=y +CONFIG_WIRELESS_EXT_SYSFS=y CONFIG_MAC80211=y # # Rate control algorithm selection # +CONFIG_MAC80211_RC_PID=y CONFIG_MAC80211_RC_DEFAULT_PID=y -# CONFIG_MAC80211_RC_DEFAULT_NONE is not set - -# -# Selecting 'y' for an algorithm will -# - -# -# build the algorithm into mac80211. -# CONFIG_MAC80211_RC_DEFAULT="pid" -CONFIG_MAC80211_RC_PID=y # CONFIG_MAC80211_MESH is not set CONFIG_MAC80211_LEDS=y # CONFIG_MAC80211_DEBUGFS is not set -# CONFIG_MAC80211_DEBUG_PACKET_ALIGNMENT is not set -# CONFIG_MAC80211_DEBUG is not set +# CONFIG_MAC80211_DEBUG_MENU is not set # CONFIG_IEEE80211 is not set # CONFIG_RFKILL is not set # CONFIG_NET_9P is not set @@ -706,6 +702,8 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_STANDALONE=y CONFIG_PREVENT_FIRMWARE_BUILD=y CONFIG_FW_LOADER=y +CONFIG_FIRMWARE_IN_KERNEL=y +CONFIG_EXTRA_FIRMWARE="" # CONFIG_DEBUG_DRIVER is not set CONFIG_DEBUG_DEVRES=y # CONFIG_SYS_HYPERVISOR is not set @@ -738,6 +736,7 @@ CONFIG_BLK_DEV_RAM_SIZE=16384 # CONFIG_BLK_DEV_XIP is not set # CONFIG_CDROM_PKTCDVD is not set # CONFIG_ATA_OVER_ETH is not set +# CONFIG_BLK_DEV_HD is not set CONFIG_MISC_DEVICES=y # CONFIG_IBM_ASM is not set # CONFIG_PHANTOM is not set @@ -748,10 +747,14 @@ CONFIG_MISC_DEVICES=y # CONFIG_ASUS_LAPTOP is not set # CONFIG_FUJITSU_LAPTOP is not set # CONFIG_MSI_LAPTOP is not set +# CONFIG_COMPAL_LAPTOP is not set # CONFIG_SONY_LAPTOP is not set # CONFIG_THINKPAD_ACPI is not set # CONFIG_INTEL_MENLOW is not set # CONFIG_ENCLOSURE_SERVICES is not set +# CONFIG_SGI_XP is not set +# CONFIG_HP_ILO is not set +# CONFIG_SGI_GRU is not set CONFIG_HAVE_IDE=y # CONFIG_IDE is not set @@ -790,12 +793,13 @@ CONFIG_SCSI_WAIT_SCAN=m # CONFIG_SCSI_SPI_ATTRS=y # CONFIG_SCSI_FC_ATTRS is not set -# CONFIG_SCSI_ISCSI_ATTRS is not set +CONFIG_SCSI_ISCSI_ATTRS=y # CONFIG_SCSI_SAS_ATTRS is not set # CONFIG_SCSI_SAS_LIBSAS is not set # CONFIG_SCSI_SRP_ATTRS is not set # CONFIG_SCSI_LOWLEVEL is not set # CONFIG_SCSI_LOWLEVEL_PCMCIA is not set +# CONFIG_SCSI_DH is not set CONFIG_ATA=y # CONFIG_ATA_NONSTANDARD is not set CONFIG_ATA_ACPI=y @@ -857,6 +861,7 @@ CONFIG_PATA_OLDPIIX=y # CONFIG_PATA_SIS is not set # CONFIG_PATA_VIA is not set # CONFIG_PATA_WINBOND is not set +CONFIG_PATA_SCH=y CONFIG_MD=y CONFIG_BLK_DEV_MD=y # CONFIG_MD_LINEAR is not set @@ -880,13 +885,16 @@ CONFIG_DM_ZERO=y # # IEEE 1394 (FireWire) support # + +# +# Enable only one of the two stacks, unless you know what you are doing +# # CONFIG_FIREWIRE is not set # CONFIG_IEEE1394 is not set # CONFIG_I2O is not set CONFIG_MACINTOSH_DRIVERS=y CONFIG_MAC_EMUMOUSEBTN=y CONFIG_NETDEVICES=y -# CONFIG_NETDEVICES_MULTIQUEUE is not set # CONFIG_IFB is not set # CONFIG_DUMMY is not set # CONFIG_BONDING is not set @@ -896,7 +904,23 @@ CONFIG_NETDEVICES=y # CONFIG_VETH is not set # CONFIG_NET_SB1000 is not set # CONFIG_ARCNET is not set -# CONFIG_PHYLIB is not set +CONFIG_PHYLIB=y + +# +# MII PHY device drivers +# +# CONFIG_MARVELL_PHY is not set +# CONFIG_DAVICOM_PHY is not set +# CONFIG_QSEMI_PHY is not set +# CONFIG_LXT_PHY is not set +# CONFIG_CICADA_PHY is not set +# CONFIG_VITESSE_PHY is not set +# CONFIG_SMSC_PHY is not set +# CONFIG_BROADCOM_PHY is not set +# CONFIG_ICPLUS_PHY is not set +# CONFIG_REALTEK_PHY is not set +# CONFIG_FIXED_PHY is not set +# CONFIG_MDIO_BITBANG is not set CONFIG_NET_ETHERNET=y CONFIG_MII=y # CONFIG_HAPPYMEAL is not set @@ -940,16 +964,15 @@ CONFIG_8139TOO_PIO=y # CONFIG_SIS900 is not set # CONFIG_EPIC100 is not set # CONFIG_SUNDANCE is not set +# CONFIG_TLAN is not set # CONFIG_VIA_RHINE is not set # CONFIG_SC92031 is not set CONFIG_NETDEV_1000=y # CONFIG_ACENIC is not set # CONFIG_DL2K is not set CONFIG_E1000=y -# CONFIG_E1000_NAPI is not set # CONFIG_E1000_DISABLE_PACKET_SPLIT is not set # CONFIG_E1000E is not set -# CONFIG_E1000E_ENABLED is not set # CONFIG_IP1000 is not set # CONFIG_IGB is not set # CONFIG_NS83820 is not set @@ -965,6 +988,7 @@ CONFIG_TIGON3=y # CONFIG_BNX2 is not set # CONFIG_QLA3XXX is not set # CONFIG_ATL1 is not set +# CONFIG_ATL1E is not set CONFIG_NETDEV_10000=y # CONFIG_CHELSIO_T1 is not set # CONFIG_CHELSIO_T3 is not set @@ -1003,13 +1027,14 @@ CONFIG_WLAN_80211=y # CONFIG_RTL8180 is not set # CONFIG_RTL8187 is not set # CONFIG_ADM8211 is not set +# CONFIG_MAC80211_HWSIM is not set # CONFIG_P54_COMMON is not set CONFIG_ATH5K=y # CONFIG_ATH5K_DEBUG is not set -# CONFIG_IWLWIFI is not set +# CONFIG_ATH9K is not set # CONFIG_IWLCORE is not set # CONFIG_IWLWIFI_LEDS is not set -# CONFIG_IWL4965 is not set +# CONFIG_IWLAGN is not set # CONFIG_IWL3945 is not set # CONFIG_HOSTAP is not set # CONFIG_B43 is not set @@ -1088,6 +1113,7 @@ CONFIG_MOUSE_PS2_TRACKPOINT=y # CONFIG_MOUSE_PS2_TOUCHKIT is not set # CONFIG_MOUSE_SERIAL is not set # CONFIG_MOUSE_APPLETOUCH is not set +# CONFIG_MOUSE_BCM5974 is not set # CONFIG_MOUSE_VSXXXAA is not set CONFIG_INPUT_JOYSTICK=y # CONFIG_JOYSTICK_ANALOG is not set @@ -1122,12 +1148,14 @@ CONFIG_INPUT_TOUCHSCREEN=y # CONFIG_TOUCHSCREEN_GUNZE is not set # CONFIG_TOUCHSCREEN_ELO is not set # CONFIG_TOUCHSCREEN_MTOUCH is not set +# CONFIG_TOUCHSCREEN_INEXIO is not set # CONFIG_TOUCHSCREEN_MK712 is not set # CONFIG_TOUCHSCREEN_PENMOUNT is not set # CONFIG_TOUCHSCREEN_TOUCHRIGHT is not set # CONFIG_TOUCHSCREEN_TOUCHWIN is not set # CONFIG_TOUCHSCREEN_UCB1400 is not set # CONFIG_TOUCHSCREEN_USB_COMPOSITE is not set +# CONFIG_TOUCHSCREEN_TOUCHIT213 is not set CONFIG_INPUT_MISC=y # CONFIG_INPUT_PCSPKR is not set # CONFIG_INPUT_APANEL is not set @@ -1155,6 +1183,7 @@ CONFIG_SERIO_LIBPS2=y # Character devices # CONFIG_VT=y +CONFIG_CONSOLE_TRANSLATIONS=y CONFIG_VT_CONSOLE=y CONFIG_HW_CONSOLE=y CONFIG_VT_HW_CONSOLE_BINDING=y @@ -1222,7 +1251,6 @@ CONFIG_NVRAM=y # CONFIG_PC8736x_GPIO is not set # CONFIG_RAW_DRIVER is not set CONFIG_HPET=y -# CONFIG_HPET_RTC_IRQ is not set # CONFIG_HPET_MMAP is not set # CONFIG_HANGCHECK_TIMER is not set # CONFIG_TCG_TPM is not set @@ -1231,42 +1259,63 @@ CONFIG_DEVPORT=y CONFIG_I2C=y CONFIG_I2C_BOARDINFO=y # CONFIG_I2C_CHARDEV is not set +CONFIG_I2C_HELPER_AUTO=y # # I2C Hardware Bus support # + +# +# PC SMBus host controller drivers +# # CONFIG_I2C_ALI1535 is not set # CONFIG_I2C_ALI1563 is not set # CONFIG_I2C_ALI15X3 is not set # CONFIG_I2C_AMD756 is not set # CONFIG_I2C_AMD8111 is not set CONFIG_I2C_I801=y -# CONFIG_I2C_I810 is not set +# CONFIG_I2C_ISCH is not set # CONFIG_I2C_PIIX4 is not set # CONFIG_I2C_NFORCE2 is not set -# CONFIG_I2C_OCORES is not set -# CONFIG_I2C_PARPORT_LIGHT is not set -# CONFIG_I2C_PROSAVAGE is not set -# CONFIG_I2C_SAVAGE4 is not set -# CONFIG_I2C_SIMTEC is not set # CONFIG_I2C_SIS5595 is not set # CONFIG_I2C_SIS630 is not set # CONFIG_I2C_SIS96X is not set -# CONFIG_I2C_TAOS_EVM is not set -# CONFIG_I2C_STUB is not set -# CONFIG_I2C_TINY_USB is not set # CONFIG_I2C_VIA is not set # CONFIG_I2C_VIAPRO is not set + +# +# I2C system bus drivers (mostly embedded / system-on-chip) +# +# CONFIG_I2C_OCORES is not set +# CONFIG_I2C_SIMTEC is not set + +# +# External I2C/SMBus adapter drivers +# +# CONFIG_I2C_PARPORT_LIGHT is not set +# CONFIG_I2C_TAOS_EVM is not set +# CONFIG_I2C_TINY_USB is not set + +# +# Graphics adapter I2C/DDC channel drivers +# # CONFIG_I2C_VOODOO3 is not set + +# +# Other I2C/SMBus bus drivers +# # CONFIG_I2C_PCA_PLATFORM is not set +# CONFIG_I2C_STUB is not set # # Miscellaneous I2C Chip support # # CONFIG_DS1682 is not set +# CONFIG_AT24 is not set # CONFIG_SENSORS_EEPROM is not set # CONFIG_SENSORS_PCF8574 is not set # CONFIG_PCF8575 is not set +# CONFIG_SENSORS_PCA9539 is not set # CONFIG_SENSORS_PCF8591 is not set # CONFIG_SENSORS_MAX6875 is not set # CONFIG_SENSORS_TSL2550 is not set @@ -1275,6 +1324,8 @@ CONFIG_I2C_I801=y # CONFIG_I2C_DEBUG_BUS is not set # CONFIG_I2C_DEBUG_CHIP is not set # CONFIG_SPI is not set +CONFIG_ARCH_WANT_OPTIONAL_GPIOLIB=y +# CONFIG_GPIOLIB is not set # CONFIG_W1 is not set CONFIG_POWER_SUPPLY=y # CONFIG_POWER_SUPPLY_DEBUG is not set @@ -1335,8 +1386,10 @@ CONFIG_SSB_POSSIBLE=y # # Multifunction device drivers # +# CONFIG_MFD_CORE is not set # CONFIG_MFD_SM501 is not set # CONFIG_HTC_PASIC3 is not set +# CONFIG_MFD_TMIO is not set # # Multimedia devices @@ -1347,6 +1400,7 @@ CONFIG_SSB_POSSIBLE=y # # CONFIG_VIDEO_DEV is not set # CONFIG_DVB_CORE is not set +# CONFIG_VIDEO_MEDIA is not set # # Multimedia drivers @@ -1387,7 +1441,6 @@ CONFIG_FB_CFB_IMAGEBLIT=y # CONFIG_FB_SYS_IMAGEBLIT is not set # CONFIG_FB_FOREIGN_ENDIAN is not set # CONFIG_FB_SYS_FOPS is not set -CONFIG_FB_DEFERRED_IO=y # CONFIG_FB_SVGALIB is not set # CONFIG_FB_MACMODES is not set # CONFIG_FB_BACKLIGHT is not set @@ -1430,6 +1483,7 @@ CONFIG_FB_EFI=y # CONFIG_FB_TRIDENT is not set # CONFIG_FB_ARK is not set # CONFIG_FB_PM3 is not set +# CONFIG_FB_CARMINE is not set # CONFIG_FB_GEODE is not set # CONFIG_FB_VIRTUAL is not set CONFIG_BACKLIGHT_LCD_SUPPORT=y @@ -1437,6 +1491,7 @@ CONFIG_BACKLIGHT_LCD_SUPPORT=y CONFIG_BACKLIGHT_CLASS_DEVICE=y # CONFIG_BACKLIGHT_CORGI is not set # CONFIG_BACKLIGHT_PROGEAR is not set +# CONFIG_BACKLIGHT_MBP_NVIDIA is not set # # Display device support @@ -1449,22 +1504,13 @@ CONFIG_BACKLIGHT_CLASS_DEVICE=y CONFIG_VGA_CONSOLE=y CONFIG_VGACON_SOFT_SCROLLBACK=y CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64 -CONFIG_VIDEO_SELECT=y CONFIG_DUMMY_CONSOLE=y # CONFIG_FRAMEBUFFER_CONSOLE is not set CONFIG_LOGO=y # CONFIG_LOGO_LINUX_MONO is not set # CONFIG_LOGO_LINUX_VGA16 is not set CONFIG_LOGO_LINUX_CLUT224=y - -# -# Sound -# CONFIG_SOUND=y - -# -# Advanced Linux Sound Architecture -# CONFIG_SND=y CONFIG_SND_TIMER=y CONFIG_SND_PCM=y @@ -1482,20 +1528,14 @@ CONFIG_SND_VERBOSE_PROCFS=y # CONFIG_SND_VERBOSE_PRINTK is not set # CONFIG_SND_DEBUG is not set CONFIG_SND_VMASTER=y - -# -# Generic devices -# +CONFIG_SND_DRIVERS=y # CONFIG_SND_PCSP is not set # CONFIG_SND_DUMMY is not set # CONFIG_SND_VIRMIDI is not set # CONFIG_SND_MTPAV is not set # CONFIG_SND_SERIAL_U16550 is not set # CONFIG_SND_MPU401 is not set - -# -# PCI devices -# +CONFIG_SND_PCI=y # CONFIG_SND_AD1889 is not set # CONFIG_SND_ALS300 is not set # CONFIG_SND_ALS4000 is not set @@ -1568,36 +1608,14 @@ CONFIG_SND_HDA_GENERIC=y # CONFIG_SND_VIRTUOSO is not set # CONFIG_SND_VX222 is not set # CONFIG_SND_YMFPCI is not set - -# -# USB devices -# +CONFIG_SND_USB=y # CONFIG_SND_USB_AUDIO is not set # CONFIG_SND_USB_USX2Y is not set # CONFIG_SND_USB_CAIAQ is not set - -# -# PCMCIA devices -# +CONFIG_SND_PCMCIA=y # CONFIG_SND_VXPOCKET is not set # CONFIG_SND_PDAUDIOCF is not set - -# -# System on Chip audio support -# # CONFIG_SND_SOC is not set - -# -# ALSA SoC audio for Freescale SOCs -# - -# -# SoC Audio for the Texas Instruments OMAP -# - -# -# Open Sound System -# # CONFIG_SOUND_PRIME is not set CONFIG_HID_SUPPORT=y CONFIG_HID=y @@ -1633,6 +1651,7 @@ CONFIG_USB_DEVICEFS=y # CONFIG_USB_DYNAMIC_MINORS is not set CONFIG_USB_SUSPEND=y # CONFIG_USB_OTG is not set +CONFIG_USB_MON=y # # USB Host Controller Drivers @@ -1656,6 +1675,7 @@ CONFIG_USB_UHCI_HCD=y # # CONFIG_USB_ACM is not set CONFIG_USB_PRINTER=y +# CONFIG_USB_WDM is not set # # NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' @@ -1677,6 +1697,7 @@ CONFIG_USB_STORAGE=y # CONFIG_USB_STORAGE_ALAUDA is not set # CONFIG_USB_STORAGE_ONETOUCH is not set # CONFIG_USB_STORAGE_KARMA is not set +# CONFIG_USB_STORAGE_SIERRA is not set # CONFIG_USB_STORAGE_CYPRESS_ATACB is not set CONFIG_USB_LIBUSUAL=y @@ -1685,7 +1706,6 @@ CONFIG_USB_LIBUSUAL=y # # CONFIG_USB_MDC800 is not set # CONFIG_USB_MICROTEK is not set -CONFIG_USB_MON=y # # USB port drivers @@ -1698,7 +1718,6 @@ CONFIG_USB_MON=y # CONFIG_USB_EMI62 is not set # CONFIG_USB_EMI26 is not set # CONFIG_USB_ADUTUX is not set -# CONFIG_USB_AUERSWALD is not set # CONFIG_USB_RIO500 is not set # CONFIG_USB_LEGOTOWER is not set # CONFIG_USB_LCD is not set @@ -1715,6 +1734,7 @@ CONFIG_USB_MON=y # CONFIG_USB_TRANCEVIBRATOR is not set # CONFIG_USB_IOWARRIOR is not set # CONFIG_USB_TEST is not set +# CONFIG_USB_ISIGHTFW is not set # CONFIG_USB_GADGET is not set # CONFIG_MMC is not set # CONFIG_MEMSTICK is not set @@ -1724,7 +1744,9 @@ CONFIG_LEDS_CLASS=y # # LED drivers # +# CONFIG_LEDS_PCA9532 is not set # CONFIG_LEDS_CLEVO_MAIL is not set +# CONFIG_LEDS_PCA955X is not set # # LED Triggers @@ -1770,6 +1792,7 @@ CONFIG_RTC_INTF_DEV=y # CONFIG_RTC_DRV_PCF8583 is not set # CONFIG_RTC_DRV_M41T80 is not set # CONFIG_RTC_DRV_S35390A is not set +# CONFIG_RTC_DRV_FM3130 is not set # # SPI RTC drivers @@ -1802,11 +1825,13 @@ CONFIG_DMADEVICES=y # Firmware Drivers # # CONFIG_EDD is not set +CONFIG_FIRMWARE_MEMMAP=y CONFIG_EFI_VARS=y # CONFIG_DELL_RBU is not set # CONFIG_DCDBAS is not set CONFIG_DMIID=y -# CONFIG_ISCSI_IBFT_FIND is not set +CONFIG_ISCSI_IBFT_FIND=y +CONFIG_ISCSI_IBFT=y # # File systems @@ -1886,14 +1911,27 @@ CONFIG_HUGETLB_PAGE=y # CONFIG_CRAMFS is not set # CONFIG_VXFS_FS is not set # CONFIG_MINIX_FS is not set +# CONFIG_OMFS_FS is not set # CONFIG_HPFS_FS is not set # CONFIG_QNX4FS_FS is not set # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set CONFIG_NETWORK_FILESYSTEMS=y -# CONFIG_NFS_FS is not set +CONFIG_NFS_FS=y +CONFIG_NFS_V3=y +CONFIG_NFS_V3_ACL=y +CONFIG_NFS_V4=y +CONFIG_ROOT_NFS=y # CONFIG_NFSD is not set +CONFIG_LOCKD=y +CONFIG_LOCKD_V4=y +CONFIG_NFS_ACL_SUPPORT=y +CONFIG_NFS_COMMON=y +CONFIG_SUNRPC=y +CONFIG_SUNRPC_GSS=y +CONFIG_RPCSEC_GSS_KRB5=y +# CONFIG_RPCSEC_GSS_SPKM3 is not set # CONFIG_SMB_FS is not set # CONFIG_CIFS is not set # CONFIG_NCP_FS is not set @@ -1967,9 +2005,9 @@ CONFIG_NLS_UTF8=y # Kernel hacking # CONFIG_TRACE_IRQFLAGS_SUPPORT=y -# CONFIG_PRINTK_TIME is not set -# CONFIG_ENABLE_WARN_DEPRECATED is not set -# CONFIG_ENABLE_MUST_CHECK is not set +CONFIG_PRINTK_TIME=y +CONFIG_ENABLE_WARN_DEPRECATED=y +CONFIG_ENABLE_MUST_CHECK=y CONFIG_FRAME_WARN=2048 CONFIG_MAGIC_SYSRQ=y # CONFIG_UNUSED_SYMBOLS is not set @@ -1998,6 +2036,7 @@ CONFIG_DEBUG_BUGVERBOSE=y # CONFIG_DEBUG_INFO is not set # CONFIG_DEBUG_VM is not set # CONFIG_DEBUG_WRITECOUNT is not set +CONFIG_DEBUG_MEMORY_INIT=y # CONFIG_DEBUG_LIST is not set # CONFIG_DEBUG_SG is not set CONFIG_FRAME_POINTER=y @@ -2008,11 +2047,20 @@ CONFIG_FRAME_POINTER=y # CONFIG_LKDTM is not set # CONFIG_FAULT_INJECTION is not set # CONFIG_LATENCYTOP is not set +CONFIG_SYSCTL_SYSCALL_CHECK=y +CONFIG_HAVE_FTRACE=y +CONFIG_HAVE_DYNAMIC_FTRACE=y +# CONFIG_FTRACE is not set +# CONFIG_IRQSOFF_TRACER is not set +# CONFIG_SYSPROF_TRACER is not set +# CONFIG_SCHED_TRACER is not set +# CONFIG_CONTEXT_SWITCH_TRACER is not set CONFIG_PROVIDE_OHCI1394_DMA_INIT=y # CONFIG_SAMPLES is not set -# CONFIG_KGDB is not set CONFIG_HAVE_ARCH_KGDB=y -# CONFIG_NONPROMISC_DEVMEM is not set +# CONFIG_KGDB is not set +# CONFIG_STRICT_DEVMEM is not set +CONFIG_X86_VERBOSE_BOOTUP=y CONFIG_EARLY_PRINTK=y CONFIG_DEBUG_STACKOVERFLOW=y CONFIG_DEBUG_STACK_USAGE=y @@ -2023,8 +2071,8 @@ CONFIG_DEBUG_RODATA=y # CONFIG_DIRECT_GBPAGES is not set # CONFIG_DEBUG_RODATA_TEST is not set CONFIG_DEBUG_NX_TEST=m -CONFIG_X86_MPPARSE=y # CONFIG_IOMMU_DEBUG is not set +# CONFIG_MMIOTRACE is not set CONFIG_IO_DELAY_TYPE_0X80=0 CONFIG_IO_DELAY_TYPE_0XED=1 CONFIG_IO_DELAY_TYPE_UDELAY=2 @@ -2036,6 +2084,7 @@ CONFIG_IO_DELAY_0X80=y CONFIG_DEFAULT_IO_DELAY_TYPE=0 CONFIG_DEBUG_BOOT_PARAMS=y # CONFIG_CPA_DEBUG is not set +CONFIG_OPTIMIZE_INLINING=y # # Security options @@ -2045,7 +2094,6 @@ CONFIG_KEYS_DEBUG_PROC_KEYS=y CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y # CONFIG_SECURITY_NETWORK_XFRM is not set -CONFIG_SECURITY_CAPABILITIES=y CONFIG_SECURITY_FILE_CAPABILITIES=y # CONFIG_SECURITY_ROOTPLUG is not set CONFIG_SECURITY_DEFAULT_MMAP_MIN_ADDR=65536 @@ -2106,6 +2154,10 @@ CONFIG_CRYPTO_HMAC=y # CONFIG_CRYPTO_MD4 is not set CONFIG_CRYPTO_MD5=y # CONFIG_CRYPTO_MICHAEL_MIC is not set +# CONFIG_CRYPTO_RMD128 is not set +# CONFIG_CRYPTO_RMD160 is not set +# CONFIG_CRYPTO_RMD256 is not set +# CONFIG_CRYPTO_RMD320 is not set CONFIG_CRYPTO_SHA1=y # CONFIG_CRYPTO_SHA256 is not set # CONFIG_CRYPTO_SHA512 is not set @@ -2155,6 +2207,7 @@ CONFIG_GENERIC_FIND_FIRST_BIT=y CONFIG_GENERIC_FIND_NEXT_BIT=y # CONFIG_CRC_CCITT is not set # CONFIG_CRC16 is not set +CONFIG_CRC_T10DIF=y # CONFIG_CRC_ITU_T is not set CONFIG_CRC32=y # CONFIG_CRC7 is not set diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 3874c2de5403..903de4aa5094 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -10,6 +10,8 @@ obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o +obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o + aes-i586-y := aes-i586-asm_32.o aes_glue.o twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o diff --git a/arch/x86/crypto/crc32c-intel.c b/arch/x86/crypto/crc32c-intel.c new file mode 100644 index 000000000000..070afc5b6c94 --- /dev/null +++ b/arch/x86/crypto/crc32c-intel.c @@ -0,0 +1,197 @@ +/* + * Using hardware provided CRC32 instruction to accelerate the CRC32 disposal. + * CRC32C polynomial:0x1EDC6F41(BE)/0x82F63B78(LE) + * CRC32 is a new instruction in Intel SSE4.2, the reference can be found at: + * http://www.intel.com/products/processor/manuals/ + * Intel(R) 64 and IA-32 Architectures Software Developer's Manual + * Volume 2A: Instruction Set Reference, A-M + * + * Copyright (c) 2008 Austin Zhang <austin_zhang@linux.intel.com> + * Copyright (c) 2008 Kent Liu <kent.liu@intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ +#include <linux/init.h> +#include <linux/module.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <crypto/internal/hash.h> + +#include <asm/cpufeature.h> + +#define CHKSUM_BLOCK_SIZE 1 +#define CHKSUM_DIGEST_SIZE 4 + +#define SCALE_F sizeof(unsigned long) + +#ifdef CONFIG_X86_64 +#define REX_PRE "0x48, " +#else +#define REX_PRE +#endif + +static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length) +{ + while (length--) { + __asm__ __volatile__( + ".byte 0xf2, 0xf, 0x38, 0xf0, 0xf1" + :"=S"(crc) + :"0"(crc), "c"(*data) + ); + data++; + } + + return crc; +} + +static u32 __pure crc32c_intel_le_hw(u32 crc, unsigned char const *p, size_t len) +{ + unsigned int iquotient = len / SCALE_F; + unsigned int iremainder = len % SCALE_F; + unsigned long *ptmp = (unsigned long *)p; + + while (iquotient--) { + __asm__ __volatile__( + ".byte 0xf2, " REX_PRE "0xf, 0x38, 0xf1, 0xf1;" + :"=S"(crc) + :"0"(crc), "c"(*ptmp) + ); + ptmp++; + } + + if (iremainder) + crc = crc32c_intel_le_hw_byte(crc, (unsigned char *)ptmp, + iremainder); + + return crc; +} + +/* + * Setting the seed allows arbitrary accumulators and flexible XOR policy + * If your algorithm starts with ~0, then XOR with ~0 before you set + * the seed. + */ +static int crc32c_intel_setkey(struct crypto_ahash *hash, const u8 *key, + unsigned int keylen) +{ + u32 *mctx = crypto_ahash_ctx(hash); + + if (keylen != sizeof(u32)) { + crypto_ahash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + *mctx = le32_to_cpup((__le32 *)key); + return 0; +} + +static int crc32c_intel_init(struct ahash_request *req) +{ + u32 *mctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req)); + u32 *crcp = ahash_request_ctx(req); + + *crcp = *mctx; + + return 0; +} + +static int crc32c_intel_update(struct ahash_request *req) +{ + struct crypto_hash_walk walk; + u32 *crcp = ahash_request_ctx(req); + u32 crc = *crcp; + int nbytes; + + for (nbytes = crypto_hash_walk_first(req, &walk); nbytes; + nbytes = crypto_hash_walk_done(&walk, 0)) + crc = crc32c_intel_le_hw(crc, walk.data, nbytes); + + *crcp = crc; + return 0; +} + +static int crc32c_intel_final(struct ahash_request *req) +{ + u32 *crcp = ahash_request_ctx(req); + + *(__le32 *)req->result = ~cpu_to_le32p(crcp); + return 0; +} + +static int crc32c_intel_digest(struct ahash_request *req) +{ + struct crypto_hash_walk walk; + u32 *mctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req)); + u32 crc = *mctx; + int nbytes; + + for (nbytes = crypto_hash_walk_first(req, &walk); nbytes; + nbytes = crypto_hash_walk_done(&walk, 0)) + crc = crc32c_intel_le_hw(crc, walk.data, nbytes); + + *(__le32 *)req->result = ~cpu_to_le32(crc); + return 0; +} + +static int crc32c_intel_cra_init(struct crypto_tfm *tfm) +{ + u32 *key = crypto_tfm_ctx(tfm); + + *key = ~0; + + tfm->crt_ahash.reqsize = sizeof(u32); + + return 0; +} + +static struct crypto_alg alg = { + .cra_name = "crc32c", + .cra_driver_name = "crc32c-intel", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_TYPE_AHASH, + .cra_blocksize = CHKSUM_BLOCK_SIZE, + .cra_alignmask = 3, + .cra_ctxsize = sizeof(u32), + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(alg.cra_list), + .cra_init = crc32c_intel_cra_init, + .cra_type = &crypto_ahash_type, + .cra_u = { + .ahash = { + .digestsize = CHKSUM_DIGEST_SIZE, + .setkey = crc32c_intel_setkey, + .init = crc32c_intel_init, + .update = crc32c_intel_update, + .final = crc32c_intel_final, + .digest = crc32c_intel_digest, + } + } +}; + + +static int __init crc32c_intel_mod_init(void) +{ + if (cpu_has_xmm4_2) + return crypto_register_alg(&alg); + else + return -ENODEV; +} + +static void __exit crc32c_intel_mod_fini(void) +{ + crypto_unregister_alg(&alg); +} + +module_init(crc32c_intel_mod_init); +module_exit(crc32c_intel_mod_fini); + +MODULE_AUTHOR("Austin Zhang <austin.zhang@intel.com>, Kent Liu <kent.liu@intel.com>"); +MODULE_DESCRIPTION("CRC32c (Castagnoli) optimization using Intel Hardware."); +MODULE_LICENSE("GPL"); + +MODULE_ALIAS("crc32c"); +MODULE_ALIAS("crc32c-intel"); + diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 58cccb6483b0..127ec3f07214 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -85,8 +85,10 @@ static void dump_thread32(struct pt_regs *regs, struct user32 *dump) dump->regs.ax = regs->ax; dump->regs.ds = current->thread.ds; dump->regs.es = current->thread.es; - asm("movl %%fs,%0" : "=r" (fs)); dump->regs.fs = fs; - asm("movl %%gs,%0" : "=r" (gs)); dump->regs.gs = gs; + savesegment(fs, fs); + dump->regs.fs = fs; + savesegment(gs, gs); + dump->regs.gs = gs; dump->regs.orig_ax = regs->orig_ax; dump->regs.ip = regs->ip; dump->regs.cs = regs->cs; @@ -430,8 +432,9 @@ beyond_if: current->mm->start_stack = (unsigned long)create_aout_tables((char __user *)bprm->p, bprm); /* start thread */ - asm volatile("movl %0,%%fs" :: "r" (0)); \ - asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS)); + loadsegment(fs, 0); + loadsegment(ds, __USER32_DS); + loadsegment(es, __USER32_DS); load_gs_index(0); (regs)->ip = ex.a_entry; (regs)->sp = current->mm->start_stack; @@ -441,12 +444,6 @@ beyond_if: regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0; set_fs(USER_DS); - if (unlikely(current->ptrace & PT_PTRACED)) { - if (current->ptrace & PT_TRACE_EXEC) - ptrace_notify((PTRACE_EVENT_EXEC << 8) | SIGTRAP); - else - send_sig(SIGTRAP, current, 0); - } return 0; } diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index cb3856a18c85..4bc02b23674b 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -36,6 +36,11 @@ #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) +#define FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \ + X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \ + X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \ + X86_EFLAGS_CF) + asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset); void signal_fault(struct pt_regs *regs, void __user *frame, char *where); @@ -174,9 +179,10 @@ struct sigframe u32 pretcode; int sig; struct sigcontext_ia32 sc; - struct _fpstate_ia32 fpstate; + struct _fpstate_ia32 fpstate_unused; /* look at kernel/sigframe.h */ unsigned int extramask[_COMPAT_NSIG_WORDS-1]; char retcode[8]; + /* fp state follows here */ }; struct rt_sigframe @@ -187,8 +193,8 @@ struct rt_sigframe u32 puc; compat_siginfo_t info; struct ucontext_ia32 uc; - struct _fpstate_ia32 fpstate; char retcode[8]; + /* fp state follows here */ }; #define COPY(x) { \ @@ -201,7 +207,7 @@ struct rt_sigframe { unsigned int cur; \ unsigned short pre; \ err |= __get_user(pre, &sc->seg); \ - asm volatile("movl %%" #seg ",%0" : "=r" (cur)); \ + savesegment(seg, cur); \ pre |= mask; \ if (pre != cur) loadsegment(seg, pre); } @@ -210,7 +216,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, unsigned int *peax) { unsigned int tmpflags, gs, oldgs, err = 0; - struct _fpstate_ia32 __user *buf; + void __user *buf; u32 tmp; /* Always make any pending restarted system calls return -EINTR */ @@ -230,7 +236,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, */ err |= __get_user(gs, &sc->gs); gs |= 3; - asm("movl %%gs,%0" : "=r" (oldgs)); + savesegment(gs, oldgs); if (gs != oldgs) load_gs_index(gs); @@ -248,32 +254,18 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, regs->ss |= 3; err |= __get_user(tmpflags, &sc->flags); - regs->flags = (regs->flags & ~0x40DD5) | (tmpflags & 0x40DD5); + regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); /* disable syscall checks */ regs->orig_ax = -1; err |= __get_user(tmp, &sc->fpstate); buf = compat_ptr(tmp); - if (buf) { - if (!access_ok(VERIFY_READ, buf, sizeof(*buf))) - goto badframe; - err |= restore_i387_ia32(buf); - } else { - struct task_struct *me = current; - - if (used_math()) { - clear_fpu(me); - clear_used_math(); - } - } + err |= restore_i387_xstate_ia32(buf); err |= __get_user(tmp, &sc->ax); *peax = tmp; return err; - -badframe: - return 1; } asmlinkage long sys32_sigreturn(struct pt_regs *regs) @@ -345,46 +337,42 @@ badframe: */ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, - struct _fpstate_ia32 __user *fpstate, + void __user *fpstate, struct pt_regs *regs, unsigned int mask) { int tmp, err = 0; - tmp = 0; - __asm__("movl %%gs,%0" : "=r"(tmp): "0"(tmp)); + savesegment(gs, tmp); err |= __put_user(tmp, (unsigned int __user *)&sc->gs); - __asm__("movl %%fs,%0" : "=r"(tmp): "0"(tmp)); + savesegment(fs, tmp); err |= __put_user(tmp, (unsigned int __user *)&sc->fs); - __asm__("movl %%ds,%0" : "=r"(tmp): "0"(tmp)); + savesegment(ds, tmp); err |= __put_user(tmp, (unsigned int __user *)&sc->ds); - __asm__("movl %%es,%0" : "=r"(tmp): "0"(tmp)); + savesegment(es, tmp); err |= __put_user(tmp, (unsigned int __user *)&sc->es); - err |= __put_user((u32)regs->di, &sc->di); - err |= __put_user((u32)regs->si, &sc->si); - err |= __put_user((u32)regs->bp, &sc->bp); - err |= __put_user((u32)regs->sp, &sc->sp); - err |= __put_user((u32)regs->bx, &sc->bx); - err |= __put_user((u32)regs->dx, &sc->dx); - err |= __put_user((u32)regs->cx, &sc->cx); - err |= __put_user((u32)regs->ax, &sc->ax); - err |= __put_user((u32)regs->cs, &sc->cs); - err |= __put_user((u32)regs->ss, &sc->ss); + err |= __put_user(regs->di, &sc->di); + err |= __put_user(regs->si, &sc->si); + err |= __put_user(regs->bp, &sc->bp); + err |= __put_user(regs->sp, &sc->sp); + err |= __put_user(regs->bx, &sc->bx); + err |= __put_user(regs->dx, &sc->dx); + err |= __put_user(regs->cx, &sc->cx); + err |= __put_user(regs->ax, &sc->ax); + err |= __put_user(regs->cs, &sc->cs); + err |= __put_user(regs->ss, &sc->ss); err |= __put_user(current->thread.trap_no, &sc->trapno); err |= __put_user(current->thread.error_code, &sc->err); - err |= __put_user((u32)regs->ip, &sc->ip); - err |= __put_user((u32)regs->flags, &sc->flags); - err |= __put_user((u32)regs->sp, &sc->sp_at_signal); + err |= __put_user(regs->ip, &sc->ip); + err |= __put_user(regs->flags, &sc->flags); + err |= __put_user(regs->sp, &sc->sp_at_signal); - tmp = save_i387_ia32(fpstate); + tmp = save_i387_xstate_ia32(fpstate); if (tmp < 0) err = -EFAULT; - else { - clear_used_math(); - stts(); + else err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL), &sc->fpstate); - } /* non-iBCS2 extensions.. */ err |= __put_user(mask, &sc->oldmask); @@ -397,7 +385,8 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, * Determine which stack to use.. */ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, - size_t frame_size) + size_t frame_size, + void **fpstate) { unsigned long sp; @@ -416,6 +405,11 @@ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, ka->sa.sa_restorer) sp = (unsigned long) ka->sa.sa_restorer; + if (used_math()) { + sp = sp - sig_xstate_ia32_size; + *fpstate = (struct _fpstate_ia32 *) sp; + } + sp -= frame_size; /* Align the stack pointer according to the i386 ABI, * i.e. so that on function entry ((sp + 4) & 15) == 0. */ @@ -429,6 +423,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, struct sigframe __user *frame; void __user *restorer; int err = 0; + void __user *fpstate = NULL; /* copy_to_user optimizes that into a single 8 byte store */ static const struct { @@ -443,25 +438,21 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, 0, }; - frame = get_sigframe(ka, regs, sizeof(*frame)); + frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - goto give_sigsegv; + return -EFAULT; - err |= __put_user(sig, &frame->sig); - if (err) - goto give_sigsegv; + if (__put_user(sig, &frame->sig)) + return -EFAULT; - err |= ia32_setup_sigcontext(&frame->sc, &frame->fpstate, regs, - set->sig[0]); - if (err) - goto give_sigsegv; + if (ia32_setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0])) + return -EFAULT; if (_COMPAT_NSIG_WORDS > 1) { - err |= __copy_to_user(frame->extramask, &set->sig[1], - sizeof(frame->extramask)); - if (err) - goto give_sigsegv; + if (__copy_to_user(frame->extramask, &set->sig[1], + sizeof(frame->extramask))) + return -EFAULT; } if (ka->sa.sa_flags & SA_RESTORER) { @@ -482,7 +473,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, */ err |= __copy_to_user(frame->retcode, &code, 8); if (err) - goto give_sigsegv; + return -EFAULT; /* Set up registers for signal handler */ regs->sp = (unsigned long) frame; @@ -493,8 +484,8 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, regs->dx = 0; regs->cx = 0; - asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); - asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); + loadsegment(ds, __USER32_DS); + loadsegment(es, __USER32_DS); regs->cs = __USER32_CS; regs->ss = __USER32_DS; @@ -505,19 +496,15 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, #endif return 0; - -give_sigsegv: - force_sigsegv(sig, current); - return -EFAULT; } int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, compat_sigset_t *set, struct pt_regs *regs) { struct rt_sigframe __user *frame; - struct exec_domain *ed = current_thread_info()->exec_domain; void __user *restorer; int err = 0; + void __user *fpstate = NULL; /* __copy_to_user optimizes that into a single 8 byte store */ static const struct { @@ -533,31 +520,33 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 0, }; - frame = get_sigframe(ka, regs, sizeof(*frame)); + frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - goto give_sigsegv; + return -EFAULT; - err |= __put_user((ed && ed->signal_invmap && sig < 32 - ? ed->signal_invmap[sig] : sig), &frame->sig); + err |= __put_user(sig, &frame->sig); err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo); err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc); err |= copy_siginfo_to_user32(&frame->info, info); if (err) - goto give_sigsegv; + return -EFAULT; /* Create the ucontext. */ - err |= __put_user(0, &frame->uc.uc_flags); + if (cpu_has_xsave) + err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags); + else + err |= __put_user(0, &frame->uc.uc_flags); err |= __put_user(0, &frame->uc.uc_link); err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); err |= __put_user(sas_ss_flags(regs->sp), &frame->uc.uc_stack.ss_flags); err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); - err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate, + err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, fpstate, regs, set->sig[0]); err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); if (err) - goto give_sigsegv; + return -EFAULT; if (ka->sa.sa_flags & SA_RESTORER) restorer = ka->sa.sa_restorer; @@ -572,7 +561,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, */ err |= __copy_to_user(frame->retcode, &code, 8); if (err) - goto give_sigsegv; + return -EFAULT; /* Set up registers for signal handler */ regs->sp = (unsigned long) frame; @@ -588,8 +577,8 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, regs->dx = (unsigned long) &frame->info; regs->cx = (unsigned long) &frame->uc; - asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); - asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); + loadsegment(ds, __USER32_DS); + loadsegment(es, __USER32_DS); regs->cs = __USER32_CS; regs->ss = __USER32_DS; @@ -600,8 +589,4 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, #endif return 0; - -give_sigsegv: - force_sigsegv(sig, current); - return -EFAULT; } diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 20371d0635e4..256b00b61892 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -15,6 +15,16 @@ #include <asm/irqflags.h> #include <linux/linkage.h> +/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ +#include <linux/elf-em.h> +#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_LE 0x40000000 + +#ifndef CONFIG_AUDITSYSCALL +#define sysexit_audit int_ret_from_sys_call +#define sysretl_audit int_ret_from_sys_call +#endif + #define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8) .macro IA32_ARG_FIXUP noebp=0 @@ -29,24 +39,27 @@ .endm /* clobbers %eax */ - .macro CLEAR_RREGS + .macro CLEAR_RREGS _r9=rax xorl %eax,%eax movq %rax,R11(%rsp) movq %rax,R10(%rsp) - movq %rax,R9(%rsp) + movq %\_r9,R9(%rsp) movq %rax,R8(%rsp) .endm - .macro LOAD_ARGS32 offset - movl \offset(%rsp),%r11d - movl \offset+8(%rsp),%r10d + /* + * Reload arg registers from stack in case ptrace changed them. + * We don't reload %eax because syscall_trace_enter() returned + * the value it wants us to use in the table lookup. + */ + .macro LOAD_ARGS32 offset, _r9=0 + .if \_r9 movl \offset+16(%rsp),%r9d - movl \offset+24(%rsp),%r8d + .endif movl \offset+40(%rsp),%ecx movl \offset+48(%rsp),%edx movl \offset+56(%rsp),%esi movl \offset+64(%rsp),%edi - movl \offset+72(%rsp),%eax .endm .macro CFI_STARTPROC32 simple @@ -131,27 +144,28 @@ ENTRY(ia32_sysenter_target) SAVE_ARGS 0,0,1 /* no need to do an access_ok check here because rbp has been 32bit zero extended */ -1: movl (%rbp),%r9d +1: movl (%rbp),%ebp .section __ex_table,"a" .quad 1b,ia32_badarg .previous GET_THREAD_INFO(%r10) orl $TS_COMPAT,TI_status(%r10) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ - TI_flags(%r10) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) CFI_REMEMBER_STATE jnz sysenter_tracesys -sysenter_do_call: cmpl $(IA32_NR_syscalls-1),%eax ja ia32_badsys - IA32_ARG_FIXUP 1 +sysenter_do_call: + IA32_ARG_FIXUP +sysenter_dispatch: call *ia32_sys_call_table(,%rax,8) movq %rax,RAX-ARGOFFSET(%rsp) GET_THREAD_INFO(%r10) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF testl $_TIF_ALLWORK_MASK,TI_flags(%r10) - jnz int_ret_from_sys_call + jnz sysexit_audit +sysexit_from_sys_call: andl $~TS_COMPAT,TI_status(%r10) /* clear IF, that popfq doesn't enable interrupts early */ andl $~0x200,EFLAGS-R11(%rsp) @@ -167,18 +181,69 @@ sysenter_do_call: TRACE_IRQS_ON ENABLE_INTERRUPTS_SYSEXIT32 -sysenter_tracesys: +#ifdef CONFIG_AUDITSYSCALL + .macro auditsys_entry_common + movl %esi,%r9d /* 6th arg: 4th syscall arg */ + movl %edx,%r8d /* 5th arg: 3rd syscall arg */ + /* (already in %ecx) 4th arg: 2nd syscall arg */ + movl %ebx,%edx /* 3rd arg: 1st syscall arg */ + movl %eax,%esi /* 2nd arg: syscall number */ + movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */ + call audit_syscall_entry + movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */ + cmpl $(IA32_NR_syscalls-1),%eax + ja ia32_badsys + movl %ebx,%edi /* reload 1st syscall arg */ + movl RCX-ARGOFFSET(%rsp),%esi /* reload 2nd syscall arg */ + movl RDX-ARGOFFSET(%rsp),%edx /* reload 3rd syscall arg */ + movl RSI-ARGOFFSET(%rsp),%ecx /* reload 4th syscall arg */ + movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */ + .endm + + .macro auditsys_exit exit,ebpsave=RBP + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) + jnz int_ret_from_sys_call + TRACE_IRQS_ON + sti + movl %eax,%esi /* second arg, syscall return value */ + cmpl $0,%eax /* is it < 0? */ + setl %al /* 1 if so, 0 if not */ + movzbl %al,%edi /* zero-extend that into %edi */ + inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ + call audit_syscall_exit + GET_THREAD_INFO(%r10) + movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */ + movl \ebpsave-ARGOFFSET(%rsp),%ebp /* reload user register value */ + movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi + cli + TRACE_IRQS_OFF + testl %edi,TI_flags(%r10) + jnz int_with_check + jmp \exit + .endm + +sysenter_auditsys: CFI_RESTORE_STATE - xchgl %r9d,%ebp + auditsys_entry_common + movl %ebp,%r9d /* reload 6th syscall arg */ + jmp sysenter_dispatch + +sysexit_audit: + auditsys_exit sysexit_from_sys_call +#endif + +sysenter_tracesys: +#ifdef CONFIG_AUDITSYSCALL + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) + jz sysenter_auditsys +#endif SAVE_REST CLEAR_RREGS - movq %r9,R9(%rsp) movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */ movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ RESTORE_REST - xchgl %ebp,%r9d cmpl $(IA32_NR_syscalls-1),%eax ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */ jmp sysenter_do_call @@ -242,21 +307,22 @@ ENTRY(ia32_cstar_target) .previous GET_THREAD_INFO(%r10) orl $TS_COMPAT,TI_status(%r10) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ - TI_flags(%r10) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) CFI_REMEMBER_STATE jnz cstar_tracesys -cstar_do_call: cmpl $IA32_NR_syscalls-1,%eax ja ia32_badsys +cstar_do_call: IA32_ARG_FIXUP 1 +cstar_dispatch: call *ia32_sys_call_table(,%rax,8) movq %rax,RAX-ARGOFFSET(%rsp) GET_THREAD_INFO(%r10) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF testl $_TIF_ALLWORK_MASK,TI_flags(%r10) - jnz int_ret_from_sys_call + jnz sysretl_audit +sysretl_from_sys_call: andl $~TS_COMPAT,TI_status(%r10) RESTORE_ARGS 1,-ARG_SKIP,1,1,1 movl RIP-ARGOFFSET(%rsp),%ecx @@ -268,19 +334,32 @@ cstar_do_call: CFI_RESTORE rsp USERGS_SYSRET32 -cstar_tracesys: +#ifdef CONFIG_AUDITSYSCALL +cstar_auditsys: CFI_RESTORE_STATE + movl %r9d,R9-ARGOFFSET(%rsp) /* register to be clobbered by call */ + auditsys_entry_common + movl R9-ARGOFFSET(%rsp),%r9d /* reload 6th syscall arg */ + jmp cstar_dispatch + +sysretl_audit: + auditsys_exit sysretl_from_sys_call, RCX /* user %ebp in RCX slot */ +#endif + +cstar_tracesys: +#ifdef CONFIG_AUDITSYSCALL + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) + jz cstar_auditsys +#endif xchgl %r9d,%ebp SAVE_REST - CLEAR_RREGS - movq %r9,R9(%rsp) + CLEAR_RREGS r9 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter - LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ + LOAD_ARGS32 ARGOFFSET, 1 /* reload args from stack in case ptrace changed it */ RESTORE_REST xchgl %ebp,%r9d - movl RSP-ARGOFFSET(%rsp), %r8d cmpl $(IA32_NR_syscalls-1),%eax ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */ jmp cstar_do_call @@ -321,6 +400,7 @@ ENTRY(ia32_syscall) /*CFI_REL_OFFSET rflags,EFLAGS-RIP*/ /*CFI_REL_OFFSET cs,CS-RIP*/ CFI_REL_OFFSET rip,RIP-RIP + PARAVIRT_ADJUST_EXCEPTION_FRAME SWAPGS /* * No need to follow this irqs on/off section: the syscall @@ -336,8 +416,7 @@ ENTRY(ia32_syscall) SAVE_ARGS 0,0,1 GET_THREAD_INFO(%r10) orl $TS_COMPAT,TI_status(%r10) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ - TI_flags(%r10) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) jnz ia32_tracesys ia32_do_syscall: cmpl $(IA32_NR_syscalls-1),%eax @@ -492,8 +571,8 @@ ia32_sys_call_table: .quad compat_sys_setrlimit /* 75 */ .quad compat_sys_old_getrlimit /* old_getrlimit */ .quad compat_sys_getrusage - .quad sys32_gettimeofday - .quad sys32_settimeofday + .quad compat_sys_gettimeofday + .quad compat_sys_settimeofday .quad sys_getgroups16 /* 80 */ .quad sys_setgroups16 .quad sys32_old_select @@ -741,4 +820,10 @@ ia32_sys_call_table: .quad sys32_fallocate .quad compat_sys_timerfd_settime /* 325 */ .quad compat_sys_timerfd_gettime + .quad compat_sys_signalfd4 + .quad sys_eventfd2 + .quad sys_epoll_create1 + .quad sys_dup3 /* 330 */ + .quad sys_pipe2 + .quad sys_inotify_init1 ia32_syscall_end: diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index f00afdf61e67..2e09dcd3c0a6 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -49,41 +49,6 @@ #define AA(__x) ((unsigned long)(__x)) -int cp_compat_stat(struct kstat *kbuf, struct compat_stat __user *ubuf) -{ - compat_ino_t ino; - - typeof(ubuf->st_uid) uid = 0; - typeof(ubuf->st_gid) gid = 0; - SET_UID(uid, kbuf->uid); - SET_GID(gid, kbuf->gid); - if (!old_valid_dev(kbuf->dev) || !old_valid_dev(kbuf->rdev)) - return -EOVERFLOW; - if (kbuf->size >= 0x7fffffff) - return -EOVERFLOW; - ino = kbuf->ino; - if (sizeof(ino) < sizeof(kbuf->ino) && ino != kbuf->ino) - return -EOVERFLOW; - if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct compat_stat)) || - __put_user(old_encode_dev(kbuf->dev), &ubuf->st_dev) || - __put_user(ino, &ubuf->st_ino) || - __put_user(kbuf->mode, &ubuf->st_mode) || - __put_user(kbuf->nlink, &ubuf->st_nlink) || - __put_user(uid, &ubuf->st_uid) || - __put_user(gid, &ubuf->st_gid) || - __put_user(old_encode_dev(kbuf->rdev), &ubuf->st_rdev) || - __put_user(kbuf->size, &ubuf->st_size) || - __put_user(kbuf->atime.tv_sec, &ubuf->st_atime) || - __put_user(kbuf->atime.tv_nsec, &ubuf->st_atime_nsec) || - __put_user(kbuf->mtime.tv_sec, &ubuf->st_mtime) || - __put_user(kbuf->mtime.tv_nsec, &ubuf->st_mtime_nsec) || - __put_user(kbuf->ctime.tv_sec, &ubuf->st_ctime) || - __put_user(kbuf->ctime.tv_nsec, &ubuf->st_ctime_nsec) || - __put_user(kbuf->blksize, &ubuf->st_blksize) || - __put_user(kbuf->blocks, &ubuf->st_blocks)) - return -EFAULT; - return 0; -} asmlinkage long sys32_truncate64(char __user *filename, unsigned long offset_low, @@ -238,7 +203,7 @@ asmlinkage long sys32_pipe(int __user *fd) int retval; int fds[2]; - retval = do_pipe(fds); + retval = do_pipe_flags(fds, 0); if (retval) goto out; if (copy_to_user(fd, fds, sizeof(fds))) @@ -402,75 +367,11 @@ asmlinkage long sys32_rt_sigprocmask(int how, compat_sigset_t __user *set, return 0; } -static inline long get_tv32(struct timeval *o, struct compat_timeval __user *i) -{ - int err = -EFAULT; - - if (access_ok(VERIFY_READ, i, sizeof(*i))) { - err = __get_user(o->tv_sec, &i->tv_sec); - err |= __get_user(o->tv_usec, &i->tv_usec); - } - return err; -} - -static inline long put_tv32(struct compat_timeval __user *o, struct timeval *i) -{ - int err = -EFAULT; - - if (access_ok(VERIFY_WRITE, o, sizeof(*o))) { - err = __put_user(i->tv_sec, &o->tv_sec); - err |= __put_user(i->tv_usec, &o->tv_usec); - } - return err; -} - asmlinkage long sys32_alarm(unsigned int seconds) { return alarm_setitimer(seconds); } -/* - * Translations due to time_t size differences. Which affects all - * sorts of things, like timeval and itimerval. - */ -asmlinkage long sys32_gettimeofday(struct compat_timeval __user *tv, - struct timezone __user *tz) -{ - if (tv) { - struct timeval ktv; - - do_gettimeofday(&ktv); - if (put_tv32(tv, &ktv)) - return -EFAULT; - } - if (tz) { - if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) - return -EFAULT; - } - return 0; -} - -asmlinkage long sys32_settimeofday(struct compat_timeval __user *tv, - struct timezone __user *tz) -{ - struct timeval ktv; - struct timespec kts; - struct timezone ktz; - - if (tv) { - if (get_tv32(&ktv, tv)) - return -EFAULT; - kts.tv_sec = ktv.tv_sec; - kts.tv_nsec = ktv.tv_usec * NSEC_PER_USEC; - } - if (tz) { - if (copy_from_user(&ktz, tz, sizeof(ktz))) - return -EFAULT; - } - - return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); -} - struct sel_arg_struct { unsigned int n; unsigned int inp; @@ -556,15 +457,6 @@ asmlinkage long sys32_rt_sigqueueinfo(int pid, int sig, return ret; } -/* These are here just in case some old ia32 binary calls it. */ -asmlinkage long sys32_pause(void) -{ - current->state = TASK_INTERRUPTIBLE; - schedule(); - return -ERESTARTNOHAND; -} - - #ifdef CONFIG_SYSCTL_SYSCALL struct sysctl_ia32 { unsigned int name; diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild new file mode 100644 index 000000000000..4a8e80cdcfa5 --- /dev/null +++ b/arch/x86/include/asm/Kbuild @@ -0,0 +1,24 @@ +include include/asm-generic/Kbuild.asm + +header-y += boot.h +header-y += bootparam.h +header-y += debugreg.h +header-y += ldt.h +header-y += msr-index.h +header-y += prctl.h +header-y += ptrace-abi.h +header-y += sigcontext32.h +header-y += ucontext.h +header-y += processor-flags.h + +unifdef-y += e820.h +unifdef-y += ist.h +unifdef-y += mce.h +unifdef-y += msr.h +unifdef-y += mtrr.h +unifdef-y += posix_types_32.h +unifdef-y += posix_types_64.h +unifdef-y += unistd_32.h +unifdef-y += unistd_64.h +unifdef-y += vm86.h +unifdef-y += vsyscall.h diff --git a/arch/x86/include/asm/a.out-core.h b/arch/x86/include/asm/a.out-core.h new file mode 100644 index 000000000000..37822206083e --- /dev/null +++ b/arch/x86/include/asm/a.out-core.h @@ -0,0 +1,73 @@ +/* a.out coredump register dumper + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#ifndef _ASM_X86_A_OUT_CORE_H +#define _ASM_X86_A_OUT_CORE_H + +#ifdef __KERNEL__ +#ifdef CONFIG_X86_32 + +#include <linux/user.h> +#include <linux/elfcore.h> + +/* + * fill in the user structure for an a.out core dump + */ +static inline void aout_dump_thread(struct pt_regs *regs, struct user *dump) +{ + u16 gs; + +/* changed the size calculations - should hopefully work better. lbt */ + dump->magic = CMAGIC; + dump->start_code = 0; + dump->start_stack = regs->sp & ~(PAGE_SIZE - 1); + dump->u_tsize = ((unsigned long)current->mm->end_code) >> PAGE_SHIFT; + dump->u_dsize = ((unsigned long)(current->mm->brk + (PAGE_SIZE - 1))) + >> PAGE_SHIFT; + dump->u_dsize -= dump->u_tsize; + dump->u_ssize = 0; + dump->u_debugreg[0] = current->thread.debugreg0; + dump->u_debugreg[1] = current->thread.debugreg1; + dump->u_debugreg[2] = current->thread.debugreg2; + dump->u_debugreg[3] = current->thread.debugreg3; + dump->u_debugreg[4] = 0; + dump->u_debugreg[5] = 0; + dump->u_debugreg[6] = current->thread.debugreg6; + dump->u_debugreg[7] = current->thread.debugreg7; + + if (dump->start_stack < TASK_SIZE) + dump->u_ssize = ((unsigned long)(TASK_SIZE - dump->start_stack)) + >> PAGE_SHIFT; + + dump->regs.bx = regs->bx; + dump->regs.cx = regs->cx; + dump->regs.dx = regs->dx; + dump->regs.si = regs->si; + dump->regs.di = regs->di; + dump->regs.bp = regs->bp; + dump->regs.ax = regs->ax; + dump->regs.ds = (u16)regs->ds; + dump->regs.es = (u16)regs->es; + dump->regs.fs = (u16)regs->fs; + savesegment(gs, gs); + dump->regs.orig_ax = regs->orig_ax; + dump->regs.ip = regs->ip; + dump->regs.cs = (u16)regs->cs; + dump->regs.flags = regs->flags; + dump->regs.sp = regs->sp; + dump->regs.ss = (u16)regs->ss; + + dump->u_fpvalid = dump_fpu(regs, &dump->i387); +} + +#endif /* CONFIG_X86_32 */ +#endif /* __KERNEL__ */ +#endif /* _ASM_X86_A_OUT_CORE_H */ diff --git a/arch/x86/include/asm/a.out.h b/arch/x86/include/asm/a.out.h new file mode 100644 index 000000000000..4684f97a5bbd --- /dev/null +++ b/arch/x86/include/asm/a.out.h @@ -0,0 +1,20 @@ +#ifndef _ASM_X86_A_OUT_H +#define _ASM_X86_A_OUT_H + +struct exec +{ + unsigned int a_info; /* Use macros N_MAGIC, etc for access */ + unsigned a_text; /* length of text, in bytes */ + unsigned a_data; /* length of data, in bytes */ + unsigned a_bss; /* length of uninitialized data area for file, in bytes */ + unsigned a_syms; /* length of symbol table data in file, in bytes */ + unsigned a_entry; /* start address */ + unsigned a_trsize; /* length of relocation info for text, in bytes */ + unsigned a_drsize; /* length of relocation info for data, in bytes */ +}; + +#define N_TRSIZE(a) ((a).a_trsize) +#define N_DRSIZE(a) ((a).a_drsize) +#define N_SYMSIZE(a) ((a).a_syms) + +#endif /* _ASM_X86_A_OUT_H */ diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h new file mode 100644 index 000000000000..8d676d8ecde9 --- /dev/null +++ b/arch/x86/include/asm/acpi.h @@ -0,0 +1,178 @@ +#ifndef _ASM_X86_ACPI_H +#define _ASM_X86_ACPI_H + +/* + * Copyright (C) 2001 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> + * Copyright (C) 2001 Patrick Mochel <mochel@osdl.org> + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ +#include <acpi/pdc_intel.h> + +#include <asm/numa.h> +#include <asm/processor.h> +#include <asm/mmu.h> +#include <asm/mpspec.h> + +#define COMPILER_DEPENDENT_INT64 long long +#define COMPILER_DEPENDENT_UINT64 unsigned long long + +/* + * Calling conventions: + * + * ACPI_SYSTEM_XFACE - Interfaces to host OS (handlers, threads) + * ACPI_EXTERNAL_XFACE - External ACPI interfaces + * ACPI_INTERNAL_XFACE - Internal ACPI interfaces + * ACPI_INTERNAL_VAR_XFACE - Internal variable-parameter list interfaces + */ +#define ACPI_SYSTEM_XFACE +#define ACPI_EXTERNAL_XFACE +#define ACPI_INTERNAL_XFACE +#define ACPI_INTERNAL_VAR_XFACE + +/* Asm macros */ + +#define ACPI_ASM_MACROS +#define BREAKPOINT3 +#define ACPI_DISABLE_IRQS() local_irq_disable() +#define ACPI_ENABLE_IRQS() local_irq_enable() +#define ACPI_FLUSH_CPU_CACHE() wbinvd() + +int __acpi_acquire_global_lock(unsigned int *lock); +int __acpi_release_global_lock(unsigned int *lock); + +#define ACPI_ACQUIRE_GLOBAL_LOCK(facs, Acq) \ + ((Acq) = __acpi_acquire_global_lock(&facs->global_lock)) + +#define ACPI_RELEASE_GLOBAL_LOCK(facs, Acq) \ + ((Acq) = __acpi_release_global_lock(&facs->global_lock)) + +/* + * Math helper asm macros + */ +#define ACPI_DIV_64_BY_32(n_hi, n_lo, d32, q32, r32) \ + asm("divl %2;" \ + : "=a"(q32), "=d"(r32) \ + : "r"(d32), \ + "0"(n_lo), "1"(n_hi)) + + +#define ACPI_SHIFT_RIGHT_64(n_hi, n_lo) \ + asm("shrl $1,%2 ;" \ + "rcrl $1,%3;" \ + : "=r"(n_hi), "=r"(n_lo) \ + : "0"(n_hi), "1"(n_lo)) + +#ifdef CONFIG_ACPI +extern int acpi_lapic; +extern int acpi_ioapic; +extern int acpi_noirq; +extern int acpi_strict; +extern int acpi_disabled; +extern int acpi_ht; +extern int acpi_pci_disabled; +extern int acpi_skip_timer_override; +extern int acpi_use_timer_override; + +extern u8 acpi_sci_flags; +extern int acpi_sci_override_gsi; +void acpi_pic_sci_set_trigger(unsigned int, u16); + +static inline void disable_acpi(void) +{ + acpi_disabled = 1; + acpi_ht = 0; + acpi_pci_disabled = 1; + acpi_noirq = 1; +} + +/* Fixmap pages to reserve for ACPI boot-time tables (see fixmap.h) */ +#define FIX_ACPI_PAGES 4 + +extern int acpi_gsi_to_irq(u32 gsi, unsigned int *irq); + +static inline void acpi_noirq_set(void) { acpi_noirq = 1; } +static inline void acpi_disable_pci(void) +{ + acpi_pci_disabled = 1; + acpi_noirq_set(); +} +extern int acpi_irq_balance_set(char *str); + +/* routines for saving/restoring kernel state */ +extern int acpi_save_state_mem(void); +extern void acpi_restore_state_mem(void); + +extern unsigned long acpi_wakeup_address; + +/* early initialization routine */ +extern void acpi_reserve_bootmem(void); + +/* + * Check if the CPU can handle C2 and deeper + */ +static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate) +{ + /* + * Early models (<=5) of AMD Opterons are not supposed to go into + * C2 state. + * + * Steppings 0x0A and later are good + */ + if (boot_cpu_data.x86 == 0x0F && + boot_cpu_data.x86_vendor == X86_VENDOR_AMD && + boot_cpu_data.x86_model <= 0x05 && + boot_cpu_data.x86_mask < 0x0A) + return 1; + else if (boot_cpu_has(X86_FEATURE_AMDC1E)) + return 1; + else + return max_cstate; +} + +#else /* !CONFIG_ACPI */ + +#define acpi_lapic 0 +#define acpi_ioapic 0 +static inline void acpi_noirq_set(void) { } +static inline void acpi_disable_pci(void) { } +static inline void disable_acpi(void) { } + +#endif /* !CONFIG_ACPI */ + +#define ARCH_HAS_POWER_INIT 1 + +struct bootnode; + +#ifdef CONFIG_ACPI_NUMA +extern int acpi_numa; +extern int acpi_scan_nodes(unsigned long start, unsigned long end); +#define NR_NODE_MEMBLKS (MAX_NUMNODES*2) +extern void acpi_fake_nodes(const struct bootnode *fake_nodes, + int num_nodes); +#else +static inline void acpi_fake_nodes(const struct bootnode *fake_nodes, + int num_nodes) +{ +} +#endif + +#define acpi_unlazy_tlb(x) leave_mm(x) + +#endif /* _ASM_X86_ACPI_H */ diff --git a/arch/x86/include/asm/agp.h b/arch/x86/include/asm/agp.h new file mode 100644 index 000000000000..9825cd64c9b6 --- /dev/null +++ b/arch/x86/include/asm/agp.h @@ -0,0 +1,35 @@ +#ifndef _ASM_X86_AGP_H +#define _ASM_X86_AGP_H + +#include <asm/pgtable.h> +#include <asm/cacheflush.h> + +/* + * Functions to keep the agpgart mappings coherent with the MMU. The + * GART gives the CPU a physical alias of pages in memory. The alias + * region is mapped uncacheable. Make sure there are no conflicting + * mappings with different cachability attributes for the same + * page. This avoids data corruption on some CPUs. + */ + +#define map_page_into_agp(page) set_pages_uc(page, 1) +#define unmap_page_from_agp(page) set_pages_wb(page, 1) + +/* + * Could use CLFLUSH here if the cpu supports it. But then it would + * need to be called for each cacheline of the whole page so it may + * not be worth it. Would need a page for it. + */ +#define flush_agp_cache() wbinvd() + +/* Convert a physical address to an address suitable for the GART. */ +#define phys_to_gart(x) (x) +#define gart_to_phys(x) (x) + +/* GATT allocation. Returns/accepts GATT kernel virtual address. */ +#define alloc_gatt_pages(order) \ + ((char *)__get_free_pages(GFP_KERNEL, (order))) +#define free_gatt_pages(table, order) \ + free_pages((unsigned long)(table), (order)) + +#endif /* _ASM_X86_AGP_H */ diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h new file mode 100644 index 000000000000..e2077d343c33 --- /dev/null +++ b/arch/x86/include/asm/alternative-asm.h @@ -0,0 +1,22 @@ +#ifdef __ASSEMBLY__ + +#ifdef CONFIG_X86_32 +# define X86_ALIGN .long +#else +# define X86_ALIGN .quad +#endif + +#ifdef CONFIG_SMP + .macro LOCK_PREFIX +1: lock + .section .smp_locks,"a" + .align 4 + X86_ALIGN 1b + .previous + .endm +#else + .macro LOCK_PREFIX + .endm +#endif + +#endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h new file mode 100644 index 000000000000..f6aa18eadf71 --- /dev/null +++ b/arch/x86/include/asm/alternative.h @@ -0,0 +1,183 @@ +#ifndef _ASM_X86_ALTERNATIVE_H +#define _ASM_X86_ALTERNATIVE_H + +#include <linux/types.h> +#include <linux/stddef.h> +#include <asm/asm.h> + +/* + * Alternative inline assembly for SMP. + * + * The LOCK_PREFIX macro defined here replaces the LOCK and + * LOCK_PREFIX macros used everywhere in the source tree. + * + * SMP alternatives use the same data structures as the other + * alternatives and the X86_FEATURE_UP flag to indicate the case of a + * UP system running a SMP kernel. The existing apply_alternatives() + * works fine for patching a SMP kernel for UP. + * + * The SMP alternative tables can be kept after boot and contain both + * UP and SMP versions of the instructions to allow switching back to + * SMP at runtime, when hotplugging in a new CPU, which is especially + * useful in virtualized environments. + * + * The very common lock prefix is handled as special case in a + * separate table which is a pure address list without replacement ptr + * and size information. That keeps the table sizes small. + */ + +#ifdef CONFIG_SMP +#define LOCK_PREFIX \ + ".section .smp_locks,\"a\"\n" \ + _ASM_ALIGN "\n" \ + _ASM_PTR "661f\n" /* address */ \ + ".previous\n" \ + "661:\n\tlock; " + +#else /* ! CONFIG_SMP */ +#define LOCK_PREFIX "" +#endif + +/* This must be included *after* the definition of LOCK_PREFIX */ +#include <asm/cpufeature.h> + +struct alt_instr { + u8 *instr; /* original instruction */ + u8 *replacement; + u8 cpuid; /* cpuid bit set for replacement */ + u8 instrlen; /* length of original instruction */ + u8 replacementlen; /* length of new instruction, <= instrlen */ + u8 pad1; +#ifdef CONFIG_X86_64 + u32 pad2; +#endif +}; + +extern void alternative_instructions(void); +extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); + +struct module; + +#ifdef CONFIG_SMP +extern void alternatives_smp_module_add(struct module *mod, char *name, + void *locks, void *locks_end, + void *text, void *text_end); +extern void alternatives_smp_module_del(struct module *mod); +extern void alternatives_smp_switch(int smp); +#else +static inline void alternatives_smp_module_add(struct module *mod, char *name, + void *locks, void *locks_end, + void *text, void *text_end) {} +static inline void alternatives_smp_module_del(struct module *mod) {} +static inline void alternatives_smp_switch(int smp) {} +#endif /* CONFIG_SMP */ + +const unsigned char *const *find_nop_table(void); + +/* + * Alternative instructions for different CPU types or capabilities. + * + * This allows to use optimized instructions even on generic binary + * kernels. + * + * length of oldinstr must be longer or equal the length of newinstr + * It can be padded with nops as needed. + * + * For non barrier like inlines please define new variants + * without volatile and memory clobber. + */ +#define alternative(oldinstr, newinstr, feature) \ + asm volatile ("661:\n\t" oldinstr "\n662:\n" \ + ".section .altinstructions,\"a\"\n" \ + _ASM_ALIGN "\n" \ + _ASM_PTR "661b\n" /* label */ \ + _ASM_PTR "663f\n" /* new instruction */ \ + " .byte %c0\n" /* feature bit */ \ + " .byte 662b-661b\n" /* sourcelen */ \ + " .byte 664f-663f\n" /* replacementlen */ \ + ".previous\n" \ + ".section .altinstr_replacement,\"ax\"\n" \ + "663:\n\t" newinstr "\n664:\n" /* replacement */ \ + ".previous" :: "i" (feature) : "memory") + +/* + * Alternative inline assembly with input. + * + * Pecularities: + * No memory clobber here. + * Argument numbers start with 1. + * Best is to use constraints that are fixed size (like (%1) ... "r") + * If you use variable sized constraints like "m" or "g" in the + * replacement make sure to pad to the worst case length. + */ +#define alternative_input(oldinstr, newinstr, feature, input...) \ + asm volatile ("661:\n\t" oldinstr "\n662:\n" \ + ".section .altinstructions,\"a\"\n" \ + _ASM_ALIGN "\n" \ + _ASM_PTR "661b\n" /* label */ \ + _ASM_PTR "663f\n" /* new instruction */ \ + " .byte %c0\n" /* feature bit */ \ + " .byte 662b-661b\n" /* sourcelen */ \ + " .byte 664f-663f\n" /* replacementlen */ \ + ".previous\n" \ + ".section .altinstr_replacement,\"ax\"\n" \ + "663:\n\t" newinstr "\n664:\n" /* replacement */ \ + ".previous" :: "i" (feature), ##input) + +/* Like alternative_input, but with a single output argument */ +#define alternative_io(oldinstr, newinstr, feature, output, input...) \ + asm volatile ("661:\n\t" oldinstr "\n662:\n" \ + ".section .altinstructions,\"a\"\n" \ + _ASM_ALIGN "\n" \ + _ASM_PTR "661b\n" /* label */ \ + _ASM_PTR "663f\n" /* new instruction */ \ + " .byte %c[feat]\n" /* feature bit */ \ + " .byte 662b-661b\n" /* sourcelen */ \ + " .byte 664f-663f\n" /* replacementlen */ \ + ".previous\n" \ + ".section .altinstr_replacement,\"ax\"\n" \ + "663:\n\t" newinstr "\n664:\n" /* replacement */ \ + ".previous" : output : [feat] "i" (feature), ##input) + +/* + * use this macro(s) if you need more than one output parameter + * in alternative_io + */ +#define ASM_OUTPUT2(a, b) a, b + +struct paravirt_patch_site; +#ifdef CONFIG_PARAVIRT +void apply_paravirt(struct paravirt_patch_site *start, + struct paravirt_patch_site *end); +#else +static inline void apply_paravirt(struct paravirt_patch_site *start, + struct paravirt_patch_site *end) +{} +#define __parainstructions NULL +#define __parainstructions_end NULL +#endif + +extern void add_nops(void *insns, unsigned int len); + +/* + * Clear and restore the kernel write-protection flag on the local CPU. + * Allows the kernel to edit read-only pages. + * Side-effect: any interrupt handler running between save and restore will have + * the ability to write to read-only pages. + * + * Warning: + * Code patching in the UP case is safe if NMIs and MCE handlers are stopped and + * no thread can be preempted in the instructions being modified (no iret to an + * invalid instruction possible) or if the instructions are changed from a + * consistent state to another consistent state atomically. + * More care must be taken when modifying code in the SMP case because of + * Intel's errata. + * On the local CPU you need to be protected again NMI or MCE handlers seeing an + * inconsistent instruction while you patch. + * The _early version expects the memory to already be RW. + */ + +extern void *text_poke(void *addr, const void *opcode, size_t len); +extern void *text_poke_early(void *addr, const void *opcode, size_t len); + +#endif /* _ASM_X86_ALTERNATIVE_H */ diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h new file mode 100644 index 000000000000..f712344329bc --- /dev/null +++ b/arch/x86/include/asm/amd_iommu.h @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. + * Author: Joerg Roedel <joerg.roedel@amd.com> + * Leo Duran <leo.duran@amd.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _ASM_X86_AMD_IOMMU_H +#define _ASM_X86_AMD_IOMMU_H + +#include <linux/irqreturn.h> + +#ifdef CONFIG_AMD_IOMMU +extern int amd_iommu_init(void); +extern int amd_iommu_init_dma_ops(void); +extern void amd_iommu_detect(void); +extern irqreturn_t amd_iommu_int_handler(int irq, void *data); +#else +static inline int amd_iommu_init(void) { return -ENODEV; } +static inline void amd_iommu_detect(void) { } +#endif + +#endif /* _ASM_X86_AMD_IOMMU_H */ diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h new file mode 100644 index 000000000000..1a30c0440c6b --- /dev/null +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -0,0 +1,404 @@ +/* + * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. + * Author: Joerg Roedel <joerg.roedel@amd.com> + * Leo Duran <leo.duran@amd.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _ASM_X86_AMD_IOMMU_TYPES_H +#define _ASM_X86_AMD_IOMMU_TYPES_H + +#include <linux/types.h> +#include <linux/list.h> +#include <linux/spinlock.h> + +/* + * some size calculation constants + */ +#define DEV_TABLE_ENTRY_SIZE 32 +#define ALIAS_TABLE_ENTRY_SIZE 2 +#define RLOOKUP_TABLE_ENTRY_SIZE (sizeof(void *)) + +/* Length of the MMIO region for the AMD IOMMU */ +#define MMIO_REGION_LENGTH 0x4000 + +/* Capability offsets used by the driver */ +#define MMIO_CAP_HDR_OFFSET 0x00 +#define MMIO_RANGE_OFFSET 0x0c +#define MMIO_MISC_OFFSET 0x10 + +/* Masks, shifts and macros to parse the device range capability */ +#define MMIO_RANGE_LD_MASK 0xff000000 +#define MMIO_RANGE_FD_MASK 0x00ff0000 +#define MMIO_RANGE_BUS_MASK 0x0000ff00 +#define MMIO_RANGE_LD_SHIFT 24 +#define MMIO_RANGE_FD_SHIFT 16 +#define MMIO_RANGE_BUS_SHIFT 8 +#define MMIO_GET_LD(x) (((x) & MMIO_RANGE_LD_MASK) >> MMIO_RANGE_LD_SHIFT) +#define MMIO_GET_FD(x) (((x) & MMIO_RANGE_FD_MASK) >> MMIO_RANGE_FD_SHIFT) +#define MMIO_GET_BUS(x) (((x) & MMIO_RANGE_BUS_MASK) >> MMIO_RANGE_BUS_SHIFT) +#define MMIO_MSI_NUM(x) ((x) & 0x1f) + +/* Flag masks for the AMD IOMMU exclusion range */ +#define MMIO_EXCL_ENABLE_MASK 0x01ULL +#define MMIO_EXCL_ALLOW_MASK 0x02ULL + +/* Used offsets into the MMIO space */ +#define MMIO_DEV_TABLE_OFFSET 0x0000 +#define MMIO_CMD_BUF_OFFSET 0x0008 +#define MMIO_EVT_BUF_OFFSET 0x0010 +#define MMIO_CONTROL_OFFSET 0x0018 +#define MMIO_EXCL_BASE_OFFSET 0x0020 +#define MMIO_EXCL_LIMIT_OFFSET 0x0028 +#define MMIO_CMD_HEAD_OFFSET 0x2000 +#define MMIO_CMD_TAIL_OFFSET 0x2008 +#define MMIO_EVT_HEAD_OFFSET 0x2010 +#define MMIO_EVT_TAIL_OFFSET 0x2018 +#define MMIO_STATUS_OFFSET 0x2020 + +/* MMIO status bits */ +#define MMIO_STATUS_COM_WAIT_INT_MASK 0x04 + +/* event logging constants */ +#define EVENT_ENTRY_SIZE 0x10 +#define EVENT_TYPE_SHIFT 28 +#define EVENT_TYPE_MASK 0xf +#define EVENT_TYPE_ILL_DEV 0x1 +#define EVENT_TYPE_IO_FAULT 0x2 +#define EVENT_TYPE_DEV_TAB_ERR 0x3 +#define EVENT_TYPE_PAGE_TAB_ERR 0x4 +#define EVENT_TYPE_ILL_CMD 0x5 +#define EVENT_TYPE_CMD_HARD_ERR 0x6 +#define EVENT_TYPE_IOTLB_INV_TO 0x7 +#define EVENT_TYPE_INV_DEV_REQ 0x8 +#define EVENT_DEVID_MASK 0xffff +#define EVENT_DEVID_SHIFT 0 +#define EVENT_DOMID_MASK 0xffff +#define EVENT_DOMID_SHIFT 0 +#define EVENT_FLAGS_MASK 0xfff +#define EVENT_FLAGS_SHIFT 0x10 + +/* feature control bits */ +#define CONTROL_IOMMU_EN 0x00ULL +#define CONTROL_HT_TUN_EN 0x01ULL +#define CONTROL_EVT_LOG_EN 0x02ULL +#define CONTROL_EVT_INT_EN 0x03ULL +#define CONTROL_COMWAIT_EN 0x04ULL +#define CONTROL_PASSPW_EN 0x08ULL +#define CONTROL_RESPASSPW_EN 0x09ULL +#define CONTROL_COHERENT_EN 0x0aULL +#define CONTROL_ISOC_EN 0x0bULL +#define CONTROL_CMDBUF_EN 0x0cULL +#define CONTROL_PPFLOG_EN 0x0dULL +#define CONTROL_PPFINT_EN 0x0eULL + +/* command specific defines */ +#define CMD_COMPL_WAIT 0x01 +#define CMD_INV_DEV_ENTRY 0x02 +#define CMD_INV_IOMMU_PAGES 0x03 + +#define CMD_COMPL_WAIT_STORE_MASK 0x01 +#define CMD_COMPL_WAIT_INT_MASK 0x02 +#define CMD_INV_IOMMU_PAGES_SIZE_MASK 0x01 +#define CMD_INV_IOMMU_PAGES_PDE_MASK 0x02 + +#define CMD_INV_IOMMU_ALL_PAGES_ADDRESS 0x7fffffffffffffffULL + +/* macros and definitions for device table entries */ +#define DEV_ENTRY_VALID 0x00 +#define DEV_ENTRY_TRANSLATION 0x01 +#define DEV_ENTRY_IR 0x3d +#define DEV_ENTRY_IW 0x3e +#define DEV_ENTRY_NO_PAGE_FAULT 0x62 +#define DEV_ENTRY_EX 0x67 +#define DEV_ENTRY_SYSMGT1 0x68 +#define DEV_ENTRY_SYSMGT2 0x69 +#define DEV_ENTRY_INIT_PASS 0xb8 +#define DEV_ENTRY_EINT_PASS 0xb9 +#define DEV_ENTRY_NMI_PASS 0xba +#define DEV_ENTRY_LINT0_PASS 0xbe +#define DEV_ENTRY_LINT1_PASS 0xbf +#define DEV_ENTRY_MODE_MASK 0x07 +#define DEV_ENTRY_MODE_SHIFT 0x09 + +/* constants to configure the command buffer */ +#define CMD_BUFFER_SIZE 8192 +#define CMD_BUFFER_ENTRIES 512 +#define MMIO_CMD_SIZE_SHIFT 56 +#define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT) + +/* constants for event buffer handling */ +#define EVT_BUFFER_SIZE 8192 /* 512 entries */ +#define EVT_LEN_MASK (0x9ULL << 56) + +#define PAGE_MODE_1_LEVEL 0x01 +#define PAGE_MODE_2_LEVEL 0x02 +#define PAGE_MODE_3_LEVEL 0x03 + +#define IOMMU_PDE_NL_0 0x000ULL +#define IOMMU_PDE_NL_1 0x200ULL +#define IOMMU_PDE_NL_2 0x400ULL +#define IOMMU_PDE_NL_3 0x600ULL + +#define IOMMU_PTE_L2_INDEX(address) (((address) >> 30) & 0x1ffULL) +#define IOMMU_PTE_L1_INDEX(address) (((address) >> 21) & 0x1ffULL) +#define IOMMU_PTE_L0_INDEX(address) (((address) >> 12) & 0x1ffULL) + +#define IOMMU_MAP_SIZE_L1 (1ULL << 21) +#define IOMMU_MAP_SIZE_L2 (1ULL << 30) +#define IOMMU_MAP_SIZE_L3 (1ULL << 39) + +#define IOMMU_PTE_P (1ULL << 0) +#define IOMMU_PTE_TV (1ULL << 1) +#define IOMMU_PTE_U (1ULL << 59) +#define IOMMU_PTE_FC (1ULL << 60) +#define IOMMU_PTE_IR (1ULL << 61) +#define IOMMU_PTE_IW (1ULL << 62) + +#define IOMMU_L1_PDE(address) \ + ((address) | IOMMU_PDE_NL_1 | IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW) +#define IOMMU_L2_PDE(address) \ + ((address) | IOMMU_PDE_NL_2 | IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW) + +#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL) +#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P) +#define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK)) +#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07) + +#define IOMMU_PROT_MASK 0x03 +#define IOMMU_PROT_IR 0x01 +#define IOMMU_PROT_IW 0x02 + +/* IOMMU capabilities */ +#define IOMMU_CAP_IOTLB 24 +#define IOMMU_CAP_NPCACHE 26 + +#define MAX_DOMAIN_ID 65536 + +/* FIXME: move this macro to <linux/pci.h> */ +#define PCI_BUS(x) (((x) >> 8) & 0xff) + +/* + * This structure contains generic data for IOMMU protection domains + * independent of their use. + */ +struct protection_domain { + spinlock_t lock; /* mostly used to lock the page table*/ + u16 id; /* the domain id written to the device table */ + int mode; /* paging mode (0-6 levels) */ + u64 *pt_root; /* page table root pointer */ + void *priv; /* private data */ +}; + +/* + * Data container for a dma_ops specific protection domain + */ +struct dma_ops_domain { + struct list_head list; + + /* generic protection domain information */ + struct protection_domain domain; + + /* size of the aperture for the mappings */ + unsigned long aperture_size; + + /* address we start to search for free addresses */ + unsigned long next_bit; + + /* address allocation bitmap */ + unsigned long *bitmap; + + /* + * Array of PTE pages for the aperture. In this array we save all the + * leaf pages of the domain page table used for the aperture. This way + * we don't need to walk the page table to find a specific PTE. We can + * just calculate its address in constant time. + */ + u64 **pte_pages; + + /* This will be set to true when TLB needs to be flushed */ + bool need_flush; + + /* + * if this is a preallocated domain, keep the device for which it was + * preallocated in this variable + */ + u16 target_dev; +}; + +/* + * Structure where we save information about one hardware AMD IOMMU in the + * system. + */ +struct amd_iommu { + struct list_head list; + + /* locks the accesses to the hardware */ + spinlock_t lock; + + /* Pointer to PCI device of this IOMMU */ + struct pci_dev *dev; + + /* + * Capability pointer. There could be more than one IOMMU per PCI + * device function if there are more than one AMD IOMMU capability + * pointers. + */ + u16 cap_ptr; + + /* physical address of MMIO space */ + u64 mmio_phys; + /* virtual address of MMIO space */ + u8 *mmio_base; + + /* capabilities of that IOMMU read from ACPI */ + u32 cap; + + /* pci domain of this IOMMU */ + u16 pci_seg; + + /* first device this IOMMU handles. read from PCI */ + u16 first_device; + /* last device this IOMMU handles. read from PCI */ + u16 last_device; + + /* start of exclusion range of that IOMMU */ + u64 exclusion_start; + /* length of exclusion range of that IOMMU */ + u64 exclusion_length; + + /* command buffer virtual address */ + u8 *cmd_buf; + /* size of command buffer */ + u32 cmd_buf_size; + + /* event buffer virtual address */ + u8 *evt_buf; + /* size of event buffer */ + u32 evt_buf_size; + /* MSI number for event interrupt */ + u16 evt_msi_num; + + /* if one, we need to send a completion wait command */ + int need_sync; + + /* true if interrupts for this IOMMU are already enabled */ + bool int_enabled; + + /* default dma_ops domain for that IOMMU */ + struct dma_ops_domain *default_dom; +}; + +/* + * List with all IOMMUs in the system. This list is not locked because it is + * only written and read at driver initialization or suspend time + */ +extern struct list_head amd_iommu_list; + +/* + * Structure defining one entry in the device table + */ +struct dev_table_entry { + u32 data[8]; +}; + +/* + * One entry for unity mappings parsed out of the ACPI table. + */ +struct unity_map_entry { + struct list_head list; + + /* starting device id this entry is used for (including) */ + u16 devid_start; + /* end device id this entry is used for (including) */ + u16 devid_end; + + /* start address to unity map (including) */ + u64 address_start; + /* end address to unity map (including) */ + u64 address_end; + + /* required protection */ + int prot; +}; + +/* + * List of all unity mappings. It is not locked because as runtime it is only + * read. It is created at ACPI table parsing time. + */ +extern struct list_head amd_iommu_unity_map; + +/* + * Data structures for device handling + */ + +/* + * Device table used by hardware. Read and write accesses by software are + * locked with the amd_iommu_pd_table lock. + */ +extern struct dev_table_entry *amd_iommu_dev_table; + +/* + * Alias table to find requestor ids to device ids. Not locked because only + * read on runtime. + */ +extern u16 *amd_iommu_alias_table; + +/* + * Reverse lookup table to find the IOMMU which translates a specific device. + */ +extern struct amd_iommu **amd_iommu_rlookup_table; + +/* size of the dma_ops aperture as power of 2 */ +extern unsigned amd_iommu_aperture_order; + +/* largest PCI device id we expect translation requests for */ +extern u16 amd_iommu_last_bdf; + +/* data structures for protection domain handling */ +extern struct protection_domain **amd_iommu_pd_table; + +/* allocation bitmap for domain ids */ +extern unsigned long *amd_iommu_pd_alloc_bitmap; + +/* will be 1 if device isolation is enabled */ +extern int amd_iommu_isolate; + +/* + * If true, the addresses will be flushed on unmap time, not when + * they are reused + */ +extern bool amd_iommu_unmap_flush; + +/* takes a PCI device id and prints it out in a readable form */ +static inline void print_devid(u16 devid, int nl) +{ + int bus = devid >> 8; + int dev = devid >> 3 & 0x1f; + int fn = devid & 0x07; + + printk("%02x:%02x.%x", bus, dev, fn); + if (nl) + printk("\n"); +} + +/* takes bus and device/function and returns the device id + * FIXME: should that be in generic PCI code? */ +static inline u16 calc_devid(u8 bus, u8 devfn) +{ + return (((u16)bus) << 8) | devfn; +} + +#endif /* _ASM_X86_AMD_IOMMU_TYPES_H */ diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h new file mode 100644 index 000000000000..3b1510b4fc57 --- /dev/null +++ b/arch/x86/include/asm/apic.h @@ -0,0 +1,199 @@ +#ifndef _ASM_X86_APIC_H +#define _ASM_X86_APIC_H + +#include <linux/pm.h> +#include <linux/delay.h> + +#include <asm/alternative.h> +#include <asm/fixmap.h> +#include <asm/apicdef.h> +#include <asm/processor.h> +#include <asm/system.h> +#include <asm/cpufeature.h> +#include <asm/msr.h> + +#define ARCH_APICTIMER_STOPS_ON_C3 1 + +/* + * Debugging macros + */ +#define APIC_QUIET 0 +#define APIC_VERBOSE 1 +#define APIC_DEBUG 2 + +/* + * Define the default level of output to be very little + * This can be turned up by using apic=verbose for more + * information and apic=debug for _lots_ of information. + * apic_verbosity is defined in apic.c + */ +#define apic_printk(v, s, a...) do { \ + if ((v) <= apic_verbosity) \ + printk(s, ##a); \ + } while (0) + + +extern void generic_apic_probe(void); + +#ifdef CONFIG_X86_LOCAL_APIC + +extern unsigned int apic_verbosity; +extern int local_apic_timer_c2_ok; + +extern int disable_apic; +/* + * Basic functions accessing APICs. + */ +#ifdef CONFIG_PARAVIRT +#include <asm/paravirt.h> +#else +#define setup_boot_clock setup_boot_APIC_clock +#define setup_secondary_clock setup_secondary_APIC_clock +#endif + +extern int is_vsmp_box(void); +extern void xapic_wait_icr_idle(void); +extern u32 safe_xapic_wait_icr_idle(void); +extern u64 xapic_icr_read(void); +extern void xapic_icr_write(u32, u32); +extern int setup_profiling_timer(unsigned int); + +static inline void native_apic_mem_write(u32 reg, u32 v) +{ + volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg); + + alternative_io("movl %0, %1", "xchgl %0, %1", X86_FEATURE_11AP, + ASM_OUTPUT2("=r" (v), "=m" (*addr)), + ASM_OUTPUT2("0" (v), "m" (*addr))); +} + +static inline u32 native_apic_mem_read(u32 reg) +{ + return *((volatile u32 *)(APIC_BASE + reg)); +} + +static inline void native_apic_msr_write(u32 reg, u32 v) +{ + if (reg == APIC_DFR || reg == APIC_ID || reg == APIC_LDR || + reg == APIC_LVR) + return; + + wrmsr(APIC_BASE_MSR + (reg >> 4), v, 0); +} + +static inline u32 native_apic_msr_read(u32 reg) +{ + u32 low, high; + + if (reg == APIC_DFR) + return -1; + + rdmsr(APIC_BASE_MSR + (reg >> 4), low, high); + return low; +} + +#ifndef CONFIG_X86_32 +extern int x2apic, x2apic_preenabled; +extern void check_x2apic(void); +extern void enable_x2apic(void); +extern void enable_IR_x2apic(void); +extern void x2apic_icr_write(u32 low, u32 id); +static inline int x2apic_enabled(void) +{ + int msr, msr2; + + if (!cpu_has_x2apic) + return 0; + + rdmsr(MSR_IA32_APICBASE, msr, msr2); + if (msr & X2APIC_ENABLE) + return 1; + return 0; +} +#else +#define x2apic_enabled() 0 +#endif + +struct apic_ops { + u32 (*read)(u32 reg); + void (*write)(u32 reg, u32 v); + u64 (*icr_read)(void); + void (*icr_write)(u32 low, u32 high); + void (*wait_icr_idle)(void); + u32 (*safe_wait_icr_idle)(void); +}; + +extern struct apic_ops *apic_ops; + +#define apic_read (apic_ops->read) +#define apic_write (apic_ops->write) +#define apic_icr_read (apic_ops->icr_read) +#define apic_icr_write (apic_ops->icr_write) +#define apic_wait_icr_idle (apic_ops->wait_icr_idle) +#define safe_apic_wait_icr_idle (apic_ops->safe_wait_icr_idle) + +extern int get_physical_broadcast(void); + +#ifdef CONFIG_X86_64 +static inline void ack_x2APIC_irq(void) +{ + /* Docs say use 0 for future compatibility */ + native_apic_msr_write(APIC_EOI, 0); +} +#endif + + +static inline void ack_APIC_irq(void) +{ + /* + * ack_APIC_irq() actually gets compiled as a single instruction + * ... yummie. + */ + + /* Docs say use 0 for future compatibility */ + apic_write(APIC_EOI, 0); +} + +extern int lapic_get_maxlvt(void); +extern void clear_local_APIC(void); +extern void connect_bsp_APIC(void); +extern void disconnect_bsp_APIC(int virt_wire_setup); +extern void disable_local_APIC(void); +extern void lapic_shutdown(void); +extern int verify_local_APIC(void); +extern void cache_APIC_registers(void); +extern void sync_Arb_IDs(void); +extern void init_bsp_APIC(void); +extern void setup_local_APIC(void); +extern void end_local_APIC_setup(void); +extern void init_apic_mappings(void); +extern void setup_boot_APIC_clock(void); +extern void setup_secondary_APIC_clock(void); +extern int APIC_init_uniprocessor(void); +extern void enable_NMI_through_LVT0(void); + +/* + * On 32bit this is mach-xxx local + */ +#ifdef CONFIG_X86_64 +extern void early_init_lapic_mapping(void); +extern int apic_is_clustered_box(void); +#else +static inline int apic_is_clustered_box(void) +{ + return 0; +} +#endif + +extern u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask); +extern u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask); + + +#else /* !CONFIG_X86_LOCAL_APIC */ +static inline void lapic_shutdown(void) { } +#define local_apic_timer_c2_ok 1 +static inline void init_apic_mappings(void) { } + +#endif /* !CONFIG_X86_LOCAL_APIC */ + +#endif /* _ASM_X86_APIC_H */ diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h new file mode 100644 index 000000000000..63134e31e8b9 --- /dev/null +++ b/arch/x86/include/asm/apicdef.h @@ -0,0 +1,417 @@ +#ifndef _ASM_X86_APICDEF_H +#define _ASM_X86_APICDEF_H + +/* + * Constants for various Intel APICs. (local APIC, IOAPIC, etc.) + * + * Alan Cox <Alan.Cox@linux.org>, 1995. + * Ingo Molnar <mingo@redhat.com>, 1999, 2000 + */ + +#define APIC_DEFAULT_PHYS_BASE 0xfee00000 + +#define APIC_ID 0x20 + +#define APIC_LVR 0x30 +#define APIC_LVR_MASK 0xFF00FF +#define GET_APIC_VERSION(x) ((x) & 0xFFu) +#define GET_APIC_MAXLVT(x) (((x) >> 16) & 0xFFu) +#ifdef CONFIG_X86_32 +# define APIC_INTEGRATED(x) ((x) & 0xF0u) +#else +# define APIC_INTEGRATED(x) (1) +#endif +#define APIC_XAPIC(x) ((x) >= 0x14) +#define APIC_TASKPRI 0x80 +#define APIC_TPRI_MASK 0xFFu +#define APIC_ARBPRI 0x90 +#define APIC_ARBPRI_MASK 0xFFu +#define APIC_PROCPRI 0xA0 +#define APIC_EOI 0xB0 +#define APIC_EIO_ACK 0x0 +#define APIC_RRR 0xC0 +#define APIC_LDR 0xD0 +#define APIC_LDR_MASK (0xFFu << 24) +#define GET_APIC_LOGICAL_ID(x) (((x) >> 24) & 0xFFu) +#define SET_APIC_LOGICAL_ID(x) (((x) << 24)) +#define APIC_ALL_CPUS 0xFFu +#define APIC_DFR 0xE0 +#define APIC_DFR_CLUSTER 0x0FFFFFFFul +#define APIC_DFR_FLAT 0xFFFFFFFFul +#define APIC_SPIV 0xF0 +#define APIC_SPIV_FOCUS_DISABLED (1 << 9) +#define APIC_SPIV_APIC_ENABLED (1 << 8) +#define APIC_ISR 0x100 +#define APIC_ISR_NR 0x8 /* Number of 32 bit ISR registers. */ +#define APIC_TMR 0x180 +#define APIC_IRR 0x200 +#define APIC_ESR 0x280 +#define APIC_ESR_SEND_CS 0x00001 +#define APIC_ESR_RECV_CS 0x00002 +#define APIC_ESR_SEND_ACC 0x00004 +#define APIC_ESR_RECV_ACC 0x00008 +#define APIC_ESR_SENDILL 0x00020 +#define APIC_ESR_RECVILL 0x00040 +#define APIC_ESR_ILLREGA 0x00080 +#define APIC_ICR 0x300 +#define APIC_DEST_SELF 0x40000 +#define APIC_DEST_ALLINC 0x80000 +#define APIC_DEST_ALLBUT 0xC0000 +#define APIC_ICR_RR_MASK 0x30000 +#define APIC_ICR_RR_INVALID 0x00000 +#define APIC_ICR_RR_INPROG 0x10000 +#define APIC_ICR_RR_VALID 0x20000 +#define APIC_INT_LEVELTRIG 0x08000 +#define APIC_INT_ASSERT 0x04000 +#define APIC_ICR_BUSY 0x01000 +#define APIC_DEST_LOGICAL 0x00800 +#define APIC_DEST_PHYSICAL 0x00000 +#define APIC_DM_FIXED 0x00000 +#define APIC_DM_LOWEST 0x00100 +#define APIC_DM_SMI 0x00200 +#define APIC_DM_REMRD 0x00300 +#define APIC_DM_NMI 0x00400 +#define APIC_DM_INIT 0x00500 +#define APIC_DM_STARTUP 0x00600 +#define APIC_DM_EXTINT 0x00700 +#define APIC_VECTOR_MASK 0x000FF +#define APIC_ICR2 0x310 +#define GET_APIC_DEST_FIELD(x) (((x) >> 24) & 0xFF) +#define SET_APIC_DEST_FIELD(x) ((x) << 24) +#define APIC_LVTT 0x320 +#define APIC_LVTTHMR 0x330 +#define APIC_LVTPC 0x340 +#define APIC_LVT0 0x350 +#define APIC_LVT_TIMER_BASE_MASK (0x3 << 18) +#define GET_APIC_TIMER_BASE(x) (((x) >> 18) & 0x3) +#define SET_APIC_TIMER_BASE(x) (((x) << 18)) +#define APIC_TIMER_BASE_CLKIN 0x0 +#define APIC_TIMER_BASE_TMBASE 0x1 +#define APIC_TIMER_BASE_DIV 0x2 +#define APIC_LVT_TIMER_PERIODIC (1 << 17) +#define APIC_LVT_MASKED (1 << 16) +#define APIC_LVT_LEVEL_TRIGGER (1 << 15) +#define APIC_LVT_REMOTE_IRR (1 << 14) +#define APIC_INPUT_POLARITY (1 << 13) +#define APIC_SEND_PENDING (1 << 12) +#define APIC_MODE_MASK 0x700 +#define GET_APIC_DELIVERY_MODE(x) (((x) >> 8) & 0x7) +#define SET_APIC_DELIVERY_MODE(x, y) (((x) & ~0x700) | ((y) << 8)) +#define APIC_MODE_FIXED 0x0 +#define APIC_MODE_NMI 0x4 +#define APIC_MODE_EXTINT 0x7 +#define APIC_LVT1 0x360 +#define APIC_LVTERR 0x370 +#define APIC_TMICT 0x380 +#define APIC_TMCCT 0x390 +#define APIC_TDCR 0x3E0 +#define APIC_SELF_IPI 0x3F0 +#define APIC_TDR_DIV_TMBASE (1 << 2) +#define APIC_TDR_DIV_1 0xB +#define APIC_TDR_DIV_2 0x0 +#define APIC_TDR_DIV_4 0x1 +#define APIC_TDR_DIV_8 0x2 +#define APIC_TDR_DIV_16 0x3 +#define APIC_TDR_DIV_32 0x8 +#define APIC_TDR_DIV_64 0x9 +#define APIC_TDR_DIV_128 0xA +#define APIC_EILVT0 0x500 +#define APIC_EILVT_NR_AMD_K8 1 /* # of extended interrupts */ +#define APIC_EILVT_NR_AMD_10H 4 +#define APIC_EILVT_LVTOFF(x) (((x) >> 4) & 0xF) +#define APIC_EILVT_MSG_FIX 0x0 +#define APIC_EILVT_MSG_SMI 0x2 +#define APIC_EILVT_MSG_NMI 0x4 +#define APIC_EILVT_MSG_EXT 0x7 +#define APIC_EILVT_MASKED (1 << 16) +#define APIC_EILVT1 0x510 +#define APIC_EILVT2 0x520 +#define APIC_EILVT3 0x530 + +#define APIC_BASE (fix_to_virt(FIX_APIC_BASE)) +#define APIC_BASE_MSR 0x800 +#define X2APIC_ENABLE (1UL << 10) + +#ifdef CONFIG_X86_32 +# define MAX_IO_APICS 64 +#else +# define MAX_IO_APICS 128 +# define MAX_LOCAL_APIC 32768 +#endif + +/* + * All x86-64 systems are xAPIC compatible. + * In the following, "apicid" is a physical APIC ID. + */ +#define XAPIC_DEST_CPUS_SHIFT 4 +#define XAPIC_DEST_CPUS_MASK ((1u << XAPIC_DEST_CPUS_SHIFT) - 1) +#define XAPIC_DEST_CLUSTER_MASK (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT) +#define APIC_CLUSTER(apicid) ((apicid) & XAPIC_DEST_CLUSTER_MASK) +#define APIC_CLUSTERID(apicid) (APIC_CLUSTER(apicid) >> XAPIC_DEST_CPUS_SHIFT) +#define APIC_CPUID(apicid) ((apicid) & XAPIC_DEST_CPUS_MASK) +#define NUM_APIC_CLUSTERS ((BAD_APICID + 1) >> XAPIC_DEST_CPUS_SHIFT) + +/* + * the local APIC register structure, memory mapped. Not terribly well + * tested, but we might eventually use this one in the future - the + * problem why we cannot use it right now is the P5 APIC, it has an + * errata which cannot take 8-bit reads and writes, only 32-bit ones ... + */ +#define u32 unsigned int + +struct local_apic { + +/*000*/ struct { u32 __reserved[4]; } __reserved_01; + +/*010*/ struct { u32 __reserved[4]; } __reserved_02; + +/*020*/ struct { /* APIC ID Register */ + u32 __reserved_1 : 24, + phys_apic_id : 4, + __reserved_2 : 4; + u32 __reserved[3]; + } id; + +/*030*/ const + struct { /* APIC Version Register */ + u32 version : 8, + __reserved_1 : 8, + max_lvt : 8, + __reserved_2 : 8; + u32 __reserved[3]; + } version; + +/*040*/ struct { u32 __reserved[4]; } __reserved_03; + +/*050*/ struct { u32 __reserved[4]; } __reserved_04; + +/*060*/ struct { u32 __reserved[4]; } __reserved_05; + +/*070*/ struct { u32 __reserved[4]; } __reserved_06; + +/*080*/ struct { /* Task Priority Register */ + u32 priority : 8, + __reserved_1 : 24; + u32 __reserved_2[3]; + } tpr; + +/*090*/ const + struct { /* Arbitration Priority Register */ + u32 priority : 8, + __reserved_1 : 24; + u32 __reserved_2[3]; + } apr; + +/*0A0*/ const + struct { /* Processor Priority Register */ + u32 priority : 8, + __reserved_1 : 24; + u32 __reserved_2[3]; + } ppr; + +/*0B0*/ struct { /* End Of Interrupt Register */ + u32 eoi; + u32 __reserved[3]; + } eoi; + +/*0C0*/ struct { u32 __reserved[4]; } __reserved_07; + +/*0D0*/ struct { /* Logical Destination Register */ + u32 __reserved_1 : 24, + logical_dest : 8; + u32 __reserved_2[3]; + } ldr; + +/*0E0*/ struct { /* Destination Format Register */ + u32 __reserved_1 : 28, + model : 4; + u32 __reserved_2[3]; + } dfr; + +/*0F0*/ struct { /* Spurious Interrupt Vector Register */ + u32 spurious_vector : 8, + apic_enabled : 1, + focus_cpu : 1, + __reserved_2 : 22; + u32 __reserved_3[3]; + } svr; + +/*100*/ struct { /* In Service Register */ +/*170*/ u32 bitfield; + u32 __reserved[3]; + } isr [8]; + +/*180*/ struct { /* Trigger Mode Register */ +/*1F0*/ u32 bitfield; + u32 __reserved[3]; + } tmr [8]; + +/*200*/ struct { /* Interrupt Request Register */ +/*270*/ u32 bitfield; + u32 __reserved[3]; + } irr [8]; + +/*280*/ union { /* Error Status Register */ + struct { + u32 send_cs_error : 1, + receive_cs_error : 1, + send_accept_error : 1, + receive_accept_error : 1, + __reserved_1 : 1, + send_illegal_vector : 1, + receive_illegal_vector : 1, + illegal_register_address : 1, + __reserved_2 : 24; + u32 __reserved_3[3]; + } error_bits; + struct { + u32 errors; + u32 __reserved_3[3]; + } all_errors; + } esr; + +/*290*/ struct { u32 __reserved[4]; } __reserved_08; + +/*2A0*/ struct { u32 __reserved[4]; } __reserved_09; + +/*2B0*/ struct { u32 __reserved[4]; } __reserved_10; + +/*2C0*/ struct { u32 __reserved[4]; } __reserved_11; + +/*2D0*/ struct { u32 __reserved[4]; } __reserved_12; + +/*2E0*/ struct { u32 __reserved[4]; } __reserved_13; + +/*2F0*/ struct { u32 __reserved[4]; } __reserved_14; + +/*300*/ struct { /* Interrupt Command Register 1 */ + u32 vector : 8, + delivery_mode : 3, + destination_mode : 1, + delivery_status : 1, + __reserved_1 : 1, + level : 1, + trigger : 1, + __reserved_2 : 2, + shorthand : 2, + __reserved_3 : 12; + u32 __reserved_4[3]; + } icr1; + +/*310*/ struct { /* Interrupt Command Register 2 */ + union { + u32 __reserved_1 : 24, + phys_dest : 4, + __reserved_2 : 4; + u32 __reserved_3 : 24, + logical_dest : 8; + } dest; + u32 __reserved_4[3]; + } icr2; + +/*320*/ struct { /* LVT - Timer */ + u32 vector : 8, + __reserved_1 : 4, + delivery_status : 1, + __reserved_2 : 3, + mask : 1, + timer_mode : 1, + __reserved_3 : 14; + u32 __reserved_4[3]; + } lvt_timer; + +/*330*/ struct { /* LVT - Thermal Sensor */ + u32 vector : 8, + delivery_mode : 3, + __reserved_1 : 1, + delivery_status : 1, + __reserved_2 : 3, + mask : 1, + __reserved_3 : 15; + u32 __reserved_4[3]; + } lvt_thermal; + +/*340*/ struct { /* LVT - Performance Counter */ + u32 vector : 8, + delivery_mode : 3, + __reserved_1 : 1, + delivery_status : 1, + __reserved_2 : 3, + mask : 1, + __reserved_3 : 15; + u32 __reserved_4[3]; + } lvt_pc; + +/*350*/ struct { /* LVT - LINT0 */ + u32 vector : 8, + delivery_mode : 3, + __reserved_1 : 1, + delivery_status : 1, + polarity : 1, + remote_irr : 1, + trigger : 1, + mask : 1, + __reserved_2 : 15; + u32 __reserved_3[3]; + } lvt_lint0; + +/*360*/ struct { /* LVT - LINT1 */ + u32 vector : 8, + delivery_mode : 3, + __reserved_1 : 1, + delivery_status : 1, + polarity : 1, + remote_irr : 1, + trigger : 1, + mask : 1, + __reserved_2 : 15; + u32 __reserved_3[3]; + } lvt_lint1; + +/*370*/ struct { /* LVT - Error */ + u32 vector : 8, + __reserved_1 : 4, + delivery_status : 1, + __reserved_2 : 3, + mask : 1, + __reserved_3 : 15; + u32 __reserved_4[3]; + } lvt_error; + +/*380*/ struct { /* Timer Initial Count Register */ + u32 initial_count; + u32 __reserved_2[3]; + } timer_icr; + +/*390*/ const + struct { /* Timer Current Count Register */ + u32 curr_count; + u32 __reserved_2[3]; + } timer_ccr; + +/*3A0*/ struct { u32 __reserved[4]; } __reserved_16; + +/*3B0*/ struct { u32 __reserved[4]; } __reserved_17; + +/*3C0*/ struct { u32 __reserved[4]; } __reserved_18; + +/*3D0*/ struct { u32 __reserved[4]; } __reserved_19; + +/*3E0*/ struct { /* Timer Divide Configuration Register */ + u32 divisor : 4, + __reserved_1 : 28; + u32 __reserved_2[3]; + } timer_dcr; + +/*3F0*/ struct { u32 __reserved[4]; } __reserved_20; + +} __attribute__ ((packed)); + +#undef u32 + +#ifdef CONFIG_X86_32 + #define BAD_APICID 0xFFu +#else + #define BAD_APICID 0xFFFFu +#endif +#endif /* _ASM_X86_APICDEF_H */ diff --git a/arch/x86/include/asm/arch_hooks.h b/arch/x86/include/asm/arch_hooks.h new file mode 100644 index 000000000000..cbd4957838a6 --- /dev/null +++ b/arch/x86/include/asm/arch_hooks.h @@ -0,0 +1,26 @@ +#ifndef _ASM_X86_ARCH_HOOKS_H +#define _ASM_X86_ARCH_HOOKS_H + +#include <linux/interrupt.h> + +/* + * linux/include/asm/arch_hooks.h + * + * define the architecture specific hooks + */ + +/* these aren't arch hooks, they are generic routines + * that can be used by the hooks */ +extern void init_ISA_irqs(void); +extern irqreturn_t timer_interrupt(int irq, void *dev_id); + +/* these are the defined hooks */ +extern void intr_init_hook(void); +extern void pre_intr_init_hook(void); +extern void pre_setup_arch_hook(void); +extern void trap_init_hook(void); +extern void pre_time_init_hook(void); +extern void time_init_hook(void); +extern void mca_nmi_hook(void); + +#endif /* _ASM_X86_ARCH_HOOKS_H */ diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h new file mode 100644 index 000000000000..56be78f582f0 --- /dev/null +++ b/arch/x86/include/asm/asm.h @@ -0,0 +1,47 @@ +#ifndef _ASM_X86_ASM_H +#define _ASM_X86_ASM_H + +#ifdef __ASSEMBLY__ +# define __ASM_FORM(x) x +# define __ASM_EX_SEC .section __ex_table +#else +# define __ASM_FORM(x) " " #x " " +# define __ASM_EX_SEC " .section __ex_table,\"a\"\n" +#endif + +#ifdef CONFIG_X86_32 +# define __ASM_SEL(a,b) __ASM_FORM(a) +#else +# define __ASM_SEL(a,b) __ASM_FORM(b) +#endif + +#define __ASM_SIZE(inst) __ASM_SEL(inst##l, inst##q) +#define __ASM_REG(reg) __ASM_SEL(e##reg, r##reg) + +#define _ASM_PTR __ASM_SEL(.long, .quad) +#define _ASM_ALIGN __ASM_SEL(.balign 4, .balign 8) + +#define _ASM_MOV __ASM_SIZE(mov) +#define _ASM_INC __ASM_SIZE(inc) +#define _ASM_DEC __ASM_SIZE(dec) +#define _ASM_ADD __ASM_SIZE(add) +#define _ASM_SUB __ASM_SIZE(sub) +#define _ASM_XADD __ASM_SIZE(xadd) + +#define _ASM_AX __ASM_REG(ax) +#define _ASM_BX __ASM_REG(bx) +#define _ASM_CX __ASM_REG(cx) +#define _ASM_DX __ASM_REG(dx) +#define _ASM_SP __ASM_REG(sp) +#define _ASM_BP __ASM_REG(bp) +#define _ASM_SI __ASM_REG(si) +#define _ASM_DI __ASM_REG(di) + +/* Exception table entry */ +# define _ASM_EXTABLE(from,to) \ + __ASM_EX_SEC \ + _ASM_ALIGN "\n" \ + _ASM_PTR #from "," #to "\n" \ + " .previous\n" + +#endif /* _ASM_X86_ASM_H */ diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h new file mode 100644 index 000000000000..4e1b8873c474 --- /dev/null +++ b/arch/x86/include/asm/atomic.h @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include "atomic_32.h" +#else +# include "atomic_64.h" +#endif diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h new file mode 100644 index 000000000000..ad5b9f6ecddf --- /dev/null +++ b/arch/x86/include/asm/atomic_32.h @@ -0,0 +1,259 @@ +#ifndef _ASM_X86_ATOMIC_32_H +#define _ASM_X86_ATOMIC_32_H + +#include <linux/compiler.h> +#include <asm/processor.h> +#include <asm/cmpxchg.h> + +/* + * Atomic operations that C can't guarantee us. Useful for + * resource counting etc.. + */ + +/* + * Make sure gcc doesn't try to be clever and move things around + * on us. We need to use _exactly_ the address the user gave us, + * not some alias that contains the same information. + */ +typedef struct { + int counter; +} atomic_t; + +#define ATOMIC_INIT(i) { (i) } + +/** + * atomic_read - read atomic variable + * @v: pointer of type atomic_t + * + * Atomically reads the value of @v. + */ +#define atomic_read(v) ((v)->counter) + +/** + * atomic_set - set atomic variable + * @v: pointer of type atomic_t + * @i: required value + * + * Atomically sets the value of @v to @i. + */ +#define atomic_set(v, i) (((v)->counter) = (i)) + +/** + * atomic_add - add integer to atomic variable + * @i: integer value to add + * @v: pointer of type atomic_t + * + * Atomically adds @i to @v. + */ +static inline void atomic_add(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "addl %1,%0" + : "+m" (v->counter) + : "ir" (i)); +} + +/** + * atomic_sub - subtract integer from atomic variable + * @i: integer value to subtract + * @v: pointer of type atomic_t + * + * Atomically subtracts @i from @v. + */ +static inline void atomic_sub(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "subl %1,%0" + : "+m" (v->counter) + : "ir" (i)); +} + +/** + * atomic_sub_and_test - subtract value from variable and test result + * @i: integer value to subtract + * @v: pointer of type atomic_t + * + * Atomically subtracts @i from @v and returns + * true if the result is zero, or false for all + * other cases. + */ +static inline int atomic_sub_and_test(int i, atomic_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "subl %2,%0; sete %1" + : "+m" (v->counter), "=qm" (c) + : "ir" (i) : "memory"); + return c; +} + +/** + * atomic_inc - increment atomic variable + * @v: pointer of type atomic_t + * + * Atomically increments @v by 1. + */ +static inline void atomic_inc(atomic_t *v) +{ + asm volatile(LOCK_PREFIX "incl %0" + : "+m" (v->counter)); +} + +/** + * atomic_dec - decrement atomic variable + * @v: pointer of type atomic_t + * + * Atomically decrements @v by 1. + */ +static inline void atomic_dec(atomic_t *v) +{ + asm volatile(LOCK_PREFIX "decl %0" + : "+m" (v->counter)); +} + +/** + * atomic_dec_and_test - decrement and test + * @v: pointer of type atomic_t + * + * Atomically decrements @v by 1 and + * returns true if the result is 0, or false for all other + * cases. + */ +static inline int atomic_dec_and_test(atomic_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "decl %0; sete %1" + : "+m" (v->counter), "=qm" (c) + : : "memory"); + return c != 0; +} + +/** + * atomic_inc_and_test - increment and test + * @v: pointer of type atomic_t + * + * Atomically increments @v by 1 + * and returns true if the result is zero, or false for all + * other cases. + */ +static inline int atomic_inc_and_test(atomic_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "incl %0; sete %1" + : "+m" (v->counter), "=qm" (c) + : : "memory"); + return c != 0; +} + +/** + * atomic_add_negative - add and test if negative + * @v: pointer of type atomic_t + * @i: integer value to add + * + * Atomically adds @i to @v and returns true + * if the result is negative, or false when + * result is greater than or equal to zero. + */ +static inline int atomic_add_negative(int i, atomic_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "addl %2,%0; sets %1" + : "+m" (v->counter), "=qm" (c) + : "ir" (i) : "memory"); + return c; +} + +/** + * atomic_add_return - add integer and return + * @v: pointer of type atomic_t + * @i: integer value to add + * + * Atomically adds @i to @v and returns @i + @v + */ +static inline int atomic_add_return(int i, atomic_t *v) +{ + int __i; +#ifdef CONFIG_M386 + unsigned long flags; + if (unlikely(boot_cpu_data.x86 <= 3)) + goto no_xadd; +#endif + /* Modern 486+ processor */ + __i = i; + asm volatile(LOCK_PREFIX "xaddl %0, %1" + : "+r" (i), "+m" (v->counter) + : : "memory"); + return i + __i; + +#ifdef CONFIG_M386 +no_xadd: /* Legacy 386 processor */ + local_irq_save(flags); + __i = atomic_read(v); + atomic_set(v, i + __i); + local_irq_restore(flags); + return i + __i; +#endif +} + +/** + * atomic_sub_return - subtract integer and return + * @v: pointer of type atomic_t + * @i: integer value to subtract + * + * Atomically subtracts @i from @v and returns @v - @i + */ +static inline int atomic_sub_return(int i, atomic_t *v) +{ + return atomic_add_return(-i, v); +} + +#define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new))) +#define atomic_xchg(v, new) (xchg(&((v)->counter), (new))) + +/** + * atomic_add_unless - add unless the number is already a given value + * @v: pointer of type atomic_t + * @a: the amount to add to v... + * @u: ...unless v is equal to u. + * + * Atomically adds @a to @v, so long as @v was not already @u. + * Returns non-zero if @v was not @u, and zero otherwise. + */ +static inline int atomic_add_unless(atomic_t *v, int a, int u) +{ + int c, old; + c = atomic_read(v); + for (;;) { + if (unlikely(c == (u))) + break; + old = atomic_cmpxchg((v), c, c + (a)); + if (likely(old == c)) + break; + c = old; + } + return c != (u); +} + +#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) + +#define atomic_inc_return(v) (atomic_add_return(1, v)) +#define atomic_dec_return(v) (atomic_sub_return(1, v)) + +/* These are x86-specific, used by some header files */ +#define atomic_clear_mask(mask, addr) \ + asm volatile(LOCK_PREFIX "andl %0,%1" \ + : : "r" (~(mask)), "m" (*(addr)) : "memory") + +#define atomic_set_mask(mask, addr) \ + asm volatile(LOCK_PREFIX "orl %0,%1" \ + : : "r" (mask), "m" (*(addr)) : "memory") + +/* Atomic operations are already serializing on x86 */ +#define smp_mb__before_atomic_dec() barrier() +#define smp_mb__after_atomic_dec() barrier() +#define smp_mb__before_atomic_inc() barrier() +#define smp_mb__after_atomic_inc() barrier() + +#include <asm-generic/atomic.h> +#endif /* _ASM_X86_ATOMIC_32_H */ diff --git a/arch/x86/include/asm/atomic_64.h b/arch/x86/include/asm/atomic_64.h new file mode 100644 index 000000000000..279d2a731f3f --- /dev/null +++ b/arch/x86/include/asm/atomic_64.h @@ -0,0 +1,473 @@ +#ifndef _ASM_X86_ATOMIC_64_H +#define _ASM_X86_ATOMIC_64_H + +#include <asm/alternative.h> +#include <asm/cmpxchg.h> + +/* atomic_t should be 32 bit signed type */ + +/* + * Atomic operations that C can't guarantee us. Useful for + * resource counting etc.. + */ + +/* + * Make sure gcc doesn't try to be clever and move things around + * on us. We need to use _exactly_ the address the user gave us, + * not some alias that contains the same information. + */ +typedef struct { + int counter; +} atomic_t; + +#define ATOMIC_INIT(i) { (i) } + +/** + * atomic_read - read atomic variable + * @v: pointer of type atomic_t + * + * Atomically reads the value of @v. + */ +#define atomic_read(v) ((v)->counter) + +/** + * atomic_set - set atomic variable + * @v: pointer of type atomic_t + * @i: required value + * + * Atomically sets the value of @v to @i. + */ +#define atomic_set(v, i) (((v)->counter) = (i)) + +/** + * atomic_add - add integer to atomic variable + * @i: integer value to add + * @v: pointer of type atomic_t + * + * Atomically adds @i to @v. + */ +static inline void atomic_add(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "addl %1,%0" + : "=m" (v->counter) + : "ir" (i), "m" (v->counter)); +} + +/** + * atomic_sub - subtract the atomic variable + * @i: integer value to subtract + * @v: pointer of type atomic_t + * + * Atomically subtracts @i from @v. + */ +static inline void atomic_sub(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "subl %1,%0" + : "=m" (v->counter) + : "ir" (i), "m" (v->counter)); +} + +/** + * atomic_sub_and_test - subtract value from variable and test result + * @i: integer value to subtract + * @v: pointer of type atomic_t + * + * Atomically subtracts @i from @v and returns + * true if the result is zero, or false for all + * other cases. + */ +static inline int atomic_sub_and_test(int i, atomic_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "subl %2,%0; sete %1" + : "=m" (v->counter), "=qm" (c) + : "ir" (i), "m" (v->counter) : "memory"); + return c; +} + +/** + * atomic_inc - increment atomic variable + * @v: pointer of type atomic_t + * + * Atomically increments @v by 1. + */ +static inline void atomic_inc(atomic_t *v) +{ + asm volatile(LOCK_PREFIX "incl %0" + : "=m" (v->counter) + : "m" (v->counter)); +} + +/** + * atomic_dec - decrement atomic variable + * @v: pointer of type atomic_t + * + * Atomically decrements @v by 1. + */ +static inline void atomic_dec(atomic_t *v) +{ + asm volatile(LOCK_PREFIX "decl %0" + : "=m" (v->counter) + : "m" (v->counter)); +} + +/** + * atomic_dec_and_test - decrement and test + * @v: pointer of type atomic_t + * + * Atomically decrements @v by 1 and + * returns true if the result is 0, or false for all other + * cases. + */ +static inline int atomic_dec_and_test(atomic_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "decl %0; sete %1" + : "=m" (v->counter), "=qm" (c) + : "m" (v->counter) : "memory"); + return c != 0; +} + +/** + * atomic_inc_and_test - increment and test + * @v: pointer of type atomic_t + * + * Atomically increments @v by 1 + * and returns true if the result is zero, or false for all + * other cases. + */ +static inline int atomic_inc_and_test(atomic_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "incl %0; sete %1" + : "=m" (v->counter), "=qm" (c) + : "m" (v->counter) : "memory"); + return c != 0; +} + +/** + * atomic_add_negative - add and test if negative + * @i: integer value to add + * @v: pointer of type atomic_t + * + * Atomically adds @i to @v and returns true + * if the result is negative, or false when + * result is greater than or equal to zero. + */ +static inline int atomic_add_negative(int i, atomic_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "addl %2,%0; sets %1" + : "=m" (v->counter), "=qm" (c) + : "ir" (i), "m" (v->counter) : "memory"); + return c; +} + +/** + * atomic_add_return - add and return + * @i: integer value to add + * @v: pointer of type atomic_t + * + * Atomically adds @i to @v and returns @i + @v + */ +static inline int atomic_add_return(int i, atomic_t *v) +{ + int __i = i; + asm volatile(LOCK_PREFIX "xaddl %0, %1" + : "+r" (i), "+m" (v->counter) + : : "memory"); + return i + __i; +} + +static inline int atomic_sub_return(int i, atomic_t *v) +{ + return atomic_add_return(-i, v); +} + +#define atomic_inc_return(v) (atomic_add_return(1, v)) +#define atomic_dec_return(v) (atomic_sub_return(1, v)) + +/* An 64bit atomic type */ + +typedef struct { + long counter; +} atomic64_t; + +#define ATOMIC64_INIT(i) { (i) } + +/** + * atomic64_read - read atomic64 variable + * @v: pointer of type atomic64_t + * + * Atomically reads the value of @v. + * Doesn't imply a read memory barrier. + */ +#define atomic64_read(v) ((v)->counter) + +/** + * atomic64_set - set atomic64 variable + * @v: pointer to type atomic64_t + * @i: required value + * + * Atomically sets the value of @v to @i. + */ +#define atomic64_set(v, i) (((v)->counter) = (i)) + +/** + * atomic64_add - add integer to atomic64 variable + * @i: integer value to add + * @v: pointer to type atomic64_t + * + * Atomically adds @i to @v. + */ +static inline void atomic64_add(long i, atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "addq %1,%0" + : "=m" (v->counter) + : "er" (i), "m" (v->counter)); +} + +/** + * atomic64_sub - subtract the atomic64 variable + * @i: integer value to subtract + * @v: pointer to type atomic64_t + * + * Atomically subtracts @i from @v. + */ +static inline void atomic64_sub(long i, atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "subq %1,%0" + : "=m" (v->counter) + : "er" (i), "m" (v->counter)); +} + +/** + * atomic64_sub_and_test - subtract value from variable and test result + * @i: integer value to subtract + * @v: pointer to type atomic64_t + * + * Atomically subtracts @i from @v and returns + * true if the result is zero, or false for all + * other cases. + */ +static inline int atomic64_sub_and_test(long i, atomic64_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "subq %2,%0; sete %1" + : "=m" (v->counter), "=qm" (c) + : "er" (i), "m" (v->counter) : "memory"); + return c; +} + +/** + * atomic64_inc - increment atomic64 variable + * @v: pointer to type atomic64_t + * + * Atomically increments @v by 1. + */ +static inline void atomic64_inc(atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "incq %0" + : "=m" (v->counter) + : "m" (v->counter)); +} + +/** + * atomic64_dec - decrement atomic64 variable + * @v: pointer to type atomic64_t + * + * Atomically decrements @v by 1. + */ +static inline void atomic64_dec(atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "decq %0" + : "=m" (v->counter) + : "m" (v->counter)); +} + +/** + * atomic64_dec_and_test - decrement and test + * @v: pointer to type atomic64_t + * + * Atomically decrements @v by 1 and + * returns true if the result is 0, or false for all other + * cases. + */ +static inline int atomic64_dec_and_test(atomic64_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "decq %0; sete %1" + : "=m" (v->counter), "=qm" (c) + : "m" (v->counter) : "memory"); + return c != 0; +} + +/** + * atomic64_inc_and_test - increment and test + * @v: pointer to type atomic64_t + * + * Atomically increments @v by 1 + * and returns true if the result is zero, or false for all + * other cases. + */ +static inline int atomic64_inc_and_test(atomic64_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "incq %0; sete %1" + : "=m" (v->counter), "=qm" (c) + : "m" (v->counter) : "memory"); + return c != 0; +} + +/** + * atomic64_add_negative - add and test if negative + * @i: integer value to add + * @v: pointer to type atomic64_t + * + * Atomically adds @i to @v and returns true + * if the result is negative, or false when + * result is greater than or equal to zero. + */ +static inline int atomic64_add_negative(long i, atomic64_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "addq %2,%0; sets %1" + : "=m" (v->counter), "=qm" (c) + : "er" (i), "m" (v->counter) : "memory"); + return c; +} + +/** + * atomic64_add_return - add and return + * @i: integer value to add + * @v: pointer to type atomic64_t + * + * Atomically adds @i to @v and returns @i + @v + */ +static inline long atomic64_add_return(long i, atomic64_t *v) +{ + long __i = i; + asm volatile(LOCK_PREFIX "xaddq %0, %1;" + : "+r" (i), "+m" (v->counter) + : : "memory"); + return i + __i; +} + +static inline long atomic64_sub_return(long i, atomic64_t *v) +{ + return atomic64_add_return(-i, v); +} + +#define atomic64_inc_return(v) (atomic64_add_return(1, (v))) +#define atomic64_dec_return(v) (atomic64_sub_return(1, (v))) + +#define atomic64_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new))) +#define atomic64_xchg(v, new) (xchg(&((v)->counter), new)) + +#define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new))) +#define atomic_xchg(v, new) (xchg(&((v)->counter), (new))) + +/** + * atomic_add_unless - add unless the number is a given value + * @v: pointer of type atomic_t + * @a: the amount to add to v... + * @u: ...unless v is equal to u. + * + * Atomically adds @a to @v, so long as it was not @u. + * Returns non-zero if @v was not @u, and zero otherwise. + */ +static inline int atomic_add_unless(atomic_t *v, int a, int u) +{ + int c, old; + c = atomic_read(v); + for (;;) { + if (unlikely(c == (u))) + break; + old = atomic_cmpxchg((v), c, c + (a)); + if (likely(old == c)) + break; + c = old; + } + return c != (u); +} + +#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) + +/** + * atomic64_add_unless - add unless the number is a given value + * @v: pointer of type atomic64_t + * @a: the amount to add to v... + * @u: ...unless v is equal to u. + * + * Atomically adds @a to @v, so long as it was not @u. + * Returns non-zero if @v was not @u, and zero otherwise. + */ +static inline int atomic64_add_unless(atomic64_t *v, long a, long u) +{ + long c, old; + c = atomic64_read(v); + for (;;) { + if (unlikely(c == (u))) + break; + old = atomic64_cmpxchg((v), c, c + (a)); + if (likely(old == c)) + break; + c = old; + } + return c != (u); +} + +/** + * atomic_inc_short - increment of a short integer + * @v: pointer to type int + * + * Atomically adds 1 to @v + * Returns the new value of @u + */ +static inline short int atomic_inc_short(short int *v) +{ + asm(LOCK_PREFIX "addw $1, %0" : "+m" (*v)); + return *v; +} + +/** + * atomic_or_long - OR of two long integers + * @v1: pointer to type unsigned long + * @v2: pointer to type unsigned long + * + * Atomically ORs @v1 and @v2 + * Returns the result of the OR + */ +static inline void atomic_or_long(unsigned long *v1, unsigned long v2) +{ + asm(LOCK_PREFIX "orq %1, %0" : "+m" (*v1) : "r" (v2)); +} + +#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0) + +/* These are x86-specific, used by some header files */ +#define atomic_clear_mask(mask, addr) \ + asm volatile(LOCK_PREFIX "andl %0,%1" \ + : : "r" (~(mask)), "m" (*(addr)) : "memory") + +#define atomic_set_mask(mask, addr) \ + asm volatile(LOCK_PREFIX "orl %0,%1" \ + : : "r" ((unsigned)(mask)), "m" (*(addr)) \ + : "memory") + +/* Atomic operations are already serializing on x86 */ +#define smp_mb__before_atomic_dec() barrier() +#define smp_mb__after_atomic_dec() barrier() +#define smp_mb__before_atomic_inc() barrier() +#define smp_mb__after_atomic_inc() barrier() + +#include <asm-generic/atomic.h> +#endif /* _ASM_X86_ATOMIC_64_H */ diff --git a/arch/x86/include/asm/auxvec.h b/arch/x86/include/asm/auxvec.h new file mode 100644 index 000000000000..1316b4c35425 --- /dev/null +++ b/arch/x86/include/asm/auxvec.h @@ -0,0 +1,12 @@ +#ifndef _ASM_X86_AUXVEC_H +#define _ASM_X86_AUXVEC_H +/* + * Architecture-neutral AT_ values in 0-17, leave some room + * for more of them, start the x86-specific ones at 32. + */ +#ifdef __i386__ +#define AT_SYSINFO 32 +#endif +#define AT_SYSINFO_EHDR 33 + +#endif /* _ASM_X86_AUXVEC_H */ diff --git a/arch/x86/include/asm/bigsmp/apic.h b/arch/x86/include/asm/bigsmp/apic.h new file mode 100644 index 000000000000..1d9543b9d358 --- /dev/null +++ b/arch/x86/include/asm/bigsmp/apic.h @@ -0,0 +1,139 @@ +#ifndef __ASM_MACH_APIC_H +#define __ASM_MACH_APIC_H + +#define xapic_phys_to_log_apicid(cpu) (per_cpu(x86_bios_cpu_apicid, cpu)) +#define esr_disable (1) + +static inline int apic_id_registered(void) +{ + return (1); +} + +static inline cpumask_t target_cpus(void) +{ +#ifdef CONFIG_SMP + return cpu_online_map; +#else + return cpumask_of_cpu(0); +#endif +} + +#undef APIC_DEST_LOGICAL +#define APIC_DEST_LOGICAL 0 +#define APIC_DFR_VALUE (APIC_DFR_FLAT) +#define INT_DELIVERY_MODE (dest_Fixed) +#define INT_DEST_MODE (0) /* phys delivery to target proc */ +#define NO_BALANCE_IRQ (0) +#define WAKE_SECONDARY_VIA_INIT + + +static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid) +{ + return (0); +} + +static inline unsigned long check_apicid_present(int bit) +{ + return (1); +} + +static inline unsigned long calculate_ldr(int cpu) +{ + unsigned long val, id; + val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; + id = xapic_phys_to_log_apicid(cpu); + val |= SET_APIC_LOGICAL_ID(id); + return val; +} + +/* + * Set up the logical destination ID. + * + * Intel recommends to set DFR, LDR and TPR before enabling + * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel + * document number 292116). So here it goes... + */ +static inline void init_apic_ldr(void) +{ + unsigned long val; + int cpu = smp_processor_id(); + + apic_write(APIC_DFR, APIC_DFR_VALUE); + val = calculate_ldr(cpu); + apic_write(APIC_LDR, val); +} + +static inline void setup_apic_routing(void) +{ + printk("Enabling APIC mode: %s. Using %d I/O APICs\n", + "Physflat", nr_ioapics); +} + +static inline int multi_timer_check(int apic, int irq) +{ + return (0); +} + +static inline int apicid_to_node(int logical_apicid) +{ + return apicid_2_node[hard_smp_processor_id()]; +} + +static inline int cpu_present_to_apicid(int mps_cpu) +{ + if (mps_cpu < NR_CPUS) + return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu); + + return BAD_APICID; +} + +static inline physid_mask_t apicid_to_cpu_present(int phys_apicid) +{ + return physid_mask_of_physid(phys_apicid); +} + +extern u8 cpu_2_logical_apicid[]; +/* Mapping from cpu number to logical apicid */ +static inline int cpu_to_logical_apicid(int cpu) +{ + if (cpu >= NR_CPUS) + return BAD_APICID; + return cpu_physical_id(cpu); +} + +static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map) +{ + /* For clustered we don't have a good way to do this yet - hack */ + return physids_promote(0xFFL); +} + +static inline void setup_portio_remap(void) +{ +} + +static inline void enable_apic_mode(void) +{ +} + +static inline int check_phys_apicid_present(int boot_cpu_physical_apicid) +{ + return (1); +} + +/* As we are using single CPU as destination, pick only one CPU here */ +static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) +{ + int cpu; + int apicid; + + cpu = first_cpu(cpumask); + apicid = cpu_to_logical_apicid(cpu); + return apicid; +} + +static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb) +{ + return cpuid_apic >> index_msb; +} + +#endif /* __ASM_MACH_APIC_H */ diff --git a/arch/x86/include/asm/bigsmp/apicdef.h b/arch/x86/include/asm/bigsmp/apicdef.h new file mode 100644 index 000000000000..392c3f5ef2fe --- /dev/null +++ b/arch/x86/include/asm/bigsmp/apicdef.h @@ -0,0 +1,13 @@ +#ifndef __ASM_MACH_APICDEF_H +#define __ASM_MACH_APICDEF_H + +#define APIC_ID_MASK (0xFF<<24) + +static inline unsigned get_apic_id(unsigned long x) +{ + return (((x)>>24)&0xFF); +} + +#define GET_APIC_ID(x) get_apic_id(x) + +#endif diff --git a/arch/x86/include/asm/bigsmp/ipi.h b/arch/x86/include/asm/bigsmp/ipi.h new file mode 100644 index 000000000000..9404c535b7ec --- /dev/null +++ b/arch/x86/include/asm/bigsmp/ipi.h @@ -0,0 +1,25 @@ +#ifndef __ASM_MACH_IPI_H +#define __ASM_MACH_IPI_H + +void send_IPI_mask_sequence(cpumask_t mask, int vector); + +static inline void send_IPI_mask(cpumask_t mask, int vector) +{ + send_IPI_mask_sequence(mask, vector); +} + +static inline void send_IPI_allbutself(int vector) +{ + cpumask_t mask = cpu_online_map; + cpu_clear(smp_processor_id(), mask); + + if (!cpus_empty(mask)) + send_IPI_mask(mask, vector); +} + +static inline void send_IPI_all(int vector) +{ + send_IPI_mask(cpu_online_map, vector); +} + +#endif /* __ASM_MACH_IPI_H */ diff --git a/arch/x86/include/asm/bios_ebda.h b/arch/x86/include/asm/bios_ebda.h new file mode 100644 index 000000000000..3c7521063d3f --- /dev/null +++ b/arch/x86/include/asm/bios_ebda.h @@ -0,0 +1,36 @@ +#ifndef _ASM_X86_BIOS_EBDA_H +#define _ASM_X86_BIOS_EBDA_H + +#include <asm/io.h> + +/* + * there is a real-mode segmented pointer pointing to the + * 4K EBDA area at 0x40E. + */ +static inline unsigned int get_bios_ebda(void) +{ + unsigned int address = *(unsigned short *)phys_to_virt(0x40E); + address <<= 4; + return address; /* 0 means none */ +} + +void reserve_ebda_region(void); + +#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION +/* + * This is obviously not a great place for this, but we want to be + * able to scatter it around anywhere in the kernel. + */ +void check_for_bios_corruption(void); +void start_periodic_check_for_corruption(void); +#else +static inline void check_for_bios_corruption(void) +{ +} + +static inline void start_periodic_check_for_corruption(void) +{ +} +#endif + +#endif /* _ASM_X86_BIOS_EBDA_H */ diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h new file mode 100644 index 000000000000..360010322711 --- /dev/null +++ b/arch/x86/include/asm/bitops.h @@ -0,0 +1,451 @@ +#ifndef _ASM_X86_BITOPS_H +#define _ASM_X86_BITOPS_H + +/* + * Copyright 1992, Linus Torvalds. + */ + +#ifndef _LINUX_BITOPS_H +#error only <linux/bitops.h> can be included directly +#endif + +#include <linux/compiler.h> +#include <asm/alternative.h> + +/* + * These have to be done with inline assembly: that way the bit-setting + * is guaranteed to be atomic. All bit operations return 0 if the bit + * was cleared before the operation and != 0 if it was not. + * + * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1). + */ + +#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1) +/* Technically wrong, but this avoids compilation errors on some gcc + versions. */ +#define BITOP_ADDR(x) "=m" (*(volatile long *) (x)) +#else +#define BITOP_ADDR(x) "+m" (*(volatile long *) (x)) +#endif + +#define ADDR BITOP_ADDR(addr) + +/* + * We do the locked ops that don't return the old value as + * a mask operation on a byte. + */ +#define IS_IMMEDIATE(nr) (__builtin_constant_p(nr)) +#define CONST_MASK_ADDR(nr, addr) BITOP_ADDR((void *)(addr) + ((nr)>>3)) +#define CONST_MASK(nr) (1 << ((nr) & 7)) + +/** + * set_bit - Atomically set a bit in memory + * @nr: the bit to set + * @addr: the address to start counting from + * + * This function is atomic and may not be reordered. See __set_bit() + * if you do not require the atomic guarantees. + * + * Note: there are no guarantees that this function will not be reordered + * on non x86 architectures, so if you are writing portable code, + * make sure not to rely on its reordering guarantees. + * + * Note that @nr may be almost arbitrarily large; this function is not + * restricted to acting on a single-word quantity. + */ +static inline void set_bit(unsigned int nr, volatile unsigned long *addr) +{ + if (IS_IMMEDIATE(nr)) { + asm volatile(LOCK_PREFIX "orb %1,%0" + : CONST_MASK_ADDR(nr, addr) + : "iq" ((u8)CONST_MASK(nr)) + : "memory"); + } else { + asm volatile(LOCK_PREFIX "bts %1,%0" + : BITOP_ADDR(addr) : "Ir" (nr) : "memory"); + } +} + +/** + * __set_bit - Set a bit in memory + * @nr: the bit to set + * @addr: the address to start counting from + * + * Unlike set_bit(), this function is non-atomic and may be reordered. + * If it's called on the same region of memory simultaneously, the effect + * may be that only one operation succeeds. + */ +static inline void __set_bit(int nr, volatile unsigned long *addr) +{ + asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory"); +} + +/** + * clear_bit - Clears a bit in memory + * @nr: Bit to clear + * @addr: Address to start counting from + * + * clear_bit() is atomic and may not be reordered. However, it does + * not contain a memory barrier, so if it is used for locking purposes, + * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit() + * in order to ensure changes are visible on other processors. + */ +static inline void clear_bit(int nr, volatile unsigned long *addr) +{ + if (IS_IMMEDIATE(nr)) { + asm volatile(LOCK_PREFIX "andb %1,%0" + : CONST_MASK_ADDR(nr, addr) + : "iq" ((u8)~CONST_MASK(nr))); + } else { + asm volatile(LOCK_PREFIX "btr %1,%0" + : BITOP_ADDR(addr) + : "Ir" (nr)); + } +} + +/* + * clear_bit_unlock - Clears a bit in memory + * @nr: Bit to clear + * @addr: Address to start counting from + * + * clear_bit() is atomic and implies release semantics before the memory + * operation. It can be used for an unlock. + */ +static inline void clear_bit_unlock(unsigned nr, volatile unsigned long *addr) +{ + barrier(); + clear_bit(nr, addr); +} + +static inline void __clear_bit(int nr, volatile unsigned long *addr) +{ + asm volatile("btr %1,%0" : ADDR : "Ir" (nr)); +} + +/* + * __clear_bit_unlock - Clears a bit in memory + * @nr: Bit to clear + * @addr: Address to start counting from + * + * __clear_bit() is non-atomic and implies release semantics before the memory + * operation. It can be used for an unlock if no other CPUs can concurrently + * modify other bits in the word. + * + * No memory barrier is required here, because x86 cannot reorder stores past + * older loads. Same principle as spin_unlock. + */ +static inline void __clear_bit_unlock(unsigned nr, volatile unsigned long *addr) +{ + barrier(); + __clear_bit(nr, addr); +} + +#define smp_mb__before_clear_bit() barrier() +#define smp_mb__after_clear_bit() barrier() + +/** + * __change_bit - Toggle a bit in memory + * @nr: the bit to change + * @addr: the address to start counting from + * + * Unlike change_bit(), this function is non-atomic and may be reordered. + * If it's called on the same region of memory simultaneously, the effect + * may be that only one operation succeeds. + */ +static inline void __change_bit(int nr, volatile unsigned long *addr) +{ + asm volatile("btc %1,%0" : ADDR : "Ir" (nr)); +} + +/** + * change_bit - Toggle a bit in memory + * @nr: Bit to change + * @addr: Address to start counting from + * + * change_bit() is atomic and may not be reordered. + * Note that @nr may be almost arbitrarily large; this function is not + * restricted to acting on a single-word quantity. + */ +static inline void change_bit(int nr, volatile unsigned long *addr) +{ + asm volatile(LOCK_PREFIX "btc %1,%0" : ADDR : "Ir" (nr)); +} + +/** + * test_and_set_bit - Set a bit and return its old value + * @nr: Bit to set + * @addr: Address to count from + * + * This operation is atomic and cannot be reordered. + * It also implies a memory barrier. + */ +static inline int test_and_set_bit(int nr, volatile unsigned long *addr) +{ + int oldbit; + + asm volatile(LOCK_PREFIX "bts %2,%1\n\t" + "sbb %0,%0" : "=r" (oldbit), ADDR : "Ir" (nr) : "memory"); + + return oldbit; +} + +/** + * test_and_set_bit_lock - Set a bit and return its old value for lock + * @nr: Bit to set + * @addr: Address to count from + * + * This is the same as test_and_set_bit on x86. + */ +static inline int test_and_set_bit_lock(int nr, volatile unsigned long *addr) +{ + return test_and_set_bit(nr, addr); +} + +/** + * __test_and_set_bit - Set a bit and return its old value + * @nr: Bit to set + * @addr: Address to count from + * + * This operation is non-atomic and can be reordered. + * If two examples of this operation race, one can appear to succeed + * but actually fail. You must protect multiple accesses with a lock. + */ +static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) +{ + int oldbit; + + asm("bts %2,%1\n\t" + "sbb %0,%0" + : "=r" (oldbit), ADDR + : "Ir" (nr)); + return oldbit; +} + +/** + * test_and_clear_bit - Clear a bit and return its old value + * @nr: Bit to clear + * @addr: Address to count from + * + * This operation is atomic and cannot be reordered. + * It also implies a memory barrier. + */ +static inline int test_and_clear_bit(int nr, volatile unsigned long *addr) +{ + int oldbit; + + asm volatile(LOCK_PREFIX "btr %2,%1\n\t" + "sbb %0,%0" + : "=r" (oldbit), ADDR : "Ir" (nr) : "memory"); + + return oldbit; +} + +/** + * __test_and_clear_bit - Clear a bit and return its old value + * @nr: Bit to clear + * @addr: Address to count from + * + * This operation is non-atomic and can be reordered. + * If two examples of this operation race, one can appear to succeed + * but actually fail. You must protect multiple accesses with a lock. + */ +static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) +{ + int oldbit; + + asm volatile("btr %2,%1\n\t" + "sbb %0,%0" + : "=r" (oldbit), ADDR + : "Ir" (nr)); + return oldbit; +} + +/* WARNING: non atomic and it can be reordered! */ +static inline int __test_and_change_bit(int nr, volatile unsigned long *addr) +{ + int oldbit; + + asm volatile("btc %2,%1\n\t" + "sbb %0,%0" + : "=r" (oldbit), ADDR + : "Ir" (nr) : "memory"); + + return oldbit; +} + +/** + * test_and_change_bit - Change a bit and return its old value + * @nr: Bit to change + * @addr: Address to count from + * + * This operation is atomic and cannot be reordered. + * It also implies a memory barrier. + */ +static inline int test_and_change_bit(int nr, volatile unsigned long *addr) +{ + int oldbit; + + asm volatile(LOCK_PREFIX "btc %2,%1\n\t" + "sbb %0,%0" + : "=r" (oldbit), ADDR : "Ir" (nr) : "memory"); + + return oldbit; +} + +static inline int constant_test_bit(int nr, const volatile unsigned long *addr) +{ + return ((1UL << (nr % BITS_PER_LONG)) & + (((unsigned long *)addr)[nr / BITS_PER_LONG])) != 0; +} + +static inline int variable_test_bit(int nr, volatile const unsigned long *addr) +{ + int oldbit; + + asm volatile("bt %2,%1\n\t" + "sbb %0,%0" + : "=r" (oldbit) + : "m" (*(unsigned long *)addr), "Ir" (nr)); + + return oldbit; +} + +#if 0 /* Fool kernel-doc since it doesn't do macros yet */ +/** + * test_bit - Determine whether a bit is set + * @nr: bit number to test + * @addr: Address to start counting from + */ +static int test_bit(int nr, const volatile unsigned long *addr); +#endif + +#define test_bit(nr, addr) \ + (__builtin_constant_p((nr)) \ + ? constant_test_bit((nr), (addr)) \ + : variable_test_bit((nr), (addr))) + +/** + * __ffs - find first set bit in word + * @word: The word to search + * + * Undefined if no bit exists, so code should check against 0 first. + */ +static inline unsigned long __ffs(unsigned long word) +{ + asm("bsf %1,%0" + : "=r" (word) + : "rm" (word)); + return word; +} + +/** + * ffz - find first zero bit in word + * @word: The word to search + * + * Undefined if no zero exists, so code should check against ~0UL first. + */ +static inline unsigned long ffz(unsigned long word) +{ + asm("bsf %1,%0" + : "=r" (word) + : "r" (~word)); + return word; +} + +/* + * __fls: find last set bit in word + * @word: The word to search + * + * Undefined if no set bit exists, so code should check against 0 first. + */ +static inline unsigned long __fls(unsigned long word) +{ + asm("bsr %1,%0" + : "=r" (word) + : "rm" (word)); + return word; +} + +#ifdef __KERNEL__ +/** + * ffs - find first set bit in word + * @x: the word to search + * + * This is defined the same way as the libc and compiler builtin ffs + * routines, therefore differs in spirit from the other bitops. + * + * ffs(value) returns 0 if value is 0 or the position of the first + * set bit if value is nonzero. The first (least significant) bit + * is at position 1. + */ +static inline int ffs(int x) +{ + int r; +#ifdef CONFIG_X86_CMOV + asm("bsfl %1,%0\n\t" + "cmovzl %2,%0" + : "=r" (r) : "rm" (x), "r" (-1)); +#else + asm("bsfl %1,%0\n\t" + "jnz 1f\n\t" + "movl $-1,%0\n" + "1:" : "=r" (r) : "rm" (x)); +#endif + return r + 1; +} + +/** + * fls - find last set bit in word + * @x: the word to search + * + * This is defined in a similar way as the libc and compiler builtin + * ffs, but returns the position of the most significant set bit. + * + * fls(value) returns 0 if value is 0 or the position of the last + * set bit if value is nonzero. The last (most significant) bit is + * at position 32. + */ +static inline int fls(int x) +{ + int r; +#ifdef CONFIG_X86_CMOV + asm("bsrl %1,%0\n\t" + "cmovzl %2,%0" + : "=&r" (r) : "rm" (x), "rm" (-1)); +#else + asm("bsrl %1,%0\n\t" + "jnz 1f\n\t" + "movl $-1,%0\n" + "1:" : "=r" (r) : "rm" (x)); +#endif + return r + 1; +} +#endif /* __KERNEL__ */ + +#undef ADDR + +#ifdef __KERNEL__ + +#include <asm-generic/bitops/sched.h> + +#define ARCH_HAS_FAST_MULTIPLIER 1 + +#include <asm-generic/bitops/hweight.h> + +#endif /* __KERNEL__ */ + +#include <asm-generic/bitops/fls64.h> + +#ifdef __KERNEL__ + +#include <asm-generic/bitops/ext2-non-atomic.h> + +#define ext2_set_bit_atomic(lock, nr, addr) \ + test_and_set_bit((nr), (unsigned long *)(addr)) +#define ext2_clear_bit_atomic(lock, nr, addr) \ + test_and_clear_bit((nr), (unsigned long *)(addr)) + +#include <asm-generic/bitops/minix.h> + +#endif /* __KERNEL__ */ +#endif /* _ASM_X86_BITOPS_H */ diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h new file mode 100644 index 000000000000..dd61616cb73d --- /dev/null +++ b/arch/x86/include/asm/boot.h @@ -0,0 +1,26 @@ +#ifndef _ASM_X86_BOOT_H +#define _ASM_X86_BOOT_H + +/* Don't touch these, unless you really know what you're doing. */ +#define DEF_SYSSEG 0x1000 +#define DEF_SYSSIZE 0x7F00 + +/* Internal svga startup constants */ +#define NORMAL_VGA 0xffff /* 80x25 mode */ +#define EXTENDED_VGA 0xfffe /* 80x50 mode */ +#define ASK_VGA 0xfffd /* ask for it at bootup */ + +/* Physical address where kernel should be loaded. */ +#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \ + + (CONFIG_PHYSICAL_ALIGN - 1)) \ + & ~(CONFIG_PHYSICAL_ALIGN - 1)) + +#ifdef CONFIG_X86_64 +#define BOOT_HEAP_SIZE 0x7000 +#define BOOT_STACK_SIZE 0x4000 +#else +#define BOOT_HEAP_SIZE 0x4000 +#define BOOT_STACK_SIZE 0x1000 +#endif + +#endif /* _ASM_X86_BOOT_H */ diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h new file mode 100644 index 000000000000..433adaebf9b6 --- /dev/null +++ b/arch/x86/include/asm/bootparam.h @@ -0,0 +1,111 @@ +#ifndef _ASM_X86_BOOTPARAM_H +#define _ASM_X86_BOOTPARAM_H + +#include <linux/types.h> +#include <linux/screen_info.h> +#include <linux/apm_bios.h> +#include <linux/edd.h> +#include <asm/e820.h> +#include <asm/ist.h> +#include <video/edid.h> + +/* setup data types */ +#define SETUP_NONE 0 +#define SETUP_E820_EXT 1 + +/* extensible setup data list node */ +struct setup_data { + __u64 next; + __u32 type; + __u32 len; + __u8 data[0]; +}; + +struct setup_header { + __u8 setup_sects; + __u16 root_flags; + __u32 syssize; + __u16 ram_size; +#define RAMDISK_IMAGE_START_MASK 0x07FF +#define RAMDISK_PROMPT_FLAG 0x8000 +#define RAMDISK_LOAD_FLAG 0x4000 + __u16 vid_mode; + __u16 root_dev; + __u16 boot_flag; + __u16 jump; + __u32 header; + __u16 version; + __u32 realmode_swtch; + __u16 start_sys; + __u16 kernel_version; + __u8 type_of_loader; + __u8 loadflags; +#define LOADED_HIGH (1<<0) +#define QUIET_FLAG (1<<5) +#define KEEP_SEGMENTS (1<<6) +#define CAN_USE_HEAP (1<<7) + __u16 setup_move_size; + __u32 code32_start; + __u32 ramdisk_image; + __u32 ramdisk_size; + __u32 bootsect_kludge; + __u16 heap_end_ptr; + __u16 _pad1; + __u32 cmd_line_ptr; + __u32 initrd_addr_max; + __u32 kernel_alignment; + __u8 relocatable_kernel; + __u8 _pad2[3]; + __u32 cmdline_size; + __u32 hardware_subarch; + __u64 hardware_subarch_data; + __u32 payload_offset; + __u32 payload_length; + __u64 setup_data; +} __attribute__((packed)); + +struct sys_desc_table { + __u16 length; + __u8 table[14]; +}; + +struct efi_info { + __u32 efi_loader_signature; + __u32 efi_systab; + __u32 efi_memdesc_size; + __u32 efi_memdesc_version; + __u32 efi_memmap; + __u32 efi_memmap_size; + __u32 efi_systab_hi; + __u32 efi_memmap_hi; +}; + +/* The so-called "zeropage" */ +struct boot_params { + struct screen_info screen_info; /* 0x000 */ + struct apm_bios_info apm_bios_info; /* 0x040 */ + __u8 _pad2[12]; /* 0x054 */ + struct ist_info ist_info; /* 0x060 */ + __u8 _pad3[16]; /* 0x070 */ + __u8 hd0_info[16]; /* obsolete! */ /* 0x080 */ + __u8 hd1_info[16]; /* obsolete! */ /* 0x090 */ + struct sys_desc_table sys_desc_table; /* 0x0a0 */ + __u8 _pad4[144]; /* 0x0b0 */ + struct edid_info edid_info; /* 0x140 */ + struct efi_info efi_info; /* 0x1c0 */ + __u32 alt_mem_k; /* 0x1e0 */ + __u32 scratch; /* Scratch field! */ /* 0x1e4 */ + __u8 e820_entries; /* 0x1e8 */ + __u8 eddbuf_entries; /* 0x1e9 */ + __u8 edd_mbr_sig_buf_entries; /* 0x1ea */ + __u8 _pad6[6]; /* 0x1eb */ + struct setup_header hdr; /* setup header */ /* 0x1f1 */ + __u8 _pad7[0x290-0x1f1-sizeof(struct setup_header)]; + __u32 edd_mbr_sig_buffer[EDD_MBR_SIG_MAX]; /* 0x290 */ + struct e820entry e820_map[E820MAX]; /* 0x2d0 */ + __u8 _pad8[48]; /* 0xcd0 */ + struct edd_info eddbuf[EDDMAXNR]; /* 0xd00 */ + __u8 _pad9[276]; /* 0xeec */ +} __attribute__((packed)); + +#endif /* _ASM_X86_BOOTPARAM_H */ diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h new file mode 100644 index 000000000000..3def2065fcea --- /dev/null +++ b/arch/x86/include/asm/bug.h @@ -0,0 +1,39 @@ +#ifndef _ASM_X86_BUG_H +#define _ASM_X86_BUG_H + +#ifdef CONFIG_BUG +#define HAVE_ARCH_BUG + +#ifdef CONFIG_DEBUG_BUGVERBOSE + +#ifdef CONFIG_X86_32 +# define __BUG_C0 "2:\t.long 1b, %c0\n" +#else +# define __BUG_C0 "2:\t.quad 1b, %c0\n" +#endif + +#define BUG() \ +do { \ + asm volatile("1:\tud2\n" \ + ".pushsection __bug_table,\"a\"\n" \ + __BUG_C0 \ + "\t.word %c1, 0\n" \ + "\t.org 2b+%c2\n" \ + ".popsection" \ + : : "i" (__FILE__), "i" (__LINE__), \ + "i" (sizeof(struct bug_entry))); \ + for (;;) ; \ +} while (0) + +#else +#define BUG() \ +do { \ + asm volatile("ud2"); \ + for (;;) ; \ +} while (0) +#endif + +#endif /* !CONFIG_BUG */ + +#include <asm-generic/bug.h> +#endif /* _ASM_X86_BUG_H */ diff --git a/arch/x86/include/asm/bugs.h b/arch/x86/include/asm/bugs.h new file mode 100644 index 000000000000..08abf639075f --- /dev/null +++ b/arch/x86/include/asm/bugs.h @@ -0,0 +1,12 @@ +#ifndef _ASM_X86_BUGS_H +#define _ASM_X86_BUGS_H + +extern void check_bugs(void); + +#if defined(CONFIG_CPU_SUP_INTEL) && defined(CONFIG_X86_32) +int ppro_with_ram_bug(void); +#else +static inline int ppro_with_ram_bug(void) { return 0; } +#endif + +#endif /* _ASM_X86_BUGS_H */ diff --git a/arch/x86/include/asm/byteorder.h b/arch/x86/include/asm/byteorder.h new file mode 100644 index 000000000000..e02ae2d89acf --- /dev/null +++ b/arch/x86/include/asm/byteorder.h @@ -0,0 +1,81 @@ +#ifndef _ASM_X86_BYTEORDER_H +#define _ASM_X86_BYTEORDER_H + +#include <asm/types.h> +#include <linux/compiler.h> + +#ifdef __GNUC__ + +#ifdef __i386__ + +static inline __attribute_const__ __u32 ___arch__swab32(__u32 x) +{ +#ifdef CONFIG_X86_BSWAP + asm("bswap %0" : "=r" (x) : "0" (x)); +#else + asm("xchgb %b0,%h0\n\t" /* swap lower bytes */ + "rorl $16,%0\n\t" /* swap words */ + "xchgb %b0,%h0" /* swap higher bytes */ + : "=q" (x) + : "0" (x)); +#endif + return x; +} + +static inline __attribute_const__ __u64 ___arch__swab64(__u64 val) +{ + union { + struct { + __u32 a; + __u32 b; + } s; + __u64 u; + } v; + v.u = val; +#ifdef CONFIG_X86_BSWAP + asm("bswapl %0 ; bswapl %1 ; xchgl %0,%1" + : "=r" (v.s.a), "=r" (v.s.b) + : "0" (v.s.a), "1" (v.s.b)); +#else + v.s.a = ___arch__swab32(v.s.a); + v.s.b = ___arch__swab32(v.s.b); + asm("xchgl %0,%1" + : "=r" (v.s.a), "=r" (v.s.b) + : "0" (v.s.a), "1" (v.s.b)); +#endif + return v.u; +} + +#else /* __i386__ */ + +static inline __attribute_const__ __u64 ___arch__swab64(__u64 x) +{ + asm("bswapq %0" + : "=r" (x) + : "0" (x)); + return x; +} + +static inline __attribute_const__ __u32 ___arch__swab32(__u32 x) +{ + asm("bswapl %0" + : "=r" (x) + : "0" (x)); + return x; +} + +#endif + +/* Do not define swab16. Gcc is smart enough to recognize "C" version and + convert it into rotation or exhange. */ + +#define __arch__swab64(x) ___arch__swab64(x) +#define __arch__swab32(x) ___arch__swab32(x) + +#define __BYTEORDER_HAS_U64__ + +#endif /* __GNUC__ */ + +#include <linux/byteorder/little_endian.h> + +#endif /* _ASM_X86_BYTEORDER_H */ diff --git a/arch/x86/include/asm/cache.h b/arch/x86/include/asm/cache.h new file mode 100644 index 000000000000..5d367caa0e36 --- /dev/null +++ b/arch/x86/include/asm/cache.h @@ -0,0 +1,20 @@ +#ifndef _ASM_X86_CACHE_H +#define _ASM_X86_CACHE_H + +/* L1 cache line size */ +#define L1_CACHE_SHIFT (CONFIG_X86_L1_CACHE_SHIFT) +#define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT) + +#define __read_mostly __attribute__((__section__(".data.read_mostly"))) + +#ifdef CONFIG_X86_VSMP +/* vSMP Internode cacheline shift */ +#define INTERNODE_CACHE_SHIFT (12) +#ifdef CONFIG_SMP +#define __cacheline_aligned_in_smp \ + __attribute__((__aligned__(1 << (INTERNODE_CACHE_SHIFT)))) \ + __attribute__((__section__(".data.page_aligned"))) +#endif +#endif + +#endif /* _ASM_X86_CACHE_H */ diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h new file mode 100644 index 000000000000..2f8466540fb5 --- /dev/null +++ b/arch/x86/include/asm/cacheflush.h @@ -0,0 +1,118 @@ +#ifndef _ASM_X86_CACHEFLUSH_H +#define _ASM_X86_CACHEFLUSH_H + +/* Keep includes the same across arches. */ +#include <linux/mm.h> + +/* Caches aren't brain-dead on the intel. */ +#define flush_cache_all() do { } while (0) +#define flush_cache_mm(mm) do { } while (0) +#define flush_cache_dup_mm(mm) do { } while (0) +#define flush_cache_range(vma, start, end) do { } while (0) +#define flush_cache_page(vma, vmaddr, pfn) do { } while (0) +#define flush_dcache_page(page) do { } while (0) +#define flush_dcache_mmap_lock(mapping) do { } while (0) +#define flush_dcache_mmap_unlock(mapping) do { } while (0) +#define flush_icache_range(start, end) do { } while (0) +#define flush_icache_page(vma, pg) do { } while (0) +#define flush_icache_user_range(vma, pg, adr, len) do { } while (0) +#define flush_cache_vmap(start, end) do { } while (0) +#define flush_cache_vunmap(start, end) do { } while (0) + +#define copy_to_user_page(vma, page, vaddr, dst, src, len) \ + memcpy((dst), (src), (len)) +#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ + memcpy((dst), (src), (len)) + +#define PG_non_WB PG_arch_1 +PAGEFLAG(NonWB, non_WB) + +/* + * The set_memory_* API can be used to change various attributes of a virtual + * address range. The attributes include: + * Cachability : UnCached, WriteCombining, WriteBack + * Executability : eXeutable, NoteXecutable + * Read/Write : ReadOnly, ReadWrite + * Presence : NotPresent + * + * Within a catagory, the attributes are mutually exclusive. + * + * The implementation of this API will take care of various aspects that + * are associated with changing such attributes, such as: + * - Flushing TLBs + * - Flushing CPU caches + * - Making sure aliases of the memory behind the mapping don't violate + * coherency rules as defined by the CPU in the system. + * + * What this API does not do: + * - Provide exclusion between various callers - including callers that + * operation on other mappings of the same physical page + * - Restore default attributes when a page is freed + * - Guarantee that mappings other than the requested one are + * in any state, other than that these do not violate rules for + * the CPU you have. Do not depend on any effects on other mappings, + * CPUs other than the one you have may have more relaxed rules. + * The caller is required to take care of these. + */ + +int _set_memory_uc(unsigned long addr, int numpages); +int _set_memory_wc(unsigned long addr, int numpages); +int _set_memory_wb(unsigned long addr, int numpages); +int set_memory_uc(unsigned long addr, int numpages); +int set_memory_wc(unsigned long addr, int numpages); +int set_memory_wb(unsigned long addr, int numpages); +int set_memory_x(unsigned long addr, int numpages); +int set_memory_nx(unsigned long addr, int numpages); +int set_memory_ro(unsigned long addr, int numpages); +int set_memory_rw(unsigned long addr, int numpages); +int set_memory_np(unsigned long addr, int numpages); +int set_memory_4k(unsigned long addr, int numpages); + +int set_memory_array_uc(unsigned long *addr, int addrinarray); +int set_memory_array_wb(unsigned long *addr, int addrinarray); + +/* + * For legacy compatibility with the old APIs, a few functions + * are provided that work on a "struct page". + * These functions operate ONLY on the 1:1 kernel mapping of the + * memory that the struct page represents, and internally just + * call the set_memory_* function. See the description of the + * set_memory_* function for more details on conventions. + * + * These APIs should be considered *deprecated* and are likely going to + * be removed in the future. + * The reason for this is the implicit operation on the 1:1 mapping only, + * making this not a generally useful API. + * + * Specifically, many users of the old APIs had a virtual address, + * called virt_to_page() or vmalloc_to_page() on that address to + * get a struct page* that the old API required. + * To convert these cases, use set_memory_*() on the original + * virtual address, do not use these functions. + */ + +int set_pages_uc(struct page *page, int numpages); +int set_pages_wb(struct page *page, int numpages); +int set_pages_x(struct page *page, int numpages); +int set_pages_nx(struct page *page, int numpages); +int set_pages_ro(struct page *page, int numpages); +int set_pages_rw(struct page *page, int numpages); + + +void clflush_cache_range(void *addr, unsigned int size); + +#ifdef CONFIG_DEBUG_RODATA +void mark_rodata_ro(void); +extern const int rodata_test_data; +#endif + +#ifdef CONFIG_DEBUG_RODATA_TEST +int rodata_test(void); +#else +static inline int rodata_test(void) +{ + return 0; +} +#endif + +#endif /* _ASM_X86_CACHEFLUSH_H */ diff --git a/arch/x86/include/asm/calgary.h b/arch/x86/include/asm/calgary.h new file mode 100644 index 000000000000..b03bedb62aa7 --- /dev/null +++ b/arch/x86/include/asm/calgary.h @@ -0,0 +1,72 @@ +/* + * Derived from include/asm-powerpc/iommu.h + * + * Copyright IBM Corporation, 2006-2007 + * + * Author: Jon Mason <jdmason@us.ibm.com> + * Author: Muli Ben-Yehuda <muli@il.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _ASM_X86_CALGARY_H +#define _ASM_X86_CALGARY_H + +#include <linux/spinlock.h> +#include <linux/device.h> +#include <linux/dma-mapping.h> +#include <linux/timer.h> +#include <asm/types.h> + +struct iommu_table { + struct cal_chipset_ops *chip_ops; /* chipset specific funcs */ + unsigned long it_base; /* mapped address of tce table */ + unsigned long it_hint; /* Hint for next alloc */ + unsigned long *it_map; /* A simple allocation bitmap for now */ + void __iomem *bbar; /* Bridge BAR */ + u64 tar_val; /* Table Address Register */ + struct timer_list watchdog_timer; + spinlock_t it_lock; /* Protects it_map */ + unsigned int it_size; /* Size of iommu table in entries */ + unsigned char it_busno; /* Bus number this table belongs to */ +}; + +struct cal_chipset_ops { + void (*handle_quirks)(struct iommu_table *tbl, struct pci_dev *dev); + void (*tce_cache_blast)(struct iommu_table *tbl); + void (*dump_error_regs)(struct iommu_table *tbl); +}; + +#define TCE_TABLE_SIZE_UNSPECIFIED ~0 +#define TCE_TABLE_SIZE_64K 0 +#define TCE_TABLE_SIZE_128K 1 +#define TCE_TABLE_SIZE_256K 2 +#define TCE_TABLE_SIZE_512K 3 +#define TCE_TABLE_SIZE_1M 4 +#define TCE_TABLE_SIZE_2M 5 +#define TCE_TABLE_SIZE_4M 6 +#define TCE_TABLE_SIZE_8M 7 + +extern int use_calgary; + +#ifdef CONFIG_CALGARY_IOMMU +extern int calgary_iommu_init(void); +extern void detect_calgary(void); +#else +static inline int calgary_iommu_init(void) { return 1; } +static inline void detect_calgary(void) { return; } +#endif + +#endif /* _ASM_X86_CALGARY_H */ diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h new file mode 100644 index 000000000000..2bc162e0ec6e --- /dev/null +++ b/arch/x86/include/asm/calling.h @@ -0,0 +1,170 @@ +/* + * Some macros to handle stack frames in assembly. + */ + +#define R15 0 +#define R14 8 +#define R13 16 +#define R12 24 +#define RBP 32 +#define RBX 40 + +/* arguments: interrupts/non tracing syscalls only save upto here*/ +#define R11 48 +#define R10 56 +#define R9 64 +#define R8 72 +#define RAX 80 +#define RCX 88 +#define RDX 96 +#define RSI 104 +#define RDI 112 +#define ORIG_RAX 120 /* + error_code */ +/* end of arguments */ + +/* cpu exception frame or undefined in case of fast syscall. */ +#define RIP 128 +#define CS 136 +#define EFLAGS 144 +#define RSP 152 +#define SS 160 + +#define ARGOFFSET R11 +#define SWFRAME ORIG_RAX + + .macro SAVE_ARGS addskip=0, norcx=0, nor891011=0 + subq $9*8+\addskip, %rsp + CFI_ADJUST_CFA_OFFSET 9*8+\addskip + movq %rdi, 8*8(%rsp) + CFI_REL_OFFSET rdi, 8*8 + movq %rsi, 7*8(%rsp) + CFI_REL_OFFSET rsi, 7*8 + movq %rdx, 6*8(%rsp) + CFI_REL_OFFSET rdx, 6*8 + .if \norcx + .else + movq %rcx, 5*8(%rsp) + CFI_REL_OFFSET rcx, 5*8 + .endif + movq %rax, 4*8(%rsp) + CFI_REL_OFFSET rax, 4*8 + .if \nor891011 + .else + movq %r8, 3*8(%rsp) + CFI_REL_OFFSET r8, 3*8 + movq %r9, 2*8(%rsp) + CFI_REL_OFFSET r9, 2*8 + movq %r10, 1*8(%rsp) + CFI_REL_OFFSET r10, 1*8 + movq %r11, (%rsp) + CFI_REL_OFFSET r11, 0*8 + .endif + .endm + +#define ARG_SKIP 9*8 + + .macro RESTORE_ARGS skiprax=0, addskip=0, skiprcx=0, skipr11=0, \ + skipr8910=0, skiprdx=0 + .if \skipr11 + .else + movq (%rsp), %r11 + CFI_RESTORE r11 + .endif + .if \skipr8910 + .else + movq 1*8(%rsp), %r10 + CFI_RESTORE r10 + movq 2*8(%rsp), %r9 + CFI_RESTORE r9 + movq 3*8(%rsp), %r8 + CFI_RESTORE r8 + .endif + .if \skiprax + .else + movq 4*8(%rsp), %rax + CFI_RESTORE rax + .endif + .if \skiprcx + .else + movq 5*8(%rsp), %rcx + CFI_RESTORE rcx + .endif + .if \skiprdx + .else + movq 6*8(%rsp), %rdx + CFI_RESTORE rdx + .endif + movq 7*8(%rsp), %rsi + CFI_RESTORE rsi + movq 8*8(%rsp), %rdi + CFI_RESTORE rdi + .if ARG_SKIP+\addskip > 0 + addq $ARG_SKIP+\addskip, %rsp + CFI_ADJUST_CFA_OFFSET -(ARG_SKIP+\addskip) + .endif + .endm + + .macro LOAD_ARGS offset, skiprax=0 + movq \offset(%rsp), %r11 + movq \offset+8(%rsp), %r10 + movq \offset+16(%rsp), %r9 + movq \offset+24(%rsp), %r8 + movq \offset+40(%rsp), %rcx + movq \offset+48(%rsp), %rdx + movq \offset+56(%rsp), %rsi + movq \offset+64(%rsp), %rdi + .if \skiprax + .else + movq \offset+72(%rsp), %rax + .endif + .endm + +#define REST_SKIP 6*8 + + .macro SAVE_REST + subq $REST_SKIP, %rsp + CFI_ADJUST_CFA_OFFSET REST_SKIP + movq %rbx, 5*8(%rsp) + CFI_REL_OFFSET rbx, 5*8 + movq %rbp, 4*8(%rsp) + CFI_REL_OFFSET rbp, 4*8 + movq %r12, 3*8(%rsp) + CFI_REL_OFFSET r12, 3*8 + movq %r13, 2*8(%rsp) + CFI_REL_OFFSET r13, 2*8 + movq %r14, 1*8(%rsp) + CFI_REL_OFFSET r14, 1*8 + movq %r15, (%rsp) + CFI_REL_OFFSET r15, 0*8 + .endm + + .macro RESTORE_REST + movq (%rsp), %r15 + CFI_RESTORE r15 + movq 1*8(%rsp), %r14 + CFI_RESTORE r14 + movq 2*8(%rsp), %r13 + CFI_RESTORE r13 + movq 3*8(%rsp), %r12 + CFI_RESTORE r12 + movq 4*8(%rsp), %rbp + CFI_RESTORE rbp + movq 5*8(%rsp), %rbx + CFI_RESTORE rbx + addq $REST_SKIP, %rsp + CFI_ADJUST_CFA_OFFSET -(REST_SKIP) + .endm + + .macro SAVE_ALL + SAVE_ARGS + SAVE_REST + .endm + + .macro RESTORE_ALL addskip=0 + RESTORE_REST + RESTORE_ARGS 0, \addskip + .endm + + .macro icebp + .byte 0xf1 + .endm diff --git a/arch/x86/include/asm/checksum.h b/arch/x86/include/asm/checksum.h new file mode 100644 index 000000000000..848850fd7d62 --- /dev/null +++ b/arch/x86/include/asm/checksum.h @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include "checksum_32.h" +#else +# include "checksum_64.h" +#endif diff --git a/arch/x86/include/asm/checksum_32.h b/arch/x86/include/asm/checksum_32.h new file mode 100644 index 000000000000..7c5ef8b14d92 --- /dev/null +++ b/arch/x86/include/asm/checksum_32.h @@ -0,0 +1,189 @@ +#ifndef _ASM_X86_CHECKSUM_32_H +#define _ASM_X86_CHECKSUM_32_H + +#include <linux/in6.h> + +#include <asm/uaccess.h> + +/* + * computes the checksum of a memory block at buff, length len, + * and adds in "sum" (32-bit) + * + * returns a 32-bit number suitable for feeding into itself + * or csum_tcpudp_magic + * + * this function must be called with even lengths, except + * for the last fragment, which may be odd + * + * it's best to have buff aligned on a 32-bit boundary + */ +asmlinkage __wsum csum_partial(const void *buff, int len, __wsum sum); + +/* + * the same as csum_partial, but copies from src while it + * checksums, and handles user-space pointer exceptions correctly, when needed. + * + * here even more important to align src and dst on a 32-bit (or even + * better 64-bit) boundary + */ + +asmlinkage __wsum csum_partial_copy_generic(const void *src, void *dst, + int len, __wsum sum, + int *src_err_ptr, int *dst_err_ptr); + +/* + * Note: when you get a NULL pointer exception here this means someone + * passed in an incorrect kernel address to one of these functions. + * + * If you use these functions directly please don't forget the + * access_ok(). + */ +static inline __wsum csum_partial_copy_nocheck(const void *src, void *dst, + int len, __wsum sum) +{ + return csum_partial_copy_generic(src, dst, len, sum, NULL, NULL); +} + +static inline __wsum csum_partial_copy_from_user(const void __user *src, + void *dst, + int len, __wsum sum, + int *err_ptr) +{ + might_sleep(); + return csum_partial_copy_generic((__force void *)src, dst, + len, sum, err_ptr, NULL); +} + +/* + * This is a version of ip_compute_csum() optimized for IP headers, + * which always checksum on 4 octet boundaries. + * + * By Jorge Cwik <jorge@laser.satlink.net>, adapted for linux by + * Arnt Gulbrandsen. + */ +static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl) +{ + unsigned int sum; + + asm volatile("movl (%1), %0 ;\n" + "subl $4, %2 ;\n" + "jbe 2f ;\n" + "addl 4(%1), %0 ;\n" + "adcl 8(%1), %0 ;\n" + "adcl 12(%1), %0;\n" + "1: adcl 16(%1), %0 ;\n" + "lea 4(%1), %1 ;\n" + "decl %2 ;\n" + "jne 1b ;\n" + "adcl $0, %0 ;\n" + "movl %0, %2 ;\n" + "shrl $16, %0 ;\n" + "addw %w2, %w0 ;\n" + "adcl $0, %0 ;\n" + "notl %0 ;\n" + "2: ;\n" + /* Since the input registers which are loaded with iph and ihl + are modified, we must also specify them as outputs, or gcc + will assume they contain their original values. */ + : "=r" (sum), "=r" (iph), "=r" (ihl) + : "1" (iph), "2" (ihl) + : "memory"); + return (__force __sum16)sum; +} + +/* + * Fold a partial checksum + */ + +static inline __sum16 csum_fold(__wsum sum) +{ + asm("addl %1, %0 ;\n" + "adcl $0xffff, %0 ;\n" + : "=r" (sum) + : "r" ((__force u32)sum << 16), + "0" ((__force u32)sum & 0xffff0000)); + return (__force __sum16)(~(__force u32)sum >> 16); +} + +static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, + unsigned short len, + unsigned short proto, + __wsum sum) +{ + asm("addl %1, %0 ;\n" + "adcl %2, %0 ;\n" + "adcl %3, %0 ;\n" + "adcl $0, %0 ;\n" + : "=r" (sum) + : "g" (daddr), "g"(saddr), + "g" ((len + proto) << 8), "0" (sum)); + return sum; +} + +/* + * computes the checksum of the TCP/UDP pseudo-header + * returns a 16-bit checksum, already complemented + */ +static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, + unsigned short len, + unsigned short proto, + __wsum sum) +{ + return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum)); +} + +/* + * this routine is used for miscellaneous IP-like checksums, mainly + * in icmp.c + */ + +static inline __sum16 ip_compute_csum(const void *buff, int len) +{ + return csum_fold(csum_partial(buff, len, 0)); +} + +#define _HAVE_ARCH_IPV6_CSUM +static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr, + const struct in6_addr *daddr, + __u32 len, unsigned short proto, + __wsum sum) +{ + asm("addl 0(%1), %0 ;\n" + "adcl 4(%1), %0 ;\n" + "adcl 8(%1), %0 ;\n" + "adcl 12(%1), %0 ;\n" + "adcl 0(%2), %0 ;\n" + "adcl 4(%2), %0 ;\n" + "adcl 8(%2), %0 ;\n" + "adcl 12(%2), %0 ;\n" + "adcl %3, %0 ;\n" + "adcl %4, %0 ;\n" + "adcl $0, %0 ;\n" + : "=&r" (sum) + : "r" (saddr), "r" (daddr), + "r" (htonl(len)), "r" (htonl(proto)), "0" (sum)); + + return csum_fold(sum); +} + +/* + * Copy and checksum to user + */ +#define HAVE_CSUM_COPY_USER +static inline __wsum csum_and_copy_to_user(const void *src, + void __user *dst, + int len, __wsum sum, + int *err_ptr) +{ + might_sleep(); + if (access_ok(VERIFY_WRITE, dst, len)) + return csum_partial_copy_generic(src, (__force void *)dst, + len, sum, NULL, err_ptr); + + if (len) + *err_ptr = -EFAULT; + + return (__force __wsum)-1; /* invalid checksum */ +} + +#endif /* _ASM_X86_CHECKSUM_32_H */ diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h new file mode 100644 index 000000000000..9bfdc41629ec --- /dev/null +++ b/arch/x86/include/asm/checksum_64.h @@ -0,0 +1,191 @@ +#ifndef _ASM_X86_CHECKSUM_64_H +#define _ASM_X86_CHECKSUM_64_H + +/* + * Checksums for x86-64 + * Copyright 2002 by Andi Kleen, SuSE Labs + * with some code from asm-x86/checksum.h + */ + +#include <linux/compiler.h> +#include <asm/uaccess.h> +#include <asm/byteorder.h> + +/** + * csum_fold - Fold and invert a 32bit checksum. + * sum: 32bit unfolded sum + * + * Fold a 32bit running checksum to 16bit and invert it. This is usually + * the last step before putting a checksum into a packet. + * Make sure not to mix with 64bit checksums. + */ +static inline __sum16 csum_fold(__wsum sum) +{ + asm(" addl %1,%0\n" + " adcl $0xffff,%0" + : "=r" (sum) + : "r" ((__force u32)sum << 16), + "0" ((__force u32)sum & 0xffff0000)); + return (__force __sum16)(~(__force u32)sum >> 16); +} + +/* + * This is a version of ip_compute_csum() optimized for IP headers, + * which always checksum on 4 octet boundaries. + * + * By Jorge Cwik <jorge@laser.satlink.net>, adapted for linux by + * Arnt Gulbrandsen. + */ + +/** + * ip_fast_csum - Compute the IPv4 header checksum efficiently. + * iph: ipv4 header + * ihl: length of header / 4 + */ +static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl) +{ + unsigned int sum; + + asm(" movl (%1), %0\n" + " subl $4, %2\n" + " jbe 2f\n" + " addl 4(%1), %0\n" + " adcl 8(%1), %0\n" + " adcl 12(%1), %0\n" + "1: adcl 16(%1), %0\n" + " lea 4(%1), %1\n" + " decl %2\n" + " jne 1b\n" + " adcl $0, %0\n" + " movl %0, %2\n" + " shrl $16, %0\n" + " addw %w2, %w0\n" + " adcl $0, %0\n" + " notl %0\n" + "2:" + /* Since the input registers which are loaded with iph and ihl + are modified, we must also specify them as outputs, or gcc + will assume they contain their original values. */ + : "=r" (sum), "=r" (iph), "=r" (ihl) + : "1" (iph), "2" (ihl) + : "memory"); + return (__force __sum16)sum; +} + +/** + * csum_tcpup_nofold - Compute an IPv4 pseudo header checksum. + * @saddr: source address + * @daddr: destination address + * @len: length of packet + * @proto: ip protocol of packet + * @sum: initial sum to be added in (32bit unfolded) + * + * Returns the pseudo header checksum the input data. Result is + * 32bit unfolded. + */ +static inline __wsum +csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len, + unsigned short proto, __wsum sum) +{ + asm(" addl %1, %0\n" + " adcl %2, %0\n" + " adcl %3, %0\n" + " adcl $0, %0\n" + : "=r" (sum) + : "g" (daddr), "g" (saddr), + "g" ((len + proto)<<8), "0" (sum)); + return sum; +} + + +/** + * csum_tcpup_magic - Compute an IPv4 pseudo header checksum. + * @saddr: source address + * @daddr: destination address + * @len: length of packet + * @proto: ip protocol of packet + * @sum: initial sum to be added in (32bit unfolded) + * + * Returns the 16bit pseudo header checksum the input data already + * complemented and ready to be filled in. + */ +static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, + unsigned short len, + unsigned short proto, __wsum sum) +{ + return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum)); +} + +/** + * csum_partial - Compute an internet checksum. + * @buff: buffer to be checksummed + * @len: length of buffer. + * @sum: initial sum to be added in (32bit unfolded) + * + * Returns the 32bit unfolded internet checksum of the buffer. + * Before filling it in it needs to be csum_fold()'ed. + * buff should be aligned to a 64bit boundary if possible. + */ +extern __wsum csum_partial(const void *buff, int len, __wsum sum); + +#define _HAVE_ARCH_COPY_AND_CSUM_FROM_USER 1 +#define HAVE_CSUM_COPY_USER 1 + + +/* Do not call this directly. Use the wrappers below */ +extern __wsum csum_partial_copy_generic(const void *src, const void *dst, + int len, __wsum sum, + int *src_err_ptr, int *dst_err_ptr); + + +extern __wsum csum_partial_copy_from_user(const void __user *src, void *dst, + int len, __wsum isum, int *errp); +extern __wsum csum_partial_copy_to_user(const void *src, void __user *dst, + int len, __wsum isum, int *errp); +extern __wsum csum_partial_copy_nocheck(const void *src, void *dst, + int len, __wsum sum); + +/* Old names. To be removed. */ +#define csum_and_copy_to_user csum_partial_copy_to_user +#define csum_and_copy_from_user csum_partial_copy_from_user + +/** + * ip_compute_csum - Compute an 16bit IP checksum. + * @buff: buffer address. + * @len: length of buffer. + * + * Returns the 16bit folded/inverted checksum of the passed buffer. + * Ready to fill in. + */ +extern __sum16 ip_compute_csum(const void *buff, int len); + +/** + * csum_ipv6_magic - Compute checksum of an IPv6 pseudo header. + * @saddr: source address + * @daddr: destination address + * @len: length of packet + * @proto: protocol of packet + * @sum: initial sum (32bit unfolded) to be added in + * + * Computes an IPv6 pseudo header checksum. This sum is added the checksum + * into UDP/TCP packets and contains some link layer information. + * Returns the unfolded 32bit checksum. + */ + +struct in6_addr; + +#define _HAVE_ARCH_IPV6_CSUM 1 +extern __sum16 +csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr, + __u32 len, unsigned short proto, __wsum sum); + +static inline unsigned add32_with_carry(unsigned a, unsigned b) +{ + asm("addl %2,%0\n\t" + "adcl $0,%0" + : "=r" (a) + : "0" (a), "r" (b)); + return a; +} + +#endif /* _ASM_X86_CHECKSUM_64_H */ diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h new file mode 100644 index 000000000000..a460fa088d4c --- /dev/null +++ b/arch/x86/include/asm/cmpxchg.h @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include "cmpxchg_32.h" +#else +# include "cmpxchg_64.h" +#endif diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h new file mode 100644 index 000000000000..82ceb788a981 --- /dev/null +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -0,0 +1,344 @@ +#ifndef _ASM_X86_CMPXCHG_32_H +#define _ASM_X86_CMPXCHG_32_H + +#include <linux/bitops.h> /* for LOCK_PREFIX */ + +/* + * Note: if you use set64_bit(), __cmpxchg64(), or their variants, you + * you need to test for the feature in boot_cpu_data. + */ + +#define xchg(ptr, v) \ + ((__typeof__(*(ptr)))__xchg((unsigned long)(v), (ptr), sizeof(*(ptr)))) + +struct __xchg_dummy { + unsigned long a[100]; +}; +#define __xg(x) ((struct __xchg_dummy *)(x)) + +/* + * The semantics of XCHGCMP8B are a bit strange, this is why + * there is a loop and the loading of %%eax and %%edx has to + * be inside. This inlines well in most cases, the cached + * cost is around ~38 cycles. (in the future we might want + * to do an SIMD/3DNOW!/MMX/FPU 64-bit store here, but that + * might have an implicit FPU-save as a cost, so it's not + * clear which path to go.) + * + * cmpxchg8b must be used with the lock prefix here to allow + * the instruction to be executed atomically, see page 3-102 + * of the instruction set reference 24319102.pdf. We need + * the reader side to see the coherent 64bit value. + */ +static inline void __set_64bit(unsigned long long *ptr, + unsigned int low, unsigned int high) +{ + asm volatile("\n1:\t" + "movl (%0), %%eax\n\t" + "movl 4(%0), %%edx\n\t" + LOCK_PREFIX "cmpxchg8b (%0)\n\t" + "jnz 1b" + : /* no outputs */ + : "D"(ptr), + "b"(low), + "c"(high) + : "ax", "dx", "memory"); +} + +static inline void __set_64bit_constant(unsigned long long *ptr, + unsigned long long value) +{ + __set_64bit(ptr, (unsigned int)value, (unsigned int)(value >> 32)); +} + +#define ll_low(x) *(((unsigned int *)&(x)) + 0) +#define ll_high(x) *(((unsigned int *)&(x)) + 1) + +static inline void __set_64bit_var(unsigned long long *ptr, + unsigned long long value) +{ + __set_64bit(ptr, ll_low(value), ll_high(value)); +} + +#define set_64bit(ptr, value) \ + (__builtin_constant_p((value)) \ + ? __set_64bit_constant((ptr), (value)) \ + : __set_64bit_var((ptr), (value))) + +#define _set_64bit(ptr, value) \ + (__builtin_constant_p(value) \ + ? __set_64bit(ptr, (unsigned int)(value), \ + (unsigned int)((value) >> 32)) \ + : __set_64bit(ptr, ll_low((value)), ll_high((value)))) + +/* + * Note: no "lock" prefix even on SMP: xchg always implies lock anyway + * Note 2: xchg has side effect, so that attribute volatile is necessary, + * but generally the primitive is invalid, *ptr is output argument. --ANK + */ +static inline unsigned long __xchg(unsigned long x, volatile void *ptr, + int size) +{ + switch (size) { + case 1: + asm volatile("xchgb %b0,%1" + : "=q" (x) + : "m" (*__xg(ptr)), "0" (x) + : "memory"); + break; + case 2: + asm volatile("xchgw %w0,%1" + : "=r" (x) + : "m" (*__xg(ptr)), "0" (x) + : "memory"); + break; + case 4: + asm volatile("xchgl %0,%1" + : "=r" (x) + : "m" (*__xg(ptr)), "0" (x) + : "memory"); + break; + } + return x; +} + +/* + * Atomic compare and exchange. Compare OLD with MEM, if identical, + * store NEW in MEM. Return the initial value in MEM. Success is + * indicated by comparing RETURN with OLD. + */ + +#ifdef CONFIG_X86_CMPXCHG +#define __HAVE_ARCH_CMPXCHG 1 +#define cmpxchg(ptr, o, n) \ + ((__typeof__(*(ptr)))__cmpxchg((ptr), (unsigned long)(o), \ + (unsigned long)(n), \ + sizeof(*(ptr)))) +#define sync_cmpxchg(ptr, o, n) \ + ((__typeof__(*(ptr)))__sync_cmpxchg((ptr), (unsigned long)(o), \ + (unsigned long)(n), \ + sizeof(*(ptr)))) +#define cmpxchg_local(ptr, o, n) \ + ((__typeof__(*(ptr)))__cmpxchg_local((ptr), (unsigned long)(o), \ + (unsigned long)(n), \ + sizeof(*(ptr)))) +#endif + +#ifdef CONFIG_X86_CMPXCHG64 +#define cmpxchg64(ptr, o, n) \ + ((__typeof__(*(ptr)))__cmpxchg64((ptr), (unsigned long long)(o), \ + (unsigned long long)(n))) +#define cmpxchg64_local(ptr, o, n) \ + ((__typeof__(*(ptr)))__cmpxchg64_local((ptr), (unsigned long long)(o), \ + (unsigned long long)(n))) +#endif + +static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, + unsigned long new, int size) +{ + unsigned long prev; + switch (size) { + case 1: + asm volatile(LOCK_PREFIX "cmpxchgb %b1,%2" + : "=a"(prev) + : "q"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); + return prev; + case 2: + asm volatile(LOCK_PREFIX "cmpxchgw %w1,%2" + : "=a"(prev) + : "r"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); + return prev; + case 4: + asm volatile(LOCK_PREFIX "cmpxchgl %1,%2" + : "=a"(prev) + : "r"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); + return prev; + } + return old; +} + +/* + * Always use locked operations when touching memory shared with a + * hypervisor, since the system may be SMP even if the guest kernel + * isn't. + */ +static inline unsigned long __sync_cmpxchg(volatile void *ptr, + unsigned long old, + unsigned long new, int size) +{ + unsigned long prev; + switch (size) { + case 1: + asm volatile("lock; cmpxchgb %b1,%2" + : "=a"(prev) + : "q"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); + return prev; + case 2: + asm volatile("lock; cmpxchgw %w1,%2" + : "=a"(prev) + : "r"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); + return prev; + case 4: + asm volatile("lock; cmpxchgl %1,%2" + : "=a"(prev) + : "r"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); + return prev; + } + return old; +} + +static inline unsigned long __cmpxchg_local(volatile void *ptr, + unsigned long old, + unsigned long new, int size) +{ + unsigned long prev; + switch (size) { + case 1: + asm volatile("cmpxchgb %b1,%2" + : "=a"(prev) + : "q"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); + return prev; + case 2: + asm volatile("cmpxchgw %w1,%2" + : "=a"(prev) + : "r"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); + return prev; + case 4: + asm volatile("cmpxchgl %1,%2" + : "=a"(prev) + : "r"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); + return prev; + } + return old; +} + +static inline unsigned long long __cmpxchg64(volatile void *ptr, + unsigned long long old, + unsigned long long new) +{ + unsigned long long prev; + asm volatile(LOCK_PREFIX "cmpxchg8b %3" + : "=A"(prev) + : "b"((unsigned long)new), + "c"((unsigned long)(new >> 32)), + "m"(*__xg(ptr)), + "0"(old) + : "memory"); + return prev; +} + +static inline unsigned long long __cmpxchg64_local(volatile void *ptr, + unsigned long long old, + unsigned long long new) +{ + unsigned long long prev; + asm volatile("cmpxchg8b %3" + : "=A"(prev) + : "b"((unsigned long)new), + "c"((unsigned long)(new >> 32)), + "m"(*__xg(ptr)), + "0"(old) + : "memory"); + return prev; +} + +#ifndef CONFIG_X86_CMPXCHG +/* + * Building a kernel capable running on 80386. It may be necessary to + * simulate the cmpxchg on the 80386 CPU. For that purpose we define + * a function for each of the sizes we support. + */ + +extern unsigned long cmpxchg_386_u8(volatile void *, u8, u8); +extern unsigned long cmpxchg_386_u16(volatile void *, u16, u16); +extern unsigned long cmpxchg_386_u32(volatile void *, u32, u32); + +static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old, + unsigned long new, int size) +{ + switch (size) { + case 1: + return cmpxchg_386_u8(ptr, old, new); + case 2: + return cmpxchg_386_u16(ptr, old, new); + case 4: + return cmpxchg_386_u32(ptr, old, new); + } + return old; +} + +#define cmpxchg(ptr, o, n) \ +({ \ + __typeof__(*(ptr)) __ret; \ + if (likely(boot_cpu_data.x86 > 3)) \ + __ret = (__typeof__(*(ptr)))__cmpxchg((ptr), \ + (unsigned long)(o), (unsigned long)(n), \ + sizeof(*(ptr))); \ + else \ + __ret = (__typeof__(*(ptr)))cmpxchg_386((ptr), \ + (unsigned long)(o), (unsigned long)(n), \ + sizeof(*(ptr))); \ + __ret; \ +}) +#define cmpxchg_local(ptr, o, n) \ +({ \ + __typeof__(*(ptr)) __ret; \ + if (likely(boot_cpu_data.x86 > 3)) \ + __ret = (__typeof__(*(ptr)))__cmpxchg_local((ptr), \ + (unsigned long)(o), (unsigned long)(n), \ + sizeof(*(ptr))); \ + else \ + __ret = (__typeof__(*(ptr)))cmpxchg_386((ptr), \ + (unsigned long)(o), (unsigned long)(n), \ + sizeof(*(ptr))); \ + __ret; \ +}) +#endif + +#ifndef CONFIG_X86_CMPXCHG64 +/* + * Building a kernel capable running on 80386 and 80486. It may be necessary + * to simulate the cmpxchg8b on the 80386 and 80486 CPU. + */ + +extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64); + +#define cmpxchg64(ptr, o, n) \ +({ \ + __typeof__(*(ptr)) __ret; \ + if (likely(boot_cpu_data.x86 > 4)) \ + __ret = (__typeof__(*(ptr)))__cmpxchg64((ptr), \ + (unsigned long long)(o), \ + (unsigned long long)(n)); \ + else \ + __ret = (__typeof__(*(ptr)))cmpxchg_486_u64((ptr), \ + (unsigned long long)(o), \ + (unsigned long long)(n)); \ + __ret; \ +}) +#define cmpxchg64_local(ptr, o, n) \ +({ \ + __typeof__(*(ptr)) __ret; \ + if (likely(boot_cpu_data.x86 > 4)) \ + __ret = (__typeof__(*(ptr)))__cmpxchg64_local((ptr), \ + (unsigned long long)(o), \ + (unsigned long long)(n)); \ + else \ + __ret = (__typeof__(*(ptr)))cmpxchg_486_u64((ptr), \ + (unsigned long long)(o), \ + (unsigned long long)(n)); \ + __ret; \ +}) + +#endif + +#endif /* _ASM_X86_CMPXCHG_32_H */ diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h new file mode 100644 index 000000000000..52de72e0de8c --- /dev/null +++ b/arch/x86/include/asm/cmpxchg_64.h @@ -0,0 +1,185 @@ +#ifndef _ASM_X86_CMPXCHG_64_H +#define _ASM_X86_CMPXCHG_64_H + +#include <asm/alternative.h> /* Provides LOCK_PREFIX */ + +#define xchg(ptr, v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v), \ + (ptr), sizeof(*(ptr)))) + +#define __xg(x) ((volatile long *)(x)) + +static inline void set_64bit(volatile unsigned long *ptr, unsigned long val) +{ + *ptr = val; +} + +#define _set_64bit set_64bit + +/* + * Note: no "lock" prefix even on SMP: xchg always implies lock anyway + * Note 2: xchg has side effect, so that attribute volatile is necessary, + * but generally the primitive is invalid, *ptr is output argument. --ANK + */ +static inline unsigned long __xchg(unsigned long x, volatile void *ptr, + int size) +{ + switch (size) { + case 1: + asm volatile("xchgb %b0,%1" + : "=q" (x) + : "m" (*__xg(ptr)), "0" (x) + : "memory"); + break; + case 2: + asm volatile("xchgw %w0,%1" + : "=r" (x) + : "m" (*__xg(ptr)), "0" (x) + : "memory"); + break; + case 4: + asm volatile("xchgl %k0,%1" + : "=r" (x) + : "m" (*__xg(ptr)), "0" (x) + : "memory"); + break; + case 8: + asm volatile("xchgq %0,%1" + : "=r" (x) + : "m" (*__xg(ptr)), "0" (x) + : "memory"); + break; + } + return x; +} + +/* + * Atomic compare and exchange. Compare OLD with MEM, if identical, + * store NEW in MEM. Return the initial value in MEM. Success is + * indicated by comparing RETURN with OLD. + */ + +#define __HAVE_ARCH_CMPXCHG 1 + +static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, + unsigned long new, int size) +{ + unsigned long prev; + switch (size) { + case 1: + asm volatile(LOCK_PREFIX "cmpxchgb %b1,%2" + : "=a"(prev) + : "q"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); + return prev; + case 2: + asm volatile(LOCK_PREFIX "cmpxchgw %w1,%2" + : "=a"(prev) + : "r"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); + return prev; + case 4: + asm volatile(LOCK_PREFIX "cmpxchgl %k1,%2" + : "=a"(prev) + : "r"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); + return prev; + case 8: + asm volatile(LOCK_PREFIX "cmpxchgq %1,%2" + : "=a"(prev) + : "r"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); + return prev; + } + return old; +} + +/* + * Always use locked operations when touching memory shared with a + * hypervisor, since the system may be SMP even if the guest kernel + * isn't. + */ +static inline unsigned long __sync_cmpxchg(volatile void *ptr, + unsigned long old, + unsigned long new, int size) +{ + unsigned long prev; + switch (size) { + case 1: + asm volatile("lock; cmpxchgb %b1,%2" + : "=a"(prev) + : "q"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); + return prev; + case 2: + asm volatile("lock; cmpxchgw %w1,%2" + : "=a"(prev) + : "r"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); + return prev; + case 4: + asm volatile("lock; cmpxchgl %1,%2" + : "=a"(prev) + : "r"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); + return prev; + } + return old; +} + +static inline unsigned long __cmpxchg_local(volatile void *ptr, + unsigned long old, + unsigned long new, int size) +{ + unsigned long prev; + switch (size) { + case 1: + asm volatile("cmpxchgb %b1,%2" + : "=a"(prev) + : "q"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); + return prev; + case 2: + asm volatile("cmpxchgw %w1,%2" + : "=a"(prev) + : "r"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); + return prev; + case 4: + asm volatile("cmpxchgl %k1,%2" + : "=a"(prev) + : "r"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); + return prev; + case 8: + asm volatile("cmpxchgq %1,%2" + : "=a"(prev) + : "r"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); + return prev; + } + return old; +} + +#define cmpxchg(ptr, o, n) \ + ((__typeof__(*(ptr)))__cmpxchg((ptr), (unsigned long)(o), \ + (unsigned long)(n), sizeof(*(ptr)))) +#define cmpxchg64(ptr, o, n) \ +({ \ + BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ + cmpxchg((ptr), (o), (n)); \ +}) +#define cmpxchg_local(ptr, o, n) \ + ((__typeof__(*(ptr)))__cmpxchg_local((ptr), (unsigned long)(o), \ + (unsigned long)(n), \ + sizeof(*(ptr)))) +#define sync_cmpxchg(ptr, o, n) \ + ((__typeof__(*(ptr)))__sync_cmpxchg((ptr), (unsigned long)(o), \ + (unsigned long)(n), \ + sizeof(*(ptr)))) +#define cmpxchg64_local(ptr, o, n) \ +({ \ + BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ + cmpxchg_local((ptr), (o), (n)); \ +}) + +#endif /* _ASM_X86_CMPXCHG_64_H */ diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h new file mode 100644 index 000000000000..9a9c7bdc923d --- /dev/null +++ b/arch/x86/include/asm/compat.h @@ -0,0 +1,218 @@ +#ifndef _ASM_X86_COMPAT_H +#define _ASM_X86_COMPAT_H + +/* + * Architecture specific compatibility types + */ +#include <linux/types.h> +#include <linux/sched.h> +#include <asm/user32.h> + +#define COMPAT_USER_HZ 100 + +typedef u32 compat_size_t; +typedef s32 compat_ssize_t; +typedef s32 compat_time_t; +typedef s32 compat_clock_t; +typedef s32 compat_pid_t; +typedef u16 __compat_uid_t; +typedef u16 __compat_gid_t; +typedef u32 __compat_uid32_t; +typedef u32 __compat_gid32_t; +typedef u16 compat_mode_t; +typedef u32 compat_ino_t; +typedef u16 compat_dev_t; +typedef s32 compat_off_t; +typedef s64 compat_loff_t; +typedef u16 compat_nlink_t; +typedef u16 compat_ipc_pid_t; +typedef s32 compat_daddr_t; +typedef u32 compat_caddr_t; +typedef __kernel_fsid_t compat_fsid_t; +typedef s32 compat_timer_t; +typedef s32 compat_key_t; + +typedef s32 compat_int_t; +typedef s32 compat_long_t; +typedef s64 __attribute__((aligned(4))) compat_s64; +typedef u32 compat_uint_t; +typedef u32 compat_ulong_t; +typedef u64 __attribute__((aligned(4))) compat_u64; + +struct compat_timespec { + compat_time_t tv_sec; + s32 tv_nsec; +}; + +struct compat_timeval { + compat_time_t tv_sec; + s32 tv_usec; +}; + +struct compat_stat { + compat_dev_t st_dev; + u16 __pad1; + compat_ino_t st_ino; + compat_mode_t st_mode; + compat_nlink_t st_nlink; + __compat_uid_t st_uid; + __compat_gid_t st_gid; + compat_dev_t st_rdev; + u16 __pad2; + u32 st_size; + u32 st_blksize; + u32 st_blocks; + u32 st_atime; + u32 st_atime_nsec; + u32 st_mtime; + u32 st_mtime_nsec; + u32 st_ctime; + u32 st_ctime_nsec; + u32 __unused4; + u32 __unused5; +}; + +struct compat_flock { + short l_type; + short l_whence; + compat_off_t l_start; + compat_off_t l_len; + compat_pid_t l_pid; +}; + +#define F_GETLK64 12 /* using 'struct flock64' */ +#define F_SETLK64 13 +#define F_SETLKW64 14 + +/* + * IA32 uses 4 byte alignment for 64 bit quantities, + * so we need to pack this structure. + */ +struct compat_flock64 { + short l_type; + short l_whence; + compat_loff_t l_start; + compat_loff_t l_len; + compat_pid_t l_pid; +} __attribute__((packed)); + +struct compat_statfs { + int f_type; + int f_bsize; + int f_blocks; + int f_bfree; + int f_bavail; + int f_files; + int f_ffree; + compat_fsid_t f_fsid; + int f_namelen; /* SunOS ignores this field. */ + int f_frsize; + int f_spare[5]; +}; + +#define COMPAT_RLIM_OLD_INFINITY 0x7fffffff +#define COMPAT_RLIM_INFINITY 0xffffffff + +typedef u32 compat_old_sigset_t; /* at least 32 bits */ + +#define _COMPAT_NSIG 64 +#define _COMPAT_NSIG_BPW 32 + +typedef u32 compat_sigset_word; + +#define COMPAT_OFF_T_MAX 0x7fffffff +#define COMPAT_LOFF_T_MAX 0x7fffffffffffffffL + +struct compat_ipc64_perm { + compat_key_t key; + __compat_uid32_t uid; + __compat_gid32_t gid; + __compat_uid32_t cuid; + __compat_gid32_t cgid; + unsigned short mode; + unsigned short __pad1; + unsigned short seq; + unsigned short __pad2; + compat_ulong_t unused1; + compat_ulong_t unused2; +}; + +struct compat_semid64_ds { + struct compat_ipc64_perm sem_perm; + compat_time_t sem_otime; + compat_ulong_t __unused1; + compat_time_t sem_ctime; + compat_ulong_t __unused2; + compat_ulong_t sem_nsems; + compat_ulong_t __unused3; + compat_ulong_t __unused4; +}; + +struct compat_msqid64_ds { + struct compat_ipc64_perm msg_perm; + compat_time_t msg_stime; + compat_ulong_t __unused1; + compat_time_t msg_rtime; + compat_ulong_t __unused2; + compat_time_t msg_ctime; + compat_ulong_t __unused3; + compat_ulong_t msg_cbytes; + compat_ulong_t msg_qnum; + compat_ulong_t msg_qbytes; + compat_pid_t msg_lspid; + compat_pid_t msg_lrpid; + compat_ulong_t __unused4; + compat_ulong_t __unused5; +}; + +struct compat_shmid64_ds { + struct compat_ipc64_perm shm_perm; + compat_size_t shm_segsz; + compat_time_t shm_atime; + compat_ulong_t __unused1; + compat_time_t shm_dtime; + compat_ulong_t __unused2; + compat_time_t shm_ctime; + compat_ulong_t __unused3; + compat_pid_t shm_cpid; + compat_pid_t shm_lpid; + compat_ulong_t shm_nattch; + compat_ulong_t __unused4; + compat_ulong_t __unused5; +}; + +/* + * The type of struct elf_prstatus.pr_reg in compatible core dumps. + */ +typedef struct user_regs_struct32 compat_elf_gregset_t; + +/* + * A pointer passed in from user mode. This should not + * be used for syscall parameters, just declare them + * as pointers because the syscall entry code will have + * appropriately converted them already. + */ +typedef u32 compat_uptr_t; + +static inline void __user *compat_ptr(compat_uptr_t uptr) +{ + return (void __user *)(unsigned long)uptr; +} + +static inline compat_uptr_t ptr_to_compat(void __user *uptr) +{ + return (u32)(unsigned long)uptr; +} + +static inline void __user *compat_alloc_user_space(long len) +{ + struct pt_regs *regs = task_pt_regs(current); + return (void __user *)regs->sp - len; +} + +static inline int is_compat_task(void) +{ + return current_thread_info()->status & TS_COMPAT; +} + +#endif /* _ASM_X86_COMPAT_H */ diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h new file mode 100644 index 000000000000..bae482df6039 --- /dev/null +++ b/arch/x86/include/asm/cpu.h @@ -0,0 +1,20 @@ +#ifndef _ASM_X86_CPU_H +#define _ASM_X86_CPU_H + +#include <linux/device.h> +#include <linux/cpu.h> +#include <linux/topology.h> +#include <linux/nodemask.h> +#include <linux/percpu.h> + +struct x86_cpu { + struct cpu cpu; +}; + +#ifdef CONFIG_HOTPLUG_CPU +extern int arch_register_cpu(int num); +extern void arch_unregister_cpu(int); +#endif + +DECLARE_PER_CPU(int, cpu_state); +#endif /* _ASM_X86_CPU_H */ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h new file mode 100644 index 000000000000..f73e95d75b45 --- /dev/null +++ b/arch/x86/include/asm/cpufeature.h @@ -0,0 +1,271 @@ +/* + * Defines x86 CPU feature bits + */ +#ifndef _ASM_X86_CPUFEATURE_H +#define _ASM_X86_CPUFEATURE_H + +#include <asm/required-features.h> + +#define NCAPINTS 9 /* N 32-bit words worth of info */ + +/* + * Note: If the comment begins with a quoted string, that string is used + * in /proc/cpuinfo instead of the macro name. If the string is "", + * this feature bit is not displayed in /proc/cpuinfo at all. + */ + +/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */ +#define X86_FEATURE_FPU (0*32+ 0) /* Onboard FPU */ +#define X86_FEATURE_VME (0*32+ 1) /* Virtual Mode Extensions */ +#define X86_FEATURE_DE (0*32+ 2) /* Debugging Extensions */ +#define X86_FEATURE_PSE (0*32+ 3) /* Page Size Extensions */ +#define X86_FEATURE_TSC (0*32+ 4) /* Time Stamp Counter */ +#define X86_FEATURE_MSR (0*32+ 5) /* Model-Specific Registers */ +#define X86_FEATURE_PAE (0*32+ 6) /* Physical Address Extensions */ +#define X86_FEATURE_MCE (0*32+ 7) /* Machine Check Architecture */ +#define X86_FEATURE_CX8 (0*32+ 8) /* CMPXCHG8 instruction */ +#define X86_FEATURE_APIC (0*32+ 9) /* Onboard APIC */ +#define X86_FEATURE_SEP (0*32+11) /* SYSENTER/SYSEXIT */ +#define X86_FEATURE_MTRR (0*32+12) /* Memory Type Range Registers */ +#define X86_FEATURE_PGE (0*32+13) /* Page Global Enable */ +#define X86_FEATURE_MCA (0*32+14) /* Machine Check Architecture */ +#define X86_FEATURE_CMOV (0*32+15) /* CMOV instructions */ + /* (plus FCMOVcc, FCOMI with FPU) */ +#define X86_FEATURE_PAT (0*32+16) /* Page Attribute Table */ +#define X86_FEATURE_PSE36 (0*32+17) /* 36-bit PSEs */ +#define X86_FEATURE_PN (0*32+18) /* Processor serial number */ +#define X86_FEATURE_CLFLSH (0*32+19) /* "clflush" CLFLUSH instruction */ +#define X86_FEATURE_DS (0*32+21) /* "dts" Debug Store */ +#define X86_FEATURE_ACPI (0*32+22) /* ACPI via MSR */ +#define X86_FEATURE_MMX (0*32+23) /* Multimedia Extensions */ +#define X86_FEATURE_FXSR (0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */ +#define X86_FEATURE_XMM (0*32+25) /* "sse" */ +#define X86_FEATURE_XMM2 (0*32+26) /* "sse2" */ +#define X86_FEATURE_SELFSNOOP (0*32+27) /* "ss" CPU self snoop */ +#define X86_FEATURE_HT (0*32+28) /* Hyper-Threading */ +#define X86_FEATURE_ACC (0*32+29) /* "tm" Automatic clock control */ +#define X86_FEATURE_IA64 (0*32+30) /* IA-64 processor */ +#define X86_FEATURE_PBE (0*32+31) /* Pending Break Enable */ + +/* AMD-defined CPU features, CPUID level 0x80000001, word 1 */ +/* Don't duplicate feature flags which are redundant with Intel! */ +#define X86_FEATURE_SYSCALL (1*32+11) /* SYSCALL/SYSRET */ +#define X86_FEATURE_MP (1*32+19) /* MP Capable. */ +#define X86_FEATURE_NX (1*32+20) /* Execute Disable */ +#define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */ +#define X86_FEATURE_FXSR_OPT (1*32+25) /* FXSAVE/FXRSTOR optimizations */ +#define X86_FEATURE_GBPAGES (1*32+26) /* "pdpe1gb" GB pages */ +#define X86_FEATURE_RDTSCP (1*32+27) /* RDTSCP */ +#define X86_FEATURE_LM (1*32+29) /* Long Mode (x86-64) */ +#define X86_FEATURE_3DNOWEXT (1*32+30) /* AMD 3DNow! extensions */ +#define X86_FEATURE_3DNOW (1*32+31) /* 3DNow! */ + +/* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */ +#define X86_FEATURE_RECOVERY (2*32+ 0) /* CPU in recovery mode */ +#define X86_FEATURE_LONGRUN (2*32+ 1) /* Longrun power control */ +#define X86_FEATURE_LRTI (2*32+ 3) /* LongRun table interface */ + +/* Other features, Linux-defined mapping, word 3 */ +/* This range is used for feature bits which conflict or are synthesized */ +#define X86_FEATURE_CXMMX (3*32+ 0) /* Cyrix MMX extensions */ +#define X86_FEATURE_K6_MTRR (3*32+ 1) /* AMD K6 nonstandard MTRRs */ +#define X86_FEATURE_CYRIX_ARR (3*32+ 2) /* Cyrix ARRs (= MTRRs) */ +#define X86_FEATURE_CENTAUR_MCR (3*32+ 3) /* Centaur MCRs (= MTRRs) */ +/* cpu types for specific tunings: */ +#define X86_FEATURE_K8 (3*32+ 4) /* "" Opteron, Athlon64 */ +#define X86_FEATURE_K7 (3*32+ 5) /* "" Athlon */ +#define X86_FEATURE_P3 (3*32+ 6) /* "" P3 */ +#define X86_FEATURE_P4 (3*32+ 7) /* "" P4 */ +#define X86_FEATURE_CONSTANT_TSC (3*32+ 8) /* TSC ticks at a constant rate */ +#define X86_FEATURE_UP (3*32+ 9) /* smp kernel running on up */ +#define X86_FEATURE_FXSAVE_LEAK (3*32+10) /* "" FXSAVE leaks FOP/FIP/FOP */ +#define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */ +#define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */ +#define X86_FEATURE_PEBS (3*32+12) /* Precise-Event Based Sampling */ +#define X86_FEATURE_BTS (3*32+13) /* Branch Trace Store */ +#define X86_FEATURE_SYSCALL32 (3*32+14) /* "" syscall in ia32 userspace */ +#define X86_FEATURE_SYSENTER32 (3*32+15) /* "" sysenter in ia32 userspace */ +#define X86_FEATURE_REP_GOOD (3*32+16) /* rep microcode works well */ +#define X86_FEATURE_MFENCE_RDTSC (3*32+17) /* "" Mfence synchronizes RDTSC */ +#define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* "" Lfence synchronizes RDTSC */ +#define X86_FEATURE_11AP (3*32+19) /* "" Bad local APIC aka 11AP */ +#define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */ +#define X86_FEATURE_AMDC1E (3*32+21) /* AMD C1E detected */ +#define X86_FEATURE_XTOPOLOGY (3*32+21) /* cpu topology enum extensions */ + +/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ +#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ +#define X86_FEATURE_PCLMULQDQ (4*32+ 1) /* PCLMULQDQ instruction */ +#define X86_FEATURE_DTES64 (4*32+ 2) /* 64-bit Debug Store */ +#define X86_FEATURE_MWAIT (4*32+ 3) /* "monitor" Monitor/Mwait support */ +#define X86_FEATURE_DSCPL (4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */ +#define X86_FEATURE_VMX (4*32+ 5) /* Hardware virtualization */ +#define X86_FEATURE_SMX (4*32+ 6) /* Safer mode */ +#define X86_FEATURE_EST (4*32+ 7) /* Enhanced SpeedStep */ +#define X86_FEATURE_TM2 (4*32+ 8) /* Thermal Monitor 2 */ +#define X86_FEATURE_SSSE3 (4*32+ 9) /* Supplemental SSE-3 */ +#define X86_FEATURE_CID (4*32+10) /* Context ID */ +#define X86_FEATURE_FMA (4*32+12) /* Fused multiply-add */ +#define X86_FEATURE_CX16 (4*32+13) /* CMPXCHG16B */ +#define X86_FEATURE_XTPR (4*32+14) /* Send Task Priority Messages */ +#define X86_FEATURE_PDCM (4*32+15) /* Performance Capabilities */ +#define X86_FEATURE_DCA (4*32+18) /* Direct Cache Access */ +#define X86_FEATURE_XMM4_1 (4*32+19) /* "sse4_1" SSE-4.1 */ +#define X86_FEATURE_XMM4_2 (4*32+20) /* "sse4_2" SSE-4.2 */ +#define X86_FEATURE_X2APIC (4*32+21) /* x2APIC */ +#define X86_FEATURE_AES (4*32+25) /* AES instructions */ +#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ +#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */ +#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */ + +/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ +#define X86_FEATURE_XSTORE (5*32+ 2) /* "rng" RNG present (xstore) */ +#define X86_FEATURE_XSTORE_EN (5*32+ 3) /* "rng_en" RNG enabled */ +#define X86_FEATURE_XCRYPT (5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */ +#define X86_FEATURE_XCRYPT_EN (5*32+ 7) /* "ace_en" on-CPU crypto enabled */ +#define X86_FEATURE_ACE2 (5*32+ 8) /* Advanced Cryptography Engine v2 */ +#define X86_FEATURE_ACE2_EN (5*32+ 9) /* ACE v2 enabled */ +#define X86_FEATURE_PHE (5*32+10) /* PadLock Hash Engine */ +#define X86_FEATURE_PHE_EN (5*32+11) /* PHE enabled */ +#define X86_FEATURE_PMM (5*32+12) /* PadLock Montgomery Multiplier */ +#define X86_FEATURE_PMM_EN (5*32+13) /* PMM enabled */ + +/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */ +#define X86_FEATURE_LAHF_LM (6*32+ 0) /* LAHF/SAHF in long mode */ +#define X86_FEATURE_CMP_LEGACY (6*32+ 1) /* If yes HyperThreading not valid */ +#define X86_FEATURE_SVM (6*32+ 2) /* Secure virtual machine */ +#define X86_FEATURE_EXTAPIC (6*32+ 3) /* Extended APIC space */ +#define X86_FEATURE_CR8_LEGACY (6*32+ 4) /* CR8 in 32-bit mode */ +#define X86_FEATURE_ABM (6*32+ 5) /* Advanced bit manipulation */ +#define X86_FEATURE_SSE4A (6*32+ 6) /* SSE-4A */ +#define X86_FEATURE_MISALIGNSSE (6*32+ 7) /* Misaligned SSE mode */ +#define X86_FEATURE_3DNOWPREFETCH (6*32+ 8) /* 3DNow prefetch instructions */ +#define X86_FEATURE_OSVW (6*32+ 9) /* OS Visible Workaround */ +#define X86_FEATURE_IBS (6*32+10) /* Instruction Based Sampling */ +#define X86_FEATURE_SSE5 (6*32+11) /* SSE-5 */ +#define X86_FEATURE_SKINIT (6*32+12) /* SKINIT/STGI instructions */ +#define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */ + +/* + * Auxiliary flags: Linux defined - For features scattered in various + * CPUID levels like 0x6, 0xA etc + */ +#define X86_FEATURE_IDA (7*32+ 0) /* Intel Dynamic Acceleration */ + +/* Virtualization flags: Linux defined */ +#define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */ +#define X86_FEATURE_VNMI (8*32+ 1) /* Intel Virtual NMI */ +#define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */ +#define X86_FEATURE_EPT (8*32+ 3) /* Intel Extended Page Table */ +#define X86_FEATURE_VPID (8*32+ 4) /* Intel Virtual Processor ID */ + +#if defined(__KERNEL__) && !defined(__ASSEMBLY__) + +#include <linux/bitops.h> + +extern const char * const x86_cap_flags[NCAPINTS*32]; +extern const char * const x86_power_flags[32]; + +#define test_cpu_cap(c, bit) \ + test_bit(bit, (unsigned long *)((c)->x86_capability)) + +#define cpu_has(c, bit) \ + (__builtin_constant_p(bit) && \ + ( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0)) || \ + (((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1)) || \ + (((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2)) || \ + (((bit)>>5)==3 && (1UL<<((bit)&31) & REQUIRED_MASK3)) || \ + (((bit)>>5)==4 && (1UL<<((bit)&31) & REQUIRED_MASK4)) || \ + (((bit)>>5)==5 && (1UL<<((bit)&31) & REQUIRED_MASK5)) || \ + (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \ + (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) ) \ + ? 1 : \ + test_cpu_cap(c, bit)) + +#define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit) + +#define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability)) +#define clear_cpu_cap(c, bit) clear_bit(bit, (unsigned long *)((c)->x86_capability)) +#define setup_clear_cpu_cap(bit) do { \ + clear_cpu_cap(&boot_cpu_data, bit); \ + set_bit(bit, (unsigned long *)cleared_cpu_caps); \ +} while (0) +#define setup_force_cpu_cap(bit) do { \ + set_cpu_cap(&boot_cpu_data, bit); \ + clear_bit(bit, (unsigned long *)cleared_cpu_caps); \ +} while (0) + +#define cpu_has_fpu boot_cpu_has(X86_FEATURE_FPU) +#define cpu_has_vme boot_cpu_has(X86_FEATURE_VME) +#define cpu_has_de boot_cpu_has(X86_FEATURE_DE) +#define cpu_has_pse boot_cpu_has(X86_FEATURE_PSE) +#define cpu_has_tsc boot_cpu_has(X86_FEATURE_TSC) +#define cpu_has_pae boot_cpu_has(X86_FEATURE_PAE) +#define cpu_has_pge boot_cpu_has(X86_FEATURE_PGE) +#define cpu_has_apic boot_cpu_has(X86_FEATURE_APIC) +#define cpu_has_sep boot_cpu_has(X86_FEATURE_SEP) +#define cpu_has_mtrr boot_cpu_has(X86_FEATURE_MTRR) +#define cpu_has_mmx boot_cpu_has(X86_FEATURE_MMX) +#define cpu_has_fxsr boot_cpu_has(X86_FEATURE_FXSR) +#define cpu_has_xmm boot_cpu_has(X86_FEATURE_XMM) +#define cpu_has_xmm2 boot_cpu_has(X86_FEATURE_XMM2) +#define cpu_has_xmm3 boot_cpu_has(X86_FEATURE_XMM3) +#define cpu_has_ht boot_cpu_has(X86_FEATURE_HT) +#define cpu_has_mp boot_cpu_has(X86_FEATURE_MP) +#define cpu_has_nx boot_cpu_has(X86_FEATURE_NX) +#define cpu_has_k6_mtrr boot_cpu_has(X86_FEATURE_K6_MTRR) +#define cpu_has_cyrix_arr boot_cpu_has(X86_FEATURE_CYRIX_ARR) +#define cpu_has_centaur_mcr boot_cpu_has(X86_FEATURE_CENTAUR_MCR) +#define cpu_has_xstore boot_cpu_has(X86_FEATURE_XSTORE) +#define cpu_has_xstore_enabled boot_cpu_has(X86_FEATURE_XSTORE_EN) +#define cpu_has_xcrypt boot_cpu_has(X86_FEATURE_XCRYPT) +#define cpu_has_xcrypt_enabled boot_cpu_has(X86_FEATURE_XCRYPT_EN) +#define cpu_has_ace2 boot_cpu_has(X86_FEATURE_ACE2) +#define cpu_has_ace2_enabled boot_cpu_has(X86_FEATURE_ACE2_EN) +#define cpu_has_phe boot_cpu_has(X86_FEATURE_PHE) +#define cpu_has_phe_enabled boot_cpu_has(X86_FEATURE_PHE_EN) +#define cpu_has_pmm boot_cpu_has(X86_FEATURE_PMM) +#define cpu_has_pmm_enabled boot_cpu_has(X86_FEATURE_PMM_EN) +#define cpu_has_ds boot_cpu_has(X86_FEATURE_DS) +#define cpu_has_pebs boot_cpu_has(X86_FEATURE_PEBS) +#define cpu_has_clflush boot_cpu_has(X86_FEATURE_CLFLSH) +#define cpu_has_bts boot_cpu_has(X86_FEATURE_BTS) +#define cpu_has_gbpages boot_cpu_has(X86_FEATURE_GBPAGES) +#define cpu_has_arch_perfmon boot_cpu_has(X86_FEATURE_ARCH_PERFMON) +#define cpu_has_pat boot_cpu_has(X86_FEATURE_PAT) +#define cpu_has_xmm4_1 boot_cpu_has(X86_FEATURE_XMM4_1) +#define cpu_has_xmm4_2 boot_cpu_has(X86_FEATURE_XMM4_2) +#define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC) +#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) + +#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64) +# define cpu_has_invlpg 1 +#else +# define cpu_has_invlpg (boot_cpu_data.x86 > 3) +#endif + +#ifdef CONFIG_X86_64 + +#undef cpu_has_vme +#define cpu_has_vme 0 + +#undef cpu_has_pae +#define cpu_has_pae ___BUG___ + +#undef cpu_has_mp +#define cpu_has_mp 1 + +#undef cpu_has_k6_mtrr +#define cpu_has_k6_mtrr 0 + +#undef cpu_has_cyrix_arr +#define cpu_has_cyrix_arr 0 + +#undef cpu_has_centaur_mcr +#define cpu_has_centaur_mcr 0 + +#endif /* CONFIG_X86_64 */ + +#endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */ + +#endif /* _ASM_X86_CPUFEATURE_H */ diff --git a/arch/x86/include/asm/cputime.h b/arch/x86/include/asm/cputime.h new file mode 100644 index 000000000000..6d68ad7e0ea3 --- /dev/null +++ b/arch/x86/include/asm/cputime.h @@ -0,0 +1 @@ +#include <asm-generic/cputime.h> diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h new file mode 100644 index 000000000000..0930b4f8d672 --- /dev/null +++ b/arch/x86/include/asm/current.h @@ -0,0 +1,39 @@ +#ifndef _ASM_X86_CURRENT_H +#define _ASM_X86_CURRENT_H + +#ifdef CONFIG_X86_32 +#include <linux/compiler.h> +#include <asm/percpu.h> + +struct task_struct; + +DECLARE_PER_CPU(struct task_struct *, current_task); +static __always_inline struct task_struct *get_current(void) +{ + return x86_read_percpu(current_task); +} + +#else /* X86_32 */ + +#ifndef __ASSEMBLY__ +#include <asm/pda.h> + +struct task_struct; + +static __always_inline struct task_struct *get_current(void) +{ + return read_pda(pcurrent); +} + +#else /* __ASSEMBLY__ */ + +#include <asm/asm-offsets.h> +#define GET_CURRENT(reg) movq %gs:(pda_pcurrent),reg + +#endif /* __ASSEMBLY__ */ + +#endif /* X86_32 */ + +#define current get_current() + +#endif /* _ASM_X86_CURRENT_H */ diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h new file mode 100644 index 000000000000..3ea6f37be9e2 --- /dev/null +++ b/arch/x86/include/asm/debugreg.h @@ -0,0 +1,70 @@ +#ifndef _ASM_X86_DEBUGREG_H +#define _ASM_X86_DEBUGREG_H + + +/* Indicate the register numbers for a number of the specific + debug registers. Registers 0-3 contain the addresses we wish to trap on */ +#define DR_FIRSTADDR 0 /* u_debugreg[DR_FIRSTADDR] */ +#define DR_LASTADDR 3 /* u_debugreg[DR_LASTADDR] */ + +#define DR_STATUS 6 /* u_debugreg[DR_STATUS] */ +#define DR_CONTROL 7 /* u_debugreg[DR_CONTROL] */ + +/* Define a few things for the status register. We can use this to determine + which debugging register was responsible for the trap. The other bits + are either reserved or not of interest to us. */ + +#define DR_TRAP0 (0x1) /* db0 */ +#define DR_TRAP1 (0x2) /* db1 */ +#define DR_TRAP2 (0x4) /* db2 */ +#define DR_TRAP3 (0x8) /* db3 */ + +#define DR_STEP (0x4000) /* single-step */ +#define DR_SWITCH (0x8000) /* task switch */ + +/* Now define a bunch of things for manipulating the control register. + The top two bytes of the control register consist of 4 fields of 4 + bits - each field corresponds to one of the four debug registers, + and indicates what types of access we trap on, and how large the data + field is that we are looking at */ + +#define DR_CONTROL_SHIFT 16 /* Skip this many bits in ctl register */ +#define DR_CONTROL_SIZE 4 /* 4 control bits per register */ + +#define DR_RW_EXECUTE (0x0) /* Settings for the access types to trap on */ +#define DR_RW_WRITE (0x1) +#define DR_RW_READ (0x3) + +#define DR_LEN_1 (0x0) /* Settings for data length to trap on */ +#define DR_LEN_2 (0x4) +#define DR_LEN_4 (0xC) +#define DR_LEN_8 (0x8) + +/* The low byte to the control register determine which registers are + enabled. There are 4 fields of two bits. One bit is "local", meaning + that the processor will reset the bit after a task switch and the other + is global meaning that we have to explicitly reset the bit. With linux, + you can use either one, since we explicitly zero the register when we enter + kernel mode. */ + +#define DR_LOCAL_ENABLE_SHIFT 0 /* Extra shift to the local enable bit */ +#define DR_GLOBAL_ENABLE_SHIFT 1 /* Extra shift to the global enable bit */ +#define DR_ENABLE_SIZE 2 /* 2 enable bits per register */ + +#define DR_LOCAL_ENABLE_MASK (0x55) /* Set local bits for all 4 regs */ +#define DR_GLOBAL_ENABLE_MASK (0xAA) /* Set global bits for all 4 regs */ + +/* The second byte to the control register has a few special things. + We can slow the instruction pipeline for instructions coming via the + gdt or the ldt if we want to. I am not sure why this is an advantage */ + +#ifdef __i386__ +#define DR_CONTROL_RESERVED (0xFC00) /* Reserved by Intel */ +#else +#define DR_CONTROL_RESERVED (0xFFFFFFFF0000FC00UL) /* Reserved */ +#endif + +#define DR_LOCAL_SLOWDOWN (0x100) /* Local slow the pipeline */ +#define DR_GLOBAL_SLOWDOWN (0x200) /* Global slow the pipeline */ + +#endif /* _ASM_X86_DEBUGREG_H */ diff --git a/arch/x86/include/asm/delay.h b/arch/x86/include/asm/delay.h new file mode 100644 index 000000000000..409a649204aa --- /dev/null +++ b/arch/x86/include/asm/delay.h @@ -0,0 +1,31 @@ +#ifndef _ASM_X86_DELAY_H +#define _ASM_X86_DELAY_H + +/* + * Copyright (C) 1993 Linus Torvalds + * + * Delay routines calling functions in arch/x86/lib/delay.c + */ + +/* Undefined functions to get compile-time errors */ +extern void __bad_udelay(void); +extern void __bad_ndelay(void); + +extern void __udelay(unsigned long usecs); +extern void __ndelay(unsigned long nsecs); +extern void __const_udelay(unsigned long xloops); +extern void __delay(unsigned long loops); + +/* 0x10c7 is 2**32 / 1000000 (rounded up) */ +#define udelay(n) (__builtin_constant_p(n) ? \ + ((n) > 20000 ? __bad_udelay() : __const_udelay((n) * 0x10c7ul)) : \ + __udelay(n)) + +/* 0x5 is 2**32 / 1000000000 (rounded up) */ +#define ndelay(n) (__builtin_constant_p(n) ? \ + ((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \ + __ndelay(n)) + +void use_tsc_delay(void); + +#endif /* _ASM_X86_DELAY_H */ diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h new file mode 100644 index 000000000000..e6b82b17b072 --- /dev/null +++ b/arch/x86/include/asm/desc.h @@ -0,0 +1,409 @@ +#ifndef _ASM_X86_DESC_H +#define _ASM_X86_DESC_H + +#ifndef __ASSEMBLY__ +#include <asm/desc_defs.h> +#include <asm/ldt.h> +#include <asm/mmu.h> +#include <linux/smp.h> + +static inline void fill_ldt(struct desc_struct *desc, + const struct user_desc *info) +{ + desc->limit0 = info->limit & 0x0ffff; + desc->base0 = info->base_addr & 0x0000ffff; + + desc->base1 = (info->base_addr & 0x00ff0000) >> 16; + desc->type = (info->read_exec_only ^ 1) << 1; + desc->type |= info->contents << 2; + desc->s = 1; + desc->dpl = 0x3; + desc->p = info->seg_not_present ^ 1; + desc->limit = (info->limit & 0xf0000) >> 16; + desc->avl = info->useable; + desc->d = info->seg_32bit; + desc->g = info->limit_in_pages; + desc->base2 = (info->base_addr & 0xff000000) >> 24; + /* + * Don't allow setting of the lm bit. It is useless anyway + * because 64bit system calls require __USER_CS: + */ + desc->l = 0; +} + +extern struct desc_ptr idt_descr; +extern gate_desc idt_table[]; + +struct gdt_page { + struct desc_struct gdt[GDT_ENTRIES]; +} __attribute__((aligned(PAGE_SIZE))); +DECLARE_PER_CPU(struct gdt_page, gdt_page); + +static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) +{ + return per_cpu(gdt_page, cpu).gdt; +} + +#ifdef CONFIG_X86_64 + +static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func, + unsigned dpl, unsigned ist, unsigned seg) +{ + gate->offset_low = PTR_LOW(func); + gate->segment = __KERNEL_CS; + gate->ist = ist; + gate->p = 1; + gate->dpl = dpl; + gate->zero0 = 0; + gate->zero1 = 0; + gate->type = type; + gate->offset_middle = PTR_MIDDLE(func); + gate->offset_high = PTR_HIGH(func); +} + +#else +static inline void pack_gate(gate_desc *gate, unsigned char type, + unsigned long base, unsigned dpl, unsigned flags, + unsigned short seg) +{ + gate->a = (seg << 16) | (base & 0xffff); + gate->b = (base & 0xffff0000) | + (((0x80 | type | (dpl << 5)) & 0xff) << 8); +} + +#endif + +static inline int desc_empty(const void *ptr) +{ + const u32 *desc = ptr; + return !(desc[0] | desc[1]); +} + +#ifdef CONFIG_PARAVIRT +#include <asm/paravirt.h> +#else +#define load_TR_desc() native_load_tr_desc() +#define load_gdt(dtr) native_load_gdt(dtr) +#define load_idt(dtr) native_load_idt(dtr) +#define load_tr(tr) asm volatile("ltr %0"::"m" (tr)) +#define load_ldt(ldt) asm volatile("lldt %0"::"m" (ldt)) + +#define store_gdt(dtr) native_store_gdt(dtr) +#define store_idt(dtr) native_store_idt(dtr) +#define store_tr(tr) (tr = native_store_tr()) +#define store_ldt(ldt) asm("sldt %0":"=m" (ldt)) + +#define load_TLS(t, cpu) native_load_tls(t, cpu) +#define set_ldt native_set_ldt + +#define write_ldt_entry(dt, entry, desc) \ + native_write_ldt_entry(dt, entry, desc) +#define write_gdt_entry(dt, entry, desc, type) \ + native_write_gdt_entry(dt, entry, desc, type) +#define write_idt_entry(dt, entry, g) \ + native_write_idt_entry(dt, entry, g) + +static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries) +{ +} + +static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries) +{ +} +#endif /* CONFIG_PARAVIRT */ + +static inline void native_write_idt_entry(gate_desc *idt, int entry, + const gate_desc *gate) +{ + memcpy(&idt[entry], gate, sizeof(*gate)); +} + +static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry, + const void *desc) +{ + memcpy(&ldt[entry], desc, 8); +} + +static inline void native_write_gdt_entry(struct desc_struct *gdt, int entry, + const void *desc, int type) +{ + unsigned int size; + switch (type) { + case DESC_TSS: + size = sizeof(tss_desc); + break; + case DESC_LDT: + size = sizeof(ldt_desc); + break; + default: + size = sizeof(struct desc_struct); + break; + } + memcpy(&gdt[entry], desc, size); +} + +static inline void pack_descriptor(struct desc_struct *desc, unsigned long base, + unsigned long limit, unsigned char type, + unsigned char flags) +{ + desc->a = ((base & 0xffff) << 16) | (limit & 0xffff); + desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) | + (limit & 0x000f0000) | ((type & 0xff) << 8) | + ((flags & 0xf) << 20); + desc->p = 1; +} + + +static inline void set_tssldt_descriptor(void *d, unsigned long addr, + unsigned type, unsigned size) +{ +#ifdef CONFIG_X86_64 + struct ldttss_desc64 *desc = d; + memset(desc, 0, sizeof(*desc)); + desc->limit0 = size & 0xFFFF; + desc->base0 = PTR_LOW(addr); + desc->base1 = PTR_MIDDLE(addr) & 0xFF; + desc->type = type; + desc->p = 1; + desc->limit1 = (size >> 16) & 0xF; + desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF; + desc->base3 = PTR_HIGH(addr); +#else + pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0); +#endif +} + +static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr) +{ + struct desc_struct *d = get_cpu_gdt_table(cpu); + tss_desc tss; + + /* + * sizeof(unsigned long) coming from an extra "long" at the end + * of the iobitmap. See tss_struct definition in processor.h + * + * -1? seg base+limit should be pointing to the address of the + * last valid byte + */ + set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS, + IO_BITMAP_OFFSET + IO_BITMAP_BYTES + + sizeof(unsigned long) - 1); + write_gdt_entry(d, entry, &tss, DESC_TSS); +} + +#define set_tss_desc(cpu, addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr) + +static inline void native_set_ldt(const void *addr, unsigned int entries) +{ + if (likely(entries == 0)) + asm volatile("lldt %w0"::"q" (0)); + else { + unsigned cpu = smp_processor_id(); + ldt_desc ldt; + + set_tssldt_descriptor(&ldt, (unsigned long)addr, DESC_LDT, + entries * LDT_ENTRY_SIZE - 1); + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, + &ldt, DESC_LDT); + asm volatile("lldt %w0"::"q" (GDT_ENTRY_LDT*8)); + } +} + +static inline void native_load_tr_desc(void) +{ + asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); +} + +static inline void native_load_gdt(const struct desc_ptr *dtr) +{ + asm volatile("lgdt %0"::"m" (*dtr)); +} + +static inline void native_load_idt(const struct desc_ptr *dtr) +{ + asm volatile("lidt %0"::"m" (*dtr)); +} + +static inline void native_store_gdt(struct desc_ptr *dtr) +{ + asm volatile("sgdt %0":"=m" (*dtr)); +} + +static inline void native_store_idt(struct desc_ptr *dtr) +{ + asm volatile("sidt %0":"=m" (*dtr)); +} + +static inline unsigned long native_store_tr(void) +{ + unsigned long tr; + asm volatile("str %0":"=r" (tr)); + return tr; +} + +static inline void native_load_tls(struct thread_struct *t, unsigned int cpu) +{ + unsigned int i; + struct desc_struct *gdt = get_cpu_gdt_table(cpu); + + for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) + gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]; +} + +#define _LDT_empty(info) \ + ((info)->base_addr == 0 && \ + (info)->limit == 0 && \ + (info)->contents == 0 && \ + (info)->read_exec_only == 1 && \ + (info)->seg_32bit == 0 && \ + (info)->limit_in_pages == 0 && \ + (info)->seg_not_present == 1 && \ + (info)->useable == 0) + +#ifdef CONFIG_X86_64 +#define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0)) +#else +#define LDT_empty(info) (_LDT_empty(info)) +#endif + +static inline void clear_LDT(void) +{ + set_ldt(NULL, 0); +} + +/* + * load one particular LDT into the current CPU + */ +static inline void load_LDT_nolock(mm_context_t *pc) +{ + set_ldt(pc->ldt, pc->size); +} + +static inline void load_LDT(mm_context_t *pc) +{ + preempt_disable(); + load_LDT_nolock(pc); + preempt_enable(); +} + +static inline unsigned long get_desc_base(const struct desc_struct *desc) +{ + return desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24); +} + +static inline unsigned long get_desc_limit(const struct desc_struct *desc) +{ + return desc->limit0 | (desc->limit << 16); +} + +static inline void _set_gate(int gate, unsigned type, void *addr, + unsigned dpl, unsigned ist, unsigned seg) +{ + gate_desc s; + pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg); + /* + * does not need to be atomic because it is only done once at + * setup time + */ + write_idt_entry(idt_table, gate, &s); +} + +/* + * This needs to use 'idt_table' rather than 'idt', and + * thus use the _nonmapped_ version of the IDT, as the + * Pentium F0 0F bugfix can have resulted in the mapped + * IDT being write-protected. + */ +static inline void set_intr_gate(unsigned int n, void *addr) +{ + BUG_ON((unsigned)n > 0xFF); + _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS); +} + +#define SYS_VECTOR_FREE 0 +#define SYS_VECTOR_ALLOCED 1 + +extern int first_system_vector; +extern char system_vectors[]; + +static inline void alloc_system_vector(int vector) +{ + if (system_vectors[vector] == SYS_VECTOR_FREE) { + system_vectors[vector] = SYS_VECTOR_ALLOCED; + if (first_system_vector > vector) + first_system_vector = vector; + } else + BUG(); +} + +static inline void alloc_intr_gate(unsigned int n, void *addr) +{ + alloc_system_vector(n); + set_intr_gate(n, addr); +} + +/* + * This routine sets up an interrupt gate at directory privilege level 3. + */ +static inline void set_system_intr_gate(unsigned int n, void *addr) +{ + BUG_ON((unsigned)n > 0xFF); + _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS); +} + +static inline void set_system_trap_gate(unsigned int n, void *addr) +{ + BUG_ON((unsigned)n > 0xFF); + _set_gate(n, GATE_TRAP, addr, 0x3, 0, __KERNEL_CS); +} + +static inline void set_trap_gate(unsigned int n, void *addr) +{ + BUG_ON((unsigned)n > 0xFF); + _set_gate(n, GATE_TRAP, addr, 0, 0, __KERNEL_CS); +} + +static inline void set_task_gate(unsigned int n, unsigned int gdt_entry) +{ + BUG_ON((unsigned)n > 0xFF); + _set_gate(n, GATE_TASK, (void *)0, 0, 0, (gdt_entry<<3)); +} + +static inline void set_intr_gate_ist(int n, void *addr, unsigned ist) +{ + BUG_ON((unsigned)n > 0xFF); + _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS); +} + +static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist) +{ + BUG_ON((unsigned)n > 0xFF); + _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS); +} + +#else +/* + * GET_DESC_BASE reads the descriptor base of the specified segment. + * + * Args: + * idx - descriptor index + * gdt - GDT pointer + * base - 32bit register to which the base will be written + * lo_w - lo word of the "base" register + * lo_b - lo byte of the "base" register + * hi_b - hi byte of the low word of the "base" register + * + * Example: + * GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah) + * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax. + */ +#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \ + movb idx * 8 + 4(gdt), lo_b; \ + movb idx * 8 + 7(gdt), hi_b; \ + shll $16, base; \ + movw idx * 8 + 2(gdt), lo_w; + + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_X86_DESC_H */ diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h new file mode 100644 index 000000000000..a6adefa28b94 --- /dev/null +++ b/arch/x86/include/asm/desc_defs.h @@ -0,0 +1,95 @@ +/* Written 2000 by Andi Kleen */ +#ifndef _ASM_X86_DESC_DEFS_H +#define _ASM_X86_DESC_DEFS_H + +/* + * Segment descriptor structure definitions, usable from both x86_64 and i386 + * archs. + */ + +#ifndef __ASSEMBLY__ + +#include <linux/types.h> + +/* + * FIXME: Acessing the desc_struct through its fields is more elegant, + * and should be the one valid thing to do. However, a lot of open code + * still touches the a and b acessors, and doing this allow us to do it + * incrementally. We keep the signature as a struct, rather than an union, + * so we can get rid of it transparently in the future -- glommer + */ +/* 8 byte segment descriptor */ +struct desc_struct { + union { + struct { + unsigned int a; + unsigned int b; + }; + struct { + u16 limit0; + u16 base0; + unsigned base1: 8, type: 4, s: 1, dpl: 2, p: 1; + unsigned limit: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8; + }; + }; +} __attribute__((packed)); + +enum { + GATE_INTERRUPT = 0xE, + GATE_TRAP = 0xF, + GATE_CALL = 0xC, + GATE_TASK = 0x5, +}; + +/* 16byte gate */ +struct gate_struct64 { + u16 offset_low; + u16 segment; + unsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1; + u16 offset_middle; + u32 offset_high; + u32 zero1; +} __attribute__((packed)); + +#define PTR_LOW(x) ((unsigned long long)(x) & 0xFFFF) +#define PTR_MIDDLE(x) (((unsigned long long)(x) >> 16) & 0xFFFF) +#define PTR_HIGH(x) ((unsigned long long)(x) >> 32) + +enum { + DESC_TSS = 0x9, + DESC_LDT = 0x2, + DESCTYPE_S = 0x10, /* !system */ +}; + +/* LDT or TSS descriptor in the GDT. 16 bytes. */ +struct ldttss_desc64 { + u16 limit0; + u16 base0; + unsigned base1 : 8, type : 5, dpl : 2, p : 1; + unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8; + u32 base3; + u32 zero1; +} __attribute__((packed)); + +#ifdef CONFIG_X86_64 +typedef struct gate_struct64 gate_desc; +typedef struct ldttss_desc64 ldt_desc; +typedef struct ldttss_desc64 tss_desc; +#define gate_offset(g) ((g).offset_low | ((unsigned long)(g).offset_middle << 16) | ((unsigned long)(g).offset_high << 32)) +#define gate_segment(g) ((g).segment) +#else +typedef struct desc_struct gate_desc; +typedef struct desc_struct ldt_desc; +typedef struct desc_struct tss_desc; +#define gate_offset(g) (((g).b & 0xffff0000) | ((g).a & 0x0000ffff)) +#define gate_segment(g) ((g).a >> 16) +#endif + +struct desc_ptr { + unsigned short size; + unsigned long address; +} __attribute__((packed)) ; + +#endif /* !__ASSEMBLY__ */ + +#endif /* _ASM_X86_DESC_DEFS_H */ diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h new file mode 100644 index 000000000000..3c034f48fdb0 --- /dev/null +++ b/arch/x86/include/asm/device.h @@ -0,0 +1,16 @@ +#ifndef _ASM_X86_DEVICE_H +#define _ASM_X86_DEVICE_H + +struct dev_archdata { +#ifdef CONFIG_ACPI + void *acpi_handle; +#endif +#ifdef CONFIG_X86_64 +struct dma_mapping_ops *dma_ops; +#endif +#ifdef CONFIG_DMAR + void *iommu; /* hook for IOMMU specific extension */ +#endif +}; + +#endif /* _ASM_X86_DEVICE_H */ diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h new file mode 100644 index 000000000000..9a2d644c08ef --- /dev/null +++ b/arch/x86/include/asm/div64.h @@ -0,0 +1,60 @@ +#ifndef _ASM_X86_DIV64_H +#define _ASM_X86_DIV64_H + +#ifdef CONFIG_X86_32 + +#include <linux/types.h> + +/* + * do_div() is NOT a C function. It wants to return + * two values (the quotient and the remainder), but + * since that doesn't work very well in C, what it + * does is: + * + * - modifies the 64-bit dividend _in_place_ + * - returns the 32-bit remainder + * + * This ends up being the most efficient "calling + * convention" on x86. + */ +#define do_div(n, base) \ +({ \ + unsigned long __upper, __low, __high, __mod, __base; \ + __base = (base); \ + asm("":"=a" (__low), "=d" (__high) : "A" (n)); \ + __upper = __high; \ + if (__high) { \ + __upper = __high % (__base); \ + __high = __high / (__base); \ + } \ + asm("divl %2":"=a" (__low), "=d" (__mod) \ + : "rm" (__base), "0" (__low), "1" (__upper)); \ + asm("":"=A" (n) : "a" (__low), "d" (__high)); \ + __mod; \ +}) + +static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder) +{ + union { + u64 v64; + u32 v32[2]; + } d = { dividend }; + u32 upper; + + upper = d.v32[1]; + d.v32[1] = 0; + if (upper >= divisor) { + d.v32[1] = upper / divisor; + upper %= divisor; + } + asm ("divl %2" : "=a" (d.v32[0]), "=d" (*remainder) : + "rm" (divisor), "0" (d.v32[0]), "1" (upper)); + return d.v64; +} +#define div_u64_rem div_u64_rem + +#else +# include <asm-generic/div64.h> +#endif /* CONFIG_X86_32 */ + +#endif /* _ASM_X86_DIV64_H */ diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h new file mode 100644 index 000000000000..4a5397bfce27 --- /dev/null +++ b/arch/x86/include/asm/dma-mapping.h @@ -0,0 +1,308 @@ +#ifndef _ASM_X86_DMA_MAPPING_H +#define _ASM_X86_DMA_MAPPING_H + +/* + * IOMMU interface. See Documentation/DMA-mapping.txt and DMA-API.txt for + * documentation. + */ + +#include <linux/scatterlist.h> +#include <asm/io.h> +#include <asm/swiotlb.h> +#include <asm-generic/dma-coherent.h> + +extern dma_addr_t bad_dma_address; +extern int iommu_merge; +extern struct device x86_dma_fallback_dev; +extern int panic_on_overflow; + +struct dma_mapping_ops { + int (*mapping_error)(struct device *dev, + dma_addr_t dma_addr); + void* (*alloc_coherent)(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp); + void (*free_coherent)(struct device *dev, size_t size, + void *vaddr, dma_addr_t dma_handle); + dma_addr_t (*map_single)(struct device *hwdev, phys_addr_t ptr, + size_t size, int direction); + void (*unmap_single)(struct device *dev, dma_addr_t addr, + size_t size, int direction); + void (*sync_single_for_cpu)(struct device *hwdev, + dma_addr_t dma_handle, size_t size, + int direction); + void (*sync_single_for_device)(struct device *hwdev, + dma_addr_t dma_handle, size_t size, + int direction); + void (*sync_single_range_for_cpu)(struct device *hwdev, + dma_addr_t dma_handle, unsigned long offset, + size_t size, int direction); + void (*sync_single_range_for_device)(struct device *hwdev, + dma_addr_t dma_handle, unsigned long offset, + size_t size, int direction); + void (*sync_sg_for_cpu)(struct device *hwdev, + struct scatterlist *sg, int nelems, + int direction); + void (*sync_sg_for_device)(struct device *hwdev, + struct scatterlist *sg, int nelems, + int direction); + int (*map_sg)(struct device *hwdev, struct scatterlist *sg, + int nents, int direction); + void (*unmap_sg)(struct device *hwdev, + struct scatterlist *sg, int nents, + int direction); + int (*dma_supported)(struct device *hwdev, u64 mask); + int is_phys; +}; + +extern struct dma_mapping_ops *dma_ops; + +static inline struct dma_mapping_ops *get_dma_ops(struct device *dev) +{ +#ifdef CONFIG_X86_32 + return dma_ops; +#else + if (unlikely(!dev) || !dev->archdata.dma_ops) + return dma_ops; + else + return dev->archdata.dma_ops; +#endif /* _ASM_X86_DMA_MAPPING_H */ +} + +/* Make sure we keep the same behaviour */ +static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) +{ +#ifdef CONFIG_X86_32 + return 0; +#else + struct dma_mapping_ops *ops = get_dma_ops(dev); + if (ops->mapping_error) + return ops->mapping_error(dev, dma_addr); + + return (dma_addr == bad_dma_address); +#endif +} + +#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) +#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) +#define dma_is_consistent(d, h) (1) + +extern int dma_supported(struct device *hwdev, u64 mask); +extern int dma_set_mask(struct device *dev, u64 mask); + +extern void *dma_generic_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_addr, gfp_t flag); + +static inline dma_addr_t +dma_map_single(struct device *hwdev, void *ptr, size_t size, + int direction) +{ + struct dma_mapping_ops *ops = get_dma_ops(hwdev); + + BUG_ON(!valid_dma_direction(direction)); + return ops->map_single(hwdev, virt_to_phys(ptr), size, direction); +} + +static inline void +dma_unmap_single(struct device *dev, dma_addr_t addr, size_t size, + int direction) +{ + struct dma_mapping_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(direction)); + if (ops->unmap_single) + ops->unmap_single(dev, addr, size, direction); +} + +static inline int +dma_map_sg(struct device *hwdev, struct scatterlist *sg, + int nents, int direction) +{ + struct dma_mapping_ops *ops = get_dma_ops(hwdev); + + BUG_ON(!valid_dma_direction(direction)); + return ops->map_sg(hwdev, sg, nents, direction); +} + +static inline void +dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents, + int direction) +{ + struct dma_mapping_ops *ops = get_dma_ops(hwdev); + + BUG_ON(!valid_dma_direction(direction)); + if (ops->unmap_sg) + ops->unmap_sg(hwdev, sg, nents, direction); +} + +static inline void +dma_sync_single_for_cpu(struct device *hwdev, dma_addr_t dma_handle, + size_t size, int direction) +{ + struct dma_mapping_ops *ops = get_dma_ops(hwdev); + + BUG_ON(!valid_dma_direction(direction)); + if (ops->sync_single_for_cpu) + ops->sync_single_for_cpu(hwdev, dma_handle, size, direction); + flush_write_buffers(); +} + +static inline void +dma_sync_single_for_device(struct device *hwdev, dma_addr_t dma_handle, + size_t size, int direction) +{ + struct dma_mapping_ops *ops = get_dma_ops(hwdev); + + BUG_ON(!valid_dma_direction(direction)); + if (ops->sync_single_for_device) + ops->sync_single_for_device(hwdev, dma_handle, size, direction); + flush_write_buffers(); +} + +static inline void +dma_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dma_handle, + unsigned long offset, size_t size, int direction) +{ + struct dma_mapping_ops *ops = get_dma_ops(hwdev); + + BUG_ON(!valid_dma_direction(direction)); + if (ops->sync_single_range_for_cpu) + ops->sync_single_range_for_cpu(hwdev, dma_handle, offset, + size, direction); + flush_write_buffers(); +} + +static inline void +dma_sync_single_range_for_device(struct device *hwdev, dma_addr_t dma_handle, + unsigned long offset, size_t size, + int direction) +{ + struct dma_mapping_ops *ops = get_dma_ops(hwdev); + + BUG_ON(!valid_dma_direction(direction)); + if (ops->sync_single_range_for_device) + ops->sync_single_range_for_device(hwdev, dma_handle, + offset, size, direction); + flush_write_buffers(); +} + +static inline void +dma_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg, + int nelems, int direction) +{ + struct dma_mapping_ops *ops = get_dma_ops(hwdev); + + BUG_ON(!valid_dma_direction(direction)); + if (ops->sync_sg_for_cpu) + ops->sync_sg_for_cpu(hwdev, sg, nelems, direction); + flush_write_buffers(); +} + +static inline void +dma_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg, + int nelems, int direction) +{ + struct dma_mapping_ops *ops = get_dma_ops(hwdev); + + BUG_ON(!valid_dma_direction(direction)); + if (ops->sync_sg_for_device) + ops->sync_sg_for_device(hwdev, sg, nelems, direction); + + flush_write_buffers(); +} + +static inline dma_addr_t dma_map_page(struct device *dev, struct page *page, + size_t offset, size_t size, + int direction) +{ + struct dma_mapping_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(direction)); + return ops->map_single(dev, page_to_phys(page) + offset, + size, direction); +} + +static inline void dma_unmap_page(struct device *dev, dma_addr_t addr, + size_t size, int direction) +{ + dma_unmap_single(dev, addr, size, direction); +} + +static inline void +dma_cache_sync(struct device *dev, void *vaddr, size_t size, + enum dma_data_direction dir) +{ + flush_write_buffers(); +} + +static inline int dma_get_cache_alignment(void) +{ + /* no easy way to get cache size on all x86, so return the + * maximum possible, to be safe */ + return boot_cpu_data.x86_clflush_size; +} + +static inline unsigned long dma_alloc_coherent_mask(struct device *dev, + gfp_t gfp) +{ + unsigned long dma_mask = 0; + + dma_mask = dev->coherent_dma_mask; + if (!dma_mask) + dma_mask = (gfp & GFP_DMA) ? DMA_24BIT_MASK : DMA_32BIT_MASK; + + return dma_mask; +} + +static inline gfp_t dma_alloc_coherent_gfp_flags(struct device *dev, gfp_t gfp) +{ +#ifdef CONFIG_X86_64 + unsigned long dma_mask = dma_alloc_coherent_mask(dev, gfp); + + if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA)) + gfp |= GFP_DMA32; +#endif + return gfp; +} + +static inline void * +dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, + gfp_t gfp) +{ + struct dma_mapping_ops *ops = get_dma_ops(dev); + void *memory; + + gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); + + if (dma_alloc_from_coherent(dev, size, dma_handle, &memory)) + return memory; + + if (!dev) { + dev = &x86_dma_fallback_dev; + gfp |= GFP_DMA; + } + + if (!is_device_dma_capable(dev)) + return NULL; + + if (!ops->alloc_coherent) + return NULL; + + return ops->alloc_coherent(dev, size, dma_handle, + dma_alloc_coherent_gfp_flags(dev, gfp)); +} + +static inline void dma_free_coherent(struct device *dev, size_t size, + void *vaddr, dma_addr_t bus) +{ + struct dma_mapping_ops *ops = get_dma_ops(dev); + + WARN_ON(irqs_disabled()); /* for portability */ + + if (dma_release_from_coherent(dev, get_order(size), vaddr)) + return; + + if (ops->free_coherent) + ops->free_coherent(dev, size, vaddr, bus); +} + +#endif diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h new file mode 100644 index 000000000000..ca1098a7e580 --- /dev/null +++ b/arch/x86/include/asm/dma.h @@ -0,0 +1,318 @@ +/* + * linux/include/asm/dma.h: Defines for using and allocating dma channels. + * Written by Hennus Bergman, 1992. + * High DMA channel support & info by Hannu Savolainen + * and John Boyd, Nov. 1992. + */ + +#ifndef _ASM_X86_DMA_H +#define _ASM_X86_DMA_H + +#include <linux/spinlock.h> /* And spinlocks */ +#include <asm/io.h> /* need byte IO */ +#include <linux/delay.h> + +#ifdef HAVE_REALLY_SLOW_DMA_CONTROLLER +#define dma_outb outb_p +#else +#define dma_outb outb +#endif + +#define dma_inb inb + +/* + * NOTES about DMA transfers: + * + * controller 1: channels 0-3, byte operations, ports 00-1F + * controller 2: channels 4-7, word operations, ports C0-DF + * + * - ALL registers are 8 bits only, regardless of transfer size + * - channel 4 is not used - cascades 1 into 2. + * - channels 0-3 are byte - addresses/counts are for physical bytes + * - channels 5-7 are word - addresses/counts are for physical words + * - transfers must not cross physical 64K (0-3) or 128K (5-7) boundaries + * - transfer count loaded to registers is 1 less than actual count + * - controller 2 offsets are all even (2x offsets for controller 1) + * - page registers for 5-7 don't use data bit 0, represent 128K pages + * - page registers for 0-3 use bit 0, represent 64K pages + * + * DMA transfers are limited to the lower 16MB of _physical_ memory. + * Note that addresses loaded into registers must be _physical_ addresses, + * not logical addresses (which may differ if paging is active). + * + * Address mapping for channels 0-3: + * + * A23 ... A16 A15 ... A8 A7 ... A0 (Physical addresses) + * | ... | | ... | | ... | + * | ... | | ... | | ... | + * | ... | | ... | | ... | + * P7 ... P0 A7 ... A0 A7 ... A0 + * | Page | Addr MSB | Addr LSB | (DMA registers) + * + * Address mapping for channels 5-7: + * + * A23 ... A17 A16 A15 ... A9 A8 A7 ... A1 A0 (Physical addresses) + * | ... | \ \ ... \ \ \ ... \ \ + * | ... | \ \ ... \ \ \ ... \ (not used) + * | ... | \ \ ... \ \ \ ... \ + * P7 ... P1 (0) A7 A6 ... A0 A7 A6 ... A0 + * | Page | Addr MSB | Addr LSB | (DMA registers) + * + * Again, channels 5-7 transfer _physical_ words (16 bits), so addresses + * and counts _must_ be word-aligned (the lowest address bit is _ignored_ at + * the hardware level, so odd-byte transfers aren't possible). + * + * Transfer count (_not # bytes_) is limited to 64K, represented as actual + * count - 1 : 64K => 0xFFFF, 1 => 0x0000. Thus, count is always 1 or more, + * and up to 128K bytes may be transferred on channels 5-7 in one operation. + * + */ + +#define MAX_DMA_CHANNELS 8 + +#ifdef CONFIG_X86_32 + +/* The maximum address that we can perform a DMA transfer to on this platform */ +#define MAX_DMA_ADDRESS (PAGE_OFFSET + 0x1000000) + +#else + +/* 16MB ISA DMA zone */ +#define MAX_DMA_PFN ((16 * 1024 * 1024) >> PAGE_SHIFT) + +/* 4GB broken PCI/AGP hardware bus master zone */ +#define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT) + +/* Compat define for old dma zone */ +#define MAX_DMA_ADDRESS ((unsigned long)__va(MAX_DMA_PFN << PAGE_SHIFT)) + +#endif + +/* 8237 DMA controllers */ +#define IO_DMA1_BASE 0x00 /* 8 bit slave DMA, channels 0..3 */ +#define IO_DMA2_BASE 0xC0 /* 16 bit master DMA, ch 4(=slave input)..7 */ + +/* DMA controller registers */ +#define DMA1_CMD_REG 0x08 /* command register (w) */ +#define DMA1_STAT_REG 0x08 /* status register (r) */ +#define DMA1_REQ_REG 0x09 /* request register (w) */ +#define DMA1_MASK_REG 0x0A /* single-channel mask (w) */ +#define DMA1_MODE_REG 0x0B /* mode register (w) */ +#define DMA1_CLEAR_FF_REG 0x0C /* clear pointer flip-flop (w) */ +#define DMA1_TEMP_REG 0x0D /* Temporary Register (r) */ +#define DMA1_RESET_REG 0x0D /* Master Clear (w) */ +#define DMA1_CLR_MASK_REG 0x0E /* Clear Mask */ +#define DMA1_MASK_ALL_REG 0x0F /* all-channels mask (w) */ + +#define DMA2_CMD_REG 0xD0 /* command register (w) */ +#define DMA2_STAT_REG 0xD0 /* status register (r) */ +#define DMA2_REQ_REG 0xD2 /* request register (w) */ +#define DMA2_MASK_REG 0xD4 /* single-channel mask (w) */ +#define DMA2_MODE_REG 0xD6 /* mode register (w) */ +#define DMA2_CLEAR_FF_REG 0xD8 /* clear pointer flip-flop (w) */ +#define DMA2_TEMP_REG 0xDA /* Temporary Register (r) */ +#define DMA2_RESET_REG 0xDA /* Master Clear (w) */ +#define DMA2_CLR_MASK_REG 0xDC /* Clear Mask */ +#define DMA2_MASK_ALL_REG 0xDE /* all-channels mask (w) */ + +#define DMA_ADDR_0 0x00 /* DMA address registers */ +#define DMA_ADDR_1 0x02 +#define DMA_ADDR_2 0x04 +#define DMA_ADDR_3 0x06 +#define DMA_ADDR_4 0xC0 +#define DMA_ADDR_5 0xC4 +#define DMA_ADDR_6 0xC8 +#define DMA_ADDR_7 0xCC + +#define DMA_CNT_0 0x01 /* DMA count registers */ +#define DMA_CNT_1 0x03 +#define DMA_CNT_2 0x05 +#define DMA_CNT_3 0x07 +#define DMA_CNT_4 0xC2 +#define DMA_CNT_5 0xC6 +#define DMA_CNT_6 0xCA +#define DMA_CNT_7 0xCE + +#define DMA_PAGE_0 0x87 /* DMA page registers */ +#define DMA_PAGE_1 0x83 +#define DMA_PAGE_2 0x81 +#define DMA_PAGE_3 0x82 +#define DMA_PAGE_5 0x8B +#define DMA_PAGE_6 0x89 +#define DMA_PAGE_7 0x8A + +/* I/O to memory, no autoinit, increment, single mode */ +#define DMA_MODE_READ 0x44 +/* memory to I/O, no autoinit, increment, single mode */ +#define DMA_MODE_WRITE 0x48 +/* pass thru DREQ->HRQ, DACK<-HLDA only */ +#define DMA_MODE_CASCADE 0xC0 + +#define DMA_AUTOINIT 0x10 + + +extern spinlock_t dma_spin_lock; + +static inline unsigned long claim_dma_lock(void) +{ + unsigned long flags; + spin_lock_irqsave(&dma_spin_lock, flags); + return flags; +} + +static inline void release_dma_lock(unsigned long flags) +{ + spin_unlock_irqrestore(&dma_spin_lock, flags); +} + +/* enable/disable a specific DMA channel */ +static inline void enable_dma(unsigned int dmanr) +{ + if (dmanr <= 3) + dma_outb(dmanr, DMA1_MASK_REG); + else + dma_outb(dmanr & 3, DMA2_MASK_REG); +} + +static inline void disable_dma(unsigned int dmanr) +{ + if (dmanr <= 3) + dma_outb(dmanr | 4, DMA1_MASK_REG); + else + dma_outb((dmanr & 3) | 4, DMA2_MASK_REG); +} + +/* Clear the 'DMA Pointer Flip Flop'. + * Write 0 for LSB/MSB, 1 for MSB/LSB access. + * Use this once to initialize the FF to a known state. + * After that, keep track of it. :-) + * --- In order to do that, the DMA routines below should --- + * --- only be used while holding the DMA lock ! --- + */ +static inline void clear_dma_ff(unsigned int dmanr) +{ + if (dmanr <= 3) + dma_outb(0, DMA1_CLEAR_FF_REG); + else + dma_outb(0, DMA2_CLEAR_FF_REG); +} + +/* set mode (above) for a specific DMA channel */ +static inline void set_dma_mode(unsigned int dmanr, char mode) +{ + if (dmanr <= 3) + dma_outb(mode | dmanr, DMA1_MODE_REG); + else + dma_outb(mode | (dmanr & 3), DMA2_MODE_REG); +} + +/* Set only the page register bits of the transfer address. + * This is used for successive transfers when we know the contents of + * the lower 16 bits of the DMA current address register, but a 64k boundary + * may have been crossed. + */ +static inline void set_dma_page(unsigned int dmanr, char pagenr) +{ + switch (dmanr) { + case 0: + dma_outb(pagenr, DMA_PAGE_0); + break; + case 1: + dma_outb(pagenr, DMA_PAGE_1); + break; + case 2: + dma_outb(pagenr, DMA_PAGE_2); + break; + case 3: + dma_outb(pagenr, DMA_PAGE_3); + break; + case 5: + dma_outb(pagenr & 0xfe, DMA_PAGE_5); + break; + case 6: + dma_outb(pagenr & 0xfe, DMA_PAGE_6); + break; + case 7: + dma_outb(pagenr & 0xfe, DMA_PAGE_7); + break; + } +} + + +/* Set transfer address & page bits for specific DMA channel. + * Assumes dma flipflop is clear. + */ +static inline void set_dma_addr(unsigned int dmanr, unsigned int a) +{ + set_dma_page(dmanr, a>>16); + if (dmanr <= 3) { + dma_outb(a & 0xff, ((dmanr & 3) << 1) + IO_DMA1_BASE); + dma_outb((a >> 8) & 0xff, ((dmanr & 3) << 1) + IO_DMA1_BASE); + } else { + dma_outb((a >> 1) & 0xff, ((dmanr & 3) << 2) + IO_DMA2_BASE); + dma_outb((a >> 9) & 0xff, ((dmanr & 3) << 2) + IO_DMA2_BASE); + } +} + + +/* Set transfer size (max 64k for DMA0..3, 128k for DMA5..7) for + * a specific DMA channel. + * You must ensure the parameters are valid. + * NOTE: from a manual: "the number of transfers is one more + * than the initial word count"! This is taken into account. + * Assumes dma flip-flop is clear. + * NOTE 2: "count" represents _bytes_ and must be even for channels 5-7. + */ +static inline void set_dma_count(unsigned int dmanr, unsigned int count) +{ + count--; + if (dmanr <= 3) { + dma_outb(count & 0xff, ((dmanr & 3) << 1) + 1 + IO_DMA1_BASE); + dma_outb((count >> 8) & 0xff, + ((dmanr & 3) << 1) + 1 + IO_DMA1_BASE); + } else { + dma_outb((count >> 1) & 0xff, + ((dmanr & 3) << 2) + 2 + IO_DMA2_BASE); + dma_outb((count >> 9) & 0xff, + ((dmanr & 3) << 2) + 2 + IO_DMA2_BASE); + } +} + + +/* Get DMA residue count. After a DMA transfer, this + * should return zero. Reading this while a DMA transfer is + * still in progress will return unpredictable results. + * If called before the channel has been used, it may return 1. + * Otherwise, it returns the number of _bytes_ left to transfer. + * + * Assumes DMA flip-flop is clear. + */ +static inline int get_dma_residue(unsigned int dmanr) +{ + unsigned int io_port; + /* using short to get 16-bit wrap around */ + unsigned short count; + + io_port = (dmanr <= 3) ? ((dmanr & 3) << 1) + 1 + IO_DMA1_BASE + : ((dmanr & 3) << 2) + 2 + IO_DMA2_BASE; + + count = 1 + dma_inb(io_port); + count += dma_inb(io_port) << 8; + + return (dmanr <= 3) ? count : (count << 1); +} + + +/* These are in kernel/dma.c: */ +extern int request_dma(unsigned int dmanr, const char *device_id); +extern void free_dma(unsigned int dmanr); + +/* From PCI */ + +#ifdef CONFIG_PCI +extern int isa_dma_bridge_buggy; +#else +#define isa_dma_bridge_buggy (0) +#endif + +#endif /* _ASM_X86_DMA_H */ diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h new file mode 100644 index 000000000000..bc68212c6bc0 --- /dev/null +++ b/arch/x86/include/asm/dmi.h @@ -0,0 +1,26 @@ +#ifndef _ASM_X86_DMI_H +#define _ASM_X86_DMI_H + +#include <asm/io.h> + +#define DMI_MAX_DATA 2048 + +extern int dmi_alloc_index; +extern char dmi_alloc_data[DMI_MAX_DATA]; + +/* This is so early that there is no good way to allocate dynamic memory. + Allocate data in an BSS array. */ +static inline void *dmi_alloc(unsigned len) +{ + int idx = dmi_alloc_index; + if ((dmi_alloc_index + len) > DMI_MAX_DATA) + return NULL; + dmi_alloc_index += len; + return dmi_alloc_data + idx; +} + +/* Use early IO mappings for DMI because it's initialized early */ +#define dmi_ioremap early_ioremap +#define dmi_iounmap early_iounmap + +#endif /* _ASM_X86_DMI_H */ diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h new file mode 100644 index 000000000000..72c5a190bf48 --- /dev/null +++ b/arch/x86/include/asm/ds.h @@ -0,0 +1,238 @@ +/* + * Debug Store (DS) support + * + * This provides a low-level interface to the hardware's Debug Store + * feature that is used for branch trace store (BTS) and + * precise-event based sampling (PEBS). + * + * It manages: + * - per-thread and per-cpu allocation of BTS and PEBS + * - buffer memory allocation (optional) + * - buffer overflow handling + * - buffer access + * + * It assumes: + * - get_task_struct on all parameter tasks + * - current is allowed to trace parameter tasks + * + * + * Copyright (C) 2007-2008 Intel Corporation. + * Markus Metzger <markus.t.metzger@intel.com>, 2007-2008 + */ + +#ifndef _ASM_X86_DS_H +#define _ASM_X86_DS_H + +#ifdef CONFIG_X86_DS + +#include <linux/types.h> +#include <linux/init.h> + + +struct task_struct; + +/* + * Request BTS or PEBS + * + * Due to alignement constraints, the actual buffer may be slightly + * smaller than the requested or provided buffer. + * + * Returns 0 on success; -Eerrno otherwise + * + * task: the task to request recording for; + * NULL for per-cpu recording on the current cpu + * base: the base pointer for the (non-pageable) buffer; + * NULL if buffer allocation requested + * size: the size of the requested or provided buffer + * ovfl: pointer to a function to be called on buffer overflow; + * NULL if cyclic buffer requested + */ +typedef void (*ds_ovfl_callback_t)(struct task_struct *); +extern int ds_request_bts(struct task_struct *task, void *base, size_t size, + ds_ovfl_callback_t ovfl); +extern int ds_request_pebs(struct task_struct *task, void *base, size_t size, + ds_ovfl_callback_t ovfl); + +/* + * Release BTS or PEBS resources + * + * Frees buffers allocated on ds_request. + * + * Returns 0 on success; -Eerrno otherwise + * + * task: the task to release resources for; + * NULL to release resources for the current cpu + */ +extern int ds_release_bts(struct task_struct *task); +extern int ds_release_pebs(struct task_struct *task); + +/* + * Return the (array) index of the write pointer. + * (assuming an array of BTS/PEBS records) + * + * Returns -Eerrno on error + * + * task: the task to access; + * NULL to access the current cpu + * pos (out): if not NULL, will hold the result + */ +extern int ds_get_bts_index(struct task_struct *task, size_t *pos); +extern int ds_get_pebs_index(struct task_struct *task, size_t *pos); + +/* + * Return the (array) index one record beyond the end of the array. + * (assuming an array of BTS/PEBS records) + * + * Returns -Eerrno on error + * + * task: the task to access; + * NULL to access the current cpu + * pos (out): if not NULL, will hold the result + */ +extern int ds_get_bts_end(struct task_struct *task, size_t *pos); +extern int ds_get_pebs_end(struct task_struct *task, size_t *pos); + +/* + * Provide a pointer to the BTS/PEBS record at parameter index. + * (assuming an array of BTS/PEBS records) + * + * The pointer points directly into the buffer. The user is + * responsible for copying the record. + * + * Returns the size of a single record on success; -Eerrno on error + * + * task: the task to access; + * NULL to access the current cpu + * index: the index of the requested record + * record (out): pointer to the requested record + */ +extern int ds_access_bts(struct task_struct *task, + size_t index, const void **record); +extern int ds_access_pebs(struct task_struct *task, + size_t index, const void **record); + +/* + * Write one or more BTS/PEBS records at the write pointer index and + * advance the write pointer. + * + * If size is not a multiple of the record size, trailing bytes are + * zeroed out. + * + * May result in one or more overflow notifications. + * + * If called during overflow handling, that is, with index >= + * interrupt threshold, the write will wrap around. + * + * An overflow notification is given if and when the interrupt + * threshold is reached during or after the write. + * + * Returns the number of bytes written or -Eerrno. + * + * task: the task to access; + * NULL to access the current cpu + * buffer: the buffer to write + * size: the size of the buffer + */ +extern int ds_write_bts(struct task_struct *task, + const void *buffer, size_t size); +extern int ds_write_pebs(struct task_struct *task, + const void *buffer, size_t size); + +/* + * Same as ds_write_bts/pebs, but omit ownership checks. + * + * This is needed to have some other task than the owner of the + * BTS/PEBS buffer or the parameter task itself write into the + * respective buffer. + */ +extern int ds_unchecked_write_bts(struct task_struct *task, + const void *buffer, size_t size); +extern int ds_unchecked_write_pebs(struct task_struct *task, + const void *buffer, size_t size); + +/* + * Reset the write pointer of the BTS/PEBS buffer. + * + * Returns 0 on success; -Eerrno on error + * + * task: the task to access; + * NULL to access the current cpu + */ +extern int ds_reset_bts(struct task_struct *task); +extern int ds_reset_pebs(struct task_struct *task); + +/* + * Clear the BTS/PEBS buffer and reset the write pointer. + * The entire buffer will be zeroed out. + * + * Returns 0 on success; -Eerrno on error + * + * task: the task to access; + * NULL to access the current cpu + */ +extern int ds_clear_bts(struct task_struct *task); +extern int ds_clear_pebs(struct task_struct *task); + +/* + * Provide the PEBS counter reset value. + * + * Returns 0 on success; -Eerrno on error + * + * task: the task to access; + * NULL to access the current cpu + * value (out): the counter reset value + */ +extern int ds_get_pebs_reset(struct task_struct *task, u64 *value); + +/* + * Set the PEBS counter reset value. + * + * Returns 0 on success; -Eerrno on error + * + * task: the task to access; + * NULL to access the current cpu + * value: the new counter reset value + */ +extern int ds_set_pebs_reset(struct task_struct *task, u64 value); + +/* + * Initialization + */ +struct cpuinfo_x86; +extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *); + + + +/* + * The DS context - part of struct thread_struct. + */ +struct ds_context { + /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */ + unsigned char *ds; + /* the owner of the BTS and PEBS configuration, respectively */ + struct task_struct *owner[2]; + /* buffer overflow notification function for BTS and PEBS */ + ds_ovfl_callback_t callback[2]; + /* the original buffer address */ + void *buffer[2]; + /* the number of allocated pages for on-request allocated buffers */ + unsigned int pages[2]; + /* use count */ + unsigned long count; + /* a pointer to the context location inside the thread_struct + * or the per_cpu context array */ + struct ds_context **this; + /* a pointer to the task owning this context, or NULL, if the + * context is owned by a cpu */ + struct task_struct *task; +}; + +/* called by exit_thread() to free leftover contexts */ +extern void ds_free(struct ds_context *context); + +#else /* CONFIG_X86_DS */ + +#define ds_init_intel(config) do {} while (0) + +#endif /* CONFIG_X86_DS */ +#endif /* _ASM_X86_DS_H */ diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h new file mode 100644 index 000000000000..804b6e6be929 --- /dev/null +++ b/arch/x86/include/asm/dwarf2.h @@ -0,0 +1,61 @@ +#ifndef _ASM_X86_DWARF2_H +#define _ASM_X86_DWARF2_H + +#ifndef __ASSEMBLY__ +#warning "asm/dwarf2.h should be only included in pure assembly files" +#endif + +/* + Macros for dwarf2 CFI unwind table entries. + See "as.info" for details on these pseudo ops. Unfortunately + they are only supported in very new binutils, so define them + away for older version. + */ + +#ifdef CONFIG_AS_CFI + +#define CFI_STARTPROC .cfi_startproc +#define CFI_ENDPROC .cfi_endproc +#define CFI_DEF_CFA .cfi_def_cfa +#define CFI_DEF_CFA_REGISTER .cfi_def_cfa_register +#define CFI_DEF_CFA_OFFSET .cfi_def_cfa_offset +#define CFI_ADJUST_CFA_OFFSET .cfi_adjust_cfa_offset +#define CFI_OFFSET .cfi_offset +#define CFI_REL_OFFSET .cfi_rel_offset +#define CFI_REGISTER .cfi_register +#define CFI_RESTORE .cfi_restore +#define CFI_REMEMBER_STATE .cfi_remember_state +#define CFI_RESTORE_STATE .cfi_restore_state +#define CFI_UNDEFINED .cfi_undefined + +#ifdef CONFIG_AS_CFI_SIGNAL_FRAME +#define CFI_SIGNAL_FRAME .cfi_signal_frame +#else +#define CFI_SIGNAL_FRAME +#endif + +#else + +/* Due to the structure of pre-exisiting code, don't use assembler line + comment character # to ignore the arguments. Instead, use a dummy macro. */ +.macro cfi_ignore a=0, b=0, c=0, d=0 +.endm + +#define CFI_STARTPROC cfi_ignore +#define CFI_ENDPROC cfi_ignore +#define CFI_DEF_CFA cfi_ignore +#define CFI_DEF_CFA_REGISTER cfi_ignore +#define CFI_DEF_CFA_OFFSET cfi_ignore +#define CFI_ADJUST_CFA_OFFSET cfi_ignore +#define CFI_OFFSET cfi_ignore +#define CFI_REL_OFFSET cfi_ignore +#define CFI_REGISTER cfi_ignore +#define CFI_RESTORE cfi_ignore +#define CFI_REMEMBER_STATE cfi_ignore +#define CFI_RESTORE_STATE cfi_ignore +#define CFI_UNDEFINED cfi_ignore +#define CFI_SIGNAL_FRAME cfi_ignore + +#endif + +#endif /* _ASM_X86_DWARF2_H */ diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h new file mode 100644 index 000000000000..3d8ceddbd407 --- /dev/null +++ b/arch/x86/include/asm/e820.h @@ -0,0 +1,146 @@ +#ifndef _ASM_X86_E820_H +#define _ASM_X86_E820_H +#define E820MAP 0x2d0 /* our map */ +#define E820MAX 128 /* number of entries in E820MAP */ + +/* + * Legacy E820 BIOS limits us to 128 (E820MAX) nodes due to the + * constrained space in the zeropage. If we have more nodes than + * that, and if we've booted off EFI firmware, then the EFI tables + * passed us from the EFI firmware can list more nodes. Size our + * internal memory map tables to have room for these additional + * nodes, based on up to three entries per node for which the + * kernel was built: MAX_NUMNODES == (1 << CONFIG_NODES_SHIFT), + * plus E820MAX, allowing space for the possible duplicate E820 + * entries that might need room in the same arrays, prior to the + * call to sanitize_e820_map() to remove duplicates. The allowance + * of three memory map entries per node is "enough" entries for + * the initial hardware platform motivating this mechanism to make + * use of additional EFI map entries. Future platforms may want + * to allow more than three entries per node or otherwise refine + * this size. + */ + +/* + * Odd: 'make headers_check' complains about numa.h if I try + * to collapse the next two #ifdef lines to a single line: + * #if defined(__KERNEL__) && defined(CONFIG_EFI) + */ +#ifdef __KERNEL__ +#ifdef CONFIG_EFI +#include <linux/numa.h> +#define E820_X_MAX (E820MAX + 3 * MAX_NUMNODES) +#else /* ! CONFIG_EFI */ +#define E820_X_MAX E820MAX +#endif +#else /* ! __KERNEL__ */ +#define E820_X_MAX E820MAX +#endif + +#define E820NR 0x1e8 /* # entries in E820MAP */ + +#define E820_RAM 1 +#define E820_RESERVED 2 +#define E820_ACPI 3 +#define E820_NVS 4 +#define E820_UNUSABLE 5 + +/* reserved RAM used by kernel itself */ +#define E820_RESERVED_KERN 128 + +#ifndef __ASSEMBLY__ +struct e820entry { + __u64 addr; /* start of memory segment */ + __u64 size; /* size of memory segment */ + __u32 type; /* type of memory segment */ +} __attribute__((packed)); + +struct e820map { + __u32 nr_map; + struct e820entry map[E820_X_MAX]; +}; + +#ifdef __KERNEL__ +/* see comment in arch/x86/kernel/e820.c */ +extern struct e820map e820; +extern struct e820map e820_saved; + +extern unsigned long pci_mem_start; +extern int e820_any_mapped(u64 start, u64 end, unsigned type); +extern int e820_all_mapped(u64 start, u64 end, unsigned type); +extern void e820_add_region(u64 start, u64 size, int type); +extern void e820_print_map(char *who); +extern int +sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, int *pnr_map); +extern u64 e820_update_range(u64 start, u64 size, unsigned old_type, + unsigned new_type); +extern u64 e820_remove_range(u64 start, u64 size, unsigned old_type, + int checktype); +extern void update_e820(void); +extern void e820_setup_gap(void); +extern int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, + unsigned long start_addr, unsigned long long end_addr); +struct setup_data; +extern void parse_e820_ext(struct setup_data *data, unsigned long pa_data); + +#if defined(CONFIG_X86_64) || \ + (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION)) +extern void e820_mark_nosave_regions(unsigned long limit_pfn); +#else +static inline void e820_mark_nosave_regions(unsigned long limit_pfn) +{ +} +#endif + +#ifdef CONFIG_MEMTEST +extern void early_memtest(unsigned long start, unsigned long end); +#else +static inline void early_memtest(unsigned long start, unsigned long end) +{ +} +#endif + +extern unsigned long end_user_pfn; + +extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align); +extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align); +extern void reserve_early(u64 start, u64 end, char *name); +extern void reserve_early_overlap_ok(u64 start, u64 end, char *name); +extern void free_early(u64 start, u64 end); +extern void early_res_to_bootmem(u64 start, u64 end); +extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align); + +extern unsigned long e820_end_of_ram_pfn(void); +extern unsigned long e820_end_of_low_ram_pfn(void); +extern int e820_find_active_region(const struct e820entry *ei, + unsigned long start_pfn, + unsigned long last_pfn, + unsigned long *ei_startpfn, + unsigned long *ei_endpfn); +extern void e820_register_active_regions(int nid, unsigned long start_pfn, + unsigned long end_pfn); +extern u64 e820_hole_size(u64 start, u64 end); +extern void finish_e820_parsing(void); +extern void e820_reserve_resources(void); +extern void e820_reserve_resources_late(void); +extern void setup_memory_map(void); +extern char *default_machine_specific_memory_setup(void); +extern char *machine_specific_memory_setup(void); +extern char *memory_setup(void); +#endif /* __KERNEL__ */ +#endif /* __ASSEMBLY__ */ + +#define ISA_START_ADDRESS 0xa0000 +#define ISA_END_ADDRESS 0x100000 +#define is_ISA_range(s, e) ((s) >= ISA_START_ADDRESS && (e) < ISA_END_ADDRESS) + +#define BIOS_BEGIN 0x000a0000 +#define BIOS_END 0x00100000 + +#ifdef __KERNEL__ +#include <linux/ioport.h> + +#define HIGH_MEMORY (1024*1024) +#endif /* __KERNEL__ */ + +#endif /* _ASM_X86_E820_H */ diff --git a/arch/x86/include/asm/edac.h b/arch/x86/include/asm/edac.h new file mode 100644 index 000000000000..e9b57ecc70c5 --- /dev/null +++ b/arch/x86/include/asm/edac.h @@ -0,0 +1,18 @@ +#ifndef _ASM_X86_EDAC_H +#define _ASM_X86_EDAC_H + +/* ECC atomic, DMA, SMP and interrupt safe scrub function */ + +static inline void atomic_scrub(void *va, u32 size) +{ + u32 i, *virt_addr = va; + + /* + * Very carefully read and write to memory atomically so we + * are interrupt, DMA and SMP safe. + */ + for (i = 0; i < size / 4; i++, virt_addr++) + asm volatile("lock; addl $0, %0"::"m" (*virt_addr)); +} + +#endif /* _ASM_X86_EDAC_H */ diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h new file mode 100644 index 000000000000..a2e545c91c35 --- /dev/null +++ b/arch/x86/include/asm/efi.h @@ -0,0 +1,110 @@ +#ifndef _ASM_X86_EFI_H +#define _ASM_X86_EFI_H + +#ifdef CONFIG_X86_32 + +extern unsigned long asmlinkage efi_call_phys(void *, ...); + +#define efi_call_phys0(f) efi_call_phys(f) +#define efi_call_phys1(f, a1) efi_call_phys(f, a1) +#define efi_call_phys2(f, a1, a2) efi_call_phys(f, a1, a2) +#define efi_call_phys3(f, a1, a2, a3) efi_call_phys(f, a1, a2, a3) +#define efi_call_phys4(f, a1, a2, a3, a4) \ + efi_call_phys(f, a1, a2, a3, a4) +#define efi_call_phys5(f, a1, a2, a3, a4, a5) \ + efi_call_phys(f, a1, a2, a3, a4, a5) +#define efi_call_phys6(f, a1, a2, a3, a4, a5, a6) \ + efi_call_phys(f, a1, a2, a3, a4, a5, a6) +/* + * Wrap all the virtual calls in a way that forces the parameters on the stack. + */ + +#define efi_call_virt(f, args...) \ + ((efi_##f##_t __attribute__((regparm(0)))*)efi.systab->runtime->f)(args) + +#define efi_call_virt0(f) efi_call_virt(f) +#define efi_call_virt1(f, a1) efi_call_virt(f, a1) +#define efi_call_virt2(f, a1, a2) efi_call_virt(f, a1, a2) +#define efi_call_virt3(f, a1, a2, a3) efi_call_virt(f, a1, a2, a3) +#define efi_call_virt4(f, a1, a2, a3, a4) \ + efi_call_virt(f, a1, a2, a3, a4) +#define efi_call_virt5(f, a1, a2, a3, a4, a5) \ + efi_call_virt(f, a1, a2, a3, a4, a5) +#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \ + efi_call_virt(f, a1, a2, a3, a4, a5, a6) + +#define efi_ioremap(addr, size) ioremap_cache(addr, size) + +#else /* !CONFIG_X86_32 */ + +#define MAX_EFI_IO_PAGES 100 + +extern u64 efi_call0(void *fp); +extern u64 efi_call1(void *fp, u64 arg1); +extern u64 efi_call2(void *fp, u64 arg1, u64 arg2); +extern u64 efi_call3(void *fp, u64 arg1, u64 arg2, u64 arg3); +extern u64 efi_call4(void *fp, u64 arg1, u64 arg2, u64 arg3, u64 arg4); +extern u64 efi_call5(void *fp, u64 arg1, u64 arg2, u64 arg3, + u64 arg4, u64 arg5); +extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3, + u64 arg4, u64 arg5, u64 arg6); + +#define efi_call_phys0(f) \ + efi_call0((void *)(f)) +#define efi_call_phys1(f, a1) \ + efi_call1((void *)(f), (u64)(a1)) +#define efi_call_phys2(f, a1, a2) \ + efi_call2((void *)(f), (u64)(a1), (u64)(a2)) +#define efi_call_phys3(f, a1, a2, a3) \ + efi_call3((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3)) +#define efi_call_phys4(f, a1, a2, a3, a4) \ + efi_call4((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3), \ + (u64)(a4)) +#define efi_call_phys5(f, a1, a2, a3, a4, a5) \ + efi_call5((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3), \ + (u64)(a4), (u64)(a5)) +#define efi_call_phys6(f, a1, a2, a3, a4, a5, a6) \ + efi_call6((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3), \ + (u64)(a4), (u64)(a5), (u64)(a6)) + +#define efi_call_virt0(f) \ + efi_call0((void *)(efi.systab->runtime->f)) +#define efi_call_virt1(f, a1) \ + efi_call1((void *)(efi.systab->runtime->f), (u64)(a1)) +#define efi_call_virt2(f, a1, a2) \ + efi_call2((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2)) +#define efi_call_virt3(f, a1, a2, a3) \ + efi_call3((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ + (u64)(a3)) +#define efi_call_virt4(f, a1, a2, a3, a4) \ + efi_call4((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ + (u64)(a3), (u64)(a4)) +#define efi_call_virt5(f, a1, a2, a3, a4, a5) \ + efi_call5((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ + (u64)(a3), (u64)(a4), (u64)(a5)) +#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \ + efi_call6((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ + (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6)) + +extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size); + +#endif /* CONFIG_X86_32 */ + +extern void efi_reserve_early(void); +extern void efi_call_phys_prelog(void); +extern void efi_call_phys_epilog(void); + +#ifndef CONFIG_EFI +/* + * IF EFI is not configured, have the EFI calls return -ENOSYS. + */ +#define efi_call0(_f) (-ENOSYS) +#define efi_call1(_f, _a1) (-ENOSYS) +#define efi_call2(_f, _a1, _a2) (-ENOSYS) +#define efi_call3(_f, _a1, _a2, _a3) (-ENOSYS) +#define efi_call4(_f, _a1, _a2, _a3, _a4) (-ENOSYS) +#define efi_call5(_f, _a1, _a2, _a3, _a4, _a5) (-ENOSYS) +#define efi_call6(_f, _a1, _a2, _a3, _a4, _a5, _a6) (-ENOSYS) +#endif /* CONFIG_EFI */ + +#endif /* _ASM_X86_EFI_H */ diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h new file mode 100644 index 000000000000..40ca1bea7916 --- /dev/null +++ b/arch/x86/include/asm/elf.h @@ -0,0 +1,336 @@ +#ifndef _ASM_X86_ELF_H +#define _ASM_X86_ELF_H + +/* + * ELF register definitions.. + */ + +#include <asm/ptrace.h> +#include <asm/user.h> +#include <asm/auxvec.h> + +typedef unsigned long elf_greg_t; + +#define ELF_NGREG (sizeof(struct user_regs_struct) / sizeof(elf_greg_t)) +typedef elf_greg_t elf_gregset_t[ELF_NGREG]; + +typedef struct user_i387_struct elf_fpregset_t; + +#ifdef __i386__ + +typedef struct user_fxsr_struct elf_fpxregset_t; + +#define R_386_NONE 0 +#define R_386_32 1 +#define R_386_PC32 2 +#define R_386_GOT32 3 +#define R_386_PLT32 4 +#define R_386_COPY 5 +#define R_386_GLOB_DAT 6 +#define R_386_JMP_SLOT 7 +#define R_386_RELATIVE 8 +#define R_386_GOTOFF 9 +#define R_386_GOTPC 10 +#define R_386_NUM 11 + +/* + * These are used to set parameters in the core dumps. + */ +#define ELF_CLASS ELFCLASS32 +#define ELF_DATA ELFDATA2LSB +#define ELF_ARCH EM_386 + +#else + +/* x86-64 relocation types */ +#define R_X86_64_NONE 0 /* No reloc */ +#define R_X86_64_64 1 /* Direct 64 bit */ +#define R_X86_64_PC32 2 /* PC relative 32 bit signed */ +#define R_X86_64_GOT32 3 /* 32 bit GOT entry */ +#define R_X86_64_PLT32 4 /* 32 bit PLT address */ +#define R_X86_64_COPY 5 /* Copy symbol at runtime */ +#define R_X86_64_GLOB_DAT 6 /* Create GOT entry */ +#define R_X86_64_JUMP_SLOT 7 /* Create PLT entry */ +#define R_X86_64_RELATIVE 8 /* Adjust by program base */ +#define R_X86_64_GOTPCREL 9 /* 32 bit signed pc relative + offset to GOT */ +#define R_X86_64_32 10 /* Direct 32 bit zero extended */ +#define R_X86_64_32S 11 /* Direct 32 bit sign extended */ +#define R_X86_64_16 12 /* Direct 16 bit zero extended */ +#define R_X86_64_PC16 13 /* 16 bit sign extended pc relative */ +#define R_X86_64_8 14 /* Direct 8 bit sign extended */ +#define R_X86_64_PC8 15 /* 8 bit sign extended pc relative */ + +#define R_X86_64_NUM 16 + +/* + * These are used to set parameters in the core dumps. + */ +#define ELF_CLASS ELFCLASS64 +#define ELF_DATA ELFDATA2LSB +#define ELF_ARCH EM_X86_64 + +#endif + +#include <asm/vdso.h> + +extern unsigned int vdso_enabled; + +/* + * This is used to ensure we don't load something for the wrong architecture. + */ +#define elf_check_arch_ia32(x) \ + (((x)->e_machine == EM_386) || ((x)->e_machine == EM_486)) + +#include <asm/processor.h> +#include <asm/system.h> + +#ifdef CONFIG_X86_32 +#include <asm/desc.h> + +#define elf_check_arch(x) elf_check_arch_ia32(x) + +/* SVR4/i386 ABI (pages 3-31, 3-32) says that when the program starts %edx + contains a pointer to a function which might be registered using `atexit'. + This provides a mean for the dynamic linker to call DT_FINI functions for + shared libraries that have been loaded before the code runs. + + A value of 0 tells we have no such handler. + + We might as well make sure everything else is cleared too (except for %esp), + just to make things more deterministic. + */ +#define ELF_PLAT_INIT(_r, load_addr) \ + do { \ + _r->bx = 0; _r->cx = 0; _r->dx = 0; \ + _r->si = 0; _r->di = 0; _r->bp = 0; \ + _r->ax = 0; \ +} while (0) + +/* + * regs is struct pt_regs, pr_reg is elf_gregset_t (which is + * now struct_user_regs, they are different) + */ + +#define ELF_CORE_COPY_REGS(pr_reg, regs) \ +do { \ + pr_reg[0] = regs->bx; \ + pr_reg[1] = regs->cx; \ + pr_reg[2] = regs->dx; \ + pr_reg[3] = regs->si; \ + pr_reg[4] = regs->di; \ + pr_reg[5] = regs->bp; \ + pr_reg[6] = regs->ax; \ + pr_reg[7] = regs->ds & 0xffff; \ + pr_reg[8] = regs->es & 0xffff; \ + pr_reg[9] = regs->fs & 0xffff; \ + savesegment(gs, pr_reg[10]); \ + pr_reg[11] = regs->orig_ax; \ + pr_reg[12] = regs->ip; \ + pr_reg[13] = regs->cs & 0xffff; \ + pr_reg[14] = regs->flags; \ + pr_reg[15] = regs->sp; \ + pr_reg[16] = regs->ss & 0xffff; \ +} while (0); + +#define ELF_PLATFORM (utsname()->machine) +#define set_personality_64bit() do { } while (0) + +#else /* CONFIG_X86_32 */ + +/* + * This is used to ensure we don't load something for the wrong architecture. + */ +#define elf_check_arch(x) \ + ((x)->e_machine == EM_X86_64) + +#define compat_elf_check_arch(x) elf_check_arch_ia32(x) + +static inline void start_ia32_thread(struct pt_regs *regs, u32 ip, u32 sp) +{ + loadsegment(fs, 0); + loadsegment(ds, __USER32_DS); + loadsegment(es, __USER32_DS); + load_gs_index(0); + regs->ip = ip; + regs->sp = sp; + regs->flags = X86_EFLAGS_IF; + regs->cs = __USER32_CS; + regs->ss = __USER32_DS; +} + +static inline void elf_common_init(struct thread_struct *t, + struct pt_regs *regs, const u16 ds) +{ + regs->ax = regs->bx = regs->cx = regs->dx = 0; + regs->si = regs->di = regs->bp = 0; + regs->r8 = regs->r9 = regs->r10 = regs->r11 = 0; + regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0; + t->fs = t->gs = 0; + t->fsindex = t->gsindex = 0; + t->ds = t->es = ds; +} + +#define ELF_PLAT_INIT(_r, load_addr) \ +do { \ + elf_common_init(¤t->thread, _r, 0); \ + clear_thread_flag(TIF_IA32); \ +} while (0) + +#define COMPAT_ELF_PLAT_INIT(regs, load_addr) \ + elf_common_init(¤t->thread, regs, __USER_DS) + +#define compat_start_thread(regs, ip, sp) \ +do { \ + start_ia32_thread(regs, ip, sp); \ + set_fs(USER_DS); \ +} while (0) + +#define COMPAT_SET_PERSONALITY(ex) \ +do { \ + if (test_thread_flag(TIF_IA32)) \ + clear_thread_flag(TIF_ABI_PENDING); \ + else \ + set_thread_flag(TIF_ABI_PENDING); \ + current->personality |= force_personality32; \ +} while (0) + +#define COMPAT_ELF_PLATFORM ("i686") + +/* + * regs is struct pt_regs, pr_reg is elf_gregset_t (which is + * now struct_user_regs, they are different). Assumes current is the process + * getting dumped. + */ + +#define ELF_CORE_COPY_REGS(pr_reg, regs) \ +do { \ + unsigned v; \ + (pr_reg)[0] = (regs)->r15; \ + (pr_reg)[1] = (regs)->r14; \ + (pr_reg)[2] = (regs)->r13; \ + (pr_reg)[3] = (regs)->r12; \ + (pr_reg)[4] = (regs)->bp; \ + (pr_reg)[5] = (regs)->bx; \ + (pr_reg)[6] = (regs)->r11; \ + (pr_reg)[7] = (regs)->r10; \ + (pr_reg)[8] = (regs)->r9; \ + (pr_reg)[9] = (regs)->r8; \ + (pr_reg)[10] = (regs)->ax; \ + (pr_reg)[11] = (regs)->cx; \ + (pr_reg)[12] = (regs)->dx; \ + (pr_reg)[13] = (regs)->si; \ + (pr_reg)[14] = (regs)->di; \ + (pr_reg)[15] = (regs)->orig_ax; \ + (pr_reg)[16] = (regs)->ip; \ + (pr_reg)[17] = (regs)->cs; \ + (pr_reg)[18] = (regs)->flags; \ + (pr_reg)[19] = (regs)->sp; \ + (pr_reg)[20] = (regs)->ss; \ + (pr_reg)[21] = current->thread.fs; \ + (pr_reg)[22] = current->thread.gs; \ + asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v; \ + asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \ + asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \ + asm("movl %%gs,%0" : "=r" (v)); (pr_reg)[26] = v; \ +} while (0); + +/* I'm not sure if we can use '-' here */ +#define ELF_PLATFORM ("x86_64") +extern void set_personality_64bit(void); +extern unsigned int sysctl_vsyscall32; +extern int force_personality32; + +#endif /* !CONFIG_X86_32 */ + +#define CORE_DUMP_USE_REGSET +#define USE_ELF_CORE_DUMP +#define ELF_EXEC_PAGESIZE 4096 + +/* This is the location that an ET_DYN program is loaded if exec'ed. Typical + use of this is to invoke "./ld.so someprog" to test out a new version of + the loader. We need to make sure that it is out of the way of the program + that it will "exec", and that there is sufficient room for the brk. */ + +#define ELF_ET_DYN_BASE (TASK_SIZE / 3 * 2) + +/* This yields a mask that user programs can use to figure out what + instruction set this CPU supports. This could be done in user space, + but it's not easy, and we've already done it here. */ + +#define ELF_HWCAP (boot_cpu_data.x86_capability[0]) + +/* This yields a string that ld.so will use to load implementation + specific libraries for optimization. This is more specific in + intent than poking at uname or /proc/cpuinfo. + + For the moment, we have only optimizations for the Intel generations, + but that could change... */ + +#define SET_PERSONALITY(ex) set_personality_64bit() + +/* + * An executable for which elf_read_implies_exec() returns TRUE will + * have the READ_IMPLIES_EXEC personality flag set automatically. + */ +#define elf_read_implies_exec(ex, executable_stack) \ + (executable_stack != EXSTACK_DISABLE_X) + +struct task_struct; + +#define ARCH_DLINFO_IA32(vdso_enabled) \ +do { \ + if (vdso_enabled) { \ + NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY); \ + NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_CURRENT_BASE); \ + } \ +} while (0) + +#ifdef CONFIG_X86_32 + +#define VDSO_HIGH_BASE (__fix_to_virt(FIX_VDSO)) + +#define ARCH_DLINFO ARCH_DLINFO_IA32(vdso_enabled) + +/* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */ + +#else /* CONFIG_X86_32 */ + +#define VDSO_HIGH_BASE 0xffffe000U /* CONFIG_COMPAT_VDSO address */ + +/* 1GB for 64bit, 8MB for 32bit */ +#define STACK_RND_MASK (test_thread_flag(TIF_IA32) ? 0x7ff : 0x3fffff) + +#define ARCH_DLINFO \ +do { \ + if (vdso_enabled) \ + NEW_AUX_ENT(AT_SYSINFO_EHDR, \ + (unsigned long)current->mm->context.vdso); \ +} while (0) + +#define AT_SYSINFO 32 + +#define COMPAT_ARCH_DLINFO ARCH_DLINFO_IA32(sysctl_vsyscall32) + +#define COMPAT_ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000) + +#endif /* !CONFIG_X86_32 */ + +#define VDSO_CURRENT_BASE ((unsigned long)current->mm->context.vdso) + +#define VDSO_ENTRY \ + ((unsigned long)VDSO32_SYMBOL(VDSO_CURRENT_BASE, vsyscall)) + +struct linux_binprm; + +#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 +extern int arch_setup_additional_pages(struct linux_binprm *bprm, + int executable_stack); + +extern int syscall32_setup_pages(struct linux_binprm *, int exstack); +#define compat_arch_setup_additional_pages syscall32_setup_pages + +extern unsigned long arch_randomize_brk(struct mm_struct *mm); +#define arch_randomize_brk arch_randomize_brk + +#endif /* _ASM_X86_ELF_H */ diff --git a/arch/x86/include/asm/emergency-restart.h b/arch/x86/include/asm/emergency-restart.h new file mode 100644 index 000000000000..94826cf87455 --- /dev/null +++ b/arch/x86/include/asm/emergency-restart.h @@ -0,0 +1,18 @@ +#ifndef _ASM_X86_EMERGENCY_RESTART_H +#define _ASM_X86_EMERGENCY_RESTART_H + +enum reboot_type { + BOOT_TRIPLE = 't', + BOOT_KBD = 'k', +#ifdef CONFIG_X86_32 + BOOT_BIOS = 'b', +#endif + BOOT_ACPI = 'a', + BOOT_EFI = 'e' +}; + +extern enum reboot_type reboot_type; + +extern void machine_emergency_restart(void); + +#endif /* _ASM_X86_EMERGENCY_RESTART_H */ diff --git a/arch/x86/include/asm/errno.h b/arch/x86/include/asm/errno.h new file mode 100644 index 000000000000..4c82b503d92f --- /dev/null +++ b/arch/x86/include/asm/errno.h @@ -0,0 +1 @@ +#include <asm-generic/errno.h> diff --git a/arch/x86/include/asm/es7000/apic.h b/arch/x86/include/asm/es7000/apic.h new file mode 100644 index 000000000000..380f0b4f17ed --- /dev/null +++ b/arch/x86/include/asm/es7000/apic.h @@ -0,0 +1,193 @@ +#ifndef __ASM_ES7000_APIC_H +#define __ASM_ES7000_APIC_H + +#define xapic_phys_to_log_apicid(cpu) per_cpu(x86_bios_cpu_apicid, cpu) +#define esr_disable (1) + +static inline int apic_id_registered(void) +{ + return (1); +} + +static inline cpumask_t target_cpus(void) +{ +#if defined CONFIG_ES7000_CLUSTERED_APIC + return CPU_MASK_ALL; +#else + return cpumask_of_cpu(smp_processor_id()); +#endif +} + +#if defined CONFIG_ES7000_CLUSTERED_APIC +#define APIC_DFR_VALUE (APIC_DFR_CLUSTER) +#define INT_DELIVERY_MODE (dest_LowestPrio) +#define INT_DEST_MODE (1) /* logical delivery broadcast to all procs */ +#define NO_BALANCE_IRQ (1) +#undef WAKE_SECONDARY_VIA_INIT +#define WAKE_SECONDARY_VIA_MIP +#else +#define APIC_DFR_VALUE (APIC_DFR_FLAT) +#define INT_DELIVERY_MODE (dest_Fixed) +#define INT_DEST_MODE (0) /* phys delivery to target procs */ +#define NO_BALANCE_IRQ (0) +#undef APIC_DEST_LOGICAL +#define APIC_DEST_LOGICAL 0x0 +#define WAKE_SECONDARY_VIA_INIT +#endif + +static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid) +{ + return 0; +} +static inline unsigned long check_apicid_present(int bit) +{ + return physid_isset(bit, phys_cpu_present_map); +} + +#define apicid_cluster(apicid) (apicid & 0xF0) + +static inline unsigned long calculate_ldr(int cpu) +{ + unsigned long id; + id = xapic_phys_to_log_apicid(cpu); + return (SET_APIC_LOGICAL_ID(id)); +} + +/* + * Set up the logical destination ID. + * + * Intel recommends to set DFR, LdR and TPR before enabling + * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel + * document number 292116). So here it goes... + */ +static inline void init_apic_ldr(void) +{ + unsigned long val; + int cpu = smp_processor_id(); + + apic_write(APIC_DFR, APIC_DFR_VALUE); + val = calculate_ldr(cpu); + apic_write(APIC_LDR, val); +} + +#ifndef CONFIG_X86_GENERICARCH +extern void enable_apic_mode(void); +#endif + +extern int apic_version [MAX_APICS]; +static inline void setup_apic_routing(void) +{ + int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id()); + printk("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n", + (apic_version[apic] == 0x14) ? + "Physical Cluster" : "Logical Cluster", nr_ioapics, cpus_addr(target_cpus())[0]); +} + +static inline int multi_timer_check(int apic, int irq) +{ + return 0; +} + +static inline int apicid_to_node(int logical_apicid) +{ + return 0; +} + + +static inline int cpu_present_to_apicid(int mps_cpu) +{ + if (!mps_cpu) + return boot_cpu_physical_apicid; + else if (mps_cpu < NR_CPUS) + return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu); + else + return BAD_APICID; +} + +static inline physid_mask_t apicid_to_cpu_present(int phys_apicid) +{ + static int id = 0; + physid_mask_t mask; + mask = physid_mask_of_physid(id); + ++id; + return mask; +} + +extern u8 cpu_2_logical_apicid[]; +/* Mapping from cpu number to logical apicid */ +static inline int cpu_to_logical_apicid(int cpu) +{ +#ifdef CONFIG_SMP + if (cpu >= NR_CPUS) + return BAD_APICID; + return (int)cpu_2_logical_apicid[cpu]; +#else + return logical_smp_processor_id(); +#endif +} + +static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map) +{ + /* For clustered we don't have a good way to do this yet - hack */ + return physids_promote(0xff); +} + + +static inline void setup_portio_remap(void) +{ +} + +extern unsigned int boot_cpu_physical_apicid; +static inline int check_phys_apicid_present(int cpu_physical_apicid) +{ + boot_cpu_physical_apicid = read_apic_id(); + return (1); +} + +static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) +{ + int num_bits_set; + int cpus_found = 0; + int cpu; + int apicid; + + num_bits_set = cpus_weight(cpumask); + /* Return id to all */ + if (num_bits_set == NR_CPUS) +#if defined CONFIG_ES7000_CLUSTERED_APIC + return 0xFF; +#else + return cpu_to_logical_apicid(0); +#endif + /* + * The cpus in the mask must all be on the apic cluster. If are not + * on the same apicid cluster return default value of TARGET_CPUS. + */ + cpu = first_cpu(cpumask); + apicid = cpu_to_logical_apicid(cpu); + while (cpus_found < num_bits_set) { + if (cpu_isset(cpu, cpumask)) { + int new_apicid = cpu_to_logical_apicid(cpu); + if (apicid_cluster(apicid) != + apicid_cluster(new_apicid)){ + printk ("%s: Not a valid mask!\n", __func__); +#if defined CONFIG_ES7000_CLUSTERED_APIC + return 0xFF; +#else + return cpu_to_logical_apicid(0); +#endif + } + apicid = new_apicid; + cpus_found++; + } + cpu++; + } + return apicid; +} + +static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb) +{ + return cpuid_apic >> index_msb; +} + +#endif /* __ASM_ES7000_APIC_H */ diff --git a/arch/x86/include/asm/es7000/apicdef.h b/arch/x86/include/asm/es7000/apicdef.h new file mode 100644 index 000000000000..8b234a3cb851 --- /dev/null +++ b/arch/x86/include/asm/es7000/apicdef.h @@ -0,0 +1,13 @@ +#ifndef __ASM_ES7000_APICDEF_H +#define __ASM_ES7000_APICDEF_H + +#define APIC_ID_MASK (0xFF<<24) + +static inline unsigned get_apic_id(unsigned long x) +{ + return (((x)>>24)&0xFF); +} + +#define GET_APIC_ID(x) get_apic_id(x) + +#endif diff --git a/arch/x86/include/asm/es7000/ipi.h b/arch/x86/include/asm/es7000/ipi.h new file mode 100644 index 000000000000..632a955fcc0a --- /dev/null +++ b/arch/x86/include/asm/es7000/ipi.h @@ -0,0 +1,24 @@ +#ifndef __ASM_ES7000_IPI_H +#define __ASM_ES7000_IPI_H + +void send_IPI_mask_sequence(cpumask_t mask, int vector); + +static inline void send_IPI_mask(cpumask_t mask, int vector) +{ + send_IPI_mask_sequence(mask, vector); +} + +static inline void send_IPI_allbutself(int vector) +{ + cpumask_t mask = cpu_online_map; + cpu_clear(smp_processor_id(), mask); + if (!cpus_empty(mask)) + send_IPI_mask(mask, vector); +} + +static inline void send_IPI_all(int vector) +{ + send_IPI_mask(cpu_online_map, vector); +} + +#endif /* __ASM_ES7000_IPI_H */ diff --git a/arch/x86/include/asm/es7000/mpparse.h b/arch/x86/include/asm/es7000/mpparse.h new file mode 100644 index 000000000000..ed5a3caae141 --- /dev/null +++ b/arch/x86/include/asm/es7000/mpparse.h @@ -0,0 +1,30 @@ +#ifndef __ASM_ES7000_MPPARSE_H +#define __ASM_ES7000_MPPARSE_H + +#include <linux/acpi.h> + +extern int parse_unisys_oem (char *oemptr); +extern int find_unisys_acpi_oem_table(unsigned long *oem_addr); +extern void unmap_unisys_acpi_oem_table(unsigned long oem_addr); +extern void setup_unisys(void); + +#ifndef CONFIG_X86_GENERICARCH +extern int acpi_madt_oem_check(char *oem_id, char *oem_table_id); +extern int mps_oem_check(struct mp_config_table *mpc, char *oem, + char *productid); +#endif + +#ifdef CONFIG_ACPI + +static inline int es7000_check_dsdt(void) +{ + struct acpi_table_header header; + + if (ACPI_SUCCESS(acpi_get_table_header(ACPI_SIG_DSDT, 0, &header)) && + !strncmp(header.oem_id, "UNISYS", 6)) + return 1; + return 0; +} +#endif + +#endif /* __ASM_MACH_MPPARSE_H */ diff --git a/arch/x86/include/asm/es7000/wakecpu.h b/arch/x86/include/asm/es7000/wakecpu.h new file mode 100644 index 000000000000..3ffc5a7bf667 --- /dev/null +++ b/arch/x86/include/asm/es7000/wakecpu.h @@ -0,0 +1,59 @@ +#ifndef __ASM_ES7000_WAKECPU_H +#define __ASM_ES7000_WAKECPU_H + +/* + * This file copes with machines that wakeup secondary CPUs by the + * INIT, INIT, STARTUP sequence. + */ + +#ifdef CONFIG_ES7000_CLUSTERED_APIC +#define WAKE_SECONDARY_VIA_MIP +#else +#define WAKE_SECONDARY_VIA_INIT +#endif + +#ifdef WAKE_SECONDARY_VIA_MIP +extern int es7000_start_cpu(int cpu, unsigned long eip); +static inline int +wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) +{ + int boot_error = 0; + boot_error = es7000_start_cpu(phys_apicid, start_eip); + return boot_error; +} +#endif + +#define TRAMPOLINE_LOW phys_to_virt(0x467) +#define TRAMPOLINE_HIGH phys_to_virt(0x469) + +#define boot_cpu_apicid boot_cpu_physical_apicid + +static inline void wait_for_init_deassert(atomic_t *deassert) +{ +#ifdef WAKE_SECONDARY_VIA_INIT + while (!atomic_read(deassert)) + cpu_relax(); +#endif + return; +} + +/* Nothing to do for most platforms, since cleared by the INIT cycle */ +static inline void smp_callin_clear_local_apic(void) +{ +} + +static inline void store_NMI_vector(unsigned short *high, unsigned short *low) +{ +} + +static inline void restore_NMI_vector(unsigned short *high, unsigned short *low) +{ +} + +#if APIC_DEBUG + #define inquire_remote_apic(apicid) __inquire_remote_apic(apicid) +#else + #define inquire_remote_apic(apicid) {} +#endif + +#endif /* __ASM_MACH_WAKECPU_H */ diff --git a/arch/x86/include/asm/fb.h b/arch/x86/include/asm/fb.h new file mode 100644 index 000000000000..53018464aea6 --- /dev/null +++ b/arch/x86/include/asm/fb.h @@ -0,0 +1,21 @@ +#ifndef _ASM_X86_FB_H +#define _ASM_X86_FB_H + +#include <linux/fb.h> +#include <linux/fs.h> +#include <asm/page.h> + +static inline void fb_pgprotect(struct file *file, struct vm_area_struct *vma, + unsigned long off) +{ + if (boot_cpu_data.x86 > 3) + pgprot_val(vma->vm_page_prot) |= _PAGE_PCD; +} + +#ifdef CONFIG_X86_32 +extern int fb_is_primary_device(struct fb_info *info); +#else +static inline int fb_is_primary_device(struct fb_info *info) { return 0; } +#endif + +#endif /* _ASM_X86_FB_H */ diff --git a/arch/x86/include/asm/fcntl.h b/arch/x86/include/asm/fcntl.h new file mode 100644 index 000000000000..46ab12db5739 --- /dev/null +++ b/arch/x86/include/asm/fcntl.h @@ -0,0 +1 @@ +#include <asm-generic/fcntl.h> diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h new file mode 100644 index 000000000000..8668a94f850e --- /dev/null +++ b/arch/x86/include/asm/fixmap.h @@ -0,0 +1,68 @@ +#ifndef _ASM_X86_FIXMAP_H +#define _ASM_X86_FIXMAP_H + +#ifdef CONFIG_X86_32 +# include "fixmap_32.h" +#else +# include "fixmap_64.h" +#endif + +extern int fixmaps_set; + +void __native_set_fixmap(enum fixed_addresses idx, pte_t pte); +void native_set_fixmap(enum fixed_addresses idx, + unsigned long phys, pgprot_t flags); + +#ifndef CONFIG_PARAVIRT +static inline void __set_fixmap(enum fixed_addresses idx, + unsigned long phys, pgprot_t flags) +{ + native_set_fixmap(idx, phys, flags); +} +#endif + +#define set_fixmap(idx, phys) \ + __set_fixmap(idx, phys, PAGE_KERNEL) + +/* + * Some hardware wants to get fixmapped without caching. + */ +#define set_fixmap_nocache(idx, phys) \ + __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE) + +#define clear_fixmap(idx) \ + __set_fixmap(idx, 0, __pgprot(0)) + +#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT)) +#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT) + +extern void __this_fixmap_does_not_exist(void); + +/* + * 'index to address' translation. If anyone tries to use the idx + * directly without translation, we catch the bug with a NULL-deference + * kernel oops. Illegal ranges of incoming indices are caught too. + */ +static __always_inline unsigned long fix_to_virt(const unsigned int idx) +{ + /* + * this branch gets completely eliminated after inlining, + * except when someone tries to use fixaddr indices in an + * illegal way. (such as mixing up address types or using + * out-of-range indices). + * + * If it doesn't get removed, the linker will complain + * loudly with a reasonably clear error message.. + */ + if (idx >= __end_of_fixed_addresses) + __this_fixmap_does_not_exist(); + + return __fix_to_virt(idx); +} + +static inline unsigned long virt_to_fix(const unsigned long vaddr) +{ + BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START); + return __virt_to_fix(vaddr); +} +#endif /* _ASM_X86_FIXMAP_H */ diff --git a/arch/x86/include/asm/fixmap_32.h b/arch/x86/include/asm/fixmap_32.h new file mode 100644 index 000000000000..09f29ab5c139 --- /dev/null +++ b/arch/x86/include/asm/fixmap_32.h @@ -0,0 +1,123 @@ +/* + * fixmap.h: compile-time virtual memory allocation + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (C) 1998 Ingo Molnar + * + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + */ + +#ifndef _ASM_X86_FIXMAP_32_H +#define _ASM_X86_FIXMAP_32_H + + +/* used by vmalloc.c, vsyscall.lds.S. + * + * Leave one empty page between vmalloc'ed areas and + * the start of the fixmap. + */ +extern unsigned long __FIXADDR_TOP; +#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO) +#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1) + +#ifndef __ASSEMBLY__ +#include <linux/kernel.h> +#include <asm/acpi.h> +#include <asm/apicdef.h> +#include <asm/page.h> +#ifdef CONFIG_HIGHMEM +#include <linux/threads.h> +#include <asm/kmap_types.h> +#endif + +/* + * Here we define all the compile-time 'special' virtual + * addresses. The point is to have a constant address at + * compile time, but to set the physical address only + * in the boot process. We allocate these special addresses + * from the end of virtual memory (0xfffff000) backwards. + * Also this lets us do fail-safe vmalloc(), we + * can guarantee that these special addresses and + * vmalloc()-ed addresses never overlap. + * + * these 'compile-time allocated' memory buffers are + * fixed-size 4k pages. (or larger if used with an increment + * highger than 1) use fixmap_set(idx,phys) to associate + * physical memory with fixmap indices. + * + * TLB entries of such buffers will not be flushed across + * task switches. + */ +enum fixed_addresses { + FIX_HOLE, + FIX_VDSO, + FIX_DBGP_BASE, + FIX_EARLYCON_MEM_BASE, +#ifdef CONFIG_X86_LOCAL_APIC + FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ +#endif +#ifdef CONFIG_X86_IO_APIC + FIX_IO_APIC_BASE_0, + FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1, +#endif +#ifdef CONFIG_X86_VISWS_APIC + FIX_CO_CPU, /* Cobalt timer */ + FIX_CO_APIC, /* Cobalt APIC Redirection Table */ + FIX_LI_PCIA, /* Lithium PCI Bridge A */ + FIX_LI_PCIB, /* Lithium PCI Bridge B */ +#endif +#ifdef CONFIG_X86_F00F_BUG + FIX_F00F_IDT, /* Virtual mapping for IDT */ +#endif +#ifdef CONFIG_X86_CYCLONE_TIMER + FIX_CYCLONE_TIMER, /*cyclone timer register*/ +#endif +#ifdef CONFIG_HIGHMEM + FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ + FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, +#endif +#ifdef CONFIG_PCI_MMCONFIG + FIX_PCIE_MCFG, +#endif +#ifdef CONFIG_PARAVIRT + FIX_PARAVIRT_BOOTMAP, +#endif + __end_of_permanent_fixed_addresses, + /* + * 256 temporary boot-time mappings, used by early_ioremap(), + * before ioremap() is functional. + * + * We round it up to the next 256 pages boundary so that we + * can have a single pgd entry and a single pte table: + */ +#define NR_FIX_BTMAPS 64 +#define FIX_BTMAPS_SLOTS 4 + FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 - + (__end_of_permanent_fixed_addresses & 255), + FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1, + FIX_WP_TEST, +#ifdef CONFIG_ACPI + FIX_ACPI_BEGIN, + FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1, +#endif +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT + FIX_OHCI1394_BASE, +#endif + __end_of_fixed_addresses +}; + +extern void reserve_top_address(unsigned long reserve); + + +#define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP) + +#define __FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT) +#define __FIXADDR_BOOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT) +#define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE) +#define FIXADDR_BOOT_START (FIXADDR_TOP - __FIXADDR_BOOT_SIZE) + +#endif /* !__ASSEMBLY__ */ +#endif /* _ASM_X86_FIXMAP_32_H */ diff --git a/arch/x86/include/asm/fixmap_64.h b/arch/x86/include/asm/fixmap_64.h new file mode 100644 index 000000000000..00a30ab9b1a5 --- /dev/null +++ b/arch/x86/include/asm/fixmap_64.h @@ -0,0 +1,83 @@ +/* + * fixmap.h: compile-time virtual memory allocation + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (C) 1998 Ingo Molnar + */ + +#ifndef _ASM_X86_FIXMAP_64_H +#define _ASM_X86_FIXMAP_64_H + +#include <linux/kernel.h> +#include <asm/acpi.h> +#include <asm/apicdef.h> +#include <asm/page.h> +#include <asm/vsyscall.h> +#include <asm/efi.h> + +/* + * Here we define all the compile-time 'special' virtual + * addresses. The point is to have a constant address at + * compile time, but to set the physical address only + * in the boot process. + * + * These 'compile-time allocated' memory buffers are + * fixed-size 4k pages (or larger if used with an increment + * higher than 1). Use set_fixmap(idx,phys) to associate + * physical memory with fixmap indices. + * + * TLB entries of such buffers will not be flushed across + * task switches. + */ + +enum fixed_addresses { + VSYSCALL_LAST_PAGE, + VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE + + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1, + VSYSCALL_HPET, + FIX_DBGP_BASE, + FIX_EARLYCON_MEM_BASE, + FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ + FIX_IO_APIC_BASE_0, + FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1, + FIX_EFI_IO_MAP_LAST_PAGE, + FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE + + MAX_EFI_IO_PAGES - 1, +#ifdef CONFIG_PARAVIRT + FIX_PARAVIRT_BOOTMAP, +#endif + __end_of_permanent_fixed_addresses, +#ifdef CONFIG_ACPI + FIX_ACPI_BEGIN, + FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1, +#endif +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT + FIX_OHCI1394_BASE, +#endif + /* + * 256 temporary boot-time mappings, used by early_ioremap(), + * before ioremap() is functional. + * + * We round it up to the next 256 pages boundary so that we + * can have a single pgd entry and a single pte table: + */ +#define NR_FIX_BTMAPS 64 +#define FIX_BTMAPS_SLOTS 4 + FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 - + (__end_of_permanent_fixed_addresses & 255), + FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1, + __end_of_fixed_addresses +}; + +#define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE) +#define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT) +#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) + +/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */ +#define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL) +#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE) + +#endif /* _ASM_X86_FIXMAP_64_H */ diff --git a/arch/x86/include/asm/floppy.h b/arch/x86/include/asm/floppy.h new file mode 100644 index 000000000000..dbe82a5c5eac --- /dev/null +++ b/arch/x86/include/asm/floppy.h @@ -0,0 +1,281 @@ +/* + * Architecture specific parts of the Floppy driver + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (C) 1995 + */ +#ifndef _ASM_X86_FLOPPY_H +#define _ASM_X86_FLOPPY_H + +#include <linux/vmalloc.h> + +/* + * The DMA channel used by the floppy controller cannot access data at + * addresses >= 16MB + * + * Went back to the 1MB limit, as some people had problems with the floppy + * driver otherwise. It doesn't matter much for performance anyway, as most + * floppy accesses go through the track buffer. + */ +#define _CROSS_64KB(a, s, vdma) \ + (!(vdma) && \ + ((unsigned long)(a)/K_64 != ((unsigned long)(a) + (s) - 1) / K_64)) + +#define CROSS_64KB(a, s) _CROSS_64KB(a, s, use_virtual_dma & 1) + + +#define SW fd_routine[use_virtual_dma & 1] +#define CSW fd_routine[can_use_virtual_dma & 1] + + +#define fd_inb(port) inb_p(port) +#define fd_outb(value, port) outb_p(value, port) + +#define fd_request_dma() CSW._request_dma(FLOPPY_DMA, "floppy") +#define fd_free_dma() CSW._free_dma(FLOPPY_DMA) +#define fd_enable_irq() enable_irq(FLOPPY_IRQ) +#define fd_disable_irq() disable_irq(FLOPPY_IRQ) +#define fd_free_irq() free_irq(FLOPPY_IRQ, NULL) +#define fd_get_dma_residue() SW._get_dma_residue(FLOPPY_DMA) +#define fd_dma_mem_alloc(size) SW._dma_mem_alloc(size) +#define fd_dma_setup(addr, size, mode, io) SW._dma_setup(addr, size, mode, io) + +#define FLOPPY_CAN_FALLBACK_ON_NODMA + +static int virtual_dma_count; +static int virtual_dma_residue; +static char *virtual_dma_addr; +static int virtual_dma_mode; +static int doing_pdma; + +static irqreturn_t floppy_hardint(int irq, void *dev_id) +{ + unsigned char st; + +#undef TRACE_FLPY_INT + +#ifdef TRACE_FLPY_INT + static int calls; + static int bytes; + static int dma_wait; +#endif + if (!doing_pdma) + return floppy_interrupt(irq, dev_id); + +#ifdef TRACE_FLPY_INT + if (!calls) + bytes = virtual_dma_count; +#endif + + { + int lcount; + char *lptr; + + st = 1; + for (lcount = virtual_dma_count, lptr = virtual_dma_addr; + lcount; lcount--, lptr++) { + st = inb(virtual_dma_port + 4) & 0xa0; + if (st != 0xa0) + break; + if (virtual_dma_mode) + outb_p(*lptr, virtual_dma_port + 5); + else + *lptr = inb_p(virtual_dma_port + 5); + } + virtual_dma_count = lcount; + virtual_dma_addr = lptr; + st = inb(virtual_dma_port + 4); + } + +#ifdef TRACE_FLPY_INT + calls++; +#endif + if (st == 0x20) + return IRQ_HANDLED; + if (!(st & 0x20)) { + virtual_dma_residue += virtual_dma_count; + virtual_dma_count = 0; +#ifdef TRACE_FLPY_INT + printk("count=%x, residue=%x calls=%d bytes=%d dma_wait=%d\n", + virtual_dma_count, virtual_dma_residue, calls, bytes, + dma_wait); + calls = 0; + dma_wait = 0; +#endif + doing_pdma = 0; + floppy_interrupt(irq, dev_id); + return IRQ_HANDLED; + } +#ifdef TRACE_FLPY_INT + if (!virtual_dma_count) + dma_wait++; +#endif + return IRQ_HANDLED; +} + +static void fd_disable_dma(void) +{ + if (!(can_use_virtual_dma & 1)) + disable_dma(FLOPPY_DMA); + doing_pdma = 0; + virtual_dma_residue += virtual_dma_count; + virtual_dma_count = 0; +} + +static int vdma_request_dma(unsigned int dmanr, const char *device_id) +{ + return 0; +} + +static void vdma_nop(unsigned int dummy) +{ +} + + +static int vdma_get_dma_residue(unsigned int dummy) +{ + return virtual_dma_count + virtual_dma_residue; +} + + +static int fd_request_irq(void) +{ + if (can_use_virtual_dma) + return request_irq(FLOPPY_IRQ, floppy_hardint, + IRQF_DISABLED, "floppy", NULL); + else + return request_irq(FLOPPY_IRQ, floppy_interrupt, + IRQF_DISABLED, "floppy", NULL); +} + +static unsigned long dma_mem_alloc(unsigned long size) +{ + return __get_dma_pages(GFP_KERNEL|__GFP_NORETRY, get_order(size)); +} + + +static unsigned long vdma_mem_alloc(unsigned long size) +{ + return (unsigned long)vmalloc(size); + +} + +#define nodma_mem_alloc(size) vdma_mem_alloc(size) + +static void _fd_dma_mem_free(unsigned long addr, unsigned long size) +{ + if ((unsigned long)addr >= (unsigned long)high_memory) + vfree((void *)addr); + else + free_pages(addr, get_order(size)); +} + +#define fd_dma_mem_free(addr, size) _fd_dma_mem_free(addr, size) + +static void _fd_chose_dma_mode(char *addr, unsigned long size) +{ + if (can_use_virtual_dma == 2) { + if ((unsigned long)addr >= (unsigned long)high_memory || + isa_virt_to_bus(addr) >= 0x1000000 || + _CROSS_64KB(addr, size, 0)) + use_virtual_dma = 1; + else + use_virtual_dma = 0; + } else { + use_virtual_dma = can_use_virtual_dma & 1; + } +} + +#define fd_chose_dma_mode(addr, size) _fd_chose_dma_mode(addr, size) + + +static int vdma_dma_setup(char *addr, unsigned long size, int mode, int io) +{ + doing_pdma = 1; + virtual_dma_port = io; + virtual_dma_mode = (mode == DMA_MODE_WRITE); + virtual_dma_addr = addr; + virtual_dma_count = size; + virtual_dma_residue = 0; + return 0; +} + +static int hard_dma_setup(char *addr, unsigned long size, int mode, int io) +{ +#ifdef FLOPPY_SANITY_CHECK + if (CROSS_64KB(addr, size)) { + printk("DMA crossing 64-K boundary %p-%p\n", addr, addr+size); + return -1; + } +#endif + /* actual, physical DMA */ + doing_pdma = 0; + clear_dma_ff(FLOPPY_DMA); + set_dma_mode(FLOPPY_DMA, mode); + set_dma_addr(FLOPPY_DMA, isa_virt_to_bus(addr)); + set_dma_count(FLOPPY_DMA, size); + enable_dma(FLOPPY_DMA); + return 0; +} + +static struct fd_routine_l { + int (*_request_dma)(unsigned int dmanr, const char *device_id); + void (*_free_dma)(unsigned int dmanr); + int (*_get_dma_residue)(unsigned int dummy); + unsigned long (*_dma_mem_alloc)(unsigned long size); + int (*_dma_setup)(char *addr, unsigned long size, int mode, int io); +} fd_routine[] = { + { + request_dma, + free_dma, + get_dma_residue, + dma_mem_alloc, + hard_dma_setup + }, + { + vdma_request_dma, + vdma_nop, + vdma_get_dma_residue, + vdma_mem_alloc, + vdma_dma_setup + } +}; + + +static int FDC1 = 0x3f0; +static int FDC2 = -1; + +/* + * Floppy types are stored in the rtc's CMOS RAM and so rtc_lock + * is needed to prevent corrupted CMOS RAM in case "insmod floppy" + * coincides with another rtc CMOS user. Paul G. + */ +#define FLOPPY0_TYPE \ +({ \ + unsigned long flags; \ + unsigned char val; \ + spin_lock_irqsave(&rtc_lock, flags); \ + val = (CMOS_READ(0x10) >> 4) & 15; \ + spin_unlock_irqrestore(&rtc_lock, flags); \ + val; \ +}) + +#define FLOPPY1_TYPE \ +({ \ + unsigned long flags; \ + unsigned char val; \ + spin_lock_irqsave(&rtc_lock, flags); \ + val = CMOS_READ(0x10) & 15; \ + spin_unlock_irqrestore(&rtc_lock, flags); \ + val; \ +}) + +#define N_FDC 2 +#define N_DRIVE 8 + +#define EXTRA_FLOPPY_PARAMS + +#endif /* _ASM_X86_FLOPPY_H */ diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h new file mode 100644 index 000000000000..06850a7194e1 --- /dev/null +++ b/arch/x86/include/asm/frame.h @@ -0,0 +1,27 @@ +#ifdef __ASSEMBLY__ + +#include <asm/dwarf2.h> + +/* The annotation hides the frame from the unwinder and makes it look + like a ordinary ebp save/restore. This avoids some special cases for + frame pointer later */ +#ifdef CONFIG_FRAME_POINTER + .macro FRAME + pushl %ebp + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebp,0 + movl %esp,%ebp + .endm + .macro ENDFRAME + popl %ebp + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE ebp + .endm +#else + .macro FRAME + .endm + .macro ENDFRAME + .endm +#endif + +#endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h new file mode 100644 index 000000000000..47f7e65e6c1d --- /dev/null +++ b/arch/x86/include/asm/ftrace.h @@ -0,0 +1,24 @@ +#ifndef _ASM_X86_FTRACE_H +#define _ASM_X86_FTRACE_H + +#ifdef CONFIG_FTRACE +#define MCOUNT_ADDR ((long)(mcount)) +#define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ + +#ifndef __ASSEMBLY__ +extern void mcount(void); + +static inline unsigned long ftrace_call_adjust(unsigned long addr) +{ + /* + * call mcount is "e8 <4 byte offset>" + * The addr points to the 4 byte offset and the caller of this + * function wants the pointer to e8. Simply subtract one. + */ + return addr - 1; +} +#endif + +#endif /* CONFIG_FTRACE */ + +#endif /* _ASM_X86_FTRACE_H */ diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h new file mode 100644 index 000000000000..1f11ce44e956 --- /dev/null +++ b/arch/x86/include/asm/futex.h @@ -0,0 +1,140 @@ +#ifndef _ASM_X86_FUTEX_H +#define _ASM_X86_FUTEX_H + +#ifdef __KERNEL__ + +#include <linux/futex.h> +#include <linux/uaccess.h> + +#include <asm/asm.h> +#include <asm/errno.h> +#include <asm/processor.h> +#include <asm/system.h> + +#define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg) \ + asm volatile("1:\t" insn "\n" \ + "2:\t.section .fixup,\"ax\"\n" \ + "3:\tmov\t%3, %1\n" \ + "\tjmp\t2b\n" \ + "\t.previous\n" \ + _ASM_EXTABLE(1b, 3b) \ + : "=r" (oldval), "=r" (ret), "+m" (*uaddr) \ + : "i" (-EFAULT), "0" (oparg), "1" (0)) + +#define __futex_atomic_op2(insn, ret, oldval, uaddr, oparg) \ + asm volatile("1:\tmovl %2, %0\n" \ + "\tmovl\t%0, %3\n" \ + "\t" insn "\n" \ + "2:\t" LOCK_PREFIX "cmpxchgl %3, %2\n" \ + "\tjnz\t1b\n" \ + "3:\t.section .fixup,\"ax\"\n" \ + "4:\tmov\t%5, %1\n" \ + "\tjmp\t3b\n" \ + "\t.previous\n" \ + _ASM_EXTABLE(1b, 4b) \ + _ASM_EXTABLE(2b, 4b) \ + : "=&a" (oldval), "=&r" (ret), \ + "+m" (*uaddr), "=&r" (tem) \ + : "r" (oparg), "i" (-EFAULT), "1" (0)) + +static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) +{ + int op = (encoded_op >> 28) & 7; + int cmp = (encoded_op >> 24) & 15; + int oparg = (encoded_op << 8) >> 20; + int cmparg = (encoded_op << 20) >> 20; + int oldval = 0, ret, tem; + + if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) + oparg = 1 << oparg; + + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + return -EFAULT; + +#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP) + /* Real i386 machines can only support FUTEX_OP_SET */ + if (op != FUTEX_OP_SET && boot_cpu_data.x86 == 3) + return -ENOSYS; +#endif + + pagefault_disable(); + + switch (op) { + case FUTEX_OP_SET: + __futex_atomic_op1("xchgl %0, %2", ret, oldval, uaddr, oparg); + break; + case FUTEX_OP_ADD: + __futex_atomic_op1(LOCK_PREFIX "xaddl %0, %2", ret, oldval, + uaddr, oparg); + break; + case FUTEX_OP_OR: + __futex_atomic_op2("orl %4, %3", ret, oldval, uaddr, oparg); + break; + case FUTEX_OP_ANDN: + __futex_atomic_op2("andl %4, %3", ret, oldval, uaddr, ~oparg); + break; + case FUTEX_OP_XOR: + __futex_atomic_op2("xorl %4, %3", ret, oldval, uaddr, oparg); + break; + default: + ret = -ENOSYS; + } + + pagefault_enable(); + + if (!ret) { + switch (cmp) { + case FUTEX_OP_CMP_EQ: + ret = (oldval == cmparg); + break; + case FUTEX_OP_CMP_NE: + ret = (oldval != cmparg); + break; + case FUTEX_OP_CMP_LT: + ret = (oldval < cmparg); + break; + case FUTEX_OP_CMP_GE: + ret = (oldval >= cmparg); + break; + case FUTEX_OP_CMP_LE: + ret = (oldval <= cmparg); + break; + case FUTEX_OP_CMP_GT: + ret = (oldval > cmparg); + break; + default: + ret = -ENOSYS; + } + } + return ret; +} + +static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, + int newval) +{ + +#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP) + /* Real i386 machines have no cmpxchg instruction */ + if (boot_cpu_data.x86 == 3) + return -ENOSYS; +#endif + + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + return -EFAULT; + + asm volatile("1:\t" LOCK_PREFIX "cmpxchgl %3, %1\n" + "2:\t.section .fixup, \"ax\"\n" + "3:\tmov %2, %0\n" + "\tjmp 2b\n" + "\t.previous\n" + _ASM_EXTABLE(1b, 3b) + : "=a" (oldval), "+m" (*uaddr) + : "i" (-EFAULT), "r" (newval), "0" (oldval) + : "memory" + ); + + return oldval; +} + +#endif +#endif /* _ASM_X86_FUTEX_H */ diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h new file mode 100644 index 000000000000..74252264433d --- /dev/null +++ b/arch/x86/include/asm/gart.h @@ -0,0 +1,73 @@ +#ifndef _ASM_X86_GART_H +#define _ASM_X86_GART_H + +#include <asm/e820.h> + +extern void set_up_gart_resume(u32, u32); + +extern int fallback_aper_order; +extern int fallback_aper_force; +extern int fix_aperture; + +/* PTE bits. */ +#define GPTE_VALID 1 +#define GPTE_COHERENT 2 + +/* Aperture control register bits. */ +#define GARTEN (1<<0) +#define DISGARTCPU (1<<4) +#define DISGARTIO (1<<5) + +/* GART cache control register bits. */ +#define INVGART (1<<0) +#define GARTPTEERR (1<<1) + +/* K8 On-cpu GART registers */ +#define AMD64_GARTAPERTURECTL 0x90 +#define AMD64_GARTAPERTUREBASE 0x94 +#define AMD64_GARTTABLEBASE 0x98 +#define AMD64_GARTCACHECTL 0x9c +#define AMD64_GARTEN (1<<0) + +extern int agp_amd64_init(void); + +static inline void enable_gart_translation(struct pci_dev *dev, u64 addr) +{ + u32 tmp, ctl; + + /* address of the mappings table */ + addr >>= 12; + tmp = (u32) addr<<4; + tmp &= ~0xf; + pci_write_config_dword(dev, AMD64_GARTTABLEBASE, tmp); + + /* Enable GART translation for this hammer. */ + pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); + ctl |= GARTEN; + ctl &= ~(DISGARTCPU | DISGARTIO); + pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); +} + +static inline int aperture_valid(u64 aper_base, u32 aper_size, u32 min_size) +{ + if (!aper_base) + return 0; + + if (aper_base + aper_size > 0x100000000ULL) { + printk(KERN_INFO "Aperture beyond 4GB. Ignoring.\n"); + return 0; + } + if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) { + printk(KERN_INFO "Aperture pointing to e820 RAM. Ignoring.\n"); + return 0; + } + if (aper_size < min_size) { + printk(KERN_INFO "Aperture too small (%d MB) than (%d MB)\n", + aper_size>>20, min_size>>20); + return 0; + } + + return 1; +} + +#endif /* _ASM_X86_GART_H */ diff --git a/arch/x86/include/asm/genapic.h b/arch/x86/include/asm/genapic.h new file mode 100644 index 000000000000..d48bee663a6f --- /dev/null +++ b/arch/x86/include/asm/genapic.h @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include "genapic_32.h" +#else +# include "genapic_64.h" +#endif diff --git a/arch/x86/include/asm/genapic_32.h b/arch/x86/include/asm/genapic_32.h new file mode 100644 index 000000000000..5cbd4fcc06fd --- /dev/null +++ b/arch/x86/include/asm/genapic_32.h @@ -0,0 +1,126 @@ +#ifndef _ASM_X86_GENAPIC_32_H +#define _ASM_X86_GENAPIC_32_H + +#include <asm/mpspec.h> + +/* + * Generic APIC driver interface. + * + * An straight forward mapping of the APIC related parts of the + * x86 subarchitecture interface to a dynamic object. + * + * This is used by the "generic" x86 subarchitecture. + * + * Copyright 2003 Andi Kleen, SuSE Labs. + */ + +struct mpc_config_bus; +struct mp_config_table; +struct mpc_config_processor; + +struct genapic { + char *name; + int (*probe)(void); + + int (*apic_id_registered)(void); + cpumask_t (*target_cpus)(void); + int int_delivery_mode; + int int_dest_mode; + int ESR_DISABLE; + int apic_destination_logical; + unsigned long (*check_apicid_used)(physid_mask_t bitmap, int apicid); + unsigned long (*check_apicid_present)(int apicid); + int no_balance_irq; + int no_ioapic_check; + void (*init_apic_ldr)(void); + physid_mask_t (*ioapic_phys_id_map)(physid_mask_t map); + + void (*setup_apic_routing)(void); + int (*multi_timer_check)(int apic, int irq); + int (*apicid_to_node)(int logical_apicid); + int (*cpu_to_logical_apicid)(int cpu); + int (*cpu_present_to_apicid)(int mps_cpu); + physid_mask_t (*apicid_to_cpu_present)(int phys_apicid); + void (*setup_portio_remap)(void); + int (*check_phys_apicid_present)(int boot_cpu_physical_apicid); + void (*enable_apic_mode)(void); + u32 (*phys_pkg_id)(u32 cpuid_apic, int index_msb); + + /* mpparse */ + /* When one of the next two hooks returns 1 the genapic + is switched to this. Essentially they are additional probe + functions. */ + int (*mps_oem_check)(struct mp_config_table *mpc, char *oem, + char *productid); + int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id); + + unsigned (*get_apic_id)(unsigned long x); + unsigned long apic_id_mask; + unsigned int (*cpu_mask_to_apicid)(cpumask_t cpumask); + cpumask_t (*vector_allocation_domain)(int cpu); + +#ifdef CONFIG_SMP + /* ipi */ + void (*send_IPI_mask)(cpumask_t mask, int vector); + void (*send_IPI_allbutself)(int vector); + void (*send_IPI_all)(int vector); +#endif +}; + +#define APICFUNC(x) .x = x, + +/* More functions could be probably marked IPIFUNC and save some space + in UP GENERICARCH kernels, but I don't have the nerve right now + to untangle this mess. -AK */ +#ifdef CONFIG_SMP +#define IPIFUNC(x) APICFUNC(x) +#else +#define IPIFUNC(x) +#endif + +#define APIC_INIT(aname, aprobe) \ +{ \ + .name = aname, \ + .probe = aprobe, \ + .int_delivery_mode = INT_DELIVERY_MODE, \ + .int_dest_mode = INT_DEST_MODE, \ + .no_balance_irq = NO_BALANCE_IRQ, \ + .ESR_DISABLE = esr_disable, \ + .apic_destination_logical = APIC_DEST_LOGICAL, \ + APICFUNC(apic_id_registered) \ + APICFUNC(target_cpus) \ + APICFUNC(check_apicid_used) \ + APICFUNC(check_apicid_present) \ + APICFUNC(init_apic_ldr) \ + APICFUNC(ioapic_phys_id_map) \ + APICFUNC(setup_apic_routing) \ + APICFUNC(multi_timer_check) \ + APICFUNC(apicid_to_node) \ + APICFUNC(cpu_to_logical_apicid) \ + APICFUNC(cpu_present_to_apicid) \ + APICFUNC(apicid_to_cpu_present) \ + APICFUNC(setup_portio_remap) \ + APICFUNC(check_phys_apicid_present) \ + APICFUNC(mps_oem_check) \ + APICFUNC(get_apic_id) \ + .apic_id_mask = APIC_ID_MASK, \ + APICFUNC(cpu_mask_to_apicid) \ + APICFUNC(vector_allocation_domain) \ + APICFUNC(acpi_madt_oem_check) \ + IPIFUNC(send_IPI_mask) \ + IPIFUNC(send_IPI_allbutself) \ + IPIFUNC(send_IPI_all) \ + APICFUNC(enable_apic_mode) \ + APICFUNC(phys_pkg_id) \ +} + +extern struct genapic *genapic; + +enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC}; +#define get_uv_system_type() UV_NONE +#define is_uv_system() 0 +#define uv_wakeup_secondary(a, b) 1 +#define uv_system_init() do {} while (0) + + +#endif /* _ASM_X86_GENAPIC_32_H */ diff --git a/arch/x86/include/asm/genapic_64.h b/arch/x86/include/asm/genapic_64.h new file mode 100644 index 000000000000..13c4e96199ea --- /dev/null +++ b/arch/x86/include/asm/genapic_64.h @@ -0,0 +1,58 @@ +#ifndef _ASM_X86_GENAPIC_64_H +#define _ASM_X86_GENAPIC_64_H + +/* + * Copyright 2004 James Cleverdon, IBM. + * Subject to the GNU Public License, v.2 + * + * Generic APIC sub-arch data struct. + * + * Hacked for x86-64 by James Cleverdon from i386 architecture code by + * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and + * James Cleverdon. + */ + +struct genapic { + char *name; + int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id); + u32 int_delivery_mode; + u32 int_dest_mode; + int (*apic_id_registered)(void); + cpumask_t (*target_cpus)(void); + cpumask_t (*vector_allocation_domain)(int cpu); + void (*init_apic_ldr)(void); + /* ipi */ + void (*send_IPI_mask)(cpumask_t mask, int vector); + void (*send_IPI_allbutself)(int vector); + void (*send_IPI_all)(int vector); + void (*send_IPI_self)(int vector); + /* */ + unsigned int (*cpu_mask_to_apicid)(cpumask_t cpumask); + unsigned int (*phys_pkg_id)(int index_msb); + unsigned int (*get_apic_id)(unsigned long x); + unsigned long (*set_apic_id)(unsigned int id); + unsigned long apic_id_mask; +}; + +extern struct genapic *genapic; + +extern struct genapic apic_flat; +extern struct genapic apic_physflat; +extern struct genapic apic_x2apic_cluster; +extern struct genapic apic_x2apic_phys; +extern int acpi_madt_oem_check(char *, char *); + +extern void apic_send_IPI_self(int vector); +enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC}; +extern enum uv_system_type get_uv_system_type(void); +extern int is_uv_system(void); + +extern struct genapic apic_x2apic_uv_x; +DECLARE_PER_CPU(int, x2apic_extra_bits); +extern void uv_cpu_init(void); +extern void uv_system_init(void); +extern int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip); + +extern void setup_apic_routing(void); + +#endif /* _ASM_X86_GENAPIC_64_H */ diff --git a/arch/x86/include/asm/geode.h b/arch/x86/include/asm/geode.h new file mode 100644 index 000000000000..ad3c2ed75481 --- /dev/null +++ b/arch/x86/include/asm/geode.h @@ -0,0 +1,253 @@ +/* + * AMD Geode definitions + * Copyright (C) 2006, Advanced Micro Devices, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public License + * as published by the Free Software Foundation. + */ + +#ifndef _ASM_X86_GEODE_H +#define _ASM_X86_GEODE_H + +#include <asm/processor.h> +#include <linux/io.h> + +/* Generic southbridge functions */ + +#define GEODE_DEV_PMS 0 +#define GEODE_DEV_ACPI 1 +#define GEODE_DEV_GPIO 2 +#define GEODE_DEV_MFGPT 3 + +extern int geode_get_dev_base(unsigned int dev); + +/* Useful macros */ +#define geode_pms_base() geode_get_dev_base(GEODE_DEV_PMS) +#define geode_acpi_base() geode_get_dev_base(GEODE_DEV_ACPI) +#define geode_gpio_base() geode_get_dev_base(GEODE_DEV_GPIO) +#define geode_mfgpt_base() geode_get_dev_base(GEODE_DEV_MFGPT) + +/* MSRS */ + +#define MSR_GLIU_P2D_RO0 0x10000029 + +#define MSR_LX_GLD_MSR_CONFIG 0x48002001 +#define MSR_LX_MSR_PADSEL 0x48002011 /* NOT 0x48000011; the data + * sheet has the wrong value */ +#define MSR_GLCP_SYS_RSTPLL 0x4C000014 +#define MSR_GLCP_DOTPLL 0x4C000015 + +#define MSR_LBAR_SMB 0x5140000B +#define MSR_LBAR_GPIO 0x5140000C +#define MSR_LBAR_MFGPT 0x5140000D +#define MSR_LBAR_ACPI 0x5140000E +#define MSR_LBAR_PMS 0x5140000F + +#define MSR_DIVIL_SOFT_RESET 0x51400017 + +#define MSR_PIC_YSEL_LOW 0x51400020 +#define MSR_PIC_YSEL_HIGH 0x51400021 +#define MSR_PIC_ZSEL_LOW 0x51400022 +#define MSR_PIC_ZSEL_HIGH 0x51400023 +#define MSR_PIC_IRQM_LPC 0x51400025 + +#define MSR_MFGPT_IRQ 0x51400028 +#define MSR_MFGPT_NR 0x51400029 +#define MSR_MFGPT_SETUP 0x5140002B + +#define MSR_LX_SPARE_MSR 0x80000011 /* DC-specific */ + +#define MSR_GX_GLD_MSR_CONFIG 0xC0002001 +#define MSR_GX_MSR_PADSEL 0xC0002011 + +/* Resource Sizes */ + +#define LBAR_GPIO_SIZE 0xFF +#define LBAR_MFGPT_SIZE 0x40 +#define LBAR_ACPI_SIZE 0x40 +#define LBAR_PMS_SIZE 0x80 + +/* ACPI registers (PMS block) */ + +/* + * PM1_EN is only valid when VSA is enabled for 16 bit reads. + * When VSA is not enabled, *always* read both PM1_STS and PM1_EN + * with a 32 bit read at offset 0x0 + */ + +#define PM1_STS 0x00 +#define PM1_EN 0x02 +#define PM1_CNT 0x08 +#define PM2_CNT 0x0C +#define PM_TMR 0x10 +#define PM_GPE0_STS 0x18 +#define PM_GPE0_EN 0x1C + +/* PMC registers (PMS block) */ + +#define PM_SSD 0x00 +#define PM_SCXA 0x04 +#define PM_SCYA 0x08 +#define PM_OUT_SLPCTL 0x0C +#define PM_SCLK 0x10 +#define PM_SED 0x1 +#define PM_SCXD 0x18 +#define PM_SCYD 0x1C +#define PM_IN_SLPCTL 0x20 +#define PM_WKD 0x30 +#define PM_WKXD 0x34 +#define PM_RD 0x38 +#define PM_WKXA 0x3C +#define PM_FSD 0x40 +#define PM_TSD 0x44 +#define PM_PSD 0x48 +#define PM_NWKD 0x4C +#define PM_AWKD 0x50 +#define PM_SSC 0x54 + +/* VSA2 magic values */ + +#define VSA_VRC_INDEX 0xAC1C +#define VSA_VRC_DATA 0xAC1E +#define VSA_VR_UNLOCK 0xFC53 /* unlock virtual register */ +#define VSA_VR_SIGNATURE 0x0003 +#define VSA_VR_MEM_SIZE 0x0200 +#define AMD_VSA_SIG 0x4132 /* signature is ascii 'VSA2' */ +#define GSW_VSA_SIG 0x534d /* General Software signature */ +/* GPIO */ + +#define GPIO_OUTPUT_VAL 0x00 +#define GPIO_OUTPUT_ENABLE 0x04 +#define GPIO_OUTPUT_OPEN_DRAIN 0x08 +#define GPIO_OUTPUT_INVERT 0x0C +#define GPIO_OUTPUT_AUX1 0x10 +#define GPIO_OUTPUT_AUX2 0x14 +#define GPIO_PULL_UP 0x18 +#define GPIO_PULL_DOWN 0x1C +#define GPIO_INPUT_ENABLE 0x20 +#define GPIO_INPUT_INVERT 0x24 +#define GPIO_INPUT_FILTER 0x28 +#define GPIO_INPUT_EVENT_COUNT 0x2C +#define GPIO_READ_BACK 0x30 +#define GPIO_INPUT_AUX1 0x34 +#define GPIO_EVENTS_ENABLE 0x38 +#define GPIO_LOCK_ENABLE 0x3C +#define GPIO_POSITIVE_EDGE_EN 0x40 +#define GPIO_NEGATIVE_EDGE_EN 0x44 +#define GPIO_POSITIVE_EDGE_STS 0x48 +#define GPIO_NEGATIVE_EDGE_STS 0x4C + +#define GPIO_MAP_X 0xE0 +#define GPIO_MAP_Y 0xE4 +#define GPIO_MAP_Z 0xE8 +#define GPIO_MAP_W 0xEC + +static inline u32 geode_gpio(unsigned int nr) +{ + BUG_ON(nr > 28); + return 1 << nr; +} + +extern void geode_gpio_set(u32, unsigned int); +extern void geode_gpio_clear(u32, unsigned int); +extern int geode_gpio_isset(u32, unsigned int); +extern void geode_gpio_setup_event(unsigned int, int, int); +extern void geode_gpio_set_irq(unsigned int, unsigned int); + +static inline void geode_gpio_event_irq(unsigned int gpio, int pair) +{ + geode_gpio_setup_event(gpio, pair, 0); +} + +static inline void geode_gpio_event_pme(unsigned int gpio, int pair) +{ + geode_gpio_setup_event(gpio, pair, 1); +} + +/* Specific geode tests */ + +static inline int is_geode_gx(void) +{ + return ((boot_cpu_data.x86_vendor == X86_VENDOR_NSC) && + (boot_cpu_data.x86 == 5) && + (boot_cpu_data.x86_model == 5)); +} + +static inline int is_geode_lx(void) +{ + return ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && + (boot_cpu_data.x86 == 5) && + (boot_cpu_data.x86_model == 10)); +} + +static inline int is_geode(void) +{ + return (is_geode_gx() || is_geode_lx()); +} + +#ifdef CONFIG_MGEODE_LX +extern int geode_has_vsa2(void); +#else +static inline int geode_has_vsa2(void) +{ + return 0; +} +#endif + +/* MFGPTs */ + +#define MFGPT_MAX_TIMERS 8 +#define MFGPT_TIMER_ANY (-1) + +#define MFGPT_DOMAIN_WORKING 1 +#define MFGPT_DOMAIN_STANDBY 2 +#define MFGPT_DOMAIN_ANY (MFGPT_DOMAIN_WORKING | MFGPT_DOMAIN_STANDBY) + +#define MFGPT_CMP1 0 +#define MFGPT_CMP2 1 + +#define MFGPT_EVENT_IRQ 0 +#define MFGPT_EVENT_NMI 1 +#define MFGPT_EVENT_RESET 3 + +#define MFGPT_REG_CMP1 0 +#define MFGPT_REG_CMP2 2 +#define MFGPT_REG_COUNTER 4 +#define MFGPT_REG_SETUP 6 + +#define MFGPT_SETUP_CNTEN (1 << 15) +#define MFGPT_SETUP_CMP2 (1 << 14) +#define MFGPT_SETUP_CMP1 (1 << 13) +#define MFGPT_SETUP_SETUP (1 << 12) +#define MFGPT_SETUP_STOPEN (1 << 11) +#define MFGPT_SETUP_EXTEN (1 << 10) +#define MFGPT_SETUP_REVEN (1 << 5) +#define MFGPT_SETUP_CLKSEL (1 << 4) + +static inline void geode_mfgpt_write(int timer, u16 reg, u16 value) +{ + u32 base = geode_get_dev_base(GEODE_DEV_MFGPT); + outw(value, base + reg + (timer * 8)); +} + +static inline u16 geode_mfgpt_read(int timer, u16 reg) +{ + u32 base = geode_get_dev_base(GEODE_DEV_MFGPT); + return inw(base + reg + (timer * 8)); +} + +extern int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable); +extern int geode_mfgpt_set_irq(int timer, int cmp, int *irq, int enable); +extern int geode_mfgpt_alloc_timer(int timer, int domain); + +#define geode_mfgpt_setup_irq(t, c, i) geode_mfgpt_set_irq((t), (c), (i), 1) +#define geode_mfgpt_release_irq(t, c, i) geode_mfgpt_set_irq((t), (c), (i), 0) + +#ifdef CONFIG_GEODE_MFGPT_TIMER +extern int __init mfgpt_timer_setup(void); +#else +static inline int mfgpt_timer_setup(void) { return 0; } +#endif + +#endif /* _ASM_X86_GEODE_H */ diff --git a/arch/x86/include/asm/gpio.h b/arch/x86/include/asm/gpio.h new file mode 100644 index 000000000000..49dbfdfa50f9 --- /dev/null +++ b/arch/x86/include/asm/gpio.h @@ -0,0 +1,56 @@ +/* + * Generic GPIO API implementation for x86. + * + * Derived from the generic GPIO API for powerpc: + * + * Copyright (c) 2007-2008 MontaVista Software, Inc. + * + * Author: Anton Vorontsov <avorontsov@ru.mvista.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#ifndef _ASM_X86_GPIO_H +#define _ASM_X86_GPIO_H + +#include <asm-generic/gpio.h> + +#ifdef CONFIG_GPIOLIB + +/* + * Just call gpiolib. + */ +static inline int gpio_get_value(unsigned int gpio) +{ + return __gpio_get_value(gpio); +} + +static inline void gpio_set_value(unsigned int gpio, int value) +{ + __gpio_set_value(gpio, value); +} + +static inline int gpio_cansleep(unsigned int gpio) +{ + return __gpio_cansleep(gpio); +} + +/* + * Not implemented, yet. + */ +static inline int gpio_to_irq(unsigned int gpio) +{ + return -ENOSYS; +} + +static inline int irq_to_gpio(unsigned int irq) +{ + return -EINVAL; +} + +#endif /* CONFIG_GPIOLIB */ + +#endif /* _ASM_X86_GPIO_H */ diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h new file mode 100644 index 000000000000..000787df66e6 --- /dev/null +++ b/arch/x86/include/asm/hardirq.h @@ -0,0 +1,11 @@ +#ifdef CONFIG_X86_32 +# include "hardirq_32.h" +#else +# include "hardirq_64.h" +#endif + +extern u64 arch_irq_stat_cpu(unsigned int cpu); +#define arch_irq_stat_cpu arch_irq_stat_cpu + +extern u64 arch_irq_stat(void); +#define arch_irq_stat arch_irq_stat diff --git a/arch/x86/include/asm/hardirq_32.h b/arch/x86/include/asm/hardirq_32.h new file mode 100644 index 000000000000..5ca135e72f2b --- /dev/null +++ b/arch/x86/include/asm/hardirq_32.h @@ -0,0 +1,28 @@ +#ifndef _ASM_X86_HARDIRQ_32_H +#define _ASM_X86_HARDIRQ_32_H + +#include <linux/threads.h> +#include <linux/irq.h> + +typedef struct { + unsigned int __softirq_pending; + unsigned long idle_timestamp; + unsigned int __nmi_count; /* arch dependent */ + unsigned int apic_timer_irqs; /* arch dependent */ + unsigned int irq0_irqs; + unsigned int irq_resched_count; + unsigned int irq_call_count; + unsigned int irq_tlb_count; + unsigned int irq_thermal_count; + unsigned int irq_spurious_count; +} ____cacheline_aligned irq_cpustat_t; + +DECLARE_PER_CPU(irq_cpustat_t, irq_stat); + +#define __ARCH_IRQ_STAT +#define __IRQ_STAT(cpu, member) (per_cpu(irq_stat, cpu).member) + +void ack_bad_irq(unsigned int irq); +#include <linux/irq_cpustat.h> + +#endif /* _ASM_X86_HARDIRQ_32_H */ diff --git a/arch/x86/include/asm/hardirq_64.h b/arch/x86/include/asm/hardirq_64.h new file mode 100644 index 000000000000..1ba381fc51d3 --- /dev/null +++ b/arch/x86/include/asm/hardirq_64.h @@ -0,0 +1,23 @@ +#ifndef _ASM_X86_HARDIRQ_64_H +#define _ASM_X86_HARDIRQ_64_H + +#include <linux/threads.h> +#include <linux/irq.h> +#include <asm/pda.h> +#include <asm/apic.h> + +/* We can have at most NR_VECTORS irqs routed to a cpu at a time */ +#define MAX_HARDIRQS_PER_CPU NR_VECTORS + +#define __ARCH_IRQ_STAT 1 + +#define local_softirq_pending() read_pda(__softirq_pending) + +#define __ARCH_SET_SOFTIRQ_PENDING 1 + +#define set_softirq_pending(x) write_pda(__softirq_pending, (x)) +#define or_softirq_pending(x) or_pda(__softirq_pending, (x)) + +extern void ack_bad_irq(unsigned int irq); + +#endif /* _ASM_X86_HARDIRQ_64_H */ diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h new file mode 100644 index 000000000000..a3b3b7c3027b --- /dev/null +++ b/arch/x86/include/asm/highmem.h @@ -0,0 +1,82 @@ +/* + * highmem.h: virtual kernel memory mappings for high memory + * + * Used in CONFIG_HIGHMEM systems for memory pages which + * are not addressable by direct kernel virtual addresses. + * + * Copyright (C) 1999 Gerhard Wichert, Siemens AG + * Gerhard.Wichert@pdb.siemens.de + * + * + * Redesigned the x86 32-bit VM architecture to deal with + * up to 16 Terabyte physical memory. With current x86 CPUs + * we now support up to 64 Gigabytes physical RAM. + * + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> + */ + +#ifndef _ASM_X86_HIGHMEM_H +#define _ASM_X86_HIGHMEM_H + +#ifdef __KERNEL__ + +#include <linux/interrupt.h> +#include <linux/threads.h> +#include <asm/kmap_types.h> +#include <asm/tlbflush.h> +#include <asm/paravirt.h> + +/* declarations for highmem.c */ +extern unsigned long highstart_pfn, highend_pfn; + +extern pte_t *kmap_pte; +extern pgprot_t kmap_prot; +extern pte_t *pkmap_page_table; + +/* + * Right now we initialize only a single pte table. It can be extended + * easily, subsequent pte tables have to be allocated in one physical + * chunk of RAM. + */ +/* + * Ordering is: + * + * FIXADDR_TOP + * fixed_addresses + * FIXADDR_START + * temp fixed addresses + * FIXADDR_BOOT_START + * Persistent kmap area + * PKMAP_BASE + * VMALLOC_END + * Vmalloc area + * VMALLOC_START + * high_memory + */ +#define LAST_PKMAP_MASK (LAST_PKMAP-1) +#define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) +#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) + +extern void *kmap_high(struct page *page); +extern void kunmap_high(struct page *page); + +void *kmap(struct page *page); +void kunmap(struct page *page); +void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot); +void *kmap_atomic(struct page *page, enum km_type type); +void kunmap_atomic(void *kvaddr, enum km_type type); +void *kmap_atomic_pfn(unsigned long pfn, enum km_type type); +struct page *kmap_atomic_to_page(void *ptr); + +#ifndef CONFIG_PARAVIRT +#define kmap_atomic_pte(page, type) kmap_atomic(page, type) +#endif + +#define flush_cache_kmaps() do { } while (0) + +extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn, + unsigned long end_pfn); + +#endif /* __KERNEL__ */ + +#endif /* _ASM_X86_HIGHMEM_H */ diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h new file mode 100644 index 000000000000..1c22cb05ad6a --- /dev/null +++ b/arch/x86/include/asm/hpet.h @@ -0,0 +1,114 @@ +#ifndef _ASM_X86_HPET_H +#define _ASM_X86_HPET_H + +#include <linux/msi.h> + +#ifdef CONFIG_HPET_TIMER + +#define HPET_MMAP_SIZE 1024 + +#define HPET_ID 0x000 +#define HPET_PERIOD 0x004 +#define HPET_CFG 0x010 +#define HPET_STATUS 0x020 +#define HPET_COUNTER 0x0f0 + +#define HPET_Tn_CFG(n) (0x100 + 0x20 * n) +#define HPET_Tn_CMP(n) (0x108 + 0x20 * n) +#define HPET_Tn_ROUTE(n) (0x110 + 0x20 * n) + +#define HPET_T0_CFG 0x100 +#define HPET_T0_CMP 0x108 +#define HPET_T0_ROUTE 0x110 +#define HPET_T1_CFG 0x120 +#define HPET_T1_CMP 0x128 +#define HPET_T1_ROUTE 0x130 +#define HPET_T2_CFG 0x140 +#define HPET_T2_CMP 0x148 +#define HPET_T2_ROUTE 0x150 + +#define HPET_ID_REV 0x000000ff +#define HPET_ID_NUMBER 0x00001f00 +#define HPET_ID_64BIT 0x00002000 +#define HPET_ID_LEGSUP 0x00008000 +#define HPET_ID_VENDOR 0xffff0000 +#define HPET_ID_NUMBER_SHIFT 8 +#define HPET_ID_VENDOR_SHIFT 16 + +#define HPET_ID_VENDOR_8086 0x8086 + +#define HPET_CFG_ENABLE 0x001 +#define HPET_CFG_LEGACY 0x002 +#define HPET_LEGACY_8254 2 +#define HPET_LEGACY_RTC 8 + +#define HPET_TN_LEVEL 0x0002 +#define HPET_TN_ENABLE 0x0004 +#define HPET_TN_PERIODIC 0x0008 +#define HPET_TN_PERIODIC_CAP 0x0010 +#define HPET_TN_64BIT_CAP 0x0020 +#define HPET_TN_SETVAL 0x0040 +#define HPET_TN_32BIT 0x0100 +#define HPET_TN_ROUTE 0x3e00 +#define HPET_TN_FSB 0x4000 +#define HPET_TN_FSB_CAP 0x8000 +#define HPET_TN_ROUTE_SHIFT 9 + +/* Max HPET Period is 10^8 femto sec as in HPET spec */ +#define HPET_MAX_PERIOD 100000000UL +/* + * Min HPET period is 10^5 femto sec just for safety. If it is less than this, + * then 32 bit HPET counter wrapsaround in less than 0.5 sec. + */ +#define HPET_MIN_PERIOD 100000UL + +/* hpet memory map physical address */ +extern unsigned long hpet_address; +extern unsigned long force_hpet_address; +extern int hpet_force_user; +extern int is_hpet_enabled(void); +extern int hpet_enable(void); +extern void hpet_disable(void); +extern unsigned long hpet_readl(unsigned long a); +extern void force_hpet_resume(void); + +extern void hpet_msi_unmask(unsigned int irq); +extern void hpet_msi_mask(unsigned int irq); +extern void hpet_msi_write(unsigned int irq, struct msi_msg *msg); +extern void hpet_msi_read(unsigned int irq, struct msi_msg *msg); + +#ifdef CONFIG_PCI_MSI +extern int arch_setup_hpet_msi(unsigned int irq); +#else +static inline int arch_setup_hpet_msi(unsigned int irq) +{ + return -EINVAL; +} +#endif + +#ifdef CONFIG_HPET_EMULATE_RTC + +#include <linux/interrupt.h> + +typedef irqreturn_t (*rtc_irq_handler)(int interrupt, void *cookie); +extern int hpet_mask_rtc_irq_bit(unsigned long bit_mask); +extern int hpet_set_rtc_irq_bit(unsigned long bit_mask); +extern int hpet_set_alarm_time(unsigned char hrs, unsigned char min, + unsigned char sec); +extern int hpet_set_periodic_freq(unsigned long freq); +extern int hpet_rtc_dropped_irq(void); +extern int hpet_rtc_timer_init(void); +extern irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id); +extern int hpet_register_irq_handler(rtc_irq_handler handler); +extern void hpet_unregister_irq_handler(rtc_irq_handler handler); + +#endif /* CONFIG_HPET_EMULATE_RTC */ + +#else /* CONFIG_HPET_TIMER */ + +static inline int hpet_enable(void) { return 0; } +static inline int is_hpet_enabled(void) { return 0; } +#define hpet_readl(a) 0 + +#endif +#endif /* _ASM_X86_HPET_H */ diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h new file mode 100644 index 000000000000..439a9acc132d --- /dev/null +++ b/arch/x86/include/asm/hugetlb.h @@ -0,0 +1,93 @@ +#ifndef _ASM_X86_HUGETLB_H +#define _ASM_X86_HUGETLB_H + +#include <asm/page.h> + + +static inline int is_hugepage_only_range(struct mm_struct *mm, + unsigned long addr, + unsigned long len) { + return 0; +} + +/* + * If the arch doesn't supply something else, assume that hugepage + * size aligned regions are ok without further preparation. + */ +static inline int prepare_hugepage_range(struct file *file, + unsigned long addr, unsigned long len) +{ + struct hstate *h = hstate_file(file); + if (len & ~huge_page_mask(h)) + return -EINVAL; + if (addr & ~huge_page_mask(h)) + return -EINVAL; + return 0; +} + +static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) { +} + +static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, + unsigned long addr, unsigned long end, + unsigned long floor, + unsigned long ceiling) +{ + free_pgd_range(tlb, addr, end, floor, ceiling); +} + +static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + set_pte_at(mm, addr, ptep, pte); +} + +static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + return ptep_get_and_clear(mm, addr, ptep); +} + +static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ +} + +static inline int huge_pte_none(pte_t pte) +{ + return pte_none(pte); +} + +static inline pte_t huge_pte_wrprotect(pte_t pte) +{ + return pte_wrprotect(pte); +} + +static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + ptep_set_wrprotect(mm, addr, ptep); +} + +static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t pte, int dirty) +{ + return ptep_set_access_flags(vma, addr, ptep, pte, dirty); +} + +static inline pte_t huge_ptep_get(pte_t *ptep) +{ + return *ptep; +} + +static inline int arch_prepare_hugepage(struct page *page) +{ + return 0; +} + +static inline void arch_release_hugepage(struct page *page) +{ +} + +#endif /* _ASM_X86_HUGETLB_H */ diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h new file mode 100644 index 000000000000..b97aecb0b61d --- /dev/null +++ b/arch/x86/include/asm/hw_irq.h @@ -0,0 +1,131 @@ +#ifndef _ASM_X86_HW_IRQ_H +#define _ASM_X86_HW_IRQ_H + +/* + * (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar + * + * moved some of the old arch/i386/kernel/irq.h to here. VY + * + * IRQ/IPI changes taken from work by Thomas Radke + * <tomsoft@informatik.tu-chemnitz.de> + * + * hacked by Andi Kleen for x86-64. + * unified by tglx + */ + +#include <asm/irq_vectors.h> + +#ifndef __ASSEMBLY__ + +#include <linux/percpu.h> +#include <linux/profile.h> +#include <linux/smp.h> + +#include <asm/atomic.h> +#include <asm/irq.h> +#include <asm/sections.h> + +#define platform_legacy_irq(irq) ((irq) < 16) + +/* Interrupt handlers registered during init_IRQ */ +extern void apic_timer_interrupt(void); +extern void error_interrupt(void); +extern void spurious_interrupt(void); +extern void thermal_interrupt(void); +extern void reschedule_interrupt(void); + +extern void invalidate_interrupt(void); +extern void invalidate_interrupt0(void); +extern void invalidate_interrupt1(void); +extern void invalidate_interrupt2(void); +extern void invalidate_interrupt3(void); +extern void invalidate_interrupt4(void); +extern void invalidate_interrupt5(void); +extern void invalidate_interrupt6(void); +extern void invalidate_interrupt7(void); + +extern void irq_move_cleanup_interrupt(void); +extern void threshold_interrupt(void); + +extern void call_function_interrupt(void); +extern void call_function_single_interrupt(void); + +/* PIC specific functions */ +extern void disable_8259A_irq(unsigned int irq); +extern void enable_8259A_irq(unsigned int irq); +extern int i8259A_irq_pending(unsigned int irq); +extern void make_8259A_irq(unsigned int irq); +extern void init_8259A(int aeoi); + +/* IOAPIC */ +#define IO_APIC_IRQ(x) (((x) >= 16) || ((1<<(x)) & io_apic_irqs)) +extern unsigned long io_apic_irqs; + +extern void init_VISWS_APIC_irqs(void); +extern void setup_IO_APIC(void); +extern void disable_IO_APIC(void); +extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn); +extern void setup_ioapic_dest(void); + +#ifdef CONFIG_X86_64 +extern void enable_IO_APIC(void); +#endif + +/* IPI functions */ +#ifdef CONFIG_X86_32 +extern void send_IPI_self(int vector); +#endif +extern void send_IPI(int dest, int vector); + +/* Statistics */ +extern atomic_t irq_err_count; +extern atomic_t irq_mis_count; + +/* EISA */ +extern void eisa_set_level_irq(unsigned int irq); + +/* Voyager functions */ +extern asmlinkage void vic_cpi_interrupt(void); +extern asmlinkage void vic_sys_interrupt(void); +extern asmlinkage void vic_cmn_interrupt(void); +extern asmlinkage void qic_timer_interrupt(void); +extern asmlinkage void qic_invalidate_interrupt(void); +extern asmlinkage void qic_reschedule_interrupt(void); +extern asmlinkage void qic_enable_irq_interrupt(void); +extern asmlinkage void qic_call_function_interrupt(void); + +/* SMP */ +extern void smp_apic_timer_interrupt(struct pt_regs *); +extern void smp_spurious_interrupt(struct pt_regs *); +extern void smp_error_interrupt(struct pt_regs *); +#ifdef CONFIG_X86_SMP +extern void smp_reschedule_interrupt(struct pt_regs *); +extern void smp_call_function_interrupt(struct pt_regs *); +extern void smp_call_function_single_interrupt(struct pt_regs *); +#ifdef CONFIG_X86_32 +extern void smp_invalidate_interrupt(struct pt_regs *); +#else +extern asmlinkage void smp_invalidate_interrupt(struct pt_regs *); +#endif +#endif + +#ifdef CONFIG_X86_32 +extern void (*const interrupt[NR_VECTORS])(void); +#endif + +typedef int vector_irq_t[NR_VECTORS]; +DECLARE_PER_CPU(vector_irq_t, vector_irq); + +#ifdef CONFIG_X86_IO_APIC +extern void lock_vector_lock(void); +extern void unlock_vector_lock(void); +extern void __setup_vector_irq(int cpu); +#else +static inline void lock_vector_lock(void) {} +static inline void unlock_vector_lock(void) {} +static inline void __setup_vector_irq(int cpu) {} +#endif + +#endif /* !ASSEMBLY_ */ + +#endif /* _ASM_X86_HW_IRQ_H */ diff --git a/arch/x86/include/asm/hypertransport.h b/arch/x86/include/asm/hypertransport.h new file mode 100644 index 000000000000..334b1a885d9c --- /dev/null +++ b/arch/x86/include/asm/hypertransport.h @@ -0,0 +1,45 @@ +#ifndef _ASM_X86_HYPERTRANSPORT_H +#define _ASM_X86_HYPERTRANSPORT_H + +/* + * Constants for x86 Hypertransport Interrupts. + */ + +#define HT_IRQ_LOW_BASE 0xf8000000 + +#define HT_IRQ_LOW_VECTOR_SHIFT 16 +#define HT_IRQ_LOW_VECTOR_MASK 0x00ff0000 +#define HT_IRQ_LOW_VECTOR(v) \ + (((v) << HT_IRQ_LOW_VECTOR_SHIFT) & HT_IRQ_LOW_VECTOR_MASK) + +#define HT_IRQ_LOW_DEST_ID_SHIFT 8 +#define HT_IRQ_LOW_DEST_ID_MASK 0x0000ff00 +#define HT_IRQ_LOW_DEST_ID(v) \ + (((v) << HT_IRQ_LOW_DEST_ID_SHIFT) & HT_IRQ_LOW_DEST_ID_MASK) + +#define HT_IRQ_LOW_DM_PHYSICAL 0x0000000 +#define HT_IRQ_LOW_DM_LOGICAL 0x0000040 + +#define HT_IRQ_LOW_RQEOI_EDGE 0x0000000 +#define HT_IRQ_LOW_RQEOI_LEVEL 0x0000020 + + +#define HT_IRQ_LOW_MT_FIXED 0x0000000 +#define HT_IRQ_LOW_MT_ARBITRATED 0x0000004 +#define HT_IRQ_LOW_MT_SMI 0x0000008 +#define HT_IRQ_LOW_MT_NMI 0x000000c +#define HT_IRQ_LOW_MT_INIT 0x0000010 +#define HT_IRQ_LOW_MT_STARTUP 0x0000014 +#define HT_IRQ_LOW_MT_EXTINT 0x0000018 +#define HT_IRQ_LOW_MT_LINT1 0x000008c +#define HT_IRQ_LOW_MT_LINT0 0x0000098 + +#define HT_IRQ_LOW_IRQ_MASKED 0x0000001 + + +#define HT_IRQ_HIGH_DEST_ID_SHIFT 0 +#define HT_IRQ_HIGH_DEST_ID_MASK 0x00ffffff +#define HT_IRQ_HIGH_DEST_ID(v) \ + ((((v) >> 8) << HT_IRQ_HIGH_DEST_ID_SHIFT) & HT_IRQ_HIGH_DEST_ID_MASK) + +#endif /* _ASM_X86_HYPERTRANSPORT_H */ diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h new file mode 100644 index 000000000000..48f0004db8c9 --- /dev/null +++ b/arch/x86/include/asm/i387.h @@ -0,0 +1,400 @@ +/* + * Copyright (C) 1994 Linus Torvalds + * + * Pentium III FXSR, SSE support + * General FPU state handling cleanups + * Gareth Hughes <gareth@valinux.com>, May 2000 + * x86-64 work by Andi Kleen 2002 + */ + +#ifndef _ASM_X86_I387_H +#define _ASM_X86_I387_H + +#include <linux/sched.h> +#include <linux/kernel_stat.h> +#include <linux/regset.h> +#include <linux/hardirq.h> +#include <asm/asm.h> +#include <asm/processor.h> +#include <asm/sigcontext.h> +#include <asm/user.h> +#include <asm/uaccess.h> +#include <asm/xsave.h> + +extern unsigned int sig_xstate_size; +extern void fpu_init(void); +extern void mxcsr_feature_mask_init(void); +extern int init_fpu(struct task_struct *child); +extern asmlinkage void math_state_restore(void); +extern void init_thread_xstate(void); +extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); + +extern user_regset_active_fn fpregs_active, xfpregs_active; +extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get; +extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set; + +extern struct _fpx_sw_bytes fx_sw_reserved; +#ifdef CONFIG_IA32_EMULATION +extern unsigned int sig_xstate_ia32_size; +extern struct _fpx_sw_bytes fx_sw_reserved_ia32; +struct _fpstate_ia32; +struct _xstate_ia32; +extern int save_i387_xstate_ia32(void __user *buf); +extern int restore_i387_xstate_ia32(void __user *buf); +#endif + +#define X87_FSW_ES (1 << 7) /* Exception Summary */ + +#ifdef CONFIG_X86_64 + +/* Ignore delayed exceptions from user space */ +static inline void tolerant_fwait(void) +{ + asm volatile("1: fwait\n" + "2:\n" + _ASM_EXTABLE(1b, 2b)); +} + +static inline int fxrstor_checking(struct i387_fxsave_struct *fx) +{ + int err; + + asm volatile("1: rex64/fxrstor (%[fx])\n\t" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: movl $-1,%[err]\n" + " jmp 2b\n" + ".previous\n" + _ASM_EXTABLE(1b, 3b) + : [err] "=r" (err) +#if 0 /* See comment in __save_init_fpu() below. */ + : [fx] "r" (fx), "m" (*fx), "0" (0)); +#else + : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0)); +#endif + return err; +} + +static inline int restore_fpu_checking(struct task_struct *tsk) +{ + if (task_thread_info(tsk)->status & TS_XSAVE) + return xrstor_checking(&tsk->thread.xstate->xsave); + else + return fxrstor_checking(&tsk->thread.xstate->fxsave); +} + +/* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception + is pending. Clear the x87 state here by setting it to fixed + values. The kernel data segment can be sometimes 0 and sometimes + new user value. Both should be ok. + Use the PDA as safe address because it should be already in L1. */ +static inline void clear_fpu_state(struct task_struct *tsk) +{ + struct xsave_struct *xstate = &tsk->thread.xstate->xsave; + struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave; + + /* + * xsave header may indicate the init state of the FP. + */ + if ((task_thread_info(tsk)->status & TS_XSAVE) && + !(xstate->xsave_hdr.xstate_bv & XSTATE_FP)) + return; + + if (unlikely(fx->swd & X87_FSW_ES)) + asm volatile("fnclex"); + alternative_input(ASM_NOP8 ASM_NOP2, + " emms\n" /* clear stack tags */ + " fildl %%gs:0", /* load to clear state */ + X86_FEATURE_FXSAVE_LEAK); +} + +static inline int fxsave_user(struct i387_fxsave_struct __user *fx) +{ + int err; + + asm volatile("1: rex64/fxsave (%[fx])\n\t" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: movl $-1,%[err]\n" + " jmp 2b\n" + ".previous\n" + _ASM_EXTABLE(1b, 3b) + : [err] "=r" (err), "=m" (*fx) +#if 0 /* See comment in __fxsave_clear() below. */ + : [fx] "r" (fx), "0" (0)); +#else + : [fx] "cdaSDb" (fx), "0" (0)); +#endif + if (unlikely(err) && + __clear_user(fx, sizeof(struct i387_fxsave_struct))) + err = -EFAULT; + /* No need to clear here because the caller clears USED_MATH */ + return err; +} + +static inline void fxsave(struct task_struct *tsk) +{ + /* Using "rex64; fxsave %0" is broken because, if the memory operand + uses any extended registers for addressing, a second REX prefix + will be generated (to the assembler, rex64 followed by semicolon + is a separate instruction), and hence the 64-bitness is lost. */ +#if 0 + /* Using "fxsaveq %0" would be the ideal choice, but is only supported + starting with gas 2.16. */ + __asm__ __volatile__("fxsaveq %0" + : "=m" (tsk->thread.xstate->fxsave)); +#elif 0 + /* Using, as a workaround, the properly prefixed form below isn't + accepted by any binutils version so far released, complaining that + the same type of prefix is used twice if an extended register is + needed for addressing (fix submitted to mainline 2005-11-21). */ + __asm__ __volatile__("rex64/fxsave %0" + : "=m" (tsk->thread.xstate->fxsave)); +#else + /* This, however, we can work around by forcing the compiler to select + an addressing mode that doesn't require extended registers. */ + __asm__ __volatile__("rex64/fxsave (%1)" + : "=m" (tsk->thread.xstate->fxsave) + : "cdaSDb" (&tsk->thread.xstate->fxsave)); +#endif +} + +static inline void __save_init_fpu(struct task_struct *tsk) +{ + if (task_thread_info(tsk)->status & TS_XSAVE) + xsave(tsk); + else + fxsave(tsk); + + clear_fpu_state(tsk); + task_thread_info(tsk)->status &= ~TS_USEDFPU; +} + +#else /* CONFIG_X86_32 */ + +extern void finit(void); + +static inline void tolerant_fwait(void) +{ + asm volatile("fnclex ; fwait"); +} + +static inline void restore_fpu(struct task_struct *tsk) +{ + if (task_thread_info(tsk)->status & TS_XSAVE) { + xrstor_checking(&tsk->thread.xstate->xsave); + return; + } + /* + * The "nop" is needed to make the instructions the same + * length. + */ + alternative_input( + "nop ; frstor %1", + "fxrstor %1", + X86_FEATURE_FXSR, + "m" (tsk->thread.xstate->fxsave)); +} + +/* We need a safe address that is cheap to find and that is already + in L1 during context switch. The best choices are unfortunately + different for UP and SMP */ +#ifdef CONFIG_SMP +#define safe_address (__per_cpu_offset[0]) +#else +#define safe_address (kstat_cpu(0).cpustat.user) +#endif + +/* + * These must be called with preempt disabled + */ +static inline void __save_init_fpu(struct task_struct *tsk) +{ + if (task_thread_info(tsk)->status & TS_XSAVE) { + struct xsave_struct *xstate = &tsk->thread.xstate->xsave; + struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave; + + xsave(tsk); + + /* + * xsave header may indicate the init state of the FP. + */ + if (!(xstate->xsave_hdr.xstate_bv & XSTATE_FP)) + goto end; + + if (unlikely(fx->swd & X87_FSW_ES)) + asm volatile("fnclex"); + + /* + * we can do a simple return here or be paranoid :) + */ + goto clear_state; + } + + /* Use more nops than strictly needed in case the compiler + varies code */ + alternative_input( + "fnsave %[fx] ;fwait;" GENERIC_NOP8 GENERIC_NOP4, + "fxsave %[fx]\n" + "bt $7,%[fsw] ; jnc 1f ; fnclex\n1:", + X86_FEATURE_FXSR, + [fx] "m" (tsk->thread.xstate->fxsave), + [fsw] "m" (tsk->thread.xstate->fxsave.swd) : "memory"); +clear_state: + /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception + is pending. Clear the x87 state here by setting it to fixed + values. safe_address is a random variable that should be in L1 */ + alternative_input( + GENERIC_NOP8 GENERIC_NOP2, + "emms\n\t" /* clear stack tags */ + "fildl %[addr]", /* set F?P to defined value */ + X86_FEATURE_FXSAVE_LEAK, + [addr] "m" (safe_address)); +end: + task_thread_info(tsk)->status &= ~TS_USEDFPU; +} + +#endif /* CONFIG_X86_64 */ + +/* + * Signal frame handlers... + */ +extern int save_i387_xstate(void __user *buf); +extern int restore_i387_xstate(void __user *buf); + +static inline void __unlazy_fpu(struct task_struct *tsk) +{ + if (task_thread_info(tsk)->status & TS_USEDFPU) { + __save_init_fpu(tsk); + stts(); + } else + tsk->fpu_counter = 0; +} + +static inline void __clear_fpu(struct task_struct *tsk) +{ + if (task_thread_info(tsk)->status & TS_USEDFPU) { + tolerant_fwait(); + task_thread_info(tsk)->status &= ~TS_USEDFPU; + stts(); + } +} + +static inline void kernel_fpu_begin(void) +{ + struct thread_info *me = current_thread_info(); + preempt_disable(); + if (me->status & TS_USEDFPU) + __save_init_fpu(me->task); + else + clts(); +} + +static inline void kernel_fpu_end(void) +{ + stts(); + preempt_enable(); +} + +/* + * Some instructions like VIA's padlock instructions generate a spurious + * DNA fault but don't modify SSE registers. And these instructions + * get used from interrupt context aswell. To prevent these kernel instructions + * in interrupt context interact wrongly with other user/kernel fpu usage, we + * should use them only in the context of irq_ts_save/restore() + */ +static inline int irq_ts_save(void) +{ + /* + * If we are in process context, we are ok to take a spurious DNA fault. + * Otherwise, doing clts() in process context require pre-emption to + * be disabled or some heavy lifting like kernel_fpu_begin() + */ + if (!in_interrupt()) + return 0; + + if (read_cr0() & X86_CR0_TS) { + clts(); + return 1; + } + + return 0; +} + +static inline void irq_ts_restore(int TS_state) +{ + if (TS_state) + stts(); +} + +#ifdef CONFIG_X86_64 + +static inline void save_init_fpu(struct task_struct *tsk) +{ + __save_init_fpu(tsk); + stts(); +} + +#define unlazy_fpu __unlazy_fpu +#define clear_fpu __clear_fpu + +#else /* CONFIG_X86_32 */ + +/* + * These disable preemption on their own and are safe + */ +static inline void save_init_fpu(struct task_struct *tsk) +{ + preempt_disable(); + __save_init_fpu(tsk); + stts(); + preempt_enable(); +} + +static inline void unlazy_fpu(struct task_struct *tsk) +{ + preempt_disable(); + __unlazy_fpu(tsk); + preempt_enable(); +} + +static inline void clear_fpu(struct task_struct *tsk) +{ + preempt_disable(); + __clear_fpu(tsk); + preempt_enable(); +} + +#endif /* CONFIG_X86_64 */ + +/* + * i387 state interaction + */ +static inline unsigned short get_fpu_cwd(struct task_struct *tsk) +{ + if (cpu_has_fxsr) { + return tsk->thread.xstate->fxsave.cwd; + } else { + return (unsigned short)tsk->thread.xstate->fsave.cwd; + } +} + +static inline unsigned short get_fpu_swd(struct task_struct *tsk) +{ + if (cpu_has_fxsr) { + return tsk->thread.xstate->fxsave.swd; + } else { + return (unsigned short)tsk->thread.xstate->fsave.swd; + } +} + +static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk) +{ + if (cpu_has_xmm) { + return tsk->thread.xstate->fxsave.mxcsr; + } else { + return MXCSR_DEFAULT; + } +} + +#endif /* _ASM_X86_I387_H */ diff --git a/arch/x86/include/asm/i8253.h b/arch/x86/include/asm/i8253.h new file mode 100644 index 000000000000..1edbf89680fd --- /dev/null +++ b/arch/x86/include/asm/i8253.h @@ -0,0 +1,18 @@ +#ifndef _ASM_X86_I8253_H +#define _ASM_X86_I8253_H + +/* i8253A PIT registers */ +#define PIT_MODE 0x43 +#define PIT_CH0 0x40 +#define PIT_CH2 0x42 + +extern spinlock_t i8253_lock; + +extern struct clock_event_device *global_clock_event; + +extern void setup_pit_timer(void); + +#define inb_pit inb_p +#define outb_pit outb_p + +#endif /* _ASM_X86_I8253_H */ diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h new file mode 100644 index 000000000000..58d7091eeb1f --- /dev/null +++ b/arch/x86/include/asm/i8259.h @@ -0,0 +1,63 @@ +#ifndef _ASM_X86_I8259_H +#define _ASM_X86_I8259_H + +#include <linux/delay.h> + +extern unsigned int cached_irq_mask; + +#define __byte(x, y) (((unsigned char *)&(y))[x]) +#define cached_master_mask (__byte(0, cached_irq_mask)) +#define cached_slave_mask (__byte(1, cached_irq_mask)) + +/* i8259A PIC registers */ +#define PIC_MASTER_CMD 0x20 +#define PIC_MASTER_IMR 0x21 +#define PIC_MASTER_ISR PIC_MASTER_CMD +#define PIC_MASTER_POLL PIC_MASTER_ISR +#define PIC_MASTER_OCW3 PIC_MASTER_ISR +#define PIC_SLAVE_CMD 0xa0 +#define PIC_SLAVE_IMR 0xa1 + +/* i8259A PIC related value */ +#define PIC_CASCADE_IR 2 +#define MASTER_ICW4_DEFAULT 0x01 +#define SLAVE_ICW4_DEFAULT 0x01 +#define PIC_ICW4_AEOI 2 + +extern spinlock_t i8259A_lock; + +extern void init_8259A(int auto_eoi); +extern void enable_8259A_irq(unsigned int irq); +extern void disable_8259A_irq(unsigned int irq); +extern unsigned int startup_8259A_irq(unsigned int irq); + +/* the PIC may need a careful delay on some platforms, hence specific calls */ +static inline unsigned char inb_pic(unsigned int port) +{ + unsigned char value = inb(port); + + /* + * delay for some accesses to PIC on motherboard or in chipset + * must be at least one microsecond, so be safe here: + */ + udelay(2); + + return value; +} + +static inline void outb_pic(unsigned char value, unsigned int port) +{ + outb(value, port); + /* + * delay for some accesses to PIC on motherboard or in chipset + * must be at least one microsecond, so be safe here: + */ + udelay(2); +} + +extern struct irq_chip i8259A_chip; + +extern void mask_8259A(void); +extern void unmask_8259A(void); + +#endif /* _ASM_X86_I8259_H */ diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h new file mode 100644 index 000000000000..97989c0e534c --- /dev/null +++ b/arch/x86/include/asm/ia32.h @@ -0,0 +1,170 @@ +#ifndef _ASM_X86_IA32_H +#define _ASM_X86_IA32_H + + +#ifdef CONFIG_IA32_EMULATION + +#include <linux/compat.h> + +/* + * 32 bit structures for IA32 support. + */ + +#include <asm/sigcontext32.h> + +/* signal.h */ +struct sigaction32 { + unsigned int sa_handler; /* Really a pointer, but need to deal + with 32 bits */ + unsigned int sa_flags; + unsigned int sa_restorer; /* Another 32 bit pointer */ + compat_sigset_t sa_mask; /* A 32 bit mask */ +}; + +struct old_sigaction32 { + unsigned int sa_handler; /* Really a pointer, but need to deal + with 32 bits */ + compat_old_sigset_t sa_mask; /* A 32 bit mask */ + unsigned int sa_flags; + unsigned int sa_restorer; /* Another 32 bit pointer */ +}; + +typedef struct sigaltstack_ia32 { + unsigned int ss_sp; + int ss_flags; + unsigned int ss_size; +} stack_ia32_t; + +struct ucontext_ia32 { + unsigned int uc_flags; + unsigned int uc_link; + stack_ia32_t uc_stack; + struct sigcontext_ia32 uc_mcontext; + compat_sigset_t uc_sigmask; /* mask last for extensibility */ +}; + +/* This matches struct stat64 in glibc2.2, hence the absolutely + * insane amounts of padding around dev_t's. + */ +struct stat64 { + unsigned long long st_dev; + unsigned char __pad0[4]; + +#define STAT64_HAS_BROKEN_ST_INO 1 + unsigned int __st_ino; + + unsigned int st_mode; + unsigned int st_nlink; + + unsigned int st_uid; + unsigned int st_gid; + + unsigned long long st_rdev; + unsigned char __pad3[4]; + + long long st_size; + unsigned int st_blksize; + + long long st_blocks;/* Number 512-byte blocks allocated */ + + unsigned st_atime; + unsigned st_atime_nsec; + unsigned st_mtime; + unsigned st_mtime_nsec; + unsigned st_ctime; + unsigned st_ctime_nsec; + + unsigned long long st_ino; +} __attribute__((packed)); + +typedef struct compat_siginfo { + int si_signo; + int si_errno; + int si_code; + + union { + int _pad[((128 / sizeof(int)) - 3)]; + + /* kill() */ + struct { + unsigned int _pid; /* sender's pid */ + unsigned int _uid; /* sender's uid */ + } _kill; + + /* POSIX.1b timers */ + struct { + compat_timer_t _tid; /* timer id */ + int _overrun; /* overrun count */ + compat_sigval_t _sigval; /* same as below */ + int _sys_private; /* not to be passed to user */ + int _overrun_incr; /* amount to add to overrun */ + } _timer; + + /* POSIX.1b signals */ + struct { + unsigned int _pid; /* sender's pid */ + unsigned int _uid; /* sender's uid */ + compat_sigval_t _sigval; + } _rt; + + /* SIGCHLD */ + struct { + unsigned int _pid; /* which child */ + unsigned int _uid; /* sender's uid */ + int _status; /* exit code */ + compat_clock_t _utime; + compat_clock_t _stime; + } _sigchld; + + /* SIGILL, SIGFPE, SIGSEGV, SIGBUS */ + struct { + unsigned int _addr; /* faulting insn/memory ref. */ + } _sigfault; + + /* SIGPOLL */ + struct { + int _band; /* POLL_IN, POLL_OUT, POLL_MSG */ + int _fd; + } _sigpoll; + } _sifields; +} compat_siginfo_t; + +struct sigframe32 { + u32 pretcode; + int sig; + struct sigcontext_ia32 sc; + struct _fpstate_ia32 fpstate; + unsigned int extramask[_COMPAT_NSIG_WORDS-1]; +}; + +struct rt_sigframe32 { + u32 pretcode; + int sig; + u32 pinfo; + u32 puc; + compat_siginfo_t info; + struct ucontext_ia32 uc; + struct _fpstate_ia32 fpstate; +}; + +struct ustat32 { + __u32 f_tfree; + compat_ino_t f_tinode; + char f_fname[6]; + char f_fpack[6]; +}; + +#define IA32_STACK_TOP IA32_PAGE_OFFSET + +#ifdef __KERNEL__ +struct linux_binprm; +extern int ia32_setup_arg_pages(struct linux_binprm *bprm, + unsigned long stack_top, int exec_stack); +struct mm_struct; +extern void ia32_pick_mmap_layout(struct mm_struct *mm); + +#endif + +#endif /* !CONFIG_IA32_SUPPORT */ + +#endif /* _ASM_X86_IA32_H */ diff --git a/arch/x86/include/asm/ia32_unistd.h b/arch/x86/include/asm/ia32_unistd.h new file mode 100644 index 000000000000..976f6ecd2ce6 --- /dev/null +++ b/arch/x86/include/asm/ia32_unistd.h @@ -0,0 +1,18 @@ +#ifndef _ASM_X86_IA32_UNISTD_H +#define _ASM_X86_IA32_UNISTD_H + +/* + * This file contains the system call numbers of the ia32 port, + * this is for the kernel only. + * Only add syscalls here where some part of the kernel needs to know + * the number. This should be otherwise in sync with asm-x86/unistd_32.h. -AK + */ + +#define __NR_ia32_restart_syscall 0 +#define __NR_ia32_exit 1 +#define __NR_ia32_read 3 +#define __NR_ia32_write 4 +#define __NR_ia32_sigreturn 119 +#define __NR_ia32_rt_sigreturn 173 + +#endif /* _ASM_X86_IA32_UNISTD_H */ diff --git a/arch/x86/include/asm/idle.h b/arch/x86/include/asm/idle.h new file mode 100644 index 000000000000..44c89c3a23e9 --- /dev/null +++ b/arch/x86/include/asm/idle.h @@ -0,0 +1,16 @@ +#ifndef _ASM_X86_IDLE_H +#define _ASM_X86_IDLE_H + +#define IDLE_START 1 +#define IDLE_END 2 + +struct notifier_block; +void idle_notifier_register(struct notifier_block *n); +void idle_notifier_unregister(struct notifier_block *n); + +void enter_idle(void); +void exit_idle(void); + +void c1e_remove_cpu(int cpu); + +#endif /* _ASM_X86_IDLE_H */ diff --git a/arch/x86/include/asm/intel_arch_perfmon.h b/arch/x86/include/asm/intel_arch_perfmon.h new file mode 100644 index 000000000000..fa0fd068bc2e --- /dev/null +++ b/arch/x86/include/asm/intel_arch_perfmon.h @@ -0,0 +1,31 @@ +#ifndef _ASM_X86_INTEL_ARCH_PERFMON_H +#define _ASM_X86_INTEL_ARCH_PERFMON_H + +#define MSR_ARCH_PERFMON_PERFCTR0 0xc1 +#define MSR_ARCH_PERFMON_PERFCTR1 0xc2 + +#define MSR_ARCH_PERFMON_EVENTSEL0 0x186 +#define MSR_ARCH_PERFMON_EVENTSEL1 0x187 + +#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22) +#define ARCH_PERFMON_EVENTSEL_INT (1 << 20) +#define ARCH_PERFMON_EVENTSEL_OS (1 << 17) +#define ARCH_PERFMON_EVENTSEL_USR (1 << 16) + +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL (0x3c) +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX (0) +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \ + (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX)) + +union cpuid10_eax { + struct { + unsigned int version_id:8; + unsigned int num_counters:8; + unsigned int bit_width:8; + unsigned int mask_length:8; + } split; + unsigned int full; +}; + +#endif /* _ASM_X86_INTEL_ARCH_PERFMON_H */ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h new file mode 100644 index 000000000000..5618a103f395 --- /dev/null +++ b/arch/x86/include/asm/io.h @@ -0,0 +1,91 @@ +#ifndef _ASM_X86_IO_H +#define _ASM_X86_IO_H + +#define ARCH_HAS_IOREMAP_WC + +#include <linux/compiler.h> + +#define build_mmio_read(name, size, type, reg, barrier) \ +static inline type name(const volatile void __iomem *addr) \ +{ type ret; asm volatile("mov" size " %1,%0":reg (ret) \ +:"m" (*(volatile type __force *)addr) barrier); return ret; } + +#define build_mmio_write(name, size, type, reg, barrier) \ +static inline void name(type val, volatile void __iomem *addr) \ +{ asm volatile("mov" size " %0,%1": :reg (val), \ +"m" (*(volatile type __force *)addr) barrier); } + +build_mmio_read(readb, "b", unsigned char, "=q", :"memory") +build_mmio_read(readw, "w", unsigned short, "=r", :"memory") +build_mmio_read(readl, "l", unsigned int, "=r", :"memory") + +build_mmio_read(__readb, "b", unsigned char, "=q", ) +build_mmio_read(__readw, "w", unsigned short, "=r", ) +build_mmio_read(__readl, "l", unsigned int, "=r", ) + +build_mmio_write(writeb, "b", unsigned char, "q", :"memory") +build_mmio_write(writew, "w", unsigned short, "r", :"memory") +build_mmio_write(writel, "l", unsigned int, "r", :"memory") + +build_mmio_write(__writeb, "b", unsigned char, "q", ) +build_mmio_write(__writew, "w", unsigned short, "r", ) +build_mmio_write(__writel, "l", unsigned int, "r", ) + +#define readb_relaxed(a) __readb(a) +#define readw_relaxed(a) __readw(a) +#define readl_relaxed(a) __readl(a) +#define __raw_readb __readb +#define __raw_readw __readw +#define __raw_readl __readl + +#define __raw_writeb __writeb +#define __raw_writew __writew +#define __raw_writel __writel + +#define mmiowb() barrier() + +#ifdef CONFIG_X86_64 +build_mmio_read(readq, "q", unsigned long, "=r", :"memory") +build_mmio_read(__readq, "q", unsigned long, "=r", ) +build_mmio_write(writeq, "q", unsigned long, "r", :"memory") +build_mmio_write(__writeq, "q", unsigned long, "r", ) + +#define readq_relaxed(a) __readq(a) +#define __raw_readq __readq +#define __raw_writeq writeq + +/* Let people know we have them */ +#define readq readq +#define writeq writeq +#endif + +extern int iommu_bio_merge; + +#ifdef CONFIG_X86_32 +# include "io_32.h" +#else +# include "io_64.h" +#endif + +extern void *xlate_dev_mem_ptr(unsigned long phys); +extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr); + +extern int ioremap_change_attr(unsigned long vaddr, unsigned long size, + unsigned long prot_val); +extern void __iomem *ioremap_wc(unsigned long offset, unsigned long size); + +/* + * early_ioremap() and early_iounmap() are for temporary early boot-time + * mappings, before the real ioremap() is functional. + * A boot-time mapping is currently limited to at most 16 pages. + */ +extern void early_ioremap_init(void); +extern void early_ioremap_clear(void); +extern void early_ioremap_reset(void); +extern void *early_ioremap(unsigned long offset, unsigned long size); +extern void *early_memremap(unsigned long offset, unsigned long size); +extern void early_iounmap(void *addr, unsigned long size); +extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys); + + +#endif /* _ASM_X86_IO_H */ diff --git a/arch/x86/include/asm/io_32.h b/arch/x86/include/asm/io_32.h new file mode 100644 index 000000000000..d8e242e1b396 --- /dev/null +++ b/arch/x86/include/asm/io_32.h @@ -0,0 +1,284 @@ +#ifndef _ASM_X86_IO_32_H +#define _ASM_X86_IO_32_H + +#include <linux/string.h> +#include <linux/compiler.h> + +/* + * This file contains the definitions for the x86 IO instructions + * inb/inw/inl/outb/outw/outl and the "string versions" of the same + * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing" + * versions of the single-IO instructions (inb_p/inw_p/..). + * + * This file is not meant to be obfuscating: it's just complicated + * to (a) handle it all in a way that makes gcc able to optimize it + * as well as possible and (b) trying to avoid writing the same thing + * over and over again with slight variations and possibly making a + * mistake somewhere. + */ + +/* + * Thanks to James van Artsdalen for a better timing-fix than + * the two short jumps: using outb's to a nonexistent port seems + * to guarantee better timings even on fast machines. + * + * On the other hand, I'd like to be sure of a non-existent port: + * I feel a bit unsafe about using 0x80 (should be safe, though) + * + * Linus + */ + + /* + * Bit simplified and optimized by Jan Hubicka + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999. + * + * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added, + * isa_read[wl] and isa_write[wl] fixed + * - Arnaldo Carvalho de Melo <acme@conectiva.com.br> + */ + +#define IO_SPACE_LIMIT 0xffff + +#define XQUAD_PORTIO_BASE 0xfe400000 +#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */ + +#ifdef __KERNEL__ + +#include <asm-generic/iomap.h> + +#include <linux/vmalloc.h> + +/* + * Convert a virtual cached pointer to an uncached pointer + */ +#define xlate_dev_kmem_ptr(p) p + +/** + * virt_to_phys - map virtual addresses to physical + * @address: address to remap + * + * The returned physical address is the physical (CPU) mapping for + * the memory address given. It is only valid to use this function on + * addresses directly mapped or allocated via kmalloc. + * + * This function does not give bus mappings for DMA transfers. In + * almost all conceivable cases a device driver should not be using + * this function + */ + +static inline unsigned long virt_to_phys(volatile void *address) +{ + return __pa(address); +} + +/** + * phys_to_virt - map physical address to virtual + * @address: address to remap + * + * The returned virtual address is a current CPU mapping for + * the memory address given. It is only valid to use this function on + * addresses that have a kernel mapping + * + * This function does not handle bus mappings for DMA transfers. In + * almost all conceivable cases a device driver should not be using + * this function + */ + +static inline void *phys_to_virt(unsigned long address) +{ + return __va(address); +} + +/* + * Change "struct page" to physical address. + */ +#define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT) + +/** + * ioremap - map bus memory into CPU space + * @offset: bus address of the memory + * @size: size of the resource to map + * + * ioremap performs a platform specific sequence of operations to + * make bus memory CPU accessible via the readb/readw/readl/writeb/ + * writew/writel functions and the other mmio helpers. The returned + * address is not guaranteed to be usable directly as a virtual + * address. + * + * If the area you are trying to map is a PCI BAR you should have a + * look at pci_iomap(). + */ +extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size); +extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); +extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, + unsigned long prot_val); + +/* + * The default ioremap() behavior is non-cached: + */ +static inline void __iomem *ioremap(resource_size_t offset, unsigned long size) +{ + return ioremap_nocache(offset, size); +} + +extern void iounmap(volatile void __iomem *addr); + +/* + * ISA I/O bus memory addresses are 1:1 with the physical address. + */ +#define isa_virt_to_bus virt_to_phys +#define isa_page_to_bus page_to_phys +#define isa_bus_to_virt phys_to_virt + +/* + * However PCI ones are not necessarily 1:1 and therefore these interfaces + * are forbidden in portable PCI drivers. + * + * Allow them on x86 for legacy drivers, though. + */ +#define virt_to_bus virt_to_phys +#define bus_to_virt phys_to_virt + +static inline void +memset_io(volatile void __iomem *addr, unsigned char val, int count) +{ + memset((void __force *)addr, val, count); +} + +static inline void +memcpy_fromio(void *dst, const volatile void __iomem *src, int count) +{ + __memcpy(dst, (const void __force *)src, count); +} + +static inline void +memcpy_toio(volatile void __iomem *dst, const void *src, int count) +{ + __memcpy((void __force *)dst, src, count); +} + +/* + * ISA space is 'always mapped' on a typical x86 system, no need to + * explicitly ioremap() it. The fact that the ISA IO space is mapped + * to PAGE_OFFSET is pure coincidence - it does not mean ISA values + * are physical addresses. The following constant pointer can be + * used as the IO-area pointer (it can be iounmapped as well, so the + * analogy with PCI is quite large): + */ +#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET)) + +/* + * Cache management + * + * This needed for two cases + * 1. Out of order aware processors + * 2. Accidentally out of order processors (PPro errata #51) + */ + +#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE) + +static inline void flush_write_buffers(void) +{ + asm volatile("lock; addl $0,0(%%esp)": : :"memory"); +} + +#else + +#define flush_write_buffers() do { } while (0) + +#endif + +#endif /* __KERNEL__ */ + +extern void native_io_delay(void); + +extern int io_delay_type; +extern void io_delay_init(void); + +#if defined(CONFIG_PARAVIRT) +#include <asm/paravirt.h> +#else + +static inline void slow_down_io(void) +{ + native_io_delay(); +#ifdef REALLY_SLOW_IO + native_io_delay(); + native_io_delay(); + native_io_delay(); +#endif +} + +#endif + +#define __BUILDIO(bwl, bw, type) \ +static inline void out##bwl(unsigned type value, int port) \ +{ \ + out##bwl##_local(value, port); \ +} \ + \ +static inline unsigned type in##bwl(int port) \ +{ \ + return in##bwl##_local(port); \ +} + +#define BUILDIO(bwl, bw, type) \ +static inline void out##bwl##_local(unsigned type value, int port) \ +{ \ + asm volatile("out" #bwl " %" #bw "0, %w1" \ + : : "a"(value), "Nd"(port)); \ +} \ + \ +static inline unsigned type in##bwl##_local(int port) \ +{ \ + unsigned type value; \ + asm volatile("in" #bwl " %w1, %" #bw "0" \ + : "=a"(value) : "Nd"(port)); \ + return value; \ +} \ + \ +static inline void out##bwl##_local_p(unsigned type value, int port) \ +{ \ + out##bwl##_local(value, port); \ + slow_down_io(); \ +} \ + \ +static inline unsigned type in##bwl##_local_p(int port) \ +{ \ + unsigned type value = in##bwl##_local(port); \ + slow_down_io(); \ + return value; \ +} \ + \ +__BUILDIO(bwl, bw, type) \ + \ +static inline void out##bwl##_p(unsigned type value, int port) \ +{ \ + out##bwl(value, port); \ + slow_down_io(); \ +} \ + \ +static inline unsigned type in##bwl##_p(int port) \ +{ \ + unsigned type value = in##bwl(port); \ + slow_down_io(); \ + return value; \ +} \ + \ +static inline void outs##bwl(int port, const void *addr, unsigned long count) \ +{ \ + asm volatile("rep; outs" #bwl \ + : "+S"(addr), "+c"(count) : "d"(port)); \ +} \ + \ +static inline void ins##bwl(int port, void *addr, unsigned long count) \ +{ \ + asm volatile("rep; ins" #bwl \ + : "+D"(addr), "+c"(count) : "d"(port)); \ +} + +BUILDIO(b, b, char) +BUILDIO(w, w, short) +BUILDIO(l, , int) + +#endif /* _ASM_X86_IO_32_H */ diff --git a/arch/x86/include/asm/io_64.h b/arch/x86/include/asm/io_64.h new file mode 100644 index 000000000000..fea325a1122f --- /dev/null +++ b/arch/x86/include/asm/io_64.h @@ -0,0 +1,244 @@ +#ifndef _ASM_X86_IO_64_H +#define _ASM_X86_IO_64_H + + +/* + * This file contains the definitions for the x86 IO instructions + * inb/inw/inl/outb/outw/outl and the "string versions" of the same + * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing" + * versions of the single-IO instructions (inb_p/inw_p/..). + * + * This file is not meant to be obfuscating: it's just complicated + * to (a) handle it all in a way that makes gcc able to optimize it + * as well as possible and (b) trying to avoid writing the same thing + * over and over again with slight variations and possibly making a + * mistake somewhere. + */ + +/* + * Thanks to James van Artsdalen for a better timing-fix than + * the two short jumps: using outb's to a nonexistent port seems + * to guarantee better timings even on fast machines. + * + * On the other hand, I'd like to be sure of a non-existent port: + * I feel a bit unsafe about using 0x80 (should be safe, though) + * + * Linus + */ + + /* + * Bit simplified and optimized by Jan Hubicka + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999. + * + * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added, + * isa_read[wl] and isa_write[wl] fixed + * - Arnaldo Carvalho de Melo <acme@conectiva.com.br> + */ + +extern void native_io_delay(void); + +extern int io_delay_type; +extern void io_delay_init(void); + +#if defined(CONFIG_PARAVIRT) +#include <asm/paravirt.h> +#else + +static inline void slow_down_io(void) +{ + native_io_delay(); +#ifdef REALLY_SLOW_IO + native_io_delay(); + native_io_delay(); + native_io_delay(); +#endif +} +#endif + +/* + * Talk about misusing macros.. + */ +#define __OUT1(s, x) \ +static inline void out##s(unsigned x value, unsigned short port) { + +#define __OUT2(s, s1, s2) \ +asm volatile ("out" #s " %" s1 "0,%" s2 "1" + +#ifndef REALLY_SLOW_IO +#define REALLY_SLOW_IO +#define UNSET_REALLY_SLOW_IO +#endif + +#define __OUT(s, s1, x) \ + __OUT1(s, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \ + } \ + __OUT1(s##_p, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \ + slow_down_io(); \ +} + +#define __IN1(s) \ +static inline RETURN_TYPE in##s(unsigned short port) \ +{ \ + RETURN_TYPE _v; + +#define __IN2(s, s1, s2) \ + asm volatile ("in" #s " %" s2 "1,%" s1 "0" + +#define __IN(s, s1, i...) \ + __IN1(s) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); \ + return _v; \ + } \ + __IN1(s##_p) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); \ + slow_down_io(); \ + return _v; } + +#ifdef UNSET_REALLY_SLOW_IO +#undef REALLY_SLOW_IO +#endif + +#define __INS(s) \ +static inline void ins##s(unsigned short port, void *addr, \ + unsigned long count) \ +{ \ + asm volatile ("rep ; ins" #s \ + : "=D" (addr), "=c" (count) \ + : "d" (port), "0" (addr), "1" (count)); \ +} + +#define __OUTS(s) \ +static inline void outs##s(unsigned short port, const void *addr, \ + unsigned long count) \ +{ \ + asm volatile ("rep ; outs" #s \ + : "=S" (addr), "=c" (count) \ + : "d" (port), "0" (addr), "1" (count)); \ +} + +#define RETURN_TYPE unsigned char +__IN(b, "") +#undef RETURN_TYPE +#define RETURN_TYPE unsigned short +__IN(w, "") +#undef RETURN_TYPE +#define RETURN_TYPE unsigned int +__IN(l, "") +#undef RETURN_TYPE + +__OUT(b, "b", char) +__OUT(w, "w", short) +__OUT(l, , int) + +__INS(b) +__INS(w) +__INS(l) + +__OUTS(b) +__OUTS(w) +__OUTS(l) + +#define IO_SPACE_LIMIT 0xffff + +#if defined(__KERNEL__) && defined(__x86_64__) + +#include <linux/vmalloc.h> + +#ifndef __i386__ +/* + * Change virtual addresses to physical addresses and vv. + * These are pretty trivial + */ +static inline unsigned long virt_to_phys(volatile void *address) +{ + return __pa(address); +} + +static inline void *phys_to_virt(unsigned long address) +{ + return __va(address); +} +#endif + +/* + * Change "struct page" to physical address. + */ +#define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT) + +#include <asm-generic/iomap.h> + +/* + * This one maps high address device memory and turns off caching for that area. + * it's useful if some control registers are in such an area and write combining + * or read caching is not desirable: + */ +extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size); +extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); +extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, + unsigned long prot_val); + +/* + * The default ioremap() behavior is non-cached: + */ +static inline void __iomem *ioremap(resource_size_t offset, unsigned long size) +{ + return ioremap_nocache(offset, size); +} + +extern void iounmap(volatile void __iomem *addr); + +extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys); + +/* + * ISA I/O bus memory addresses are 1:1 with the physical address. + */ +#define isa_virt_to_bus virt_to_phys +#define isa_page_to_bus page_to_phys +#define isa_bus_to_virt phys_to_virt + +/* + * However PCI ones are not necessarily 1:1 and therefore these interfaces + * are forbidden in portable PCI drivers. + * + * Allow them on x86 for legacy drivers, though. + */ +#define virt_to_bus virt_to_phys +#define bus_to_virt phys_to_virt + +void __memcpy_fromio(void *, unsigned long, unsigned); +void __memcpy_toio(unsigned long, const void *, unsigned); + +static inline void memcpy_fromio(void *to, const volatile void __iomem *from, + unsigned len) +{ + __memcpy_fromio(to, (unsigned long)from, len); +} + +static inline void memcpy_toio(volatile void __iomem *to, const void *from, + unsigned len) +{ + __memcpy_toio((unsigned long)to, from, len); +} + +void memset_io(volatile void __iomem *a, int b, size_t c); + +/* + * ISA space is 'always mapped' on a typical x86 system, no need to + * explicitly ioremap() it. The fact that the ISA IO space is mapped + * to PAGE_OFFSET is pure coincidence - it does not mean ISA values + * are physical addresses. The following constant pointer can be + * used as the IO-area pointer (it can be iounmapped as well, so the + * analogy with PCI is quite large): + */ +#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET)) + +#define flush_write_buffers() + +#define BIO_VMERGE_BOUNDARY iommu_bio_merge + +/* + * Convert a virtual cached pointer to an uncached pointer + */ +#define xlate_dev_kmem_ptr(p) p + +#endif /* __KERNEL__ */ + +#endif /* _ASM_X86_IO_64_H */ diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h new file mode 100644 index 000000000000..e475e009ae5d --- /dev/null +++ b/arch/x86/include/asm/io_apic.h @@ -0,0 +1,214 @@ +#ifndef _ASM_X86_IO_APIC_H +#define _ASM_X86_IO_APIC_H + +#include <linux/types.h> +#include <asm/mpspec.h> +#include <asm/apicdef.h> +#include <asm/irq_vectors.h> + +/* + * Intel IO-APIC support for SMP and UP systems. + * + * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar + */ + +/* I/O Unit Redirection Table */ +#define IO_APIC_REDIR_VECTOR_MASK 0x000FF +#define IO_APIC_REDIR_DEST_LOGICAL 0x00800 +#define IO_APIC_REDIR_DEST_PHYSICAL 0x00000 +#define IO_APIC_REDIR_SEND_PENDING (1 << 12) +#define IO_APIC_REDIR_REMOTE_IRR (1 << 14) +#define IO_APIC_REDIR_LEVEL_TRIGGER (1 << 15) +#define IO_APIC_REDIR_MASKED (1 << 16) + +/* + * The structure of the IO-APIC: + */ +union IO_APIC_reg_00 { + u32 raw; + struct { + u32 __reserved_2 : 14, + LTS : 1, + delivery_type : 1, + __reserved_1 : 8, + ID : 8; + } __attribute__ ((packed)) bits; +}; + +union IO_APIC_reg_01 { + u32 raw; + struct { + u32 version : 8, + __reserved_2 : 7, + PRQ : 1, + entries : 8, + __reserved_1 : 8; + } __attribute__ ((packed)) bits; +}; + +union IO_APIC_reg_02 { + u32 raw; + struct { + u32 __reserved_2 : 24, + arbitration : 4, + __reserved_1 : 4; + } __attribute__ ((packed)) bits; +}; + +union IO_APIC_reg_03 { + u32 raw; + struct { + u32 boot_DT : 1, + __reserved_1 : 31; + } __attribute__ ((packed)) bits; +}; + +enum ioapic_irq_destination_types { + dest_Fixed = 0, + dest_LowestPrio = 1, + dest_SMI = 2, + dest__reserved_1 = 3, + dest_NMI = 4, + dest_INIT = 5, + dest__reserved_2 = 6, + dest_ExtINT = 7 +}; + +struct IO_APIC_route_entry { + __u32 vector : 8, + delivery_mode : 3, /* 000: FIXED + * 001: lowest prio + * 111: ExtINT + */ + dest_mode : 1, /* 0: physical, 1: logical */ + delivery_status : 1, + polarity : 1, + irr : 1, + trigger : 1, /* 0: edge, 1: level */ + mask : 1, /* 0: enabled, 1: disabled */ + __reserved_2 : 15; + + __u32 __reserved_3 : 24, + dest : 8; +} __attribute__ ((packed)); + +struct IR_IO_APIC_route_entry { + __u64 vector : 8, + zero : 3, + index2 : 1, + delivery_status : 1, + polarity : 1, + irr : 1, + trigger : 1, + mask : 1, + reserved : 31, + format : 1, + index : 15; +} __attribute__ ((packed)); + +#ifdef CONFIG_X86_IO_APIC + +/* + * # of IO-APICs and # of IRQ routing registers + */ +extern int nr_ioapics; +extern int nr_ioapic_registers[MAX_IO_APICS]; + +/* + * MP-BIOS irq configuration table structures: + */ + +#define MP_MAX_IOAPIC_PIN 127 + +struct mp_config_ioapic { + unsigned long mp_apicaddr; + unsigned int mp_apicid; + unsigned char mp_type; + unsigned char mp_apicver; + unsigned char mp_flags; +}; + +struct mp_config_intsrc { + unsigned int mp_dstapic; + unsigned char mp_type; + unsigned char mp_irqtype; + unsigned short mp_irqflag; + unsigned char mp_srcbus; + unsigned char mp_srcbusirq; + unsigned char mp_dstirq; +}; + +/* I/O APIC entries */ +extern struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; + +/* # of MP IRQ source entries */ +extern int mp_irq_entries; + +/* MP IRQ source entries */ +extern struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; + +/* non-0 if default (table-less) MP configuration */ +extern int mpc_default_type; + +/* Older SiS APIC requires we rewrite the index register */ +extern int sis_apic_bug; + +/* 1 if "noapic" boot option passed */ +extern int skip_ioapic_setup; + +/* 1 if "noapic" boot option passed */ +extern int noioapicquirk; + +/* -1 if "noapic" boot option passed */ +extern int noioapicreroute; + +/* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */ +extern int timer_through_8259; + +static inline void disable_ioapic_setup(void) +{ +#ifdef CONFIG_PCI + noioapicquirk = 1; + noioapicreroute = -1; +#endif + skip_ioapic_setup = 1; +} + +/* + * If we use the IO-APIC for IRQ routing, disable automatic + * assignment of PCI IRQ's. + */ +#define io_apic_assign_pci_irqs \ + (mp_irq_entries && !skip_ioapic_setup && io_apic_irqs) + +#ifdef CONFIG_ACPI +extern int io_apic_get_unique_id(int ioapic, int apic_id); +extern int io_apic_get_version(int ioapic); +extern int io_apic_get_redir_entries(int ioapic); +extern int io_apic_set_pci_routing(int ioapic, int pin, int irq, + int edge_level, int active_high_low); +#endif /* CONFIG_ACPI */ + +extern int (*ioapic_renumber_irq)(int ioapic, int irq); +extern void ioapic_init_mappings(void); + +#ifdef CONFIG_X86_64 +extern int save_mask_IO_APIC_setup(void); +extern void restore_IO_APIC_setup(void); +extern void reinit_intr_remapped_IO_APIC(int); +#endif + +extern int probe_nr_irqs(void); + +#else /* !CONFIG_X86_IO_APIC */ +#define io_apic_assign_pci_irqs 0 +static const int timer_through_8259 = 0; +static inline void ioapic_init_mappings(void) { } + +static inline int probe_nr_irqs(void) +{ + return NR_IRQS; +} +#endif + +#endif /* _ASM_X86_IO_APIC_H */ diff --git a/arch/x86/include/asm/ioctl.h b/arch/x86/include/asm/ioctl.h new file mode 100644 index 000000000000..b279fe06dfe5 --- /dev/null +++ b/arch/x86/include/asm/ioctl.h @@ -0,0 +1 @@ +#include <asm-generic/ioctl.h> diff --git a/arch/x86/include/asm/ioctls.h b/arch/x86/include/asm/ioctls.h new file mode 100644 index 000000000000..0d5b23b7b06e --- /dev/null +++ b/arch/x86/include/asm/ioctls.h @@ -0,0 +1,94 @@ +#ifndef _ASM_X86_IOCTLS_H +#define _ASM_X86_IOCTLS_H + +#include <asm/ioctl.h> + +/* 0x54 is just a magic number to make these relatively unique ('T') */ + +#define TCGETS 0x5401 +#define TCSETS 0x5402 /* Clashes with SNDCTL_TMR_START sound ioctl */ +#define TCSETSW 0x5403 +#define TCSETSF 0x5404 +#define TCGETA 0x5405 +#define TCSETA 0x5406 +#define TCSETAW 0x5407 +#define TCSETAF 0x5408 +#define TCSBRK 0x5409 +#define TCXONC 0x540A +#define TCFLSH 0x540B +#define TIOCEXCL 0x540C +#define TIOCNXCL 0x540D +#define TIOCSCTTY 0x540E +#define TIOCGPGRP 0x540F +#define TIOCSPGRP 0x5410 +#define TIOCOUTQ 0x5411 +#define TIOCSTI 0x5412 +#define TIOCGWINSZ 0x5413 +#define TIOCSWINSZ 0x5414 +#define TIOCMGET 0x5415 +#define TIOCMBIS 0x5416 +#define TIOCMBIC 0x5417 +#define TIOCMSET 0x5418 +#define TIOCGSOFTCAR 0x5419 +#define TIOCSSOFTCAR 0x541A +#define FIONREAD 0x541B +#define TIOCINQ FIONREAD +#define TIOCLINUX 0x541C +#define TIOCCONS 0x541D +#define TIOCGSERIAL 0x541E +#define TIOCSSERIAL 0x541F +#define TIOCPKT 0x5420 +#define FIONBIO 0x5421 +#define TIOCNOTTY 0x5422 +#define TIOCSETD 0x5423 +#define TIOCGETD 0x5424 +#define TCSBRKP 0x5425 /* Needed for POSIX tcsendbreak() */ +/* #define TIOCTTYGSTRUCT 0x5426 - Former debugging-only ioctl */ +#define TIOCSBRK 0x5427 /* BSD compatibility */ +#define TIOCCBRK 0x5428 /* BSD compatibility */ +#define TIOCGSID 0x5429 /* Return the session ID of FD */ +#define TCGETS2 _IOR('T', 0x2A, struct termios2) +#define TCSETS2 _IOW('T', 0x2B, struct termios2) +#define TCSETSW2 _IOW('T', 0x2C, struct termios2) +#define TCSETSF2 _IOW('T', 0x2D, struct termios2) +#define TIOCGRS485 0x542E +#define TIOCSRS485 0x542F +#define TIOCGPTN _IOR('T', 0x30, unsigned int) + /* Get Pty Number (of pty-mux device) */ +#define TIOCSPTLCK _IOW('T', 0x31, int) /* Lock/unlock Pty */ +#define TCGETX 0x5432 /* SYS5 TCGETX compatibility */ +#define TCSETX 0x5433 +#define TCSETXF 0x5434 +#define TCSETXW 0x5435 + +#define FIONCLEX 0x5450 +#define FIOCLEX 0x5451 +#define FIOASYNC 0x5452 +#define TIOCSERCONFIG 0x5453 +#define TIOCSERGWILD 0x5454 +#define TIOCSERSWILD 0x5455 +#define TIOCGLCKTRMIOS 0x5456 +#define TIOCSLCKTRMIOS 0x5457 +#define TIOCSERGSTRUCT 0x5458 /* For debugging only */ +#define TIOCSERGETLSR 0x5459 /* Get line status register */ +#define TIOCSERGETMULTI 0x545A /* Get multiport config */ +#define TIOCSERSETMULTI 0x545B /* Set multiport config */ + +#define TIOCMIWAIT 0x545C /* wait for a change on serial input line(s) */ +#define TIOCGICOUNT 0x545D /* read serial port inline interrupt counts */ +#define TIOCGHAYESESP 0x545E /* Get Hayes ESP configuration */ +#define TIOCSHAYESESP 0x545F /* Set Hayes ESP configuration */ +#define FIOQSIZE 0x5460 + +/* Used for packet mode */ +#define TIOCPKT_DATA 0 +#define TIOCPKT_FLUSHREAD 1 +#define TIOCPKT_FLUSHWRITE 2 +#define TIOCPKT_STOP 4 +#define TIOCPKT_START 8 +#define TIOCPKT_NOSTOP 16 +#define TIOCPKT_DOSTOP 32 + +#define TIOCSER_TEMT 0x01 /* Transmitter physically empty */ + +#endif /* _ASM_X86_IOCTLS_H */ diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h new file mode 100644 index 000000000000..98e28ea8cd16 --- /dev/null +++ b/arch/x86/include/asm/iommu.h @@ -0,0 +1,50 @@ +#ifndef _ASM_X86_IOMMU_H +#define _ASM_X86_IOMMU_H + +extern void pci_iommu_shutdown(void); +extern void no_iommu_init(void); +extern struct dma_mapping_ops nommu_dma_ops; +extern int force_iommu, no_iommu; +extern int iommu_detected; +extern int dmar_disabled; +extern int forbid_dac; + +extern unsigned long iommu_nr_pages(unsigned long addr, unsigned long len); + +/* 10 seconds */ +#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) + +#ifdef CONFIG_GART_IOMMU +extern int gart_iommu_aperture; +extern int gart_iommu_aperture_allowed; +extern int gart_iommu_aperture_disabled; + +extern void early_gart_iommu_check(void); +extern void gart_iommu_init(void); +extern void gart_iommu_shutdown(void); +extern void __init gart_parse_options(char *); +extern void gart_iommu_hole_init(void); + +#else +#define gart_iommu_aperture 0 +#define gart_iommu_aperture_allowed 0 +#define gart_iommu_aperture_disabled 1 + +static inline void early_gart_iommu_check(void) +{ +} +static inline void gart_iommu_init(void) +{ +} +static inline void gart_iommu_shutdown(void) +{ +} +static inline void gart_parse_options(char *options) +{ +} +static inline void gart_iommu_hole_init(void) +{ +} +#endif + +#endif /* _ASM_X86_IOMMU_H */ diff --git a/arch/x86/include/asm/ipcbuf.h b/arch/x86/include/asm/ipcbuf.h new file mode 100644 index 000000000000..ee678fd51594 --- /dev/null +++ b/arch/x86/include/asm/ipcbuf.h @@ -0,0 +1,28 @@ +#ifndef _ASM_X86_IPCBUF_H +#define _ASM_X86_IPCBUF_H + +/* + * The ipc64_perm structure for x86 architecture. + * Note extra padding because this structure is passed back and forth + * between kernel and user space. + * + * Pad space is left for: + * - 32-bit mode_t and seq + * - 2 miscellaneous 32-bit values + */ + +struct ipc64_perm { + __kernel_key_t key; + __kernel_uid32_t uid; + __kernel_gid32_t gid; + __kernel_uid32_t cuid; + __kernel_gid32_t cgid; + __kernel_mode_t mode; + unsigned short __pad1; + unsigned short seq; + unsigned short __pad2; + unsigned long __unused1; + unsigned long __unused2; +}; + +#endif /* _ASM_X86_IPCBUF_H */ diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h new file mode 100644 index 000000000000..f89dffb28aa9 --- /dev/null +++ b/arch/x86/include/asm/ipi.h @@ -0,0 +1,138 @@ +#ifndef _ASM_X86_IPI_H +#define _ASM_X86_IPI_H + +/* + * Copyright 2004 James Cleverdon, IBM. + * Subject to the GNU Public License, v.2 + * + * Generic APIC InterProcessor Interrupt code. + * + * Moved to include file by James Cleverdon from + * arch/x86-64/kernel/smp.c + * + * Copyrights from kernel/smp.c: + * + * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> + * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com> + * (c) 2002,2003 Andi Kleen, SuSE Labs. + * Subject to the GNU Public License, v.2 + */ + +#include <asm/hw_irq.h> +#include <asm/apic.h> +#include <asm/smp.h> + +/* + * the following functions deal with sending IPIs between CPUs. + * + * We use 'broadcast', CPU->CPU IPIs and self-IPIs too. + */ + +static inline unsigned int __prepare_ICR(unsigned int shortcut, int vector, + unsigned int dest) +{ + unsigned int icr = shortcut | dest; + + switch (vector) { + default: + icr |= APIC_DM_FIXED | vector; + break; + case NMI_VECTOR: + icr |= APIC_DM_NMI; + break; + } + return icr; +} + +static inline int __prepare_ICR2(unsigned int mask) +{ + return SET_APIC_DEST_FIELD(mask); +} + +static inline void __xapic_wait_icr_idle(void) +{ + while (native_apic_mem_read(APIC_ICR) & APIC_ICR_BUSY) + cpu_relax(); +} + +static inline void __send_IPI_shortcut(unsigned int shortcut, int vector, + unsigned int dest) +{ + /* + * Subtle. In the case of the 'never do double writes' workaround + * we have to lock out interrupts to be safe. As we don't care + * of the value read we use an atomic rmw access to avoid costly + * cli/sti. Otherwise we use an even cheaper single atomic write + * to the APIC. + */ + unsigned int cfg; + + /* + * Wait for idle. + */ + __xapic_wait_icr_idle(); + + /* + * No need to touch the target chip field + */ + cfg = __prepare_ICR(shortcut, vector, dest); + + /* + * Send the IPI. The write to APIC_ICR fires this off. + */ + native_apic_mem_write(APIC_ICR, cfg); +} + +/* + * This is used to send an IPI with no shorthand notation (the destination is + * specified in bits 56 to 63 of the ICR). + */ +static inline void __send_IPI_dest_field(unsigned int mask, int vector, + unsigned int dest) +{ + unsigned long cfg; + + /* + * Wait for idle. + */ + if (unlikely(vector == NMI_VECTOR)) + safe_apic_wait_icr_idle(); + else + __xapic_wait_icr_idle(); + + /* + * prepare target chip field + */ + cfg = __prepare_ICR2(mask); + native_apic_mem_write(APIC_ICR2, cfg); + + /* + * program the ICR + */ + cfg = __prepare_ICR(0, vector, dest); + + /* + * Send the IPI. The write to APIC_ICR fires this off. + */ + native_apic_mem_write(APIC_ICR, cfg); +} + +static inline void send_IPI_mask_sequence(cpumask_t mask, int vector) +{ + unsigned long flags; + unsigned long query_cpu; + + /* + * Hack. The clustered APIC addressing mode doesn't allow us to send + * to an arbitrary mask, so I do a unicast to each CPU instead. + * - mbligh + */ + local_irq_save(flags); + for_each_cpu_mask_nr(query_cpu, mask) { + __send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, query_cpu), + vector, APIC_DEST_PHYSICAL); + } + local_irq_restore(flags); +} + +#endif /* _ASM_X86_IPI_H */ diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h new file mode 100644 index 000000000000..bae0eda95486 --- /dev/null +++ b/arch/x86/include/asm/irq.h @@ -0,0 +1,50 @@ +#ifndef _ASM_X86_IRQ_H +#define _ASM_X86_IRQ_H +/* + * (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar + * + * IRQ/IPI changes taken from work by Thomas Radke + * <tomsoft@informatik.tu-chemnitz.de> + */ + +#include <asm/apicdef.h> +#include <asm/irq_vectors.h> + +static inline int irq_canonicalize(int irq) +{ + return ((irq == 2) ? 9 : irq); +} + +#ifdef CONFIG_X86_LOCAL_APIC +# define ARCH_HAS_NMI_WATCHDOG +#endif + +#ifdef CONFIG_4KSTACKS + extern void irq_ctx_init(int cpu); + extern void irq_ctx_exit(int cpu); +# define __ARCH_HAS_DO_SOFTIRQ +#else +# define irq_ctx_init(cpu) do { } while (0) +# define irq_ctx_exit(cpu) do { } while (0) +# ifdef CONFIG_X86_64 +# define __ARCH_HAS_DO_SOFTIRQ +# endif +#endif + +#ifdef CONFIG_IRQBALANCE +extern int irqbalance_disable(char *str); +#endif + +#ifdef CONFIG_HOTPLUG_CPU +#include <linux/cpumask.h> +extern void fixup_irqs(cpumask_t map); +#endif + +extern unsigned int do_IRQ(struct pt_regs *regs); +extern void init_IRQ(void); +extern void native_init_IRQ(void); + +/* Interrupt vector management */ +extern DECLARE_BITMAP(used_vectors, NR_VECTORS); + +#endif /* _ASM_X86_IRQ_H */ diff --git a/arch/x86/include/asm/irq_regs.h b/arch/x86/include/asm/irq_regs.h new file mode 100644 index 000000000000..89c898ab298b --- /dev/null +++ b/arch/x86/include/asm/irq_regs.h @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include "irq_regs_32.h" +#else +# include "irq_regs_64.h" +#endif diff --git a/arch/x86/include/asm/irq_regs_32.h b/arch/x86/include/asm/irq_regs_32.h new file mode 100644 index 000000000000..af2f02d27fc7 --- /dev/null +++ b/arch/x86/include/asm/irq_regs_32.h @@ -0,0 +1,29 @@ +/* + * Per-cpu current frame pointer - the location of the last exception frame on + * the stack, stored in the per-cpu area. + * + * Jeremy Fitzhardinge <jeremy@goop.org> + */ +#ifndef _ASM_X86_IRQ_REGS_32_H +#define _ASM_X86_IRQ_REGS_32_H + +#include <asm/percpu.h> + +DECLARE_PER_CPU(struct pt_regs *, irq_regs); + +static inline struct pt_regs *get_irq_regs(void) +{ + return x86_read_percpu(irq_regs); +} + +static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs) +{ + struct pt_regs *old_regs; + + old_regs = get_irq_regs(); + x86_write_percpu(irq_regs, new_regs); + + return old_regs; +} + +#endif /* _ASM_X86_IRQ_REGS_32_H */ diff --git a/arch/x86/include/asm/irq_regs_64.h b/arch/x86/include/asm/irq_regs_64.h new file mode 100644 index 000000000000..3dd9c0b70270 --- /dev/null +++ b/arch/x86/include/asm/irq_regs_64.h @@ -0,0 +1 @@ +#include <asm-generic/irq_regs.h> diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h new file mode 100644 index 000000000000..20e1fd588dbf --- /dev/null +++ b/arch/x86/include/asm/irq_remapping.h @@ -0,0 +1,8 @@ +#ifndef _ASM_X86_IRQ_REMAPPING_H +#define _ASM_X86_IRQ_REMAPPING_H + +extern int x2apic; + +#define IRTE_DEST(dest) ((x2apic) ? dest : dest << 8) + +#endif /* _ASM_X86_IRQ_REMAPPING_H */ diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h new file mode 100644 index 000000000000..d843ed0e9b2e --- /dev/null +++ b/arch/x86/include/asm/irq_vectors.h @@ -0,0 +1,164 @@ +#ifndef _ASM_X86_IRQ_VECTORS_H +#define _ASM_X86_IRQ_VECTORS_H + +#include <linux/threads.h> + +#define NMI_VECTOR 0x02 + +/* + * IDT vectors usable for external interrupt sources start + * at 0x20: + */ +#define FIRST_EXTERNAL_VECTOR 0x20 + +#ifdef CONFIG_X86_32 +# define SYSCALL_VECTOR 0x80 +#else +# define IA32_SYSCALL_VECTOR 0x80 +#endif + +/* + * Reserve the lowest usable priority level 0x20 - 0x2f for triggering + * cleanup after irq migration. + */ +#define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR + +/* + * Vectors 0x30-0x3f are used for ISA interrupts. + */ +#define IRQ0_VECTOR (FIRST_EXTERNAL_VECTOR + 0x10) +#define IRQ1_VECTOR (IRQ0_VECTOR + 1) +#define IRQ2_VECTOR (IRQ0_VECTOR + 2) +#define IRQ3_VECTOR (IRQ0_VECTOR + 3) +#define IRQ4_VECTOR (IRQ0_VECTOR + 4) +#define IRQ5_VECTOR (IRQ0_VECTOR + 5) +#define IRQ6_VECTOR (IRQ0_VECTOR + 6) +#define IRQ7_VECTOR (IRQ0_VECTOR + 7) +#define IRQ8_VECTOR (IRQ0_VECTOR + 8) +#define IRQ9_VECTOR (IRQ0_VECTOR + 9) +#define IRQ10_VECTOR (IRQ0_VECTOR + 10) +#define IRQ11_VECTOR (IRQ0_VECTOR + 11) +#define IRQ12_VECTOR (IRQ0_VECTOR + 12) +#define IRQ13_VECTOR (IRQ0_VECTOR + 13) +#define IRQ14_VECTOR (IRQ0_VECTOR + 14) +#define IRQ15_VECTOR (IRQ0_VECTOR + 15) + +/* + * Special IRQ vectors used by the SMP architecture, 0xf0-0xff + * + * some of the following vectors are 'rare', they are merged + * into a single vector (CALL_FUNCTION_VECTOR) to save vector space. + * TLB, reschedule and local APIC vectors are performance-critical. + * + * Vectors 0xf0-0xfa are free (reserved for future Linux use). + */ +#ifdef CONFIG_X86_32 + +# define SPURIOUS_APIC_VECTOR 0xff +# define ERROR_APIC_VECTOR 0xfe +# define INVALIDATE_TLB_VECTOR 0xfd +# define RESCHEDULE_VECTOR 0xfc +# define CALL_FUNCTION_VECTOR 0xfb +# define CALL_FUNCTION_SINGLE_VECTOR 0xfa +# define THERMAL_APIC_VECTOR 0xf0 + +#else + +#define SPURIOUS_APIC_VECTOR 0xff +#define ERROR_APIC_VECTOR 0xfe +#define RESCHEDULE_VECTOR 0xfd +#define CALL_FUNCTION_VECTOR 0xfc +#define CALL_FUNCTION_SINGLE_VECTOR 0xfb +#define THERMAL_APIC_VECTOR 0xfa +#define THRESHOLD_APIC_VECTOR 0xf9 +#define UV_BAU_MESSAGE 0xf8 +#define INVALIDATE_TLB_VECTOR_END 0xf7 +#define INVALIDATE_TLB_VECTOR_START 0xf0 /* f0-f7 used for TLB flush */ + +#define NUM_INVALIDATE_TLB_VECTORS 8 + +#endif + +/* + * Local APIC timer IRQ vector is on a different priority level, + * to work around the 'lost local interrupt if more than 2 IRQ + * sources per level' errata. + */ +#define LOCAL_TIMER_VECTOR 0xef + +/* + * First APIC vector available to drivers: (vectors 0x30-0xee) we + * start at 0x31(0x41) to spread out vectors evenly between priority + * levels. (0x80 is the syscall vector) + */ +#define FIRST_DEVICE_VECTOR (IRQ15_VECTOR + 2) + +#define NR_VECTORS 256 + +#define FPU_IRQ 13 + +#define FIRST_VM86_IRQ 3 +#define LAST_VM86_IRQ 15 +#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15) + +#ifdef CONFIG_X86_64 +# if NR_CPUS < MAX_IO_APICS +# define NR_IRQS (NR_VECTORS + (32 * NR_CPUS)) +# else +# define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS)) +# endif + +#elif !defined(CONFIG_X86_VOYAGER) + +# if defined(CONFIG_X86_IO_APIC) || defined(CONFIG_PARAVIRT) || defined(CONFIG_X86_VISWS) + +# define NR_IRQS 224 + +# else /* IO_APIC || PARAVIRT */ + +# define NR_IRQS 16 + +# endif + +#else /* !VISWS && !VOYAGER */ + +# define NR_IRQS 224 + +#endif /* VISWS */ + +/* Voyager specific defines */ +/* These define the CPIs we use in linux */ +#define VIC_CPI_LEVEL0 0 +#define VIC_CPI_LEVEL1 1 +/* now the fake CPIs */ +#define VIC_TIMER_CPI 2 +#define VIC_INVALIDATE_CPI 3 +#define VIC_RESCHEDULE_CPI 4 +#define VIC_ENABLE_IRQ_CPI 5 +#define VIC_CALL_FUNCTION_CPI 6 +#define VIC_CALL_FUNCTION_SINGLE_CPI 7 + +/* Now the QIC CPIs: Since we don't need the two initial levels, + * these are 2 less than the VIC CPIs */ +#define QIC_CPI_OFFSET 1 +#define QIC_TIMER_CPI (VIC_TIMER_CPI - QIC_CPI_OFFSET) +#define QIC_INVALIDATE_CPI (VIC_INVALIDATE_CPI - QIC_CPI_OFFSET) +#define QIC_RESCHEDULE_CPI (VIC_RESCHEDULE_CPI - QIC_CPI_OFFSET) +#define QIC_ENABLE_IRQ_CPI (VIC_ENABLE_IRQ_CPI - QIC_CPI_OFFSET) +#define QIC_CALL_FUNCTION_CPI (VIC_CALL_FUNCTION_CPI - QIC_CPI_OFFSET) +#define QIC_CALL_FUNCTION_SINGLE_CPI (VIC_CALL_FUNCTION_SINGLE_CPI - QIC_CPI_OFFSET) + +#define VIC_START_FAKE_CPI VIC_TIMER_CPI +#define VIC_END_FAKE_CPI VIC_CALL_FUNCTION_SINGLE_CPI + +/* this is the SYS_INT CPI. */ +#define VIC_SYS_INT 8 +#define VIC_CMN_INT 15 + +/* This is the boot CPI for alternate processors. It gets overwritten + * by the above once the system has activated all available processors */ +#define VIC_CPU_BOOT_CPI VIC_CPI_LEVEL0 +#define VIC_CPU_BOOT_ERRATA_CPI (VIC_CPI_LEVEL0 + 8) + + +#endif /* _ASM_X86_IRQ_VECTORS_H */ diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h new file mode 100644 index 000000000000..2bdab21f0898 --- /dev/null +++ b/arch/x86/include/asm/irqflags.h @@ -0,0 +1,211 @@ +#ifndef _X86_IRQFLAGS_H_ +#define _X86_IRQFLAGS_H_ + +#include <asm/processor-flags.h> + +#ifndef __ASSEMBLY__ +/* + * Interrupt control: + */ + +static inline unsigned long native_save_fl(void) +{ + unsigned long flags; + + asm volatile("# __raw_save_flags\n\t" + "pushf ; pop %0" + : "=g" (flags) + : /* no input */ + : "memory"); + + return flags; +} + +static inline void native_restore_fl(unsigned long flags) +{ + asm volatile("push %0 ; popf" + : /* no output */ + :"g" (flags) + :"memory", "cc"); +} + +static inline void native_irq_disable(void) +{ + asm volatile("cli": : :"memory"); +} + +static inline void native_irq_enable(void) +{ + asm volatile("sti": : :"memory"); +} + +static inline void native_safe_halt(void) +{ + asm volatile("sti; hlt": : :"memory"); +} + +static inline void native_halt(void) +{ + asm volatile("hlt": : :"memory"); +} + +#endif + +#ifdef CONFIG_PARAVIRT +#include <asm/paravirt.h> +#else +#ifndef __ASSEMBLY__ + +static inline unsigned long __raw_local_save_flags(void) +{ + return native_save_fl(); +} + +static inline void raw_local_irq_restore(unsigned long flags) +{ + native_restore_fl(flags); +} + +static inline void raw_local_irq_disable(void) +{ + native_irq_disable(); +} + +static inline void raw_local_irq_enable(void) +{ + native_irq_enable(); +} + +/* + * Used in the idle loop; sti takes one instruction cycle + * to complete: + */ +static inline void raw_safe_halt(void) +{ + native_safe_halt(); +} + +/* + * Used when interrupts are already enabled or to + * shutdown the processor: + */ +static inline void halt(void) +{ + native_halt(); +} + +/* + * For spinlocks, etc: + */ +static inline unsigned long __raw_local_irq_save(void) +{ + unsigned long flags = __raw_local_save_flags(); + + raw_local_irq_disable(); + + return flags; +} +#else + +#define ENABLE_INTERRUPTS(x) sti +#define DISABLE_INTERRUPTS(x) cli + +#ifdef CONFIG_X86_64 +#define SWAPGS swapgs +/* + * Currently paravirt can't handle swapgs nicely when we + * don't have a stack we can rely on (such as a user space + * stack). So we either find a way around these or just fault + * and emulate if a guest tries to call swapgs directly. + * + * Either way, this is a good way to document that we don't + * have a reliable stack. x86_64 only. + */ +#define SWAPGS_UNSAFE_STACK swapgs + +#define PARAVIRT_ADJUST_EXCEPTION_FRAME /* */ + +#define INTERRUPT_RETURN iretq +#define USERGS_SYSRET64 \ + swapgs; \ + sysretq; +#define USERGS_SYSRET32 \ + swapgs; \ + sysretl +#define ENABLE_INTERRUPTS_SYSEXIT32 \ + swapgs; \ + sti; \ + sysexit + +#else +#define INTERRUPT_RETURN iret +#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit +#define GET_CR0_INTO_EAX movl %cr0, %eax +#endif + + +#endif /* __ASSEMBLY__ */ +#endif /* CONFIG_PARAVIRT */ + +#ifndef __ASSEMBLY__ +#define raw_local_save_flags(flags) \ + do { (flags) = __raw_local_save_flags(); } while (0) + +#define raw_local_irq_save(flags) \ + do { (flags) = __raw_local_irq_save(); } while (0) + +static inline int raw_irqs_disabled_flags(unsigned long flags) +{ + return !(flags & X86_EFLAGS_IF); +} + +static inline int raw_irqs_disabled(void) +{ + unsigned long flags = __raw_local_save_flags(); + + return raw_irqs_disabled_flags(flags); +} + +#else + +#ifdef CONFIG_X86_64 +#define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk +#define ARCH_LOCKDEP_SYS_EXIT_IRQ \ + TRACE_IRQS_ON; \ + sti; \ + SAVE_REST; \ + LOCKDEP_SYS_EXIT; \ + RESTORE_REST; \ + cli; \ + TRACE_IRQS_OFF; + +#else +#define ARCH_LOCKDEP_SYS_EXIT \ + pushl %eax; \ + pushl %ecx; \ + pushl %edx; \ + call lockdep_sys_exit; \ + popl %edx; \ + popl %ecx; \ + popl %eax; + +#define ARCH_LOCKDEP_SYS_EXIT_IRQ +#endif + +#ifdef CONFIG_TRACE_IRQFLAGS +# define TRACE_IRQS_ON call trace_hardirqs_on_thunk; +# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk; +#else +# define TRACE_IRQS_ON +# define TRACE_IRQS_OFF +#endif +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define LOCKDEP_SYS_EXIT ARCH_LOCKDEP_SYS_EXIT +# define LOCKDEP_SYS_EXIT_IRQ ARCH_LOCKDEP_SYS_EXIT_IRQ +# else +# define LOCKDEP_SYS_EXIT +# define LOCKDEP_SYS_EXIT_IRQ +# endif + +#endif /* __ASSEMBLY__ */ +#endif diff --git a/arch/x86/include/asm/ist.h b/arch/x86/include/asm/ist.h new file mode 100644 index 000000000000..7e5dff1de0e9 --- /dev/null +++ b/arch/x86/include/asm/ist.h @@ -0,0 +1,34 @@ +#ifndef _ASM_X86_IST_H +#define _ASM_X86_IST_H + +/* + * Include file for the interface to IST BIOS + * Copyright 2002 Andy Grover <andrew.grover@intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + + +#include <linux/types.h> + +struct ist_info { + __u32 signature; + __u32 command; + __u32 event; + __u32 perf_level; +}; + +#ifdef __KERNEL__ + +extern struct ist_info ist_info; + +#endif /* __KERNEL__ */ +#endif /* _ASM_X86_IST_H */ diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h new file mode 100644 index 000000000000..54c8cc53b24d --- /dev/null +++ b/arch/x86/include/asm/k8.h @@ -0,0 +1,15 @@ +#ifndef _ASM_X86_K8_H +#define _ASM_X86_K8_H + +#include <linux/pci.h> + +extern struct pci_device_id k8_nb_ids[]; + +extern int early_is_k8_nb(u32 value); +extern struct pci_dev **k8_northbridges; +extern int num_k8_northbridges; +extern int cache_k8_northbridges(void); +extern void k8_flush_garts(void); +extern int k8_scan_nodes(unsigned long start, unsigned long end); + +#endif /* _ASM_X86_K8_H */ diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h new file mode 100644 index 000000000000..fa7c0b974761 --- /dev/null +++ b/arch/x86/include/asm/kdebug.h @@ -0,0 +1,37 @@ +#ifndef _ASM_X86_KDEBUG_H +#define _ASM_X86_KDEBUG_H + +#include <linux/notifier.h> + +struct pt_regs; + +/* Grossly misnamed. */ +enum die_val { + DIE_OOPS = 1, + DIE_INT3, + DIE_DEBUG, + DIE_PANIC, + DIE_NMI, + DIE_DIE, + DIE_NMIWATCHDOG, + DIE_KERNELDEBUG, + DIE_TRAP, + DIE_GPF, + DIE_CALL, + DIE_NMI_IPI, + DIE_PAGE_FAULT, + DIE_NMIUNKNOWN, +}; + +extern void printk_address(unsigned long address, int reliable); +extern void die(const char *, struct pt_regs *,long); +extern int __must_check __die(const char *, struct pt_regs *, long); +extern void show_registers(struct pt_regs *regs); +extern void show_trace(struct task_struct *t, struct pt_regs *regs, + unsigned long *sp, unsigned long bp); +extern void __show_regs(struct pt_regs *regs, int all); +extern void show_regs(struct pt_regs *regs); +extern unsigned long oops_begin(void); +extern void oops_end(unsigned long, struct pt_regs *, int signr); + +#endif /* _ASM_X86_KDEBUG_H */ diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h new file mode 100644 index 000000000000..a1f22771a15a --- /dev/null +++ b/arch/x86/include/asm/kexec.h @@ -0,0 +1,175 @@ +#ifndef _ASM_X86_KEXEC_H +#define _ASM_X86_KEXEC_H + +#ifdef CONFIG_X86_32 +# define PA_CONTROL_PAGE 0 +# define VA_CONTROL_PAGE 1 +# define PA_PGD 2 +# define VA_PGD 3 +# define PA_PTE_0 4 +# define VA_PTE_0 5 +# define PA_PTE_1 6 +# define VA_PTE_1 7 +# define PA_SWAP_PAGE 8 +# ifdef CONFIG_X86_PAE +# define PA_PMD_0 9 +# define VA_PMD_0 10 +# define PA_PMD_1 11 +# define VA_PMD_1 12 +# define PAGES_NR 13 +# else +# define PAGES_NR 9 +# endif +#else +# define PA_CONTROL_PAGE 0 +# define VA_CONTROL_PAGE 1 +# define PA_PGD 2 +# define VA_PGD 3 +# define PA_PUD_0 4 +# define VA_PUD_0 5 +# define PA_PMD_0 6 +# define VA_PMD_0 7 +# define PA_PTE_0 8 +# define VA_PTE_0 9 +# define PA_PUD_1 10 +# define VA_PUD_1 11 +# define PA_PMD_1 12 +# define VA_PMD_1 13 +# define PA_PTE_1 14 +# define VA_PTE_1 15 +# define PA_TABLE_PAGE 16 +# define PAGES_NR 17 +#endif + +#ifdef CONFIG_X86_32 +# define KEXEC_CONTROL_CODE_MAX_SIZE 2048 +#endif + +#ifndef __ASSEMBLY__ + +#include <linux/string.h> + +#include <asm/page.h> +#include <asm/ptrace.h> + +/* + * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return. + * I.e. Maximum page that is mapped directly into kernel memory, + * and kmap is not required. + * + * So far x86_64 is limited to 40 physical address bits. + */ +#ifdef CONFIG_X86_32 +/* Maximum physical address we can use pages from */ +# define KEXEC_SOURCE_MEMORY_LIMIT (-1UL) +/* Maximum address we can reach in physical address mode */ +# define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL) +/* Maximum address we can use for the control code buffer */ +# define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE + +# define KEXEC_CONTROL_PAGE_SIZE 4096 + +/* The native architecture */ +# define KEXEC_ARCH KEXEC_ARCH_386 + +/* We can also handle crash dumps from 64 bit kernel. */ +# define vmcore_elf_check_arch_cross(x) ((x)->e_machine == EM_X86_64) +#else +/* Maximum physical address we can use pages from */ +# define KEXEC_SOURCE_MEMORY_LIMIT (0xFFFFFFFFFFUL) +/* Maximum address we can reach in physical address mode */ +# define KEXEC_DESTINATION_MEMORY_LIMIT (0xFFFFFFFFFFUL) +/* Maximum address we can use for the control pages */ +# define KEXEC_CONTROL_MEMORY_LIMIT (0xFFFFFFFFFFUL) + +/* Allocate one page for the pdp and the second for the code */ +# define KEXEC_CONTROL_PAGE_SIZE (4096UL + 4096UL) + +/* The native architecture */ +# define KEXEC_ARCH KEXEC_ARCH_X86_64 +#endif + +/* + * CPU does not save ss and sp on stack if execution is already + * running in kernel mode at the time of NMI occurrence. This code + * fixes it. + */ +static inline void crash_fixup_ss_esp(struct pt_regs *newregs, + struct pt_regs *oldregs) +{ +#ifdef CONFIG_X86_32 + newregs->sp = (unsigned long)&(oldregs->sp); + asm volatile("xorl %%eax, %%eax\n\t" + "movw %%ss, %%ax\n\t" + :"=a"(newregs->ss)); +#endif +} + +/* + * This function is responsible for capturing register states if coming + * via panic otherwise just fix up the ss and sp if coming via kernel + * mode exception. + */ +static inline void crash_setup_regs(struct pt_regs *newregs, + struct pt_regs *oldregs) +{ + if (oldregs) { + memcpy(newregs, oldregs, sizeof(*newregs)); + crash_fixup_ss_esp(newregs, oldregs); + } else { +#ifdef CONFIG_X86_32 + asm volatile("movl %%ebx,%0" : "=m"(newregs->bx)); + asm volatile("movl %%ecx,%0" : "=m"(newregs->cx)); + asm volatile("movl %%edx,%0" : "=m"(newregs->dx)); + asm volatile("movl %%esi,%0" : "=m"(newregs->si)); + asm volatile("movl %%edi,%0" : "=m"(newregs->di)); + asm volatile("movl %%ebp,%0" : "=m"(newregs->bp)); + asm volatile("movl %%eax,%0" : "=m"(newregs->ax)); + asm volatile("movl %%esp,%0" : "=m"(newregs->sp)); + asm volatile("movl %%ss, %%eax;" :"=a"(newregs->ss)); + asm volatile("movl %%cs, %%eax;" :"=a"(newregs->cs)); + asm volatile("movl %%ds, %%eax;" :"=a"(newregs->ds)); + asm volatile("movl %%es, %%eax;" :"=a"(newregs->es)); + asm volatile("pushfl; popl %0" :"=m"(newregs->flags)); +#else + asm volatile("movq %%rbx,%0" : "=m"(newregs->bx)); + asm volatile("movq %%rcx,%0" : "=m"(newregs->cx)); + asm volatile("movq %%rdx,%0" : "=m"(newregs->dx)); + asm volatile("movq %%rsi,%0" : "=m"(newregs->si)); + asm volatile("movq %%rdi,%0" : "=m"(newregs->di)); + asm volatile("movq %%rbp,%0" : "=m"(newregs->bp)); + asm volatile("movq %%rax,%0" : "=m"(newregs->ax)); + asm volatile("movq %%rsp,%0" : "=m"(newregs->sp)); + asm volatile("movq %%r8,%0" : "=m"(newregs->r8)); + asm volatile("movq %%r9,%0" : "=m"(newregs->r9)); + asm volatile("movq %%r10,%0" : "=m"(newregs->r10)); + asm volatile("movq %%r11,%0" : "=m"(newregs->r11)); + asm volatile("movq %%r12,%0" : "=m"(newregs->r12)); + asm volatile("movq %%r13,%0" : "=m"(newregs->r13)); + asm volatile("movq %%r14,%0" : "=m"(newregs->r14)); + asm volatile("movq %%r15,%0" : "=m"(newregs->r15)); + asm volatile("movl %%ss, %%eax;" :"=a"(newregs->ss)); + asm volatile("movl %%cs, %%eax;" :"=a"(newregs->cs)); + asm volatile("pushfq; popq %0" :"=m"(newregs->flags)); +#endif + newregs->ip = (unsigned long)current_text_addr(); + } +} + +#ifdef CONFIG_X86_32 +asmlinkage unsigned long +relocate_kernel(unsigned long indirection_page, + unsigned long control_page, + unsigned long start_address, + unsigned int has_pae, + unsigned int preserve_context); +#else +NORET_TYPE void +relocate_kernel(unsigned long indirection_page, + unsigned long page_list, + unsigned long start_address) ATTRIB_NORET; +#endif + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_X86_KEXEC_H */ diff --git a/arch/x86/include/asm/kgdb.h b/arch/x86/include/asm/kgdb.h new file mode 100644 index 000000000000..e6c6c808489f --- /dev/null +++ b/arch/x86/include/asm/kgdb.h @@ -0,0 +1,79 @@ +#ifndef _ASM_X86_KGDB_H +#define _ASM_X86_KGDB_H + +/* + * Copyright (C) 2001-2004 Amit S. Kale + * Copyright (C) 2008 Wind River Systems, Inc. + */ + +/* + * BUFMAX defines the maximum number of characters in inbound/outbound + * buffers at least NUMREGBYTES*2 are needed for register packets + * Longer buffer is needed to list all threads + */ +#define BUFMAX 1024 + +/* + * Note that this register image is in a different order than + * the register image that Linux produces at interrupt time. + * + * Linux's register image is defined by struct pt_regs in ptrace.h. + * Just why GDB uses a different order is a historical mystery. + */ +#ifdef CONFIG_X86_32 +enum regnames { + GDB_AX, /* 0 */ + GDB_CX, /* 1 */ + GDB_DX, /* 2 */ + GDB_BX, /* 3 */ + GDB_SP, /* 4 */ + GDB_BP, /* 5 */ + GDB_SI, /* 6 */ + GDB_DI, /* 7 */ + GDB_PC, /* 8 also known as eip */ + GDB_PS, /* 9 also known as eflags */ + GDB_CS, /* 10 */ + GDB_SS, /* 11 */ + GDB_DS, /* 12 */ + GDB_ES, /* 13 */ + GDB_FS, /* 14 */ + GDB_GS, /* 15 */ +}; +#define NUMREGBYTES ((GDB_GS+1)*4) +#else /* ! CONFIG_X86_32 */ +enum regnames64 { + GDB_AX, /* 0 */ + GDB_BX, /* 1 */ + GDB_CX, /* 2 */ + GDB_DX, /* 3 */ + GDB_SI, /* 4 */ + GDB_DI, /* 5 */ + GDB_BP, /* 6 */ + GDB_SP, /* 7 */ + GDB_R8, /* 8 */ + GDB_R9, /* 9 */ + GDB_R10, /* 10 */ + GDB_R11, /* 11 */ + GDB_R12, /* 12 */ + GDB_R13, /* 13 */ + GDB_R14, /* 14 */ + GDB_R15, /* 15 */ + GDB_PC, /* 16 */ +}; + +enum regnames32 { + GDB_PS = 34, + GDB_CS, + GDB_SS, +}; +#define NUMREGBYTES ((GDB_SS+1)*4) +#endif /* CONFIG_X86_32 */ + +static inline void arch_kgdb_breakpoint(void) +{ + asm(" int $3"); +} +#define BREAK_INSTR_SIZE 1 +#define CACHE_FLUSH_IS_SAFE 1 + +#endif /* _ASM_X86_KGDB_H */ diff --git a/arch/x86/include/asm/kmap_types.h b/arch/x86/include/asm/kmap_types.h new file mode 100644 index 000000000000..5759c165a5cf --- /dev/null +++ b/arch/x86/include/asm/kmap_types.h @@ -0,0 +1,29 @@ +#ifndef _ASM_X86_KMAP_TYPES_H +#define _ASM_X86_KMAP_TYPES_H + +#if defined(CONFIG_X86_32) && defined(CONFIG_DEBUG_HIGHMEM) +# define D(n) __KM_FENCE_##n , +#else +# define D(n) +#endif + +enum km_type { +D(0) KM_BOUNCE_READ, +D(1) KM_SKB_SUNRPC_DATA, +D(2) KM_SKB_DATA_SOFTIRQ, +D(3) KM_USER0, +D(4) KM_USER1, +D(5) KM_BIO_SRC_IRQ, +D(6) KM_BIO_DST_IRQ, +D(7) KM_PTE0, +D(8) KM_PTE1, +D(9) KM_IRQ0, +D(10) KM_IRQ1, +D(11) KM_SOFTIRQ0, +D(12) KM_SOFTIRQ1, +D(13) KM_TYPE_NR +}; + +#undef D + +#endif /* _ASM_X86_KMAP_TYPES_H */ diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h new file mode 100644 index 000000000000..4fe681de1e76 --- /dev/null +++ b/arch/x86/include/asm/kprobes.h @@ -0,0 +1,88 @@ +#ifndef _ASM_X86_KPROBES_H +#define _ASM_X86_KPROBES_H +/* + * Kernel Probes (KProbes) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2002, 2004 + * + * See arch/x86/kernel/kprobes.c for x86 kprobes history. + */ +#include <linux/types.h> +#include <linux/ptrace.h> +#include <linux/percpu.h> + +#define __ARCH_WANT_KPROBES_INSN_SLOT + +struct pt_regs; +struct kprobe; + +typedef u8 kprobe_opcode_t; +#define BREAKPOINT_INSTRUCTION 0xcc +#define RELATIVEJUMP_INSTRUCTION 0xe9 +#define MAX_INSN_SIZE 16 +#define MAX_STACK_SIZE 64 +#define MIN_STACK_SIZE(ADDR) \ + (((MAX_STACK_SIZE) < (((unsigned long)current_thread_info()) + \ + THREAD_SIZE - (unsigned long)(ADDR))) \ + ? (MAX_STACK_SIZE) \ + : (((unsigned long)current_thread_info()) + \ + THREAD_SIZE - (unsigned long)(ADDR))) + +#define flush_insn_slot(p) do { } while (0) + +extern const int kretprobe_blacklist_size; + +void arch_remove_kprobe(struct kprobe *p); +void kretprobe_trampoline(void); + +/* Architecture specific copy of original instruction*/ +struct arch_specific_insn { + /* copy of the original instruction */ + kprobe_opcode_t *insn; + /* + * boostable = -1: This instruction type is not boostable. + * boostable = 0: This instruction type is boostable. + * boostable = 1: This instruction has been boosted: we have + * added a relative jump after the instruction copy in insn, + * so no single-step and fixup are needed (unless there's + * a post_handler or break_handler). + */ + int boostable; +}; + +struct prev_kprobe { + struct kprobe *kp; + unsigned long status; + unsigned long old_flags; + unsigned long saved_flags; +}; + +/* per-cpu kprobe control block */ +struct kprobe_ctlblk { + unsigned long kprobe_status; + unsigned long kprobe_old_flags; + unsigned long kprobe_saved_flags; + unsigned long *jprobe_saved_sp; + struct pt_regs jprobe_saved_regs; + kprobe_opcode_t jprobes_stack[MAX_STACK_SIZE]; + struct prev_kprobe prev_kprobe; +}; + +extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr); +extern int kprobe_exceptions_notify(struct notifier_block *self, + unsigned long val, void *data); +#endif /* _ASM_X86_KPROBES_H */ diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h new file mode 100644 index 000000000000..b95162af0bf6 --- /dev/null +++ b/arch/x86/include/asm/kvm.h @@ -0,0 +1,211 @@ +#ifndef _ASM_X86_KVM_H +#define _ASM_X86_KVM_H + +/* + * KVM x86 specific structures and definitions + * + */ + +#include <asm/types.h> +#include <linux/ioctl.h> + +/* Architectural interrupt line count. */ +#define KVM_NR_INTERRUPTS 256 + +struct kvm_memory_alias { + __u32 slot; /* this has a different namespace than memory slots */ + __u32 flags; + __u64 guest_phys_addr; + __u64 memory_size; + __u64 target_phys_addr; +}; + +/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */ +struct kvm_pic_state { + __u8 last_irr; /* edge detection */ + __u8 irr; /* interrupt request register */ + __u8 imr; /* interrupt mask register */ + __u8 isr; /* interrupt service register */ + __u8 priority_add; /* highest irq priority */ + __u8 irq_base; + __u8 read_reg_select; + __u8 poll; + __u8 special_mask; + __u8 init_state; + __u8 auto_eoi; + __u8 rotate_on_auto_eoi; + __u8 special_fully_nested_mode; + __u8 init4; /* true if 4 byte init */ + __u8 elcr; /* PIIX edge/trigger selection */ + __u8 elcr_mask; +}; + +#define KVM_IOAPIC_NUM_PINS 24 +struct kvm_ioapic_state { + __u64 base_address; + __u32 ioregsel; + __u32 id; + __u32 irr; + __u32 pad; + union { + __u64 bits; + struct { + __u8 vector; + __u8 delivery_mode:3; + __u8 dest_mode:1; + __u8 delivery_status:1; + __u8 polarity:1; + __u8 remote_irr:1; + __u8 trig_mode:1; + __u8 mask:1; + __u8 reserve:7; + __u8 reserved[4]; + __u8 dest_id; + } fields; + } redirtbl[KVM_IOAPIC_NUM_PINS]; +}; + +#define KVM_IRQCHIP_PIC_MASTER 0 +#define KVM_IRQCHIP_PIC_SLAVE 1 +#define KVM_IRQCHIP_IOAPIC 2 + +/* for KVM_GET_REGS and KVM_SET_REGS */ +struct kvm_regs { + /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ + __u64 rax, rbx, rcx, rdx; + __u64 rsi, rdi, rsp, rbp; + __u64 r8, r9, r10, r11; + __u64 r12, r13, r14, r15; + __u64 rip, rflags; +}; + +/* for KVM_GET_LAPIC and KVM_SET_LAPIC */ +#define KVM_APIC_REG_SIZE 0x400 +struct kvm_lapic_state { + char regs[KVM_APIC_REG_SIZE]; +}; + +struct kvm_segment { + __u64 base; + __u32 limit; + __u16 selector; + __u8 type; + __u8 present, dpl, db, s, l, g, avl; + __u8 unusable; + __u8 padding; +}; + +struct kvm_dtable { + __u64 base; + __u16 limit; + __u16 padding[3]; +}; + + +/* for KVM_GET_SREGS and KVM_SET_SREGS */ +struct kvm_sregs { + /* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */ + struct kvm_segment cs, ds, es, fs, gs, ss; + struct kvm_segment tr, ldt; + struct kvm_dtable gdt, idt; + __u64 cr0, cr2, cr3, cr4, cr8; + __u64 efer; + __u64 apic_base; + __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64]; +}; + +/* for KVM_GET_FPU and KVM_SET_FPU */ +struct kvm_fpu { + __u8 fpr[8][16]; + __u16 fcw; + __u16 fsw; + __u8 ftwx; /* in fxsave format */ + __u8 pad1; + __u16 last_opcode; + __u64 last_ip; + __u64 last_dp; + __u8 xmm[16][16]; + __u32 mxcsr; + __u32 pad2; +}; + +struct kvm_msr_entry { + __u32 index; + __u32 reserved; + __u64 data; +}; + +/* for KVM_GET_MSRS and KVM_SET_MSRS */ +struct kvm_msrs { + __u32 nmsrs; /* number of msrs in entries */ + __u32 pad; + + struct kvm_msr_entry entries[0]; +}; + +/* for KVM_GET_MSR_INDEX_LIST */ +struct kvm_msr_list { + __u32 nmsrs; /* number of msrs in entries */ + __u32 indices[0]; +}; + + +struct kvm_cpuid_entry { + __u32 function; + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 padding; +}; + +/* for KVM_SET_CPUID */ +struct kvm_cpuid { + __u32 nent; + __u32 padding; + struct kvm_cpuid_entry entries[0]; +}; + +struct kvm_cpuid_entry2 { + __u32 function; + __u32 index; + __u32 flags; + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 padding[3]; +}; + +#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1 +#define KVM_CPUID_FLAG_STATEFUL_FUNC 2 +#define KVM_CPUID_FLAG_STATE_READ_NEXT 4 + +/* for KVM_SET_CPUID2 */ +struct kvm_cpuid2 { + __u32 nent; + __u32 padding; + struct kvm_cpuid_entry2 entries[0]; +}; + +/* for KVM_GET_PIT and KVM_SET_PIT */ +struct kvm_pit_channel_state { + __u32 count; /* can be 65536 */ + __u16 latched_count; + __u8 count_latched; + __u8 status_latched; + __u8 status; + __u8 read_state; + __u8 write_state; + __u8 write_latch; + __u8 rw_mode; + __u8 mode; + __u8 bcd; + __u8 gate; + __s64 count_load_time; +}; + +struct kvm_pit_state { + struct kvm_pit_channel_state channels[3]; +}; +#endif /* _ASM_X86_KVM_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h new file mode 100644 index 000000000000..65679d006337 --- /dev/null +++ b/arch/x86/include/asm/kvm_host.h @@ -0,0 +1,752 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * + * This header defines architecture specific interfaces, x86 version + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#ifndef _ASM_X86_KVM_HOST_H +#define _ASM_X86_KVM_HOST_H + +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/mmu_notifier.h> + +#include <linux/kvm.h> +#include <linux/kvm_para.h> +#include <linux/kvm_types.h> + +#include <asm/pvclock-abi.h> +#include <asm/desc.h> + +#define KVM_MAX_VCPUS 16 +#define KVM_MEMORY_SLOTS 32 +/* memory slots that does not exposed to userspace */ +#define KVM_PRIVATE_MEM_SLOTS 4 + +#define KVM_PIO_PAGE_OFFSET 1 +#define KVM_COALESCED_MMIO_PAGE_OFFSET 2 + +#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) +#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) +#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ + 0xFFFFFF0000000000ULL) + +#define KVM_GUEST_CR0_MASK \ + (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE \ + | X86_CR0_NW | X86_CR0_CD) +#define KVM_VM_CR0_ALWAYS_ON \ + (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE | X86_CR0_TS \ + | X86_CR0_MP) +#define KVM_GUEST_CR4_MASK \ + (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE) +#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) +#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) + +#define INVALID_PAGE (~(hpa_t)0) +#define UNMAPPED_GVA (~(gpa_t)0) + +/* shadow tables are PAE even on non-PAE hosts */ +#define KVM_HPAGE_SHIFT 21 +#define KVM_HPAGE_SIZE (1UL << KVM_HPAGE_SHIFT) +#define KVM_HPAGE_MASK (~(KVM_HPAGE_SIZE - 1)) + +#define KVM_PAGES_PER_HPAGE (KVM_HPAGE_SIZE / PAGE_SIZE) + +#define DE_VECTOR 0 +#define DB_VECTOR 1 +#define BP_VECTOR 3 +#define OF_VECTOR 4 +#define BR_VECTOR 5 +#define UD_VECTOR 6 +#define NM_VECTOR 7 +#define DF_VECTOR 8 +#define TS_VECTOR 10 +#define NP_VECTOR 11 +#define SS_VECTOR 12 +#define GP_VECTOR 13 +#define PF_VECTOR 14 +#define MF_VECTOR 16 +#define MC_VECTOR 18 + +#define SELECTOR_TI_MASK (1 << 2) +#define SELECTOR_RPL_MASK 0x03 + +#define IOPL_SHIFT 12 + +#define KVM_ALIAS_SLOTS 4 + +#define KVM_PERMILLE_MMU_PAGES 20 +#define KVM_MIN_ALLOC_MMU_PAGES 64 +#define KVM_MMU_HASH_SHIFT 10 +#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT) +#define KVM_MIN_FREE_MMU_PAGES 5 +#define KVM_REFILL_PAGES 25 +#define KVM_MAX_CPUID_ENTRIES 40 +#define KVM_NR_VAR_MTRR 8 + +extern spinlock_t kvm_lock; +extern struct list_head vm_list; + +struct kvm_vcpu; +struct kvm; + +enum kvm_reg { + VCPU_REGS_RAX = 0, + VCPU_REGS_RCX = 1, + VCPU_REGS_RDX = 2, + VCPU_REGS_RBX = 3, + VCPU_REGS_RSP = 4, + VCPU_REGS_RBP = 5, + VCPU_REGS_RSI = 6, + VCPU_REGS_RDI = 7, +#ifdef CONFIG_X86_64 + VCPU_REGS_R8 = 8, + VCPU_REGS_R9 = 9, + VCPU_REGS_R10 = 10, + VCPU_REGS_R11 = 11, + VCPU_REGS_R12 = 12, + VCPU_REGS_R13 = 13, + VCPU_REGS_R14 = 14, + VCPU_REGS_R15 = 15, +#endif + VCPU_REGS_RIP, + NR_VCPU_REGS +}; + +enum { + VCPU_SREG_ES, + VCPU_SREG_CS, + VCPU_SREG_SS, + VCPU_SREG_DS, + VCPU_SREG_FS, + VCPU_SREG_GS, + VCPU_SREG_TR, + VCPU_SREG_LDTR, +}; + +#include <asm/kvm_x86_emulate.h> + +#define KVM_NR_MEM_OBJS 40 + +struct kvm_guest_debug { + int enabled; + unsigned long bp[4]; + int singlestep; +}; + +/* + * We don't want allocation failures within the mmu code, so we preallocate + * enough memory for a single page fault in a cache. + */ +struct kvm_mmu_memory_cache { + int nobjs; + void *objects[KVM_NR_MEM_OBJS]; +}; + +#define NR_PTE_CHAIN_ENTRIES 5 + +struct kvm_pte_chain { + u64 *parent_ptes[NR_PTE_CHAIN_ENTRIES]; + struct hlist_node link; +}; + +/* + * kvm_mmu_page_role, below, is defined as: + * + * bits 0:3 - total guest paging levels (2-4, or zero for real mode) + * bits 4:7 - page table level for this shadow (1-4) + * bits 8:9 - page table quadrant for 2-level guests + * bit 16 - "metaphysical" - gfn is not a real page (huge page/real mode) + * bits 17:19 - common access permissions for all ptes in this shadow page + */ +union kvm_mmu_page_role { + unsigned word; + struct { + unsigned glevels:4; + unsigned level:4; + unsigned quadrant:2; + unsigned pad_for_nice_hex_output:6; + unsigned metaphysical:1; + unsigned access:3; + unsigned invalid:1; + }; +}; + +struct kvm_mmu_page { + struct list_head link; + struct hlist_node hash_link; + + /* + * The following two entries are used to key the shadow page in the + * hash table. + */ + gfn_t gfn; + union kvm_mmu_page_role role; + + u64 *spt; + /* hold the gfn of each spte inside spt */ + gfn_t *gfns; + unsigned long slot_bitmap; /* One bit set per slot which has memory + * in this shadow page. + */ + int multimapped; /* More than one parent_pte? */ + int root_count; /* Currently serving as active root */ + bool unsync; + bool unsync_children; + union { + u64 *parent_pte; /* !multimapped */ + struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */ + }; + DECLARE_BITMAP(unsync_child_bitmap, 512); +}; + +struct kvm_pv_mmu_op_buffer { + void *ptr; + unsigned len; + unsigned processed; + char buf[512] __aligned(sizeof(long)); +}; + +/* + * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level + * 32-bit). The kvm_mmu structure abstracts the details of the current mmu + * mode. + */ +struct kvm_mmu { + void (*new_cr3)(struct kvm_vcpu *vcpu); + int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); + void (*free)(struct kvm_vcpu *vcpu); + gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva); + void (*prefetch_page)(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *page); + int (*sync_page)(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp); + void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); + hpa_t root_hpa; + int root_level; + int shadow_root_level; + + u64 *pae_root; +}; + +struct kvm_vcpu_arch { + u64 host_tsc; + int interrupt_window_open; + unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ + DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS); + /* + * rip and regs accesses must go through + * kvm_{register,rip}_{read,write} functions. + */ + unsigned long regs[NR_VCPU_REGS]; + u32 regs_avail; + u32 regs_dirty; + + unsigned long cr0; + unsigned long cr2; + unsigned long cr3; + unsigned long cr4; + unsigned long cr8; + u64 pdptrs[4]; /* pae */ + u64 shadow_efer; + u64 apic_base; + struct kvm_lapic *apic; /* kernel irqchip context */ + int mp_state; + int sipi_vector; + u64 ia32_misc_enable_msr; + bool tpr_access_reporting; + + struct kvm_mmu mmu; + /* only needed in kvm_pv_mmu_op() path, but it's hot so + * put it here to avoid allocation */ + struct kvm_pv_mmu_op_buffer mmu_op_buffer; + + struct kvm_mmu_memory_cache mmu_pte_chain_cache; + struct kvm_mmu_memory_cache mmu_rmap_desc_cache; + struct kvm_mmu_memory_cache mmu_page_cache; + struct kvm_mmu_memory_cache mmu_page_header_cache; + + gfn_t last_pt_write_gfn; + int last_pt_write_count; + u64 *last_pte_updated; + gfn_t last_pte_gfn; + + struct { + gfn_t gfn; /* presumed gfn during guest pte update */ + pfn_t pfn; /* pfn corresponding to that gfn */ + int largepage; + unsigned long mmu_seq; + } update_pte; + + struct i387_fxsave_struct host_fx_image; + struct i387_fxsave_struct guest_fx_image; + + gva_t mmio_fault_cr2; + struct kvm_pio_request pio; + void *pio_data; + + struct kvm_queued_exception { + bool pending; + bool has_error_code; + u8 nr; + u32 error_code; + } exception; + + struct kvm_queued_interrupt { + bool pending; + u8 nr; + } interrupt; + + struct { + int active; + u8 save_iopl; + struct kvm_save_segment { + u16 selector; + unsigned long base; + u32 limit; + u32 ar; + } tr, es, ds, fs, gs; + } rmode; + int halt_request; /* real mode on Intel only */ + + int cpuid_nent; + struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES]; + /* emulate context */ + + struct x86_emulate_ctxt emulate_ctxt; + + gpa_t time; + struct pvclock_vcpu_time_info hv_clock; + unsigned int hv_clock_tsc_khz; + unsigned int time_offset; + struct page *time_page; + + bool nmi_pending; + bool nmi_injected; + + u64 mtrr[0x100]; +}; + +struct kvm_mem_alias { + gfn_t base_gfn; + unsigned long npages; + gfn_t target_gfn; +}; + +struct kvm_arch{ + int naliases; + struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; + + unsigned int n_free_mmu_pages; + unsigned int n_requested_mmu_pages; + unsigned int n_alloc_mmu_pages; + struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; + /* + * Hash table of struct kvm_mmu_page. + */ + struct list_head active_mmu_pages; + struct list_head assigned_dev_head; + struct dmar_domain *intel_iommu_domain; + struct kvm_pic *vpic; + struct kvm_ioapic *vioapic; + struct kvm_pit *vpit; + struct hlist_head irq_ack_notifier_list; + + int round_robin_prev_vcpu; + unsigned int tss_addr; + struct page *apic_access_page; + + gpa_t wall_clock; + + struct page *ept_identity_pagetable; + bool ept_identity_pagetable_done; +}; + +struct kvm_vm_stat { + u32 mmu_shadow_zapped; + u32 mmu_pte_write; + u32 mmu_pte_updated; + u32 mmu_pde_zapped; + u32 mmu_flooded; + u32 mmu_recycled; + u32 mmu_cache_miss; + u32 mmu_unsync; + u32 remote_tlb_flush; + u32 lpages; +}; + +struct kvm_vcpu_stat { + u32 pf_fixed; + u32 pf_guest; + u32 tlb_flush; + u32 invlpg; + + u32 exits; + u32 io_exits; + u32 mmio_exits; + u32 signal_exits; + u32 irq_window_exits; + u32 nmi_window_exits; + u32 halt_exits; + u32 halt_wakeup; + u32 request_irq_exits; + u32 irq_exits; + u32 host_state_reload; + u32 efer_reload; + u32 fpu_reload; + u32 insn_emulation; + u32 insn_emulation_fail; + u32 hypercalls; + u32 irq_injections; +}; + +struct descriptor_table { + u16 limit; + unsigned long base; +} __attribute__((packed)); + +struct kvm_x86_ops { + int (*cpu_has_kvm_support)(void); /* __init */ + int (*disabled_by_bios)(void); /* __init */ + void (*hardware_enable)(void *dummy); /* __init */ + void (*hardware_disable)(void *dummy); + void (*check_processor_compatibility)(void *rtn); + int (*hardware_setup)(void); /* __init */ + void (*hardware_unsetup)(void); /* __exit */ + bool (*cpu_has_accelerated_tpr)(void); + + /* Create, but do not attach this VCPU */ + struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); + void (*vcpu_free)(struct kvm_vcpu *vcpu); + int (*vcpu_reset)(struct kvm_vcpu *vcpu); + + void (*prepare_guest_switch)(struct kvm_vcpu *vcpu); + void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); + void (*vcpu_put)(struct kvm_vcpu *vcpu); + + int (*set_guest_debug)(struct kvm_vcpu *vcpu, + struct kvm_debug_guest *dbg); + void (*guest_debug_pre)(struct kvm_vcpu *vcpu); + int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); + int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); + u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); + void (*get_segment)(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg); + int (*get_cpl)(struct kvm_vcpu *vcpu); + void (*set_segment)(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg); + void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); + void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); + void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); + void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); + void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); + void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); + void (*get_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); + void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); + void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); + void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); + unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr); + void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value, + int *exception); + void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); + unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); + void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); + + void (*tlb_flush)(struct kvm_vcpu *vcpu); + + void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run); + int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu); + void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); + void (*patch_hypercall)(struct kvm_vcpu *vcpu, + unsigned char *hypercall_addr); + int (*get_irq)(struct kvm_vcpu *vcpu); + void (*set_irq)(struct kvm_vcpu *vcpu, int vec); + void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr, + bool has_error_code, u32 error_code); + bool (*exception_injected)(struct kvm_vcpu *vcpu); + void (*inject_pending_irq)(struct kvm_vcpu *vcpu); + void (*inject_pending_vectors)(struct kvm_vcpu *vcpu, + struct kvm_run *run); + + int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); + int (*get_tdp_level)(void); +}; + +extern struct kvm_x86_ops *kvm_x86_ops; + +int kvm_mmu_module_init(void); +void kvm_mmu_module_exit(void); + +void kvm_mmu_destroy(struct kvm_vcpu *vcpu); +int kvm_mmu_create(struct kvm_vcpu *vcpu); +int kvm_mmu_setup(struct kvm_vcpu *vcpu); +void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte); +void kvm_mmu_set_base_ptes(u64 base_pte); +void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, + u64 dirty_mask, u64 nx_mask, u64 x_mask); + +int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); +void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); +void kvm_mmu_zap_all(struct kvm *kvm); +unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); +void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); + +int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); + +int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, + const void *val, int bytes); +int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, + gpa_t addr, unsigned long *ret); + +extern bool tdp_enabled; + +enum emulation_result { + EMULATE_DONE, /* no further processing */ + EMULATE_DO_MMIO, /* kvm_run filled with mmio request */ + EMULATE_FAIL, /* can't emulate this instruction */ +}; + +#define EMULTYPE_NO_DECODE (1 << 0) +#define EMULTYPE_TRAP_UD (1 << 1) +int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, + unsigned long cr2, u16 error_code, int emulation_type); +void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); +void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); +void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); +void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, + unsigned long *rflags); + +unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr); +void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value, + unsigned long *rflags); +void kvm_enable_efer_bits(u64); +int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data); +int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); + +struct x86_emulate_ctxt; + +int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, + int size, unsigned port); +int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, + int size, unsigned long count, int down, + gva_t address, int rep, unsigned port); +void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); +int kvm_emulate_halt(struct kvm_vcpu *vcpu); +int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); +int emulate_clts(struct kvm_vcpu *vcpu); +int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, + unsigned long *dest); +int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, + unsigned long value); + +void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); +int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, + int type_bits, int seg); + +int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason); + +void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); +void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); +void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); +void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); +unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); +void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); +void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); + +int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); +int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); + +void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); +void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); +void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, + u32 error_code); + +void kvm_pic_set_irq(void *opaque, int irq, int level); + +void kvm_inject_nmi(struct kvm_vcpu *vcpu); + +void fx_init(struct kvm_vcpu *vcpu); + +int emulator_read_std(unsigned long addr, + void *val, + unsigned int bytes, + struct kvm_vcpu *vcpu); +int emulator_write_emulated(unsigned long addr, + const void *val, + unsigned int bytes, + struct kvm_vcpu *vcpu); + +unsigned long segment_base(u16 selector); + +void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); +void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, + const u8 *new, int bytes); +int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); +void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); +int kvm_mmu_load(struct kvm_vcpu *vcpu); +void kvm_mmu_unload(struct kvm_vcpu *vcpu); +void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); + +int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); + +int kvm_fix_hypercall(struct kvm_vcpu *vcpu); + +int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code); +void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); + +void kvm_enable_tdp(void); +void kvm_disable_tdp(void); + +int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); +int complete_pio(struct kvm_vcpu *vcpu); + +static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) +{ + struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); + + return (struct kvm_mmu_page *)page_private(page); +} + +static inline u16 kvm_read_fs(void) +{ + u16 seg; + asm("mov %%fs, %0" : "=g"(seg)); + return seg; +} + +static inline u16 kvm_read_gs(void) +{ + u16 seg; + asm("mov %%gs, %0" : "=g"(seg)); + return seg; +} + +static inline u16 kvm_read_ldt(void) +{ + u16 ldt; + asm("sldt %0" : "=g"(ldt)); + return ldt; +} + +static inline void kvm_load_fs(u16 sel) +{ + asm("mov %0, %%fs" : : "rm"(sel)); +} + +static inline void kvm_load_gs(u16 sel) +{ + asm("mov %0, %%gs" : : "rm"(sel)); +} + +static inline void kvm_load_ldt(u16 sel) +{ + asm("lldt %0" : : "rm"(sel)); +} + +static inline void kvm_get_idt(struct descriptor_table *table) +{ + asm("sidt %0" : "=m"(*table)); +} + +static inline void kvm_get_gdt(struct descriptor_table *table) +{ + asm("sgdt %0" : "=m"(*table)); +} + +static inline unsigned long kvm_read_tr_base(void) +{ + u16 tr; + asm("str %0" : "=g"(tr)); + return segment_base(tr); +} + +#ifdef CONFIG_X86_64 +static inline unsigned long read_msr(unsigned long msr) +{ + u64 value; + + rdmsrl(msr, value); + return value; +} +#endif + +static inline void kvm_fx_save(struct i387_fxsave_struct *image) +{ + asm("fxsave (%0)":: "r" (image)); +} + +static inline void kvm_fx_restore(struct i387_fxsave_struct *image) +{ + asm("fxrstor (%0)":: "r" (image)); +} + +static inline void kvm_fx_finit(void) +{ + asm("finit"); +} + +static inline u32 get_rdx_init_val(void) +{ + return 0x600; /* P6 family */ +} + +static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code) +{ + kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); +} + +#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30" +#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2" +#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3" +#define ASM_VMX_VMPTRLD_RAX ".byte 0x0f, 0xc7, 0x30" +#define ASM_VMX_VMREAD_RDX_RAX ".byte 0x0f, 0x78, 0xd0" +#define ASM_VMX_VMWRITE_RAX_RDX ".byte 0x0f, 0x79, 0xd0" +#define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4" +#define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4" +#define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30" +#define ASM_VMX_INVEPT ".byte 0x66, 0x0f, 0x38, 0x80, 0x08" +#define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08" + +#define MSR_IA32_TIME_STAMP_COUNTER 0x010 + +#define TSS_IOPB_BASE_OFFSET 0x66 +#define TSS_BASE_SIZE 0x68 +#define TSS_IOPB_SIZE (65536 / 8) +#define TSS_REDIRECTION_SIZE (256 / 8) +#define RMODE_TSS_SIZE \ + (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1) + +enum { + TASK_SWITCH_CALL = 0, + TASK_SWITCH_IRET = 1, + TASK_SWITCH_JMP = 2, + TASK_SWITCH_GATE = 3, +}; + +/* + * Hardware virtualization extension instructions may fault if a + * reboot turns off virtualization while processes are running. + * Trap the fault and ignore the instruction if that happens. + */ +asmlinkage void kvm_handle_fault_on_reboot(void); + +#define __kvm_handle_fault_on_reboot(insn) \ + "666: " insn "\n\t" \ + ".pushsection .fixup, \"ax\" \n" \ + "667: \n\t" \ + __ASM_SIZE(push) " $666b \n\t" \ + "jmp kvm_handle_fault_on_reboot \n\t" \ + ".popsection \n\t" \ + ".pushsection __ex_table, \"a\" \n\t" \ + _ASM_PTR " 666b, 667b \n\t" \ + ".popsection" + +#define KVM_ARCH_WANT_MMU_NOTIFIER +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); +int kvm_age_hva(struct kvm *kvm, unsigned long hva); + +#endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h new file mode 100644 index 000000000000..b8a3305ae093 --- /dev/null +++ b/arch/x86/include/asm/kvm_para.h @@ -0,0 +1,147 @@ +#ifndef _ASM_X86_KVM_PARA_H +#define _ASM_X86_KVM_PARA_H + +/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It + * should be used to determine that a VM is running under KVM. + */ +#define KVM_CPUID_SIGNATURE 0x40000000 + +/* This CPUID returns a feature bitmap in eax. Before enabling a particular + * paravirtualization, the appropriate feature bit should be checked. + */ +#define KVM_CPUID_FEATURES 0x40000001 +#define KVM_FEATURE_CLOCKSOURCE 0 +#define KVM_FEATURE_NOP_IO_DELAY 1 +#define KVM_FEATURE_MMU_OP 2 + +#define MSR_KVM_WALL_CLOCK 0x11 +#define MSR_KVM_SYSTEM_TIME 0x12 + +#define KVM_MAX_MMU_OP_BATCH 32 + +/* Operations for KVM_HC_MMU_OP */ +#define KVM_MMU_OP_WRITE_PTE 1 +#define KVM_MMU_OP_FLUSH_TLB 2 +#define KVM_MMU_OP_RELEASE_PT 3 + +/* Payload for KVM_HC_MMU_OP */ +struct kvm_mmu_op_header { + __u32 op; + __u32 pad; +}; + +struct kvm_mmu_op_write_pte { + struct kvm_mmu_op_header header; + __u64 pte_phys; + __u64 pte_val; +}; + +struct kvm_mmu_op_flush_tlb { + struct kvm_mmu_op_header header; +}; + +struct kvm_mmu_op_release_pt { + struct kvm_mmu_op_header header; + __u64 pt_phys; +}; + +#ifdef __KERNEL__ +#include <asm/processor.h> + +extern void kvmclock_init(void); + + +/* This instruction is vmcall. On non-VT architectures, it will generate a + * trap that we will then rewrite to the appropriate instruction. + */ +#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1" + +/* For KVM hypercalls, a three-byte sequence of either the vmrun or the vmmrun + * instruction. The hypervisor may replace it with something else but only the + * instructions are guaranteed to be supported. + * + * Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively. + * The hypercall number should be placed in rax and the return value will be + * placed in rax. No other registers will be clobbered unless explicited + * noted by the particular hypercall. + */ + +static inline long kvm_hypercall0(unsigned int nr) +{ + long ret; + asm volatile(KVM_HYPERCALL + : "=a"(ret) + : "a"(nr) + : "memory"); + return ret; +} + +static inline long kvm_hypercall1(unsigned int nr, unsigned long p1) +{ + long ret; + asm volatile(KVM_HYPERCALL + : "=a"(ret) + : "a"(nr), "b"(p1) + : "memory"); + return ret; +} + +static inline long kvm_hypercall2(unsigned int nr, unsigned long p1, + unsigned long p2) +{ + long ret; + asm volatile(KVM_HYPERCALL + : "=a"(ret) + : "a"(nr), "b"(p1), "c"(p2) + : "memory"); + return ret; +} + +static inline long kvm_hypercall3(unsigned int nr, unsigned long p1, + unsigned long p2, unsigned long p3) +{ + long ret; + asm volatile(KVM_HYPERCALL + : "=a"(ret) + : "a"(nr), "b"(p1), "c"(p2), "d"(p3) + : "memory"); + return ret; +} + +static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, + unsigned long p2, unsigned long p3, + unsigned long p4) +{ + long ret; + asm volatile(KVM_HYPERCALL + : "=a"(ret) + : "a"(nr), "b"(p1), "c"(p2), "d"(p3), "S"(p4) + : "memory"); + return ret; +} + +static inline int kvm_para_available(void) +{ + unsigned int eax, ebx, ecx, edx; + char signature[13]; + + cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx); + memcpy(signature + 0, &ebx, 4); + memcpy(signature + 4, &ecx, 4); + memcpy(signature + 8, &edx, 4); + signature[12] = 0; + + if (strcmp(signature, "KVMKVMKVM") == 0) + return 1; + + return 0; +} + +static inline unsigned int kvm_arch_para_features(void) +{ + return cpuid_eax(KVM_CPUID_FEATURES); +} + +#endif + +#endif /* _ASM_X86_KVM_PARA_H */ diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_x86_emulate.h new file mode 100644 index 000000000000..25179a29f208 --- /dev/null +++ b/arch/x86/include/asm/kvm_x86_emulate.h @@ -0,0 +1,184 @@ +/****************************************************************************** + * x86_emulate.h + * + * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. + * + * Copyright (c) 2005 Keir Fraser + * + * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 + */ + +#ifndef _ASM_X86_KVM_X86_EMULATE_H +#define _ASM_X86_KVM_X86_EMULATE_H + +struct x86_emulate_ctxt; + +/* + * x86_emulate_ops: + * + * These operations represent the instruction emulator's interface to memory. + * There are two categories of operation: those that act on ordinary memory + * regions (*_std), and those that act on memory regions known to require + * special treatment or emulation (*_emulated). + * + * The emulator assumes that an instruction accesses only one 'emulated memory' + * location, that this location is the given linear faulting address (cr2), and + * that this is one of the instruction's data operands. Instruction fetches and + * stack operations are assumed never to access emulated memory. The emulator + * automatically deduces which operand of a string-move operation is accessing + * emulated memory, and assumes that the other operand accesses normal memory. + * + * NOTES: + * 1. The emulator isn't very smart about emulated vs. standard memory. + * 'Emulated memory' access addresses should be checked for sanity. + * 'Normal memory' accesses may fault, and the caller must arrange to + * detect and handle reentrancy into the emulator via recursive faults. + * Accesses may be unaligned and may cross page boundaries. + * 2. If the access fails (cannot emulate, or a standard access faults) then + * it is up to the memop to propagate the fault to the guest VM via + * some out-of-band mechanism, unknown to the emulator. The memop signals + * failure by returning X86EMUL_PROPAGATE_FAULT to the emulator, which will + * then immediately bail. + * 3. Valid access sizes are 1, 2, 4 and 8 bytes. On x86/32 systems only + * cmpxchg8b_emulated need support 8-byte accesses. + * 4. The emulator cannot handle 64-bit mode emulation on an x86/32 system. + */ +/* Access completed successfully: continue emulation as normal. */ +#define X86EMUL_CONTINUE 0 +/* Access is unhandleable: bail from emulation and return error to caller. */ +#define X86EMUL_UNHANDLEABLE 1 +/* Terminate emulation but return success to the caller. */ +#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */ +#define X86EMUL_RETRY_INSTR 2 /* retry the instruction for some reason */ +#define X86EMUL_CMPXCHG_FAILED 2 /* cmpxchg did not see expected value */ +struct x86_emulate_ops { + /* + * read_std: Read bytes of standard (non-emulated/special) memory. + * Used for instruction fetch, stack operations, and others. + * @addr: [IN ] Linear address from which to read. + * @val: [OUT] Value read from memory, zero-extended to 'u_long'. + * @bytes: [IN ] Number of bytes to read from memory. + */ + int (*read_std)(unsigned long addr, void *val, + unsigned int bytes, struct kvm_vcpu *vcpu); + + /* + * read_emulated: Read bytes from emulated/special memory area. + * @addr: [IN ] Linear address from which to read. + * @val: [OUT] Value read from memory, zero-extended to 'u_long'. + * @bytes: [IN ] Number of bytes to read from memory. + */ + int (*read_emulated)(unsigned long addr, + void *val, + unsigned int bytes, + struct kvm_vcpu *vcpu); + + /* + * write_emulated: Read bytes from emulated/special memory area. + * @addr: [IN ] Linear address to which to write. + * @val: [IN ] Value to write to memory (low-order bytes used as + * required). + * @bytes: [IN ] Number of bytes to write to memory. + */ + int (*write_emulated)(unsigned long addr, + const void *val, + unsigned int bytes, + struct kvm_vcpu *vcpu); + + /* + * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an + * emulated/special memory area. + * @addr: [IN ] Linear address to access. + * @old: [IN ] Value expected to be current at @addr. + * @new: [IN ] Value to write to @addr. + * @bytes: [IN ] Number of bytes to access using CMPXCHG. + */ + int (*cmpxchg_emulated)(unsigned long addr, + const void *old, + const void *new, + unsigned int bytes, + struct kvm_vcpu *vcpu); + +}; + +/* Type, address-of, and value of an instruction's operand. */ +struct operand { + enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type; + unsigned int bytes; + unsigned long val, orig_val, *ptr; +}; + +struct fetch_cache { + u8 data[15]; + unsigned long start; + unsigned long end; +}; + +struct decode_cache { + u8 twobyte; + u8 b; + u8 lock_prefix; + u8 rep_prefix; + u8 op_bytes; + u8 ad_bytes; + u8 rex_prefix; + struct operand src; + struct operand dst; + bool has_seg_override; + u8 seg_override; + unsigned int d; + unsigned long regs[NR_VCPU_REGS]; + unsigned long eip; + /* modrm */ + u8 modrm; + u8 modrm_mod; + u8 modrm_reg; + u8 modrm_rm; + u8 use_modrm_ea; + bool rip_relative; + unsigned long modrm_ea; + void *modrm_ptr; + unsigned long modrm_val; + struct fetch_cache fetch; +}; + +struct x86_emulate_ctxt { + /* Register state before/after emulation. */ + struct kvm_vcpu *vcpu; + + /* Linear faulting address (if emulating a page-faulting instruction) */ + unsigned long eflags; + + /* Emulated execution mode, represented by an X86EMUL_MODE value. */ + int mode; + + u32 cs_base; + + /* decode cache */ + + struct decode_cache decode; +}; + +/* Repeat String Operation Prefix */ +#define REPE_PREFIX 1 +#define REPNE_PREFIX 2 + +/* Execution mode, passed to the emulator. */ +#define X86EMUL_MODE_REAL 0 /* Real mode. */ +#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */ +#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */ +#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ + +/* Host execution mode. */ +#if defined(__i386__) +#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32 +#elif defined(CONFIG_X86_64) +#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 +#endif + +int x86_decode_insn(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops); +int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops); + +#endif /* _ASM_X86_KVM_X86_EMULATE_H */ diff --git a/arch/x86/include/asm/ldt.h b/arch/x86/include/asm/ldt.h new file mode 100644 index 000000000000..46727eb37bfe --- /dev/null +++ b/arch/x86/include/asm/ldt.h @@ -0,0 +1,40 @@ +/* + * ldt.h + * + * Definitions of structures used with the modify_ldt system call. + */ +#ifndef _ASM_X86_LDT_H +#define _ASM_X86_LDT_H + +/* Maximum number of LDT entries supported. */ +#define LDT_ENTRIES 8192 +/* The size of each LDT entry. */ +#define LDT_ENTRY_SIZE 8 + +#ifndef __ASSEMBLY__ +/* + * Note on 64bit base and limit is ignored and you cannot set DS/ES/CS + * not to the default values if you still want to do syscalls. This + * call is more for 32bit mode therefore. + */ +struct user_desc { + unsigned int entry_number; + unsigned int base_addr; + unsigned int limit; + unsigned int seg_32bit:1; + unsigned int contents:2; + unsigned int read_exec_only:1; + unsigned int limit_in_pages:1; + unsigned int seg_not_present:1; + unsigned int useable:1; +#ifdef __x86_64__ + unsigned int lm:1; +#endif +}; + +#define MODIFY_LDT_CONTENTS_DATA 0 +#define MODIFY_LDT_CONTENTS_STACK 1 +#define MODIFY_LDT_CONTENTS_CODE 2 + +#endif /* !__ASSEMBLY__ */ +#endif /* _ASM_X86_LDT_H */ diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h new file mode 100644 index 000000000000..d28a507cef39 --- /dev/null +++ b/arch/x86/include/asm/lguest.h @@ -0,0 +1,94 @@ +#ifndef _ASM_X86_LGUEST_H +#define _ASM_X86_LGUEST_H + +#define GDT_ENTRY_LGUEST_CS 10 +#define GDT_ENTRY_LGUEST_DS 11 +#define LGUEST_CS (GDT_ENTRY_LGUEST_CS * 8) +#define LGUEST_DS (GDT_ENTRY_LGUEST_DS * 8) + +#ifndef __ASSEMBLY__ +#include <asm/desc.h> + +#define GUEST_PL 1 + +/* Every guest maps the core switcher code. */ +#define SHARED_SWITCHER_PAGES \ + DIV_ROUND_UP(end_switcher_text - start_switcher_text, PAGE_SIZE) +/* Pages for switcher itself, then two pages per cpu */ +#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * NR_CPUS) + +/* We map at -4M for ease of mapping into the guest (one PTE page). */ +#define SWITCHER_ADDR 0xFFC00000 + +/* Found in switcher.S */ +extern unsigned long default_idt_entries[]; + +/* Declarations for definitions in lguest_guest.S */ +extern char lguest_noirq_start[], lguest_noirq_end[]; +extern const char lgstart_cli[], lgend_cli[]; +extern const char lgstart_sti[], lgend_sti[]; +extern const char lgstart_popf[], lgend_popf[]; +extern const char lgstart_pushf[], lgend_pushf[]; +extern const char lgstart_iret[], lgend_iret[]; + +extern void lguest_iret(void); +extern void lguest_init(void); + +struct lguest_regs { + /* Manually saved part. */ + unsigned long eax, ebx, ecx, edx; + unsigned long esi, edi, ebp; + unsigned long gs; + unsigned long fs, ds, es; + unsigned long trapnum, errcode; + /* Trap pushed part */ + unsigned long eip; + unsigned long cs; + unsigned long eflags; + unsigned long esp; + unsigned long ss; +}; + +/* This is a guest-specific page (mapped ro) into the guest. */ +struct lguest_ro_state { + /* Host information we need to restore when we switch back. */ + u32 host_cr3; + struct desc_ptr host_idt_desc; + struct desc_ptr host_gdt_desc; + u32 host_sp; + + /* Fields which are used when guest is running. */ + struct desc_ptr guest_idt_desc; + struct desc_ptr guest_gdt_desc; + struct x86_hw_tss guest_tss; + struct desc_struct guest_idt[IDT_ENTRIES]; + struct desc_struct guest_gdt[GDT_ENTRIES]; +}; + +struct lg_cpu_arch { + /* The GDT entries copied into lguest_ro_state when running. */ + struct desc_struct gdt[GDT_ENTRIES]; + + /* The IDT entries: some copied into lguest_ro_state when running. */ + struct desc_struct idt[IDT_ENTRIES]; + + /* The address of the last guest-visible pagefault (ie. cr2). */ + unsigned long last_pagefault; +}; + +static inline void lguest_set_ts(void) +{ + u32 cr0; + + cr0 = read_cr0(); + if (!(cr0 & 8)) + write_cr0(cr0 | 8); +} + +/* Full 4G segment descriptors, suitable for CS and DS. */ +#define FULL_EXEC_SEGMENT ((struct desc_struct){ { {0x0000ffff, 0x00cf9b00} } }) +#define FULL_SEGMENT ((struct desc_struct){ { {0x0000ffff, 0x00cf9300} } }) + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_X86_LGUEST_H */ diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h new file mode 100644 index 000000000000..43894428c3c2 --- /dev/null +++ b/arch/x86/include/asm/lguest_hcall.h @@ -0,0 +1,71 @@ +/* Architecture specific portion of the lguest hypercalls */ +#ifndef _ASM_X86_LGUEST_HCALL_H +#define _ASM_X86_LGUEST_HCALL_H + +#define LHCALL_FLUSH_ASYNC 0 +#define LHCALL_LGUEST_INIT 1 +#define LHCALL_SHUTDOWN 2 +#define LHCALL_LOAD_GDT 3 +#define LHCALL_NEW_PGTABLE 4 +#define LHCALL_FLUSH_TLB 5 +#define LHCALL_LOAD_IDT_ENTRY 6 +#define LHCALL_SET_STACK 7 +#define LHCALL_TS 8 +#define LHCALL_SET_CLOCKEVENT 9 +#define LHCALL_HALT 10 +#define LHCALL_SET_PTE 14 +#define LHCALL_SET_PMD 15 +#define LHCALL_LOAD_TLS 16 +#define LHCALL_NOTIFY 17 + +#define LGUEST_TRAP_ENTRY 0x1F + +/* Argument number 3 to LHCALL_LGUEST_SHUTDOWN */ +#define LGUEST_SHUTDOWN_POWEROFF 1 +#define LGUEST_SHUTDOWN_RESTART 2 + +#ifndef __ASSEMBLY__ +#include <asm/hw_irq.h> + +/*G:031 But first, how does our Guest contact the Host to ask for privileged + * operations? There are two ways: the direct way is to make a "hypercall", + * to make requests of the Host Itself. + * + * Our hypercall mechanism uses the highest unused trap code (traps 32 and + * above are used by real hardware interrupts). Fifteen hypercalls are + * available: the hypercall number is put in the %eax register, and the + * arguments (when required) are placed in %edx, %ebx and %ecx. If a return + * value makes sense, it's returned in %eax. + * + * Grossly invalid calls result in Sudden Death at the hands of the vengeful + * Host, rather than returning failure. This reflects Winston Churchill's + * definition of a gentleman: "someone who is only rude intentionally". */ +static inline unsigned long +hcall(unsigned long call, + unsigned long arg1, unsigned long arg2, unsigned long arg3) +{ + /* "int" is the Intel instruction to trigger a trap. */ + asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY) + /* The call in %eax (aka "a") might be overwritten */ + : "=a"(call) + /* The arguments are in %eax, %edx, %ebx & %ecx */ + : "a"(call), "d"(arg1), "b"(arg2), "c"(arg3) + /* "memory" means this might write somewhere in memory. + * This isn't true for all calls, but it's safe to tell + * gcc that it might happen so it doesn't get clever. */ + : "memory"); + return call; +} +/*:*/ + +/* Can't use our min() macro here: needs to be a constant */ +#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) + +#define LHCALL_RING_SIZE 64 +struct hcall_args { + /* These map directly onto eax, ebx, ecx, edx in struct lguest_regs */ + unsigned long arg0, arg2, arg3, arg1; +}; + +#endif /* !__ASSEMBLY__ */ +#endif /* _ASM_X86_LGUEST_HCALL_H */ diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h new file mode 100644 index 000000000000..f61ee8f937e4 --- /dev/null +++ b/arch/x86/include/asm/linkage.h @@ -0,0 +1,61 @@ +#ifndef _ASM_X86_LINKAGE_H +#define _ASM_X86_LINKAGE_H + +#undef notrace +#define notrace __attribute__((no_instrument_function)) + +#ifdef CONFIG_X86_64 +#define __ALIGN .p2align 4,,15 +#define __ALIGN_STR ".p2align 4,,15" +#endif + +#ifdef CONFIG_X86_32 +#define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0))) +/* + * For 32-bit UML - mark functions implemented in assembly that use + * regparm input parameters: + */ +#define asmregparm __attribute__((regparm(3))) + +/* + * Make sure the compiler doesn't do anything stupid with the + * arguments on the stack - they are owned by the *caller*, not + * the callee. This just fools gcc into not spilling into them, + * and keeps it from doing tailcall recursion and/or using the + * stack slots for temporaries, since they are live and "used" + * all the way to the end of the function. + * + * NOTE! On x86-64, all the arguments are in registers, so this + * only matters on a 32-bit kernel. + */ +#define asmlinkage_protect(n, ret, args...) \ + __asmlinkage_protect##n(ret, ##args) +#define __asmlinkage_protect_n(ret, args...) \ + __asm__ __volatile__ ("" : "=r" (ret) : "0" (ret), ##args) +#define __asmlinkage_protect0(ret) \ + __asmlinkage_protect_n(ret) +#define __asmlinkage_protect1(ret, arg1) \ + __asmlinkage_protect_n(ret, "g" (arg1)) +#define __asmlinkage_protect2(ret, arg1, arg2) \ + __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2)) +#define __asmlinkage_protect3(ret, arg1, arg2, arg3) \ + __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3)) +#define __asmlinkage_protect4(ret, arg1, arg2, arg3, arg4) \ + __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \ + "g" (arg4)) +#define __asmlinkage_protect5(ret, arg1, arg2, arg3, arg4, arg5) \ + __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \ + "g" (arg4), "g" (arg5)) +#define __asmlinkage_protect6(ret, arg1, arg2, arg3, arg4, arg5, arg6) \ + __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \ + "g" (arg4), "g" (arg5), "g" (arg6)) + +#endif + +#ifdef CONFIG_X86_ALIGNMENT_16 +#define __ALIGN .align 16,0x90 +#define __ALIGN_STR ".align 16,0x90" +#endif + +#endif /* _ASM_X86_LINKAGE_H */ + diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h new file mode 100644 index 000000000000..47b9b6f19057 --- /dev/null +++ b/arch/x86/include/asm/local.h @@ -0,0 +1,235 @@ +#ifndef _ASM_X86_LOCAL_H +#define _ASM_X86_LOCAL_H + +#include <linux/percpu.h> + +#include <asm/system.h> +#include <asm/atomic.h> +#include <asm/asm.h> + +typedef struct { + atomic_long_t a; +} local_t; + +#define LOCAL_INIT(i) { ATOMIC_LONG_INIT(i) } + +#define local_read(l) atomic_long_read(&(l)->a) +#define local_set(l, i) atomic_long_set(&(l)->a, (i)) + +static inline void local_inc(local_t *l) +{ + asm volatile(_ASM_INC "%0" + : "+m" (l->a.counter)); +} + +static inline void local_dec(local_t *l) +{ + asm volatile(_ASM_DEC "%0" + : "+m" (l->a.counter)); +} + +static inline void local_add(long i, local_t *l) +{ + asm volatile(_ASM_ADD "%1,%0" + : "+m" (l->a.counter) + : "ir" (i)); +} + +static inline void local_sub(long i, local_t *l) +{ + asm volatile(_ASM_SUB "%1,%0" + : "+m" (l->a.counter) + : "ir" (i)); +} + +/** + * local_sub_and_test - subtract value from variable and test result + * @i: integer value to subtract + * @l: pointer to type local_t + * + * Atomically subtracts @i from @l and returns + * true if the result is zero, or false for all + * other cases. + */ +static inline int local_sub_and_test(long i, local_t *l) +{ + unsigned char c; + + asm volatile(_ASM_SUB "%2,%0; sete %1" + : "+m" (l->a.counter), "=qm" (c) + : "ir" (i) : "memory"); + return c; +} + +/** + * local_dec_and_test - decrement and test + * @l: pointer to type local_t + * + * Atomically decrements @l by 1 and + * returns true if the result is 0, or false for all other + * cases. + */ +static inline int local_dec_and_test(local_t *l) +{ + unsigned char c; + + asm volatile(_ASM_DEC "%0; sete %1" + : "+m" (l->a.counter), "=qm" (c) + : : "memory"); + return c != 0; +} + +/** + * local_inc_and_test - increment and test + * @l: pointer to type local_t + * + * Atomically increments @l by 1 + * and returns true if the result is zero, or false for all + * other cases. + */ +static inline int local_inc_and_test(local_t *l) +{ + unsigned char c; + + asm volatile(_ASM_INC "%0; sete %1" + : "+m" (l->a.counter), "=qm" (c) + : : "memory"); + return c != 0; +} + +/** + * local_add_negative - add and test if negative + * @i: integer value to add + * @l: pointer to type local_t + * + * Atomically adds @i to @l and returns true + * if the result is negative, or false when + * result is greater than or equal to zero. + */ +static inline int local_add_negative(long i, local_t *l) +{ + unsigned char c; + + asm volatile(_ASM_ADD "%2,%0; sets %1" + : "+m" (l->a.counter), "=qm" (c) + : "ir" (i) : "memory"); + return c; +} + +/** + * local_add_return - add and return + * @i: integer value to add + * @l: pointer to type local_t + * + * Atomically adds @i to @l and returns @i + @l + */ +static inline long local_add_return(long i, local_t *l) +{ + long __i; +#ifdef CONFIG_M386 + unsigned long flags; + if (unlikely(boot_cpu_data.x86 <= 3)) + goto no_xadd; +#endif + /* Modern 486+ processor */ + __i = i; + asm volatile(_ASM_XADD "%0, %1;" + : "+r" (i), "+m" (l->a.counter) + : : "memory"); + return i + __i; + +#ifdef CONFIG_M386 +no_xadd: /* Legacy 386 processor */ + local_irq_save(flags); + __i = local_read(l); + local_set(l, i + __i); + local_irq_restore(flags); + return i + __i; +#endif +} + +static inline long local_sub_return(long i, local_t *l) +{ + return local_add_return(-i, l); +} + +#define local_inc_return(l) (local_add_return(1, l)) +#define local_dec_return(l) (local_sub_return(1, l)) + +#define local_cmpxchg(l, o, n) \ + (cmpxchg_local(&((l)->a.counter), (o), (n))) +/* Always has a lock prefix */ +#define local_xchg(l, n) (xchg(&((l)->a.counter), (n))) + +/** + * local_add_unless - add unless the number is a given value + * @l: pointer of type local_t + * @a: the amount to add to l... + * @u: ...unless l is equal to u. + * + * Atomically adds @a to @l, so long as it was not @u. + * Returns non-zero if @l was not @u, and zero otherwise. + */ +#define local_add_unless(l, a, u) \ +({ \ + long c, old; \ + c = local_read((l)); \ + for (;;) { \ + if (unlikely(c == (u))) \ + break; \ + old = local_cmpxchg((l), c, c + (a)); \ + if (likely(old == c)) \ + break; \ + c = old; \ + } \ + c != (u); \ +}) +#define local_inc_not_zero(l) local_add_unless((l), 1, 0) + +/* On x86_32, these are no better than the atomic variants. + * On x86-64 these are better than the atomic variants on SMP kernels + * because they dont use a lock prefix. + */ +#define __local_inc(l) local_inc(l) +#define __local_dec(l) local_dec(l) +#define __local_add(i, l) local_add((i), (l)) +#define __local_sub(i, l) local_sub((i), (l)) + +/* Use these for per-cpu local_t variables: on some archs they are + * much more efficient than these naive implementations. Note they take + * a variable, not an address. + * + * X86_64: This could be done better if we moved the per cpu data directly + * after GS. + */ + +/* Need to disable preemption for the cpu local counters otherwise we could + still access a variable of a previous CPU in a non atomic way. */ +#define cpu_local_wrap_v(l) \ +({ \ + local_t res__; \ + preempt_disable(); \ + res__ = (l); \ + preempt_enable(); \ + res__; \ +}) +#define cpu_local_wrap(l) \ +({ \ + preempt_disable(); \ + (l); \ + preempt_enable(); \ +}) \ + +#define cpu_local_read(l) cpu_local_wrap_v(local_read(&__get_cpu_var((l)))) +#define cpu_local_set(l, i) cpu_local_wrap(local_set(&__get_cpu_var((l)), (i))) +#define cpu_local_inc(l) cpu_local_wrap(local_inc(&__get_cpu_var((l)))) +#define cpu_local_dec(l) cpu_local_wrap(local_dec(&__get_cpu_var((l)))) +#define cpu_local_add(i, l) cpu_local_wrap(local_add((i), &__get_cpu_var((l)))) +#define cpu_local_sub(i, l) cpu_local_wrap(local_sub((i), &__get_cpu_var((l)))) + +#define __cpu_local_inc(l) cpu_local_inc((l)) +#define __cpu_local_dec(l) cpu_local_dec((l)) +#define __cpu_local_add(i, l) cpu_local_add((i), (l)) +#define __cpu_local_sub(i, l) cpu_local_sub((i), (l)) + +#endif /* _ASM_X86_LOCAL_H */ diff --git a/arch/x86/include/asm/mach-default/apm.h b/arch/x86/include/asm/mach-default/apm.h new file mode 100644 index 000000000000..20370c6db74b --- /dev/null +++ b/arch/x86/include/asm/mach-default/apm.h @@ -0,0 +1,73 @@ +/* + * Machine specific APM BIOS functions for generic. + * Split out from apm.c by Osamu Tomita <tomita@cinet.co.jp> + */ + +#ifndef _ASM_X86_MACH_DEFAULT_APM_H +#define _ASM_X86_MACH_DEFAULT_APM_H + +#ifdef APM_ZERO_SEGS +# define APM_DO_ZERO_SEGS \ + "pushl %%ds\n\t" \ + "pushl %%es\n\t" \ + "xorl %%edx, %%edx\n\t" \ + "mov %%dx, %%ds\n\t" \ + "mov %%dx, %%es\n\t" \ + "mov %%dx, %%fs\n\t" \ + "mov %%dx, %%gs\n\t" +# define APM_DO_POP_SEGS \ + "popl %%es\n\t" \ + "popl %%ds\n\t" +#else +# define APM_DO_ZERO_SEGS +# define APM_DO_POP_SEGS +#endif + +static inline void apm_bios_call_asm(u32 func, u32 ebx_in, u32 ecx_in, + u32 *eax, u32 *ebx, u32 *ecx, + u32 *edx, u32 *esi) +{ + /* + * N.B. We do NOT need a cld after the BIOS call + * because we always save and restore the flags. + */ + __asm__ __volatile__(APM_DO_ZERO_SEGS + "pushl %%edi\n\t" + "pushl %%ebp\n\t" + "lcall *%%cs:apm_bios_entry\n\t" + "setc %%al\n\t" + "popl %%ebp\n\t" + "popl %%edi\n\t" + APM_DO_POP_SEGS + : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx), + "=S" (*esi) + : "a" (func), "b" (ebx_in), "c" (ecx_in) + : "memory", "cc"); +} + +static inline u8 apm_bios_call_simple_asm(u32 func, u32 ebx_in, + u32 ecx_in, u32 *eax) +{ + int cx, dx, si; + u8 error; + + /* + * N.B. We do NOT need a cld after the BIOS call + * because we always save and restore the flags. + */ + __asm__ __volatile__(APM_DO_ZERO_SEGS + "pushl %%edi\n\t" + "pushl %%ebp\n\t" + "lcall *%%cs:apm_bios_entry\n\t" + "setc %%bl\n\t" + "popl %%ebp\n\t" + "popl %%edi\n\t" + APM_DO_POP_SEGS + : "=a" (*eax), "=b" (error), "=c" (cx), "=d" (dx), + "=S" (si) + : "a" (func), "b" (ebx_in), "c" (ecx_in) + : "memory", "cc"); + return error; +} + +#endif /* _ASM_X86_MACH_DEFAULT_APM_H */ diff --git a/arch/x86/include/asm/mach-default/do_timer.h b/arch/x86/include/asm/mach-default/do_timer.h new file mode 100644 index 000000000000..23ecda0b28a0 --- /dev/null +++ b/arch/x86/include/asm/mach-default/do_timer.h @@ -0,0 +1,16 @@ +/* defines for inline arch setup functions */ +#include <linux/clockchips.h> + +#include <asm/i8259.h> +#include <asm/i8253.h> + +/** + * do_timer_interrupt_hook - hook into timer tick + * + * Call the pit clock event handler. see asm/i8253.h + **/ + +static inline void do_timer_interrupt_hook(void) +{ + global_clock_event->event_handler(global_clock_event); +} diff --git a/arch/x86/include/asm/mach-default/entry_arch.h b/arch/x86/include/asm/mach-default/entry_arch.h new file mode 100644 index 000000000000..6b1add8e31dd --- /dev/null +++ b/arch/x86/include/asm/mach-default/entry_arch.h @@ -0,0 +1,36 @@ +/* + * This file is designed to contain the BUILD_INTERRUPT specifications for + * all of the extra named interrupt vectors used by the architecture. + * Usually this is the Inter Process Interrupts (IPIs) + */ + +/* + * The following vectors are part of the Linux architecture, there + * is no hardware IRQ pin equivalent for them, they are triggered + * through the ICC by us (IPIs) + */ +#ifdef CONFIG_X86_SMP +BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR) +BUILD_INTERRUPT(invalidate_interrupt,INVALIDATE_TLB_VECTOR) +BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR) +BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR) +BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR) +#endif + +/* + * every pentium local APIC has two 'local interrupts', with a + * soft-definable vector attached to both interrupts, one of + * which is a timer interrupt, the other one is error counter + * overflow. Linux uses the local APIC timer interrupt to get + * a much simpler SMP time architecture: + */ +#ifdef CONFIG_X86_LOCAL_APIC +BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR) +BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR) +BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) + +#ifdef CONFIG_X86_MCE_P4THERMAL +BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) +#endif + +#endif diff --git a/arch/x86/include/asm/mach-default/mach_apic.h b/arch/x86/include/asm/mach-default/mach_apic.h new file mode 100644 index 000000000000..ff3a6c236c00 --- /dev/null +++ b/arch/x86/include/asm/mach-default/mach_apic.h @@ -0,0 +1,156 @@ +#ifndef _ASM_X86_MACH_DEFAULT_MACH_APIC_H +#define _ASM_X86_MACH_DEFAULT_MACH_APIC_H + +#ifdef CONFIG_X86_LOCAL_APIC + +#include <mach_apicdef.h> +#include <asm/smp.h> + +#define APIC_DFR_VALUE (APIC_DFR_FLAT) + +static inline cpumask_t target_cpus(void) +{ +#ifdef CONFIG_SMP + return cpu_online_map; +#else + return cpumask_of_cpu(0); +#endif +} + +#define NO_BALANCE_IRQ (0) +#define esr_disable (0) + +#ifdef CONFIG_X86_64 +#include <asm/genapic.h> +#define INT_DELIVERY_MODE (genapic->int_delivery_mode) +#define INT_DEST_MODE (genapic->int_dest_mode) +#define TARGET_CPUS (genapic->target_cpus()) +#define apic_id_registered (genapic->apic_id_registered) +#define init_apic_ldr (genapic->init_apic_ldr) +#define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid) +#define phys_pkg_id (genapic->phys_pkg_id) +#define vector_allocation_domain (genapic->vector_allocation_domain) +#define read_apic_id() (GET_APIC_ID(apic_read(APIC_ID))) +#define send_IPI_self (genapic->send_IPI_self) +extern void setup_apic_routing(void); +#else +#define INT_DELIVERY_MODE dest_LowestPrio +#define INT_DEST_MODE 1 /* logical delivery broadcast to all procs */ +#define TARGET_CPUS (target_cpus()) +/* + * Set up the logical destination ID. + * + * Intel recommends to set DFR, LDR and TPR before enabling + * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel + * document number 292116). So here it goes... + */ +static inline void init_apic_ldr(void) +{ + unsigned long val; + + apic_write(APIC_DFR, APIC_DFR_VALUE); + val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; + val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id()); + apic_write(APIC_LDR, val); +} + +static inline int apic_id_registered(void) +{ + return physid_isset(read_apic_id(), phys_cpu_present_map); +} + +static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) +{ + return cpus_addr(cpumask)[0]; +} + +static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb) +{ + return cpuid_apic >> index_msb; +} + +static inline void setup_apic_routing(void) +{ +#ifdef CONFIG_X86_IO_APIC + printk("Enabling APIC mode: %s. Using %d I/O APICs\n", + "Flat", nr_ioapics); +#endif +} + +static inline int apicid_to_node(int logical_apicid) +{ +#ifdef CONFIG_SMP + return apicid_2_node[hard_smp_processor_id()]; +#else + return 0; +#endif +} + +static inline cpumask_t vector_allocation_domain(int cpu) +{ + /* Careful. Some cpus do not strictly honor the set of cpus + * specified in the interrupt destination when using lowest + * priority interrupt delivery mode. + * + * In particular there was a hyperthreading cpu observed to + * deliver interrupts to the wrong hyperthread when only one + * hyperthread was specified in the interrupt desitination. + */ + cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; + return domain; +} +#endif + +static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid) +{ + return physid_isset(apicid, bitmap); +} + +static inline unsigned long check_apicid_present(int bit) +{ + return physid_isset(bit, phys_cpu_present_map); +} + +static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map) +{ + return phys_map; +} + +static inline int multi_timer_check(int apic, int irq) +{ + return 0; +} + +/* Mapping from cpu number to logical apicid */ +static inline int cpu_to_logical_apicid(int cpu) +{ + return 1 << cpu; +} + +static inline int cpu_present_to_apicid(int mps_cpu) +{ + if (mps_cpu < NR_CPUS && cpu_present(mps_cpu)) + return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu); + else + return BAD_APICID; +} + +static inline physid_mask_t apicid_to_cpu_present(int phys_apicid) +{ + return physid_mask_of_physid(phys_apicid); +} + +static inline void setup_portio_remap(void) +{ +} + +static inline int check_phys_apicid_present(int boot_cpu_physical_apicid) +{ + return physid_isset(boot_cpu_physical_apicid, phys_cpu_present_map); +} + +static inline void enable_apic_mode(void) +{ +} +#endif /* CONFIG_X86_LOCAL_APIC */ +#endif /* _ASM_X86_MACH_DEFAULT_MACH_APIC_H */ diff --git a/arch/x86/include/asm/mach-default/mach_apicdef.h b/arch/x86/include/asm/mach-default/mach_apicdef.h new file mode 100644 index 000000000000..53179936d6c6 --- /dev/null +++ b/arch/x86/include/asm/mach-default/mach_apicdef.h @@ -0,0 +1,24 @@ +#ifndef _ASM_X86_MACH_DEFAULT_MACH_APICDEF_H +#define _ASM_X86_MACH_DEFAULT_MACH_APICDEF_H + +#include <asm/apic.h> + +#ifdef CONFIG_X86_64 +#define APIC_ID_MASK (genapic->apic_id_mask) +#define GET_APIC_ID(x) (genapic->get_apic_id(x)) +#define SET_APIC_ID(x) (genapic->set_apic_id(x)) +#else +#define APIC_ID_MASK (0xF<<24) +static inline unsigned get_apic_id(unsigned long x) +{ + unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR)); + if (APIC_XAPIC(ver)) + return (((x)>>24)&0xFF); + else + return (((x)>>24)&0xF); +} + +#define GET_APIC_ID(x) get_apic_id(x) +#endif + +#endif /* _ASM_X86_MACH_DEFAULT_MACH_APICDEF_H */ diff --git a/arch/x86/include/asm/mach-default/mach_ipi.h b/arch/x86/include/asm/mach-default/mach_ipi.h new file mode 100644 index 000000000000..fabca01ebacf --- /dev/null +++ b/arch/x86/include/asm/mach-default/mach_ipi.h @@ -0,0 +1,64 @@ +#ifndef _ASM_X86_MACH_DEFAULT_MACH_IPI_H +#define _ASM_X86_MACH_DEFAULT_MACH_IPI_H + +/* Avoid include hell */ +#define NMI_VECTOR 0x02 + +void send_IPI_mask_bitmask(cpumask_t mask, int vector); +void __send_IPI_shortcut(unsigned int shortcut, int vector); + +extern int no_broadcast; + +#ifdef CONFIG_X86_64 +#include <asm/genapic.h> +#define send_IPI_mask (genapic->send_IPI_mask) +#else +static inline void send_IPI_mask(cpumask_t mask, int vector) +{ + send_IPI_mask_bitmask(mask, vector); +} +#endif + +static inline void __local_send_IPI_allbutself(int vector) +{ + if (no_broadcast || vector == NMI_VECTOR) { + cpumask_t mask = cpu_online_map; + + cpu_clear(smp_processor_id(), mask); + send_IPI_mask(mask, vector); + } else + __send_IPI_shortcut(APIC_DEST_ALLBUT, vector); +} + +static inline void __local_send_IPI_all(int vector) +{ + if (no_broadcast || vector == NMI_VECTOR) + send_IPI_mask(cpu_online_map, vector); + else + __send_IPI_shortcut(APIC_DEST_ALLINC, vector); +} + +#ifdef CONFIG_X86_64 +#define send_IPI_allbutself (genapic->send_IPI_allbutself) +#define send_IPI_all (genapic->send_IPI_all) +#else +static inline void send_IPI_allbutself(int vector) +{ + /* + * if there are no other CPUs in the system then we get an APIC send + * error if we try to broadcast, thus avoid sending IPIs in this case. + */ + if (!(num_online_cpus() > 1)) + return; + + __local_send_IPI_allbutself(vector); + return; +} + +static inline void send_IPI_all(int vector) +{ + __local_send_IPI_all(vector); +} +#endif + +#endif /* _ASM_X86_MACH_DEFAULT_MACH_IPI_H */ diff --git a/arch/x86/include/asm/mach-default/mach_mpparse.h b/arch/x86/include/asm/mach-default/mach_mpparse.h new file mode 100644 index 000000000000..8c1ea21238a7 --- /dev/null +++ b/arch/x86/include/asm/mach-default/mach_mpparse.h @@ -0,0 +1,17 @@ +#ifndef _ASM_X86_MACH_DEFAULT_MACH_MPPARSE_H +#define _ASM_X86_MACH_DEFAULT_MACH_MPPARSE_H + +static inline int mps_oem_check(struct mp_config_table *mpc, char *oem, + char *productid) +{ + return 0; +} + +/* Hook from generic ACPI tables.c */ +static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + return 0; +} + + +#endif /* _ASM_X86_MACH_DEFAULT_MACH_MPPARSE_H */ diff --git a/arch/x86/include/asm/mach-default/mach_mpspec.h b/arch/x86/include/asm/mach-default/mach_mpspec.h new file mode 100644 index 000000000000..e85ede686be8 --- /dev/null +++ b/arch/x86/include/asm/mach-default/mach_mpspec.h @@ -0,0 +1,12 @@ +#ifndef _ASM_X86_MACH_DEFAULT_MACH_MPSPEC_H +#define _ASM_X86_MACH_DEFAULT_MACH_MPSPEC_H + +#define MAX_IRQ_SOURCES 256 + +#if CONFIG_BASE_SMALL == 0 +#define MAX_MP_BUSSES 256 +#else +#define MAX_MP_BUSSES 32 +#endif + +#endif /* _ASM_X86_MACH_DEFAULT_MACH_MPSPEC_H */ diff --git a/arch/x86/include/asm/mach-default/mach_timer.h b/arch/x86/include/asm/mach-default/mach_timer.h new file mode 100644 index 000000000000..853728519ae9 --- /dev/null +++ b/arch/x86/include/asm/mach-default/mach_timer.h @@ -0,0 +1,48 @@ +/* + * Machine specific calibrate_tsc() for generic. + * Split out from timer_tsc.c by Osamu Tomita <tomita@cinet.co.jp> + */ +/* ------ Calibrate the TSC ------- + * Return 2^32 * (1 / (TSC clocks per usec)) for do_fast_gettimeoffset(). + * Too much 64-bit arithmetic here to do this cleanly in C, and for + * accuracy's sake we want to keep the overhead on the CTC speaker (channel 2) + * output busy loop as low as possible. We avoid reading the CTC registers + * directly because of the awkward 8-bit access mechanism of the 82C54 + * device. + */ +#ifndef _ASM_X86_MACH_DEFAULT_MACH_TIMER_H +#define _ASM_X86_MACH_DEFAULT_MACH_TIMER_H + +#define CALIBRATE_TIME_MSEC 30 /* 30 msecs */ +#define CALIBRATE_LATCH \ + ((CLOCK_TICK_RATE * CALIBRATE_TIME_MSEC + 1000/2)/1000) + +static inline void mach_prepare_counter(void) +{ + /* Set the Gate high, disable speaker */ + outb((inb(0x61) & ~0x02) | 0x01, 0x61); + + /* + * Now let's take care of CTC channel 2 + * + * Set the Gate high, program CTC channel 2 for mode 0, + * (interrupt on terminal count mode), binary count, + * load 5 * LATCH count, (LSB and MSB) to begin countdown. + * + * Some devices need a delay here. + */ + outb(0xb0, 0x43); /* binary, mode 0, LSB/MSB, Ch 2 */ + outb_p(CALIBRATE_LATCH & 0xff, 0x42); /* LSB of count */ + outb_p(CALIBRATE_LATCH >> 8, 0x42); /* MSB of count */ +} + +static inline void mach_countup(unsigned long *count_p) +{ + unsigned long count = 0; + do { + count++; + } while ((inb_p(0x61) & 0x20) == 0); + *count_p = count; +} + +#endif /* _ASM_X86_MACH_DEFAULT_MACH_TIMER_H */ diff --git a/arch/x86/include/asm/mach-default/mach_traps.h b/arch/x86/include/asm/mach-default/mach_traps.h new file mode 100644 index 000000000000..f7920601e472 --- /dev/null +++ b/arch/x86/include/asm/mach-default/mach_traps.h @@ -0,0 +1,33 @@ +/* + * Machine specific NMI handling for generic. + * Split out from traps.c by Osamu Tomita <tomita@cinet.co.jp> + */ +#ifndef _ASM_X86_MACH_DEFAULT_MACH_TRAPS_H +#define _ASM_X86_MACH_DEFAULT_MACH_TRAPS_H + +#include <asm/mc146818rtc.h> + +static inline unsigned char get_nmi_reason(void) +{ + return inb(0x61); +} + +static inline void reassert_nmi(void) +{ + int old_reg = -1; + + if (do_i_have_lock_cmos()) + old_reg = current_lock_cmos_reg(); + else + lock_cmos(0); /* register doesn't matter here */ + outb(0x8f, 0x70); + inb(0x71); /* dummy */ + outb(0x0f, 0x70); + inb(0x71); /* dummy */ + if (old_reg >= 0) + outb(old_reg, 0x70); + else + unlock_cmos(); +} + +#endif /* _ASM_X86_MACH_DEFAULT_MACH_TRAPS_H */ diff --git a/arch/x86/include/asm/mach-default/mach_wakecpu.h b/arch/x86/include/asm/mach-default/mach_wakecpu.h new file mode 100644 index 000000000000..d5c0b826a4ff --- /dev/null +++ b/arch/x86/include/asm/mach-default/mach_wakecpu.h @@ -0,0 +1,42 @@ +#ifndef _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H +#define _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H + +/* + * This file copes with machines that wakeup secondary CPUs by the + * INIT, INIT, STARTUP sequence. + */ + +#define WAKE_SECONDARY_VIA_INIT + +#define TRAMPOLINE_LOW phys_to_virt(0x467) +#define TRAMPOLINE_HIGH phys_to_virt(0x469) + +#define boot_cpu_apicid boot_cpu_physical_apicid + +static inline void wait_for_init_deassert(atomic_t *deassert) +{ + while (!atomic_read(deassert)) + cpu_relax(); + return; +} + +/* Nothing to do for most platforms, since cleared by the INIT cycle */ +static inline void smp_callin_clear_local_apic(void) +{ +} + +static inline void store_NMI_vector(unsigned short *high, unsigned short *low) +{ +} + +static inline void restore_NMI_vector(unsigned short *high, unsigned short *low) +{ +} + +#if APIC_DEBUG + #define inquire_remote_apic(apicid) __inquire_remote_apic(apicid) +#else + #define inquire_remote_apic(apicid) {} +#endif + +#endif /* _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H */ diff --git a/arch/x86/include/asm/mach-default/pci-functions.h b/arch/x86/include/asm/mach-default/pci-functions.h new file mode 100644 index 000000000000..ed0bab427354 --- /dev/null +++ b/arch/x86/include/asm/mach-default/pci-functions.h @@ -0,0 +1,19 @@ +/* + * PCI BIOS function numbering for conventional PCI BIOS + * systems + */ + +#define PCIBIOS_PCI_FUNCTION_ID 0xb1XX +#define PCIBIOS_PCI_BIOS_PRESENT 0xb101 +#define PCIBIOS_FIND_PCI_DEVICE 0xb102 +#define PCIBIOS_FIND_PCI_CLASS_CODE 0xb103 +#define PCIBIOS_GENERATE_SPECIAL_CYCLE 0xb106 +#define PCIBIOS_READ_CONFIG_BYTE 0xb108 +#define PCIBIOS_READ_CONFIG_WORD 0xb109 +#define PCIBIOS_READ_CONFIG_DWORD 0xb10a +#define PCIBIOS_WRITE_CONFIG_BYTE 0xb10b +#define PCIBIOS_WRITE_CONFIG_WORD 0xb10c +#define PCIBIOS_WRITE_CONFIG_DWORD 0xb10d +#define PCIBIOS_GET_ROUTING_OPTIONS 0xb10e +#define PCIBIOS_SET_PCI_HW_INT 0xb10f + diff --git a/arch/x86/include/asm/mach-default/setup_arch.h b/arch/x86/include/asm/mach-default/setup_arch.h new file mode 100644 index 000000000000..38846208b548 --- /dev/null +++ b/arch/x86/include/asm/mach-default/setup_arch.h @@ -0,0 +1,3 @@ +/* Hook to call BIOS initialisation function */ + +/* no action for generic */ diff --git a/arch/x86/include/asm/mach-default/smpboot_hooks.h b/arch/x86/include/asm/mach-default/smpboot_hooks.h new file mode 100644 index 000000000000..dbab36d64d48 --- /dev/null +++ b/arch/x86/include/asm/mach-default/smpboot_hooks.h @@ -0,0 +1,59 @@ +/* two abstractions specific to kernel/smpboot.c, mainly to cater to visws + * which needs to alter them. */ + +static inline void smpboot_clear_io_apic_irqs(void) +{ +#ifdef CONFIG_X86_IO_APIC + io_apic_irqs = 0; +#endif +} + +static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) +{ + CMOS_WRITE(0xa, 0xf); + local_flush_tlb(); + pr_debug("1.\n"); + *((volatile unsigned short *) TRAMPOLINE_HIGH) = start_eip >> 4; + pr_debug("2.\n"); + *((volatile unsigned short *) TRAMPOLINE_LOW) = start_eip & 0xf; + pr_debug("3.\n"); +} + +static inline void smpboot_restore_warm_reset_vector(void) +{ + /* + * Install writable page 0 entry to set BIOS data area. + */ + local_flush_tlb(); + + /* + * Paranoid: Set warm reset code and vector here back + * to default values. + */ + CMOS_WRITE(0, 0xf); + + *((volatile long *) phys_to_virt(0x467)) = 0; +} + +static inline void __init smpboot_setup_io_apic(void) +{ +#ifdef CONFIG_X86_IO_APIC + /* + * Here we can be sure that there is an IO-APIC in the system. Let's + * go and set it up: + */ + if (!skip_ioapic_setup && nr_ioapics) + setup_IO_APIC(); + else { + nr_ioapics = 0; + localise_nmi_watchdog(); + } +#endif +} + +static inline void smpboot_clear_io_apic(void) +{ +#ifdef CONFIG_X86_IO_APIC + nr_ioapics = 0; +#endif +} diff --git a/arch/x86/include/asm/mach-generic/gpio.h b/arch/x86/include/asm/mach-generic/gpio.h new file mode 100644 index 000000000000..995c45efdb33 --- /dev/null +++ b/arch/x86/include/asm/mach-generic/gpio.h @@ -0,0 +1,15 @@ +#ifndef _ASM_X86_MACH_GENERIC_GPIO_H +#define _ASM_X86_MACH_GENERIC_GPIO_H + +int gpio_request(unsigned gpio, const char *label); +void gpio_free(unsigned gpio); +int gpio_direction_input(unsigned gpio); +int gpio_direction_output(unsigned gpio, int value); +int gpio_get_value(unsigned gpio); +void gpio_set_value(unsigned gpio, int value); +int gpio_to_irq(unsigned gpio); +int irq_to_gpio(unsigned irq); + +#include <asm-generic/gpio.h> /* cansleep wrappers */ + +#endif /* _ASM_X86_MACH_GENERIC_GPIO_H */ diff --git a/arch/x86/include/asm/mach-generic/mach_apic.h b/arch/x86/include/asm/mach-generic/mach_apic.h new file mode 100644 index 000000000000..5180bd7478fb --- /dev/null +++ b/arch/x86/include/asm/mach-generic/mach_apic.h @@ -0,0 +1,33 @@ +#ifndef _ASM_X86_MACH_GENERIC_MACH_APIC_H +#define _ASM_X86_MACH_GENERIC_MACH_APIC_H + +#include <asm/genapic.h> + +#define esr_disable (genapic->ESR_DISABLE) +#define NO_BALANCE_IRQ (genapic->no_balance_irq) +#define INT_DELIVERY_MODE (genapic->int_delivery_mode) +#define INT_DEST_MODE (genapic->int_dest_mode) +#undef APIC_DEST_LOGICAL +#define APIC_DEST_LOGICAL (genapic->apic_destination_logical) +#define TARGET_CPUS (genapic->target_cpus()) +#define apic_id_registered (genapic->apic_id_registered) +#define init_apic_ldr (genapic->init_apic_ldr) +#define ioapic_phys_id_map (genapic->ioapic_phys_id_map) +#define setup_apic_routing (genapic->setup_apic_routing) +#define multi_timer_check (genapic->multi_timer_check) +#define apicid_to_node (genapic->apicid_to_node) +#define cpu_to_logical_apicid (genapic->cpu_to_logical_apicid) +#define cpu_present_to_apicid (genapic->cpu_present_to_apicid) +#define apicid_to_cpu_present (genapic->apicid_to_cpu_present) +#define setup_portio_remap (genapic->setup_portio_remap) +#define check_apicid_present (genapic->check_apicid_present) +#define check_phys_apicid_present (genapic->check_phys_apicid_present) +#define check_apicid_used (genapic->check_apicid_used) +#define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid) +#define vector_allocation_domain (genapic->vector_allocation_domain) +#define enable_apic_mode (genapic->enable_apic_mode) +#define phys_pkg_id (genapic->phys_pkg_id) + +extern void generic_bigsmp_probe(void); + +#endif /* _ASM_X86_MACH_GENERIC_MACH_APIC_H */ diff --git a/arch/x86/include/asm/mach-generic/mach_apicdef.h b/arch/x86/include/asm/mach-generic/mach_apicdef.h new file mode 100644 index 000000000000..68041f3802f4 --- /dev/null +++ b/arch/x86/include/asm/mach-generic/mach_apicdef.h @@ -0,0 +1,11 @@ +#ifndef _ASM_X86_MACH_GENERIC_MACH_APICDEF_H +#define _ASM_X86_MACH_GENERIC_MACH_APICDEF_H + +#ifndef APIC_DEFINITION +#include <asm/genapic.h> + +#define GET_APIC_ID (genapic->get_apic_id) +#define APIC_ID_MASK (genapic->apic_id_mask) +#endif + +#endif /* _ASM_X86_MACH_GENERIC_MACH_APICDEF_H */ diff --git a/arch/x86/include/asm/mach-generic/mach_ipi.h b/arch/x86/include/asm/mach-generic/mach_ipi.h new file mode 100644 index 000000000000..ffd637e3c3d9 --- /dev/null +++ b/arch/x86/include/asm/mach-generic/mach_ipi.h @@ -0,0 +1,10 @@ +#ifndef _ASM_X86_MACH_GENERIC_MACH_IPI_H +#define _ASM_X86_MACH_GENERIC_MACH_IPI_H + +#include <asm/genapic.h> + +#define send_IPI_mask (genapic->send_IPI_mask) +#define send_IPI_allbutself (genapic->send_IPI_allbutself) +#define send_IPI_all (genapic->send_IPI_all) + +#endif /* _ASM_X86_MACH_GENERIC_MACH_IPI_H */ diff --git a/arch/x86/include/asm/mach-generic/mach_mpparse.h b/arch/x86/include/asm/mach-generic/mach_mpparse.h new file mode 100644 index 000000000000..048f1d468535 --- /dev/null +++ b/arch/x86/include/asm/mach-generic/mach_mpparse.h @@ -0,0 +1,10 @@ +#ifndef _ASM_X86_MACH_GENERIC_MACH_MPPARSE_H +#define _ASM_X86_MACH_GENERIC_MACH_MPPARSE_H + + +extern int mps_oem_check(struct mp_config_table *mpc, char *oem, + char *productid); + +extern int acpi_madt_oem_check(char *oem_id, char *oem_table_id); + +#endif /* _ASM_X86_MACH_GENERIC_MACH_MPPARSE_H */ diff --git a/arch/x86/include/asm/mach-generic/mach_mpspec.h b/arch/x86/include/asm/mach-generic/mach_mpspec.h new file mode 100644 index 000000000000..bbab5ccfd4fe --- /dev/null +++ b/arch/x86/include/asm/mach-generic/mach_mpspec.h @@ -0,0 +1,12 @@ +#ifndef _ASM_X86_MACH_GENERIC_MACH_MPSPEC_H +#define _ASM_X86_MACH_GENERIC_MACH_MPSPEC_H + +#define MAX_IRQ_SOURCES 256 + +/* Summit or generic (i.e. installer) kernels need lots of bus entries. */ +/* Maximum 256 PCI busses, plus 1 ISA bus in each of 4 cabinets. */ +#define MAX_MP_BUSSES 260 + +extern void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem, + char *productid); +#endif /* _ASM_X86_MACH_GENERIC_MACH_MPSPEC_H */ diff --git a/arch/x86/include/asm/mach-rdc321x/gpio.h b/arch/x86/include/asm/mach-rdc321x/gpio.h new file mode 100644 index 000000000000..c210ab5788b0 --- /dev/null +++ b/arch/x86/include/asm/mach-rdc321x/gpio.h @@ -0,0 +1,60 @@ +#ifndef _ASM_X86_MACH_RDC321X_GPIO_H +#define _ASM_X86_MACH_RDC321X_GPIO_H + +#include <linux/kernel.h> + +extern int rdc_gpio_get_value(unsigned gpio); +extern void rdc_gpio_set_value(unsigned gpio, int value); +extern int rdc_gpio_direction_input(unsigned gpio); +extern int rdc_gpio_direction_output(unsigned gpio, int value); +extern int rdc_gpio_request(unsigned gpio, const char *label); +extern void rdc_gpio_free(unsigned gpio); +extern void __init rdc321x_gpio_setup(void); + +/* Wrappers for the arch-neutral GPIO API */ + +static inline int gpio_request(unsigned gpio, const char *label) +{ + return rdc_gpio_request(gpio, label); +} + +static inline void gpio_free(unsigned gpio) +{ + might_sleep(); + rdc_gpio_free(gpio); +} + +static inline int gpio_direction_input(unsigned gpio) +{ + return rdc_gpio_direction_input(gpio); +} + +static inline int gpio_direction_output(unsigned gpio, int value) +{ + return rdc_gpio_direction_output(gpio, value); +} + +static inline int gpio_get_value(unsigned gpio) +{ + return rdc_gpio_get_value(gpio); +} + +static inline void gpio_set_value(unsigned gpio, int value) +{ + rdc_gpio_set_value(gpio, value); +} + +static inline int gpio_to_irq(unsigned gpio) +{ + return gpio; +} + +static inline int irq_to_gpio(unsigned irq) +{ + return irq; +} + +/* For cansleep */ +#include <asm-generic/gpio.h> + +#endif /* _ASM_X86_MACH_RDC321X_GPIO_H */ diff --git a/arch/x86/include/asm/mach-rdc321x/rdc321x_defs.h b/arch/x86/include/asm/mach-rdc321x/rdc321x_defs.h new file mode 100644 index 000000000000..c8e9c8bed3d0 --- /dev/null +++ b/arch/x86/include/asm/mach-rdc321x/rdc321x_defs.h @@ -0,0 +1,12 @@ +#define PFX "rdc321x: " + +/* General purpose configuration and data registers */ +#define RDC3210_CFGREG_ADDR 0x0CF8 +#define RDC3210_CFGREG_DATA 0x0CFC + +#define RDC321X_GPIO_CTRL_REG1 0x48 +#define RDC321X_GPIO_CTRL_REG2 0x84 +#define RDC321X_GPIO_DATA_REG1 0x4c +#define RDC321X_GPIO_DATA_REG2 0x88 + +#define RDC321X_MAX_GPIO 58 diff --git a/arch/x86/include/asm/mach-voyager/do_timer.h b/arch/x86/include/asm/mach-voyager/do_timer.h new file mode 100644 index 000000000000..9e5a459fd15b --- /dev/null +++ b/arch/x86/include/asm/mach-voyager/do_timer.h @@ -0,0 +1,17 @@ +/* defines for inline arch setup functions */ +#include <linux/clockchips.h> + +#include <asm/voyager.h> +#include <asm/i8253.h> + +/** + * do_timer_interrupt_hook - hook into timer tick + * + * Call the pit clock event handler. see asm/i8253.h + **/ +static inline void do_timer_interrupt_hook(void) +{ + global_clock_event->event_handler(global_clock_event); + voyager_timer_interrupt(); +} + diff --git a/arch/x86/include/asm/mach-voyager/entry_arch.h b/arch/x86/include/asm/mach-voyager/entry_arch.h new file mode 100644 index 000000000000..ae52624b5937 --- /dev/null +++ b/arch/x86/include/asm/mach-voyager/entry_arch.h @@ -0,0 +1,26 @@ +/* -*- mode: c; c-basic-offset: 8 -*- */ + +/* Copyright (C) 2002 + * + * Author: James.Bottomley@HansenPartnership.com + * + * linux/arch/i386/voyager/entry_arch.h + * + * This file builds the VIC and QIC CPI gates + */ + +/* initialise the voyager interrupt gates + * + * This uses the macros in irq.h to set up assembly jump gates. The + * calls are then redirected to the same routine with smp_ prefixed */ +BUILD_INTERRUPT(vic_sys_interrupt, VIC_SYS_INT) +BUILD_INTERRUPT(vic_cmn_interrupt, VIC_CMN_INT) +BUILD_INTERRUPT(vic_cpi_interrupt, VIC_CPI_LEVEL0); + +/* do all the QIC interrupts */ +BUILD_INTERRUPT(qic_timer_interrupt, QIC_TIMER_CPI); +BUILD_INTERRUPT(qic_invalidate_interrupt, QIC_INVALIDATE_CPI); +BUILD_INTERRUPT(qic_reschedule_interrupt, QIC_RESCHEDULE_CPI); +BUILD_INTERRUPT(qic_enable_irq_interrupt, QIC_ENABLE_IRQ_CPI); +BUILD_INTERRUPT(qic_call_function_interrupt, QIC_CALL_FUNCTION_CPI); +BUILD_INTERRUPT(qic_call_function_single_interrupt, QIC_CALL_FUNCTION_SINGLE_CPI); diff --git a/arch/x86/include/asm/mach-voyager/setup_arch.h b/arch/x86/include/asm/mach-voyager/setup_arch.h new file mode 100644 index 000000000000..71729ca05cd7 --- /dev/null +++ b/arch/x86/include/asm/mach-voyager/setup_arch.h @@ -0,0 +1,12 @@ +#include <asm/voyager.h> +#include <asm/setup.h> +#define VOYAGER_BIOS_INFO ((struct voyager_bios_info *) \ + (&boot_params.apm_bios_info)) + +/* Hook to call BIOS initialisation function */ + +/* for voyager, pass the voyager BIOS/SUS info area to the detection + * routines */ + +#define ARCH_SETUP voyager_detect(VOYAGER_BIOS_INFO); + diff --git a/arch/x86/include/asm/math_emu.h b/arch/x86/include/asm/math_emu.h new file mode 100644 index 000000000000..5a65b107ad58 --- /dev/null +++ b/arch/x86/include/asm/math_emu.h @@ -0,0 +1,31 @@ +#ifndef _ASM_X86_MATH_EMU_H +#define _ASM_X86_MATH_EMU_H + +/* This structure matches the layout of the data saved to the stack + following a device-not-present interrupt, part of it saved + automatically by the 80386/80486. + */ +struct info { + long ___orig_eip; + long ___ebx; + long ___ecx; + long ___edx; + long ___esi; + long ___edi; + long ___ebp; + long ___eax; + long ___ds; + long ___es; + long ___fs; + long ___orig_eax; + long ___eip; + long ___cs; + long ___eflags; + long ___esp; + long ___ss; + long ___vm86_es; /* This and the following only in vm86 mode */ + long ___vm86_ds; + long ___vm86_fs; + long ___vm86_gs; +}; +#endif /* _ASM_X86_MATH_EMU_H */ diff --git a/arch/x86/include/asm/mc146818rtc.h b/arch/x86/include/asm/mc146818rtc.h new file mode 100644 index 000000000000..01fdf5674e24 --- /dev/null +++ b/arch/x86/include/asm/mc146818rtc.h @@ -0,0 +1,104 @@ +/* + * Machine dependent access functions for RTC registers. + */ +#ifndef _ASM_X86_MC146818RTC_H +#define _ASM_X86_MC146818RTC_H + +#include <asm/io.h> +#include <asm/system.h> +#include <asm/processor.h> +#include <linux/mc146818rtc.h> + +#ifndef RTC_PORT +#define RTC_PORT(x) (0x70 + (x)) +#define RTC_ALWAYS_BCD 1 /* RTC operates in binary mode */ +#endif + +#if defined(CONFIG_X86_32) && defined(__HAVE_ARCH_CMPXCHG) +/* + * This lock provides nmi access to the CMOS/RTC registers. It has some + * special properties. It is owned by a CPU and stores the index register + * currently being accessed (if owned). The idea here is that it works + * like a normal lock (normally). However, in an NMI, the NMI code will + * first check to see if its CPU owns the lock, meaning that the NMI + * interrupted during the read/write of the device. If it does, it goes ahead + * and performs the access and then restores the index register. If it does + * not, it locks normally. + * + * Note that since we are working with NMIs, we need this lock even in + * a non-SMP machine just to mark that the lock is owned. + * + * This only works with compare-and-swap. There is no other way to + * atomically claim the lock and set the owner. + */ +#include <linux/smp.h> +extern volatile unsigned long cmos_lock; + +/* + * All of these below must be called with interrupts off, preempt + * disabled, etc. + */ + +static inline void lock_cmos(unsigned char reg) +{ + unsigned long new; + new = ((smp_processor_id() + 1) << 8) | reg; + for (;;) { + if (cmos_lock) { + cpu_relax(); + continue; + } + if (__cmpxchg(&cmos_lock, 0, new, sizeof(cmos_lock)) == 0) + return; + } +} + +static inline void unlock_cmos(void) +{ + cmos_lock = 0; +} + +static inline int do_i_have_lock_cmos(void) +{ + return (cmos_lock >> 8) == (smp_processor_id() + 1); +} + +static inline unsigned char current_lock_cmos_reg(void) +{ + return cmos_lock & 0xff; +} + +#define lock_cmos_prefix(reg) \ + do { \ + unsigned long cmos_flags; \ + local_irq_save(cmos_flags); \ + lock_cmos(reg) + +#define lock_cmos_suffix(reg) \ + unlock_cmos(); \ + local_irq_restore(cmos_flags); \ + } while (0) +#else +#define lock_cmos_prefix(reg) do {} while (0) +#define lock_cmos_suffix(reg) do {} while (0) +#define lock_cmos(reg) +#define unlock_cmos() +#define do_i_have_lock_cmos() 0 +#define current_lock_cmos_reg() 0 +#endif + +/* + * The yet supported machines all access the RTC index register via + * an ISA port access but the way to access the date register differs ... + */ +#define CMOS_READ(addr) rtc_cmos_read(addr) +#define CMOS_WRITE(val, addr) rtc_cmos_write(val, addr) +unsigned char rtc_cmos_read(unsigned char addr); +void rtc_cmos_write(unsigned char val, unsigned char addr); + +extern int mach_set_rtc_mmss(unsigned long nowtime); +extern unsigned long mach_get_cmos_time(void); + +#define RTC_IRQ 8 + +#endif /* _ASM_X86_MC146818RTC_H */ diff --git a/arch/x86/include/asm/mca.h b/arch/x86/include/asm/mca.h new file mode 100644 index 000000000000..eedbb6cc1efb --- /dev/null +++ b/arch/x86/include/asm/mca.h @@ -0,0 +1,43 @@ +/* -*- mode: c; c-basic-offset: 8 -*- */ + +/* Platform specific MCA defines */ +#ifndef _ASM_X86_MCA_H +#define _ASM_X86_MCA_H + +/* Maximal number of MCA slots - actually, some machines have less, but + * they all have sufficient number of POS registers to cover 8. + */ +#define MCA_MAX_SLOT_NR 8 + +/* Most machines have only one MCA bus. The only multiple bus machines + * I know have at most two */ +#define MAX_MCA_BUSSES 2 + +#define MCA_PRIMARY_BUS 0 +#define MCA_SECONDARY_BUS 1 + +/* Dummy slot numbers on primary MCA for integrated functions */ +#define MCA_INTEGSCSI (MCA_MAX_SLOT_NR) +#define MCA_INTEGVIDEO (MCA_MAX_SLOT_NR+1) +#define MCA_MOTHERBOARD (MCA_MAX_SLOT_NR+2) + +/* Dummy POS values for integrated functions */ +#define MCA_DUMMY_POS_START 0x10000 +#define MCA_INTEGSCSI_POS (MCA_DUMMY_POS_START+1) +#define MCA_INTEGVIDEO_POS (MCA_DUMMY_POS_START+2) +#define MCA_MOTHERBOARD_POS (MCA_DUMMY_POS_START+3) + +/* MCA registers */ + +#define MCA_MOTHERBOARD_SETUP_REG 0x94 +#define MCA_ADAPTER_SETUP_REG 0x96 +#define MCA_POS_REG(n) (0x100+(n)) + +#define MCA_ENABLED 0x01 /* POS 2, set if adapter enabled */ + +/* Max number of adapters, including both slots and various integrated + * things. + */ +#define MCA_NUMADAPTERS (MCA_MAX_SLOT_NR+3) + +#endif /* _ASM_X86_MCA_H */ diff --git a/arch/x86/include/asm/mca_dma.h b/arch/x86/include/asm/mca_dma.h new file mode 100644 index 000000000000..45271aef82dd --- /dev/null +++ b/arch/x86/include/asm/mca_dma.h @@ -0,0 +1,201 @@ +#ifndef _ASM_X86_MCA_DMA_H +#define _ASM_X86_MCA_DMA_H + +#include <asm/io.h> +#include <linux/ioport.h> + +/* + * Microchannel specific DMA stuff. DMA on an MCA machine is fairly similar to + * standard PC dma, but it certainly has its quirks. DMA register addresses + * are in a different place and there are some added functions. Most of this + * should be pretty obvious on inspection. Note that the user must divide + * count by 2 when using 16-bit dma; that is not handled by these functions. + * + * Ramen Noodles are yummy. + * + * 1998 Tymm Twillman <tymm@computer.org> + */ + +/* + * Registers that are used by the DMA controller; FN is the function register + * (tell the controller what to do) and EXE is the execution register (how + * to do it) + */ + +#define MCA_DMA_REG_FN 0x18 +#define MCA_DMA_REG_EXE 0x1A + +/* + * Functions that the DMA controller can do + */ + +#define MCA_DMA_FN_SET_IO 0x00 +#define MCA_DMA_FN_SET_ADDR 0x20 +#define MCA_DMA_FN_GET_ADDR 0x30 +#define MCA_DMA_FN_SET_COUNT 0x40 +#define MCA_DMA_FN_GET_COUNT 0x50 +#define MCA_DMA_FN_GET_STATUS 0x60 +#define MCA_DMA_FN_SET_MODE 0x70 +#define MCA_DMA_FN_SET_ARBUS 0x80 +#define MCA_DMA_FN_MASK 0x90 +#define MCA_DMA_FN_RESET_MASK 0xA0 +#define MCA_DMA_FN_MASTER_CLEAR 0xD0 + +/* + * Modes (used by setting MCA_DMA_FN_MODE in the function register) + * + * Note that the MODE_READ is read from memory (write to device), and + * MODE_WRITE is vice-versa. + */ + +#define MCA_DMA_MODE_XFER 0x04 /* read by default */ +#define MCA_DMA_MODE_READ 0x04 /* same as XFER */ +#define MCA_DMA_MODE_WRITE 0x08 /* OR with MODE_XFER to use */ +#define MCA_DMA_MODE_IO 0x01 /* DMA from IO register */ +#define MCA_DMA_MODE_16 0x40 /* 16 bit xfers */ + + +/** + * mca_enable_dma - channel to enable DMA on + * @dmanr: DMA channel + * + * Enable the MCA bus DMA on a channel. This can be called from + * IRQ context. + */ + +static inline void mca_enable_dma(unsigned int dmanr) +{ + outb(MCA_DMA_FN_RESET_MASK | dmanr, MCA_DMA_REG_FN); +} + +/** + * mca_disble_dma - channel to disable DMA on + * @dmanr: DMA channel + * + * Enable the MCA bus DMA on a channel. This can be called from + * IRQ context. + */ + +static inline void mca_disable_dma(unsigned int dmanr) +{ + outb(MCA_DMA_FN_MASK | dmanr, MCA_DMA_REG_FN); +} + +/** + * mca_set_dma_addr - load a 24bit DMA address + * @dmanr: DMA channel + * @a: 24bit bus address + * + * Load the address register in the DMA controller. This has a 24bit + * limitation (16Mb). + */ + +static inline void mca_set_dma_addr(unsigned int dmanr, unsigned int a) +{ + outb(MCA_DMA_FN_SET_ADDR | dmanr, MCA_DMA_REG_FN); + outb(a & 0xff, MCA_DMA_REG_EXE); + outb((a >> 8) & 0xff, MCA_DMA_REG_EXE); + outb((a >> 16) & 0xff, MCA_DMA_REG_EXE); +} + +/** + * mca_get_dma_addr - load a 24bit DMA address + * @dmanr: DMA channel + * + * Read the address register in the DMA controller. This has a 24bit + * limitation (16Mb). The return is a bus address. + */ + +static inline unsigned int mca_get_dma_addr(unsigned int dmanr) +{ + unsigned int addr; + + outb(MCA_DMA_FN_GET_ADDR | dmanr, MCA_DMA_REG_FN); + addr = inb(MCA_DMA_REG_EXE); + addr |= inb(MCA_DMA_REG_EXE) << 8; + addr |= inb(MCA_DMA_REG_EXE) << 16; + + return addr; +} + +/** + * mca_set_dma_count - load a 16bit transfer count + * @dmanr: DMA channel + * @count: count + * + * Set the DMA count for this channel. This can be up to 64Kbytes. + * Setting a count of zero will not do what you expect. + */ + +static inline void mca_set_dma_count(unsigned int dmanr, unsigned int count) +{ + count--; /* transfers one more than count -- correct for this */ + + outb(MCA_DMA_FN_SET_COUNT | dmanr, MCA_DMA_REG_FN); + outb(count & 0xff, MCA_DMA_REG_EXE); + outb((count >> 8) & 0xff, MCA_DMA_REG_EXE); +} + +/** + * mca_get_dma_residue - get the remaining bytes to transfer + * @dmanr: DMA channel + * + * This function returns the number of bytes left to transfer + * on this DMA channel. + */ + +static inline unsigned int mca_get_dma_residue(unsigned int dmanr) +{ + unsigned short count; + + outb(MCA_DMA_FN_GET_COUNT | dmanr, MCA_DMA_REG_FN); + count = 1 + inb(MCA_DMA_REG_EXE); + count += inb(MCA_DMA_REG_EXE) << 8; + + return count; +} + +/** + * mca_set_dma_io - set the port for an I/O transfer + * @dmanr: DMA channel + * @io_addr: an I/O port number + * + * Unlike the ISA bus DMA controllers the DMA on MCA bus can transfer + * with an I/O port target. + */ + +static inline void mca_set_dma_io(unsigned int dmanr, unsigned int io_addr) +{ + /* + * DMA from a port address -- set the io address + */ + + outb(MCA_DMA_FN_SET_IO | dmanr, MCA_DMA_REG_FN); + outb(io_addr & 0xff, MCA_DMA_REG_EXE); + outb((io_addr >> 8) & 0xff, MCA_DMA_REG_EXE); +} + +/** + * mca_set_dma_mode - set the DMA mode + * @dmanr: DMA channel + * @mode: mode to set + * + * The DMA controller supports several modes. The mode values you can + * set are- + * + * %MCA_DMA_MODE_READ when reading from the DMA device. + * + * %MCA_DMA_MODE_WRITE to writing to the DMA device. + * + * %MCA_DMA_MODE_IO to do DMA to or from an I/O port. + * + * %MCA_DMA_MODE_16 to do 16bit transfers. + */ + +static inline void mca_set_dma_mode(unsigned int dmanr, unsigned int mode) +{ + outb(MCA_DMA_FN_SET_MODE | dmanr, MCA_DMA_REG_FN); + outb(mode, MCA_DMA_REG_EXE); +} + +#endif /* _ASM_X86_MCA_DMA_H */ diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h new file mode 100644 index 000000000000..1d6e17c2f23a --- /dev/null +++ b/arch/x86/include/asm/mce.h @@ -0,0 +1,130 @@ +#ifndef _ASM_X86_MCE_H +#define _ASM_X86_MCE_H + +#ifdef __x86_64__ + +#include <asm/ioctls.h> +#include <asm/types.h> + +/* + * Machine Check support for x86 + */ + +#define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */ + +#define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */ +#define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */ +#define MCG_STATUS_MCIP (1UL<<2) /* machine check in progress */ + +#define MCI_STATUS_VAL (1UL<<63) /* valid error */ +#define MCI_STATUS_OVER (1UL<<62) /* previous errors lost */ +#define MCI_STATUS_UC (1UL<<61) /* uncorrected error */ +#define MCI_STATUS_EN (1UL<<60) /* error enabled */ +#define MCI_STATUS_MISCV (1UL<<59) /* misc error reg. valid */ +#define MCI_STATUS_ADDRV (1UL<<58) /* addr reg. valid */ +#define MCI_STATUS_PCC (1UL<<57) /* processor context corrupt */ + +/* Fields are zero when not available */ +struct mce { + __u64 status; + __u64 misc; + __u64 addr; + __u64 mcgstatus; + __u64 ip; + __u64 tsc; /* cpu time stamp counter */ + __u64 res1; /* for future extension */ + __u64 res2; /* dito. */ + __u8 cs; /* code segment */ + __u8 bank; /* machine check bank */ + __u8 cpu; /* cpu that raised the error */ + __u8 finished; /* entry is valid */ + __u32 pad; +}; + +/* + * This structure contains all data related to the MCE log. Also + * carries a signature to make it easier to find from external + * debugging tools. Each entry is only valid when its finished flag + * is set. + */ + +#define MCE_LOG_LEN 32 + +struct mce_log { + char signature[12]; /* "MACHINECHECK" */ + unsigned len; /* = MCE_LOG_LEN */ + unsigned next; + unsigned flags; + unsigned pad0; + struct mce entry[MCE_LOG_LEN]; +}; + +#define MCE_OVERFLOW 0 /* bit 0 in flags means overflow */ + +#define MCE_LOG_SIGNATURE "MACHINECHECK" + +#define MCE_GET_RECORD_LEN _IOR('M', 1, int) +#define MCE_GET_LOG_LEN _IOR('M', 2, int) +#define MCE_GETCLEAR_FLAGS _IOR('M', 3, int) + +/* Software defined banks */ +#define MCE_EXTENDED_BANK 128 +#define MCE_THERMAL_BANK MCE_EXTENDED_BANK + 0 + +#define K8_MCE_THRESHOLD_BASE (MCE_EXTENDED_BANK + 1) /* MCE_AMD */ +#define K8_MCE_THRESHOLD_BANK_0 (MCE_THRESHOLD_BASE + 0 * 9) +#define K8_MCE_THRESHOLD_BANK_1 (MCE_THRESHOLD_BASE + 1 * 9) +#define K8_MCE_THRESHOLD_BANK_2 (MCE_THRESHOLD_BASE + 2 * 9) +#define K8_MCE_THRESHOLD_BANK_3 (MCE_THRESHOLD_BASE + 3 * 9) +#define K8_MCE_THRESHOLD_BANK_4 (MCE_THRESHOLD_BASE + 4 * 9) +#define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9) +#define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0) + +#endif /* __x86_64__ */ + +#ifdef __KERNEL__ + +#ifdef CONFIG_X86_32 +extern int mce_disabled; +#else /* CONFIG_X86_32 */ + +#include <asm/atomic.h> + +void mce_log(struct mce *m); +DECLARE_PER_CPU(struct sys_device, device_mce); +extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); + +#ifdef CONFIG_X86_MCE_INTEL +void mce_intel_feature_init(struct cpuinfo_x86 *c); +#else +static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { } +#endif + +#ifdef CONFIG_X86_MCE_AMD +void mce_amd_feature_init(struct cpuinfo_x86 *c); +#else +static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } +#endif + +void mce_log_therm_throt_event(unsigned int cpu, __u64 status); + +extern atomic_t mce_entry; + +extern void do_machine_check(struct pt_regs *, long); +extern int mce_notify_user(void); + +#endif /* !CONFIG_X86_32 */ + + + +#ifdef CONFIG_X86_MCE +extern void mcheck_init(struct cpuinfo_x86 *c); +#else +#define mcheck_init(c) do { } while (0) +#endif +extern void stop_mce(void); +extern void restart_mce(void); + +#endif /* __KERNEL__ */ + +#endif /* _ASM_X86_MCE_H */ diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h new file mode 100644 index 000000000000..c882664716c1 --- /dev/null +++ b/arch/x86/include/asm/microcode.h @@ -0,0 +1,47 @@ +#ifndef _ASM_X86_MICROCODE_H +#define _ASM_X86_MICROCODE_H + +struct cpu_signature { + unsigned int sig; + unsigned int pf; + unsigned int rev; +}; + +struct device; + +struct microcode_ops { + int (*request_microcode_user) (int cpu, const void __user *buf, size_t size); + int (*request_microcode_fw) (int cpu, struct device *device); + + void (*apply_microcode) (int cpu); + + int (*collect_cpu_info) (int cpu, struct cpu_signature *csig); + void (*microcode_fini_cpu) (int cpu); +}; + +struct ucode_cpu_info { + struct cpu_signature cpu_sig; + int valid; + void *mc; +}; +extern struct ucode_cpu_info ucode_cpu_info[]; + +#ifdef CONFIG_MICROCODE_INTEL +extern struct microcode_ops * __init init_intel_microcode(void); +#else +static inline struct microcode_ops * __init init_intel_microcode(void) +{ + return NULL; +} +#endif /* CONFIG_MICROCODE_INTEL */ + +#ifdef CONFIG_MICROCODE_AMD +extern struct microcode_ops * __init init_amd_microcode(void); +#else +static inline struct microcode_ops * __init init_amd_microcode(void) +{ + return NULL; +} +#endif + +#endif /* _ASM_X86_MICROCODE_H */ diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h new file mode 100644 index 000000000000..90bc4108a4fd --- /dev/null +++ b/arch/x86/include/asm/mman.h @@ -0,0 +1,20 @@ +#ifndef _ASM_X86_MMAN_H +#define _ASM_X86_MMAN_H + +#include <asm-generic/mman.h> + +#define MAP_32BIT 0x40 /* only give out 32bit addresses */ + +#define MAP_GROWSDOWN 0x0100 /* stack-like segment */ +#define MAP_DENYWRITE 0x0800 /* ETXTBSY */ +#define MAP_EXECUTABLE 0x1000 /* mark it as an executable */ +#define MAP_LOCKED 0x2000 /* pages are locked */ +#define MAP_NORESERVE 0x4000 /* don't check for reservations */ +#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ +#define MAP_NONBLOCK 0x10000 /* do not block on IO */ +#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ + +#define MCL_CURRENT 1 /* lock all current mappings */ +#define MCL_FUTURE 2 /* lock all future mappings */ + +#endif /* _ASM_X86_MMAN_H */ diff --git a/arch/x86/include/asm/mmconfig.h b/arch/x86/include/asm/mmconfig.h new file mode 100644 index 000000000000..9b119da1d105 --- /dev/null +++ b/arch/x86/include/asm/mmconfig.h @@ -0,0 +1,12 @@ +#ifndef _ASM_X86_MMCONFIG_H +#define _ASM_X86_MMCONFIG_H + +#ifdef CONFIG_PCI_MMCONFIG +extern void __cpuinit fam10h_check_enable_mmcfg(void); +extern void __cpuinit check_enable_amd_mmconf_dmi(void); +#else +static inline void fam10h_check_enable_mmcfg(void) { } +static inline void check_enable_amd_mmconf_dmi(void) { } +#endif + +#endif /* _ASM_X86_MMCONFIG_H */ diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h new file mode 100644 index 000000000000..80a1dee5bea5 --- /dev/null +++ b/arch/x86/include/asm/mmu.h @@ -0,0 +1,26 @@ +#ifndef _ASM_X86_MMU_H +#define _ASM_X86_MMU_H + +#include <linux/spinlock.h> +#include <linux/mutex.h> + +/* + * The x86 doesn't have a mmu context, but + * we put the segment information here. + */ +typedef struct { + void *ldt; + int size; + struct mutex lock; + void *vdso; +} mm_context_t; + +#ifdef CONFIG_SMP +void leave_mm(int cpu); +#else +static inline void leave_mm(int cpu) +{ +} +#endif + +#endif /* _ASM_X86_MMU_H */ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h new file mode 100644 index 000000000000..8aeeb3fd73db --- /dev/null +++ b/arch/x86/include/asm/mmu_context.h @@ -0,0 +1,37 @@ +#ifndef _ASM_X86_MMU_CONTEXT_H +#define _ASM_X86_MMU_CONTEXT_H + +#include <asm/desc.h> +#include <asm/atomic.h> +#include <asm/pgalloc.h> +#include <asm/tlbflush.h> +#include <asm/paravirt.h> +#ifndef CONFIG_PARAVIRT +#include <asm-generic/mm_hooks.h> + +static inline void paravirt_activate_mm(struct mm_struct *prev, + struct mm_struct *next) +{ +} +#endif /* !CONFIG_PARAVIRT */ + +/* + * Used for LDT copy/destruction. + */ +int init_new_context(struct task_struct *tsk, struct mm_struct *mm); +void destroy_context(struct mm_struct *mm); + +#ifdef CONFIG_X86_32 +# include "mmu_context_32.h" +#else +# include "mmu_context_64.h" +#endif + +#define activate_mm(prev, next) \ +do { \ + paravirt_activate_mm((prev), (next)); \ + switch_mm((prev), (next), NULL); \ +} while (0); + + +#endif /* _ASM_X86_MMU_CONTEXT_H */ diff --git a/arch/x86/include/asm/mmu_context_32.h b/arch/x86/include/asm/mmu_context_32.h new file mode 100644 index 000000000000..8e10015781fb --- /dev/null +++ b/arch/x86/include/asm/mmu_context_32.h @@ -0,0 +1,56 @@ +#ifndef _ASM_X86_MMU_CONTEXT_32_H +#define _ASM_X86_MMU_CONTEXT_32_H + +static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) +{ +#ifdef CONFIG_SMP + unsigned cpu = smp_processor_id(); + if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) + per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_LAZY; +#endif +} + +static inline void switch_mm(struct mm_struct *prev, + struct mm_struct *next, + struct task_struct *tsk) +{ + int cpu = smp_processor_id(); + + if (likely(prev != next)) { + /* stop flush ipis for the previous mm */ + cpu_clear(cpu, prev->cpu_vm_mask); +#ifdef CONFIG_SMP + per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK; + per_cpu(cpu_tlbstate, cpu).active_mm = next; +#endif + cpu_set(cpu, next->cpu_vm_mask); + + /* Re-load page tables */ + load_cr3(next->pgd); + + /* + * load the LDT, if the LDT is different: + */ + if (unlikely(prev->context.ldt != next->context.ldt)) + load_LDT_nolock(&next->context); + } +#ifdef CONFIG_SMP + else { + per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK; + BUG_ON(per_cpu(cpu_tlbstate, cpu).active_mm != next); + + if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) { + /* We were in lazy tlb mode and leave_mm disabled + * tlb flush IPI delivery. We must reload %cr3. + */ + load_cr3(next->pgd); + load_LDT_nolock(&next->context); + } + } +#endif +} + +#define deactivate_mm(tsk, mm) \ + asm("movl %0,%%gs": :"r" (0)); + +#endif /* _ASM_X86_MMU_CONTEXT_32_H */ diff --git a/arch/x86/include/asm/mmu_context_64.h b/arch/x86/include/asm/mmu_context_64.h new file mode 100644 index 000000000000..677d36e9540a --- /dev/null +++ b/arch/x86/include/asm/mmu_context_64.h @@ -0,0 +1,54 @@ +#ifndef _ASM_X86_MMU_CONTEXT_64_H +#define _ASM_X86_MMU_CONTEXT_64_H + +#include <asm/pda.h> + +static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) +{ +#ifdef CONFIG_SMP + if (read_pda(mmu_state) == TLBSTATE_OK) + write_pda(mmu_state, TLBSTATE_LAZY); +#endif +} + +static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) +{ + unsigned cpu = smp_processor_id(); + if (likely(prev != next)) { + /* stop flush ipis for the previous mm */ + cpu_clear(cpu, prev->cpu_vm_mask); +#ifdef CONFIG_SMP + write_pda(mmu_state, TLBSTATE_OK); + write_pda(active_mm, next); +#endif + cpu_set(cpu, next->cpu_vm_mask); + load_cr3(next->pgd); + + if (unlikely(next->context.ldt != prev->context.ldt)) + load_LDT_nolock(&next->context); + } +#ifdef CONFIG_SMP + else { + write_pda(mmu_state, TLBSTATE_OK); + if (read_pda(active_mm) != next) + BUG(); + if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) { + /* We were in lazy tlb mode and leave_mm disabled + * tlb flush IPI delivery. We must reload CR3 + * to make sure to use no freed page tables. + */ + load_cr3(next->pgd); + load_LDT_nolock(&next->context); + } + } +#endif +} + +#define deactivate_mm(tsk, mm) \ +do { \ + load_gs_index(0); \ + asm volatile("movl %0,%%fs"::"r"(0)); \ +} while (0) + +#endif /* _ASM_X86_MMU_CONTEXT_64_H */ diff --git a/arch/x86/include/asm/mmx.h b/arch/x86/include/asm/mmx.h new file mode 100644 index 000000000000..5cbf3135b971 --- /dev/null +++ b/arch/x86/include/asm/mmx.h @@ -0,0 +1,14 @@ +#ifndef _ASM_X86_MMX_H +#define _ASM_X86_MMX_H + +/* + * MMX 3Dnow! helper operations + */ + +#include <linux/types.h> + +extern void *_mmx_memcpy(void *to, const void *from, size_t size); +extern void mmx_clear_page(void *page); +extern void mmx_copy_page(void *to, void *from); + +#endif /* _ASM_X86_MMX_H */ diff --git a/arch/x86/include/asm/mmzone.h b/arch/x86/include/asm/mmzone.h new file mode 100644 index 000000000000..64217ea16a36 --- /dev/null +++ b/arch/x86/include/asm/mmzone.h @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include "mmzone_32.h" +#else +# include "mmzone_64.h" +#endif diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h new file mode 100644 index 000000000000..485bdf059ffb --- /dev/null +++ b/arch/x86/include/asm/mmzone_32.h @@ -0,0 +1,134 @@ +/* + * Written by Pat Gaughen (gone@us.ibm.com) Mar 2002 + * + */ + +#ifndef _ASM_X86_MMZONE_32_H +#define _ASM_X86_MMZONE_32_H + +#include <asm/smp.h> + +#ifdef CONFIG_NUMA +extern struct pglist_data *node_data[]; +#define NODE_DATA(nid) (node_data[nid]) + +#include <asm/numaq.h> +/* summit or generic arch */ +#include <asm/srat.h> + +extern int get_memcfg_numa_flat(void); +/* + * This allows any one NUMA architecture to be compiled + * for, and still fall back to the flat function if it + * fails. + */ +static inline void get_memcfg_numa(void) +{ + + if (get_memcfg_numaq()) + return; + if (get_memcfg_from_srat()) + return; + get_memcfg_numa_flat(); +} + +extern int early_pfn_to_nid(unsigned long pfn); + +#else /* !CONFIG_NUMA */ + +#define get_memcfg_numa get_memcfg_numa_flat + +#endif /* CONFIG_NUMA */ + +#ifdef CONFIG_DISCONTIGMEM + +/* + * generic node memory support, the following assumptions apply: + * + * 1) memory comes in 64Mb contigious chunks which are either present or not + * 2) we will not have more than 64Gb in total + * + * for now assume that 64Gb is max amount of RAM for whole system + * 64Gb / 4096bytes/page = 16777216 pages + */ +#define MAX_NR_PAGES 16777216 +#define MAX_ELEMENTS 1024 +#define PAGES_PER_ELEMENT (MAX_NR_PAGES/MAX_ELEMENTS) + +extern s8 physnode_map[]; + +static inline int pfn_to_nid(unsigned long pfn) +{ +#ifdef CONFIG_NUMA + return((int) physnode_map[(pfn) / PAGES_PER_ELEMENT]); +#else + return 0; +#endif +} + +/* + * Following are macros that each numa implmentation must define. + */ + +#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) +#define node_end_pfn(nid) \ +({ \ + pg_data_t *__pgdat = NODE_DATA(nid); \ + __pgdat->node_start_pfn + __pgdat->node_spanned_pages; \ +}) + +static inline int pfn_valid(int pfn) +{ + int nid = pfn_to_nid(pfn); + + if (nid >= 0) + return (pfn < node_end_pfn(nid)); + return 0; +} + +#endif /* CONFIG_DISCONTIGMEM */ + +#ifdef CONFIG_NEED_MULTIPLE_NODES + +/* + * Following are macros that are specific to this numa platform. + */ +#define reserve_bootmem(addr, size, flags) \ + reserve_bootmem_node(NODE_DATA(0), (addr), (size), (flags)) +#define alloc_bootmem(x) \ + __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) +#define alloc_bootmem_nopanic(x) \ + __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), SMP_CACHE_BYTES, \ + __pa(MAX_DMA_ADDRESS)) +#define alloc_bootmem_low(x) \ + __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, 0) +#define alloc_bootmem_pages(x) \ + __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) +#define alloc_bootmem_pages_nopanic(x) \ + __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), PAGE_SIZE, \ + __pa(MAX_DMA_ADDRESS)) +#define alloc_bootmem_low_pages(x) \ + __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0) +#define alloc_bootmem_node(pgdat, x) \ +({ \ + struct pglist_data __maybe_unused \ + *__alloc_bootmem_node__pgdat = (pgdat); \ + __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, \ + __pa(MAX_DMA_ADDRESS)); \ +}) +#define alloc_bootmem_pages_node(pgdat, x) \ +({ \ + struct pglist_data __maybe_unused \ + *__alloc_bootmem_node__pgdat = (pgdat); \ + __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, \ + __pa(MAX_DMA_ADDRESS)); \ +}) +#define alloc_bootmem_low_pages_node(pgdat, x) \ +({ \ + struct pglist_data __maybe_unused \ + *__alloc_bootmem_node__pgdat = (pgdat); \ + __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0); \ +}) +#endif /* CONFIG_NEED_MULTIPLE_NODES */ + +#endif /* _ASM_X86_MMZONE_32_H */ diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h new file mode 100644 index 000000000000..a5b3817d4b9e --- /dev/null +++ b/arch/x86/include/asm/mmzone_64.h @@ -0,0 +1,51 @@ +/* K8 NUMA support */ +/* Copyright 2002,2003 by Andi Kleen, SuSE Labs */ +/* 2.5 Version loosely based on the NUMAQ Code by Pat Gaughen. */ +#ifndef _ASM_X86_MMZONE_64_H +#define _ASM_X86_MMZONE_64_H + + +#ifdef CONFIG_NUMA + +#include <linux/mmdebug.h> + +#include <asm/smp.h> + +/* Simple perfect hash to map physical addresses to node numbers */ +struct memnode { + int shift; + unsigned int mapsize; + s16 *map; + s16 embedded_map[64 - 8]; +} ____cacheline_aligned; /* total size = 128 bytes */ +extern struct memnode memnode; +#define memnode_shift memnode.shift +#define memnodemap memnode.map +#define memnodemapsize memnode.mapsize + +extern struct pglist_data *node_data[]; + +static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) +{ + unsigned nid; + VIRTUAL_BUG_ON(!memnodemap); + nid = memnodemap[addr >> memnode_shift]; + VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]); + return nid; +} + +#define NODE_DATA(nid) (node_data[nid]) + +#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) +#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \ + NODE_DATA(nid)->node_spanned_pages) + +extern int early_pfn_to_nid(unsigned long pfn); + +#ifdef CONFIG_NUMA_EMU +#define FAKE_NODE_MIN_SIZE (64 * 1024 * 1024) +#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) +#endif + +#endif +#endif /* _ASM_X86_MMZONE_64_H */ diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h new file mode 100644 index 000000000000..47d62743c4d5 --- /dev/null +++ b/arch/x86/include/asm/module.h @@ -0,0 +1,80 @@ +#ifndef _ASM_X86_MODULE_H +#define _ASM_X86_MODULE_H + +/* x86_32/64 are simple */ +struct mod_arch_specific {}; + +#ifdef CONFIG_X86_32 +# define Elf_Shdr Elf32_Shdr +# define Elf_Sym Elf32_Sym +# define Elf_Ehdr Elf32_Ehdr +#else +# define Elf_Shdr Elf64_Shdr +# define Elf_Sym Elf64_Sym +# define Elf_Ehdr Elf64_Ehdr +#endif + +#ifdef CONFIG_X86_64 +/* X86_64 does not define MODULE_PROC_FAMILY */ +#elif defined CONFIG_M386 +#define MODULE_PROC_FAMILY "386 " +#elif defined CONFIG_M486 +#define MODULE_PROC_FAMILY "486 " +#elif defined CONFIG_M586 +#define MODULE_PROC_FAMILY "586 " +#elif defined CONFIG_M586TSC +#define MODULE_PROC_FAMILY "586TSC " +#elif defined CONFIG_M586MMX +#define MODULE_PROC_FAMILY "586MMX " +#elif defined CONFIG_MCORE2 +#define MODULE_PROC_FAMILY "CORE2 " +#elif defined CONFIG_M686 +#define MODULE_PROC_FAMILY "686 " +#elif defined CONFIG_MPENTIUMII +#define MODULE_PROC_FAMILY "PENTIUMII " +#elif defined CONFIG_MPENTIUMIII +#define MODULE_PROC_FAMILY "PENTIUMIII " +#elif defined CONFIG_MPENTIUMM +#define MODULE_PROC_FAMILY "PENTIUMM " +#elif defined CONFIG_MPENTIUM4 +#define MODULE_PROC_FAMILY "PENTIUM4 " +#elif defined CONFIG_MK6 +#define MODULE_PROC_FAMILY "K6 " +#elif defined CONFIG_MK7 +#define MODULE_PROC_FAMILY "K7 " +#elif defined CONFIG_MK8 +#define MODULE_PROC_FAMILY "K8 " +#elif defined CONFIG_X86_ELAN +#define MODULE_PROC_FAMILY "ELAN " +#elif defined CONFIG_MCRUSOE +#define MODULE_PROC_FAMILY "CRUSOE " +#elif defined CONFIG_MEFFICEON +#define MODULE_PROC_FAMILY "EFFICEON " +#elif defined CONFIG_MWINCHIPC6 +#define MODULE_PROC_FAMILY "WINCHIPC6 " +#elif defined CONFIG_MWINCHIP3D +#define MODULE_PROC_FAMILY "WINCHIP3D " +#elif defined CONFIG_MCYRIXIII +#define MODULE_PROC_FAMILY "CYRIXIII " +#elif defined CONFIG_MVIAC3_2 +#define MODULE_PROC_FAMILY "VIAC3-2 " +#elif defined CONFIG_MVIAC7 +#define MODULE_PROC_FAMILY "VIAC7 " +#elif defined CONFIG_MGEODEGX1 +#define MODULE_PROC_FAMILY "GEODEGX1 " +#elif defined CONFIG_MGEODE_LX +#define MODULE_PROC_FAMILY "GEODE " +#else +#error unknown processor family +#endif + +#ifdef CONFIG_X86_32 +# ifdef CONFIG_4KSTACKS +# define MODULE_STACKSIZE "4KSTACKS " +# else +# define MODULE_STACKSIZE "" +# endif +# define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_STACKSIZE +#endif + +#endif /* _ASM_X86_MODULE_H */ diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h new file mode 100644 index 000000000000..91885c28f66b --- /dev/null +++ b/arch/x86/include/asm/mpspec.h @@ -0,0 +1,145 @@ +#ifndef _ASM_X86_MPSPEC_H +#define _ASM_X86_MPSPEC_H + +#include <linux/init.h> + +#include <asm/mpspec_def.h> + +extern int apic_version[MAX_APICS]; + +#ifdef CONFIG_X86_32 +#include <mach_mpspec.h> + +extern unsigned int def_to_bigsmp; +extern u8 apicid_2_node[]; +extern int pic_mode; + +#ifdef CONFIG_X86_NUMAQ +extern int mp_bus_id_to_node[MAX_MP_BUSSES]; +extern int mp_bus_id_to_local[MAX_MP_BUSSES]; +extern int quad_local_to_mp_bus_id [NR_CPUS/4][4]; +#endif + +#define MAX_APICID 256 + +#else + +#define MAX_MP_BUSSES 256 +/* Each PCI slot may be a combo card with its own bus. 4 IRQ pins per slot. */ +#define MAX_IRQ_SOURCES (MAX_MP_BUSSES * 4) + +#endif + +extern void early_find_smp_config(void); +extern void early_get_smp_config(void); + +#if defined(CONFIG_MCA) || defined(CONFIG_EISA) +extern int mp_bus_id_to_type[MAX_MP_BUSSES]; +#endif + +extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); + +extern unsigned int boot_cpu_physical_apicid; +extern unsigned int max_physical_apicid; +extern int smp_found_config; +extern int mpc_default_type; +extern unsigned long mp_lapic_addr; + +extern void find_smp_config(void); +extern void get_smp_config(void); +#ifdef CONFIG_X86_MPPARSE +extern void early_reserve_e820_mpc_new(void); +#else +static inline void early_reserve_e820_mpc_new(void) { } +#endif + +void __cpuinit generic_processor_info(int apicid, int version); +#ifdef CONFIG_ACPI +extern void mp_register_ioapic(int id, u32 address, u32 gsi_base); +extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, + u32 gsi); +extern void mp_config_acpi_legacy_irqs(void); +extern int mp_register_gsi(u32 gsi, int edge_level, int active_high_low); +#ifdef CONFIG_X86_IO_APIC +extern int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin, + u32 gsi, int triggering, int polarity); +#else +static inline int +mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin, + u32 gsi, int triggering, int polarity) +{ + return 0; +} +#endif +#endif /* CONFIG_ACPI */ + +#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_APICS) + +struct physid_mask { + unsigned long mask[PHYSID_ARRAY_SIZE]; +}; + +typedef struct physid_mask physid_mask_t; + +#define physid_set(physid, map) set_bit(physid, (map).mask) +#define physid_clear(physid, map) clear_bit(physid, (map).mask) +#define physid_isset(physid, map) test_bit(physid, (map).mask) +#define physid_test_and_set(physid, map) \ + test_and_set_bit(physid, (map).mask) + +#define physids_and(dst, src1, src2) \ + bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_APICS) + +#define physids_or(dst, src1, src2) \ + bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_APICS) + +#define physids_clear(map) \ + bitmap_zero((map).mask, MAX_APICS) + +#define physids_complement(dst, src) \ + bitmap_complement((dst).mask, (src).mask, MAX_APICS) + +#define physids_empty(map) \ + bitmap_empty((map).mask, MAX_APICS) + +#define physids_equal(map1, map2) \ + bitmap_equal((map1).mask, (map2).mask, MAX_APICS) + +#define physids_weight(map) \ + bitmap_weight((map).mask, MAX_APICS) + +#define physids_shift_right(d, s, n) \ + bitmap_shift_right((d).mask, (s).mask, n, MAX_APICS) + +#define physids_shift_left(d, s, n) \ + bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS) + +#define physids_coerce(map) ((map).mask[0]) + +#define physids_promote(physids) \ + ({ \ + physid_mask_t __physid_mask = PHYSID_MASK_NONE; \ + __physid_mask.mask[0] = physids; \ + __physid_mask; \ + }) + +/* Note: will create very large stack frames if physid_mask_t is big */ +#define physid_mask_of_physid(physid) \ + ({ \ + physid_mask_t __physid_mask = PHYSID_MASK_NONE; \ + physid_set(physid, __physid_mask); \ + __physid_mask; \ + }) + +static inline void physid_set_mask_of_physid(int physid, physid_mask_t *map) +{ + physids_clear(*map); + physid_set(physid, *map); +} + +#define PHYSID_MASK_ALL { {[0 ... PHYSID_ARRAY_SIZE-1] = ~0UL} } +#define PHYSID_MASK_NONE { {[0 ... PHYSID_ARRAY_SIZE-1] = 0UL} } + +extern physid_mask_t phys_cpu_present_map; + +#endif /* _ASM_X86_MPSPEC_H */ diff --git a/arch/x86/include/asm/mpspec_def.h b/arch/x86/include/asm/mpspec_def.h new file mode 100644 index 000000000000..e3ace7d1d35d --- /dev/null +++ b/arch/x86/include/asm/mpspec_def.h @@ -0,0 +1,180 @@ +#ifndef _ASM_X86_MPSPEC_DEF_H +#define _ASM_X86_MPSPEC_DEF_H + +/* + * Structure definitions for SMP machines following the + * Intel Multiprocessing Specification 1.1 and 1.4. + */ + +/* + * This tag identifies where the SMP configuration + * information is. + */ + +#define SMP_MAGIC_IDENT (('_'<<24) | ('P'<<16) | ('M'<<8) | '_') + +#ifdef CONFIG_X86_32 +# define MAX_MPC_ENTRY 1024 +# define MAX_APICS 256 +#else +# if NR_CPUS <= 255 +# define MAX_APICS 255 +# else +# define MAX_APICS 32768 +# endif +#endif + +struct intel_mp_floating { + char mpf_signature[4]; /* "_MP_" */ + unsigned int mpf_physptr; /* Configuration table address */ + unsigned char mpf_length; /* Our length (paragraphs) */ + unsigned char mpf_specification;/* Specification version */ + unsigned char mpf_checksum; /* Checksum (makes sum 0) */ + unsigned char mpf_feature1; /* Standard or configuration ? */ + unsigned char mpf_feature2; /* Bit7 set for IMCR|PIC */ + unsigned char mpf_feature3; /* Unused (0) */ + unsigned char mpf_feature4; /* Unused (0) */ + unsigned char mpf_feature5; /* Unused (0) */ +}; + +#define MPC_SIGNATURE "PCMP" + +struct mp_config_table { + char mpc_signature[4]; + unsigned short mpc_length; /* Size of table */ + char mpc_spec; /* 0x01 */ + char mpc_checksum; + char mpc_oem[8]; + char mpc_productid[12]; + unsigned int mpc_oemptr; /* 0 if not present */ + unsigned short mpc_oemsize; /* 0 if not present */ + unsigned short mpc_oemcount; + unsigned int mpc_lapic; /* APIC address */ + unsigned int reserved; +}; + +/* Followed by entries */ + +#define MP_PROCESSOR 0 +#define MP_BUS 1 +#define MP_IOAPIC 2 +#define MP_INTSRC 3 +#define MP_LINTSRC 4 +/* Used by IBM NUMA-Q to describe node locality */ +#define MP_TRANSLATION 192 + +#define CPU_ENABLED 1 /* Processor is available */ +#define CPU_BOOTPROCESSOR 2 /* Processor is the BP */ + +#define CPU_STEPPING_MASK 0x000F +#define CPU_MODEL_MASK 0x00F0 +#define CPU_FAMILY_MASK 0x0F00 + +struct mpc_config_processor { + unsigned char mpc_type; + unsigned char mpc_apicid; /* Local APIC number */ + unsigned char mpc_apicver; /* Its versions */ + unsigned char mpc_cpuflag; + unsigned int mpc_cpufeature; + unsigned int mpc_featureflag; /* CPUID feature value */ + unsigned int mpc_reserved[2]; +}; + +struct mpc_config_bus { + unsigned char mpc_type; + unsigned char mpc_busid; + unsigned char mpc_bustype[6]; +}; + +/* List of Bus Type string values, Intel MP Spec. */ +#define BUSTYPE_EISA "EISA" +#define BUSTYPE_ISA "ISA" +#define BUSTYPE_INTERN "INTERN" /* Internal BUS */ +#define BUSTYPE_MCA "MCA" +#define BUSTYPE_VL "VL" /* Local bus */ +#define BUSTYPE_PCI "PCI" +#define BUSTYPE_PCMCIA "PCMCIA" +#define BUSTYPE_CBUS "CBUS" +#define BUSTYPE_CBUSII "CBUSII" +#define BUSTYPE_FUTURE "FUTURE" +#define BUSTYPE_MBI "MBI" +#define BUSTYPE_MBII "MBII" +#define BUSTYPE_MPI "MPI" +#define BUSTYPE_MPSA "MPSA" +#define BUSTYPE_NUBUS "NUBUS" +#define BUSTYPE_TC "TC" +#define BUSTYPE_VME "VME" +#define BUSTYPE_XPRESS "XPRESS" + +#define MPC_APIC_USABLE 0x01 + +struct mpc_config_ioapic { + unsigned char mpc_type; + unsigned char mpc_apicid; + unsigned char mpc_apicver; + unsigned char mpc_flags; + unsigned int mpc_apicaddr; +}; + +struct mpc_config_intsrc { + unsigned char mpc_type; + unsigned char mpc_irqtype; + unsigned short mpc_irqflag; + unsigned char mpc_srcbus; + unsigned char mpc_srcbusirq; + unsigned char mpc_dstapic; + unsigned char mpc_dstirq; +}; + +enum mp_irq_source_types { + mp_INT = 0, + mp_NMI = 1, + mp_SMI = 2, + mp_ExtINT = 3 +}; + +#define MP_IRQDIR_DEFAULT 0 +#define MP_IRQDIR_HIGH 1 +#define MP_IRQDIR_LOW 3 + +#define MP_APIC_ALL 0xFF + +struct mpc_config_lintsrc { + unsigned char mpc_type; + unsigned char mpc_irqtype; + unsigned short mpc_irqflag; + unsigned char mpc_srcbusid; + unsigned char mpc_srcbusirq; + unsigned char mpc_destapic; + unsigned char mpc_destapiclint; +}; + +#define MPC_OEM_SIGNATURE "_OEM" + +struct mp_config_oemtable { + char oem_signature[4]; + unsigned short oem_length; /* Size of table */ + char oem_rev; /* 0x01 */ + char oem_checksum; + char mpc_oem[8]; +}; + +/* + * Default configurations + * + * 1 2 CPU ISA 82489DX + * 2 2 CPU EISA 82489DX neither IRQ 0 timer nor IRQ 13 DMA chaining + * 3 2 CPU EISA 82489DX + * 4 2 CPU MCA 82489DX + * 5 2 CPU ISA+PCI + * 6 2 CPU EISA+PCI + * 7 2 CPU MCA+PCI + */ + +enum mp_bustype { + MP_BUS_ISA = 1, + MP_BUS_EISA, + MP_BUS_PCI, + MP_BUS_MCA, +}; +#endif /* _ASM_X86_MPSPEC_DEF_H */ diff --git a/arch/x86/include/asm/msgbuf.h b/arch/x86/include/asm/msgbuf.h new file mode 100644 index 000000000000..7e4e9481f51c --- /dev/null +++ b/arch/x86/include/asm/msgbuf.h @@ -0,0 +1,39 @@ +#ifndef _ASM_X86_MSGBUF_H +#define _ASM_X86_MSGBUF_H + +/* + * The msqid64_ds structure for i386 architecture. + * Note extra padding because this structure is passed back and forth + * between kernel and user space. + * + * Pad space on i386 is left for: + * - 64-bit time_t to solve y2038 problem + * - 2 miscellaneous 32-bit values + * + * Pad space on x8664 is left for: + * - 2 miscellaneous 64-bit values + */ +struct msqid64_ds { + struct ipc64_perm msg_perm; + __kernel_time_t msg_stime; /* last msgsnd time */ +#ifdef __i386__ + unsigned long __unused1; +#endif + __kernel_time_t msg_rtime; /* last msgrcv time */ +#ifdef __i386__ + unsigned long __unused2; +#endif + __kernel_time_t msg_ctime; /* last change time */ +#ifdef __i386__ + unsigned long __unused3; +#endif + unsigned long msg_cbytes; /* current number of bytes on queue */ + unsigned long msg_qnum; /* number of messages in queue */ + unsigned long msg_qbytes; /* max number of bytes on queue */ + __kernel_pid_t msg_lspid; /* pid of last msgsnd */ + __kernel_pid_t msg_lrpid; /* last receive pid */ + unsigned long __unused4; + unsigned long __unused5; +}; + +#endif /* _ASM_X86_MSGBUF_H */ diff --git a/arch/x86/include/asm/msidef.h b/arch/x86/include/asm/msidef.h new file mode 100644 index 000000000000..6706b3006f13 --- /dev/null +++ b/arch/x86/include/asm/msidef.h @@ -0,0 +1,55 @@ +#ifndef _ASM_X86_MSIDEF_H +#define _ASM_X86_MSIDEF_H + +/* + * Constants for Intel APIC based MSI messages. + */ + +/* + * Shifts for MSI data + */ + +#define MSI_DATA_VECTOR_SHIFT 0 +#define MSI_DATA_VECTOR_MASK 0x000000ff +#define MSI_DATA_VECTOR(v) (((v) << MSI_DATA_VECTOR_SHIFT) & \ + MSI_DATA_VECTOR_MASK) + +#define MSI_DATA_DELIVERY_MODE_SHIFT 8 +#define MSI_DATA_DELIVERY_FIXED (0 << MSI_DATA_DELIVERY_MODE_SHIFT) +#define MSI_DATA_DELIVERY_LOWPRI (1 << MSI_DATA_DELIVERY_MODE_SHIFT) + +#define MSI_DATA_LEVEL_SHIFT 14 +#define MSI_DATA_LEVEL_DEASSERT (0 << MSI_DATA_LEVEL_SHIFT) +#define MSI_DATA_LEVEL_ASSERT (1 << MSI_DATA_LEVEL_SHIFT) + +#define MSI_DATA_TRIGGER_SHIFT 15 +#define MSI_DATA_TRIGGER_EDGE (0 << MSI_DATA_TRIGGER_SHIFT) +#define MSI_DATA_TRIGGER_LEVEL (1 << MSI_DATA_TRIGGER_SHIFT) + +/* + * Shift/mask fields for msi address + */ + +#define MSI_ADDR_BASE_HI 0 +#define MSI_ADDR_BASE_LO 0xfee00000 + +#define MSI_ADDR_DEST_MODE_SHIFT 2 +#define MSI_ADDR_DEST_MODE_PHYSICAL (0 << MSI_ADDR_DEST_MODE_SHIFT) +#define MSI_ADDR_DEST_MODE_LOGICAL (1 << MSI_ADDR_DEST_MODE_SHIFT) + +#define MSI_ADDR_REDIRECTION_SHIFT 3 +#define MSI_ADDR_REDIRECTION_CPU (0 << MSI_ADDR_REDIRECTION_SHIFT) + /* dedicated cpu */ +#define MSI_ADDR_REDIRECTION_LOWPRI (1 << MSI_ADDR_REDIRECTION_SHIFT) + /* lowest priority */ + +#define MSI_ADDR_DEST_ID_SHIFT 12 +#define MSI_ADDR_DEST_ID_MASK 0x00ffff0 +#define MSI_ADDR_DEST_ID(dest) (((dest) << MSI_ADDR_DEST_ID_SHIFT) & \ + MSI_ADDR_DEST_ID_MASK) + +#define MSI_ADDR_IR_EXT_INT (1 << 4) +#define MSI_ADDR_IR_SHV (1 << 3) +#define MSI_ADDR_IR_INDEX1(index) ((index & 0x8000) >> 13) +#define MSI_ADDR_IR_INDEX2(index) ((index & 0x7fff) << 5) +#endif /* _ASM_X86_MSIDEF_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h new file mode 100644 index 000000000000..e38859d577a1 --- /dev/null +++ b/arch/x86/include/asm/msr-index.h @@ -0,0 +1,332 @@ +#ifndef _ASM_X86_MSR_INDEX_H +#define _ASM_X86_MSR_INDEX_H + +/* CPU model specific register (MSR) numbers */ + +/* x86-64 specific MSRs */ +#define MSR_EFER 0xc0000080 /* extended feature register */ +#define MSR_STAR 0xc0000081 /* legacy mode SYSCALL target */ +#define MSR_LSTAR 0xc0000082 /* long mode SYSCALL target */ +#define MSR_CSTAR 0xc0000083 /* compat mode SYSCALL target */ +#define MSR_SYSCALL_MASK 0xc0000084 /* EFLAGS mask for syscall */ +#define MSR_FS_BASE 0xc0000100 /* 64bit FS base */ +#define MSR_GS_BASE 0xc0000101 /* 64bit GS base */ +#define MSR_KERNEL_GS_BASE 0xc0000102 /* SwapGS GS shadow */ + +/* EFER bits: */ +#define _EFER_SCE 0 /* SYSCALL/SYSRET */ +#define _EFER_LME 8 /* Long mode enable */ +#define _EFER_LMA 10 /* Long mode active (read-only) */ +#define _EFER_NX 11 /* No execute enable */ + +#define EFER_SCE (1<<_EFER_SCE) +#define EFER_LME (1<<_EFER_LME) +#define EFER_LMA (1<<_EFER_LMA) +#define EFER_NX (1<<_EFER_NX) + +/* Intel MSRs. Some also available on other CPUs */ +#define MSR_IA32_PERFCTR0 0x000000c1 +#define MSR_IA32_PERFCTR1 0x000000c2 +#define MSR_FSB_FREQ 0x000000cd + +#define MSR_MTRRcap 0x000000fe +#define MSR_IA32_BBL_CR_CTL 0x00000119 + +#define MSR_IA32_SYSENTER_CS 0x00000174 +#define MSR_IA32_SYSENTER_ESP 0x00000175 +#define MSR_IA32_SYSENTER_EIP 0x00000176 + +#define MSR_IA32_MCG_CAP 0x00000179 +#define MSR_IA32_MCG_STATUS 0x0000017a +#define MSR_IA32_MCG_CTL 0x0000017b + +#define MSR_IA32_PEBS_ENABLE 0x000003f1 +#define MSR_IA32_DS_AREA 0x00000600 +#define MSR_IA32_PERF_CAPABILITIES 0x00000345 + +#define MSR_MTRRfix64K_00000 0x00000250 +#define MSR_MTRRfix16K_80000 0x00000258 +#define MSR_MTRRfix16K_A0000 0x00000259 +#define MSR_MTRRfix4K_C0000 0x00000268 +#define MSR_MTRRfix4K_C8000 0x00000269 +#define MSR_MTRRfix4K_D0000 0x0000026a +#define MSR_MTRRfix4K_D8000 0x0000026b +#define MSR_MTRRfix4K_E0000 0x0000026c +#define MSR_MTRRfix4K_E8000 0x0000026d +#define MSR_MTRRfix4K_F0000 0x0000026e +#define MSR_MTRRfix4K_F8000 0x0000026f +#define MSR_MTRRdefType 0x000002ff + +#define MSR_IA32_CR_PAT 0x00000277 + +#define MSR_IA32_DEBUGCTLMSR 0x000001d9 +#define MSR_IA32_LASTBRANCHFROMIP 0x000001db +#define MSR_IA32_LASTBRANCHTOIP 0x000001dc +#define MSR_IA32_LASTINTFROMIP 0x000001dd +#define MSR_IA32_LASTINTTOIP 0x000001de + +/* DEBUGCTLMSR bits (others vary by model): */ +#define _DEBUGCTLMSR_LBR 0 /* last branch recording */ +#define _DEBUGCTLMSR_BTF 1 /* single-step on branches */ + +#define DEBUGCTLMSR_LBR (1UL << _DEBUGCTLMSR_LBR) +#define DEBUGCTLMSR_BTF (1UL << _DEBUGCTLMSR_BTF) + +#define MSR_IA32_MC0_CTL 0x00000400 +#define MSR_IA32_MC0_STATUS 0x00000401 +#define MSR_IA32_MC0_ADDR 0x00000402 +#define MSR_IA32_MC0_MISC 0x00000403 + +#define MSR_P6_PERFCTR0 0x000000c1 +#define MSR_P6_PERFCTR1 0x000000c2 +#define MSR_P6_EVNTSEL0 0x00000186 +#define MSR_P6_EVNTSEL1 0x00000187 + +/* AMD64 MSRs. Not complete. See the architecture manual for a more + complete list. */ + +#define MSR_AMD64_NB_CFG 0xc001001f +#define MSR_AMD64_IBSFETCHCTL 0xc0011030 +#define MSR_AMD64_IBSFETCHLINAD 0xc0011031 +#define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032 +#define MSR_AMD64_IBSOPCTL 0xc0011033 +#define MSR_AMD64_IBSOPRIP 0xc0011034 +#define MSR_AMD64_IBSOPDATA 0xc0011035 +#define MSR_AMD64_IBSOPDATA2 0xc0011036 +#define MSR_AMD64_IBSOPDATA3 0xc0011037 +#define MSR_AMD64_IBSDCLINAD 0xc0011038 +#define MSR_AMD64_IBSDCPHYSAD 0xc0011039 +#define MSR_AMD64_IBSCTL 0xc001103a + +/* Fam 10h MSRs */ +#define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058 +#define FAM10H_MMIO_CONF_ENABLE (1<<0) +#define FAM10H_MMIO_CONF_BUSRANGE_MASK 0xf +#define FAM10H_MMIO_CONF_BUSRANGE_SHIFT 2 +#define FAM10H_MMIO_CONF_BASE_MASK 0xfffffff +#define FAM10H_MMIO_CONF_BASE_SHIFT 20 + +/* K8 MSRs */ +#define MSR_K8_TOP_MEM1 0xc001001a +#define MSR_K8_TOP_MEM2 0xc001001d +#define MSR_K8_SYSCFG 0xc0010010 +#define MSR_K8_HWCR 0xc0010015 +#define MSR_K8_INT_PENDING_MSG 0xc0010055 +/* C1E active bits in int pending message */ +#define K8_INTP_C1E_ACTIVE_MASK 0x18000000 +#define MSR_K8_TSEG_ADDR 0xc0010112 +#define K8_MTRRFIXRANGE_DRAM_ENABLE 0x00040000 /* MtrrFixDramEn bit */ +#define K8_MTRRFIXRANGE_DRAM_MODIFY 0x00080000 /* MtrrFixDramModEn bit */ +#define K8_MTRR_RDMEM_WRMEM_MASK 0x18181818 /* Mask: RdMem|WrMem */ + +/* K7 MSRs */ +#define MSR_K7_EVNTSEL0 0xc0010000 +#define MSR_K7_PERFCTR0 0xc0010004 +#define MSR_K7_EVNTSEL1 0xc0010001 +#define MSR_K7_PERFCTR1 0xc0010005 +#define MSR_K7_EVNTSEL2 0xc0010002 +#define MSR_K7_PERFCTR2 0xc0010006 +#define MSR_K7_EVNTSEL3 0xc0010003 +#define MSR_K7_PERFCTR3 0xc0010007 +#define MSR_K7_CLK_CTL 0xc001001b +#define MSR_K7_HWCR 0xc0010015 +#define MSR_K7_FID_VID_CTL 0xc0010041 +#define MSR_K7_FID_VID_STATUS 0xc0010042 + +/* K6 MSRs */ +#define MSR_K6_EFER 0xc0000080 +#define MSR_K6_STAR 0xc0000081 +#define MSR_K6_WHCR 0xc0000082 +#define MSR_K6_UWCCR 0xc0000085 +#define MSR_K6_EPMR 0xc0000086 +#define MSR_K6_PSOR 0xc0000087 +#define MSR_K6_PFIR 0xc0000088 + +/* Centaur-Hauls/IDT defined MSRs. */ +#define MSR_IDT_FCR1 0x00000107 +#define MSR_IDT_FCR2 0x00000108 +#define MSR_IDT_FCR3 0x00000109 +#define MSR_IDT_FCR4 0x0000010a + +#define MSR_IDT_MCR0 0x00000110 +#define MSR_IDT_MCR1 0x00000111 +#define MSR_IDT_MCR2 0x00000112 +#define MSR_IDT_MCR3 0x00000113 +#define MSR_IDT_MCR4 0x00000114 +#define MSR_IDT_MCR5 0x00000115 +#define MSR_IDT_MCR6 0x00000116 +#define MSR_IDT_MCR7 0x00000117 +#define MSR_IDT_MCR_CTRL 0x00000120 + +/* VIA Cyrix defined MSRs*/ +#define MSR_VIA_FCR 0x00001107 +#define MSR_VIA_LONGHAUL 0x0000110a +#define MSR_VIA_RNG 0x0000110b +#define MSR_VIA_BCR2 0x00001147 + +/* Transmeta defined MSRs */ +#define MSR_TMTA_LONGRUN_CTRL 0x80868010 +#define MSR_TMTA_LONGRUN_FLAGS 0x80868011 +#define MSR_TMTA_LRTI_READOUT 0x80868018 +#define MSR_TMTA_LRTI_VOLT_MHZ 0x8086801a + +/* Intel defined MSRs. */ +#define MSR_IA32_P5_MC_ADDR 0x00000000 +#define MSR_IA32_P5_MC_TYPE 0x00000001 +#define MSR_IA32_TSC 0x00000010 +#define MSR_IA32_PLATFORM_ID 0x00000017 +#define MSR_IA32_EBL_CR_POWERON 0x0000002a +#define MSR_IA32_FEATURE_CONTROL 0x0000003a + +#define FEATURE_CONTROL_LOCKED (1<<0) +#define FEATURE_CONTROL_VMXON_ENABLED (1<<2) + +#define MSR_IA32_APICBASE 0x0000001b +#define MSR_IA32_APICBASE_BSP (1<<8) +#define MSR_IA32_APICBASE_ENABLE (1<<11) +#define MSR_IA32_APICBASE_BASE (0xfffff<<12) + +#define MSR_IA32_UCODE_WRITE 0x00000079 +#define MSR_IA32_UCODE_REV 0x0000008b + +#define MSR_IA32_PERF_STATUS 0x00000198 +#define MSR_IA32_PERF_CTL 0x00000199 + +#define MSR_IA32_MPERF 0x000000e7 +#define MSR_IA32_APERF 0x000000e8 + +#define MSR_IA32_THERM_CONTROL 0x0000019a +#define MSR_IA32_THERM_INTERRUPT 0x0000019b +#define MSR_IA32_THERM_STATUS 0x0000019c +#define MSR_IA32_MISC_ENABLE 0x000001a0 + +/* Intel Model 6 */ +#define MSR_P6_EVNTSEL0 0x00000186 +#define MSR_P6_EVNTSEL1 0x00000187 + +/* P4/Xeon+ specific */ +#define MSR_IA32_MCG_EAX 0x00000180 +#define MSR_IA32_MCG_EBX 0x00000181 +#define MSR_IA32_MCG_ECX 0x00000182 +#define MSR_IA32_MCG_EDX 0x00000183 +#define MSR_IA32_MCG_ESI 0x00000184 +#define MSR_IA32_MCG_EDI 0x00000185 +#define MSR_IA32_MCG_EBP 0x00000186 +#define MSR_IA32_MCG_ESP 0x00000187 +#define MSR_IA32_MCG_EFLAGS 0x00000188 +#define MSR_IA32_MCG_EIP 0x00000189 +#define MSR_IA32_MCG_RESERVED 0x0000018a + +/* Pentium IV performance counter MSRs */ +#define MSR_P4_BPU_PERFCTR0 0x00000300 +#define MSR_P4_BPU_PERFCTR1 0x00000301 +#define MSR_P4_BPU_PERFCTR2 0x00000302 +#define MSR_P4_BPU_PERFCTR3 0x00000303 +#define MSR_P4_MS_PERFCTR0 0x00000304 +#define MSR_P4_MS_PERFCTR1 0x00000305 +#define MSR_P4_MS_PERFCTR2 0x00000306 +#define MSR_P4_MS_PERFCTR3 0x00000307 +#define MSR_P4_FLAME_PERFCTR0 0x00000308 +#define MSR_P4_FLAME_PERFCTR1 0x00000309 +#define MSR_P4_FLAME_PERFCTR2 0x0000030a +#define MSR_P4_FLAME_PERFCTR3 0x0000030b +#define MSR_P4_IQ_PERFCTR0 0x0000030c +#define MSR_P4_IQ_PERFCTR1 0x0000030d +#define MSR_P4_IQ_PERFCTR2 0x0000030e +#define MSR_P4_IQ_PERFCTR3 0x0000030f +#define MSR_P4_IQ_PERFCTR4 0x00000310 +#define MSR_P4_IQ_PERFCTR5 0x00000311 +#define MSR_P4_BPU_CCCR0 0x00000360 +#define MSR_P4_BPU_CCCR1 0x00000361 +#define MSR_P4_BPU_CCCR2 0x00000362 +#define MSR_P4_BPU_CCCR3 0x00000363 +#define MSR_P4_MS_CCCR0 0x00000364 +#define MSR_P4_MS_CCCR1 0x00000365 +#define MSR_P4_MS_CCCR2 0x00000366 +#define MSR_P4_MS_CCCR3 0x00000367 +#define MSR_P4_FLAME_CCCR0 0x00000368 +#define MSR_P4_FLAME_CCCR1 0x00000369 +#define MSR_P4_FLAME_CCCR2 0x0000036a +#define MSR_P4_FLAME_CCCR3 0x0000036b +#define MSR_P4_IQ_CCCR0 0x0000036c +#define MSR_P4_IQ_CCCR1 0x0000036d +#define MSR_P4_IQ_CCCR2 0x0000036e +#define MSR_P4_IQ_CCCR3 0x0000036f +#define MSR_P4_IQ_CCCR4 0x00000370 +#define MSR_P4_IQ_CCCR5 0x00000371 +#define MSR_P4_ALF_ESCR0 0x000003ca +#define MSR_P4_ALF_ESCR1 0x000003cb +#define MSR_P4_BPU_ESCR0 0x000003b2 +#define MSR_P4_BPU_ESCR1 0x000003b3 +#define MSR_P4_BSU_ESCR0 0x000003a0 +#define MSR_P4_BSU_ESCR1 0x000003a1 +#define MSR_P4_CRU_ESCR0 0x000003b8 +#define MSR_P4_CRU_ESCR1 0x000003b9 +#define MSR_P4_CRU_ESCR2 0x000003cc +#define MSR_P4_CRU_ESCR3 0x000003cd +#define MSR_P4_CRU_ESCR4 0x000003e0 +#define MSR_P4_CRU_ESCR5 0x000003e1 +#define MSR_P4_DAC_ESCR0 0x000003a8 +#define MSR_P4_DAC_ESCR1 0x000003a9 +#define MSR_P4_FIRM_ESCR0 0x000003a4 +#define MSR_P4_FIRM_ESCR1 0x000003a5 +#define MSR_P4_FLAME_ESCR0 0x000003a6 +#define MSR_P4_FLAME_ESCR1 0x000003a7 +#define MSR_P4_FSB_ESCR0 0x000003a2 +#define MSR_P4_FSB_ESCR1 0x000003a3 +#define MSR_P4_IQ_ESCR0 0x000003ba +#define MSR_P4_IQ_ESCR1 0x000003bb +#define MSR_P4_IS_ESCR0 0x000003b4 +#define MSR_P4_IS_ESCR1 0x000003b5 +#define MSR_P4_ITLB_ESCR0 0x000003b6 +#define MSR_P4_ITLB_ESCR1 0x000003b7 +#define MSR_P4_IX_ESCR0 0x000003c8 +#define MSR_P4_IX_ESCR1 0x000003c9 +#define MSR_P4_MOB_ESCR0 0x000003aa +#define MSR_P4_MOB_ESCR1 0x000003ab +#define MSR_P4_MS_ESCR0 0x000003c0 +#define MSR_P4_MS_ESCR1 0x000003c1 +#define MSR_P4_PMH_ESCR0 0x000003ac +#define MSR_P4_PMH_ESCR1 0x000003ad +#define MSR_P4_RAT_ESCR0 0x000003bc +#define MSR_P4_RAT_ESCR1 0x000003bd +#define MSR_P4_SAAT_ESCR0 0x000003ae +#define MSR_P4_SAAT_ESCR1 0x000003af +#define MSR_P4_SSU_ESCR0 0x000003be +#define MSR_P4_SSU_ESCR1 0x000003bf /* guess: not in manual */ + +#define MSR_P4_TBPU_ESCR0 0x000003c2 +#define MSR_P4_TBPU_ESCR1 0x000003c3 +#define MSR_P4_TC_ESCR0 0x000003c4 +#define MSR_P4_TC_ESCR1 0x000003c5 +#define MSR_P4_U2L_ESCR0 0x000003b0 +#define MSR_P4_U2L_ESCR1 0x000003b1 + +/* Intel Core-based CPU performance counters */ +#define MSR_CORE_PERF_FIXED_CTR0 0x00000309 +#define MSR_CORE_PERF_FIXED_CTR1 0x0000030a +#define MSR_CORE_PERF_FIXED_CTR2 0x0000030b +#define MSR_CORE_PERF_FIXED_CTR_CTRL 0x0000038d +#define MSR_CORE_PERF_GLOBAL_STATUS 0x0000038e +#define MSR_CORE_PERF_GLOBAL_CTRL 0x0000038f +#define MSR_CORE_PERF_GLOBAL_OVF_CTRL 0x00000390 + +/* Geode defined MSRs */ +#define MSR_GEODE_BUSCONT_CONF0 0x00001900 + +/* Intel VT MSRs */ +#define MSR_IA32_VMX_BASIC 0x00000480 +#define MSR_IA32_VMX_PINBASED_CTLS 0x00000481 +#define MSR_IA32_VMX_PROCBASED_CTLS 0x00000482 +#define MSR_IA32_VMX_EXIT_CTLS 0x00000483 +#define MSR_IA32_VMX_ENTRY_CTLS 0x00000484 +#define MSR_IA32_VMX_MISC 0x00000485 +#define MSR_IA32_VMX_CR0_FIXED0 0x00000486 +#define MSR_IA32_VMX_CR0_FIXED1 0x00000487 +#define MSR_IA32_VMX_CR4_FIXED0 0x00000488 +#define MSR_IA32_VMX_CR4_FIXED1 0x00000489 +#define MSR_IA32_VMX_VMCS_ENUM 0x0000048a +#define MSR_IA32_VMX_PROCBASED_CTLS2 0x0000048b +#define MSR_IA32_VMX_EPT_VPID_CAP 0x0000048c + +#endif /* _ASM_X86_MSR_INDEX_H */ diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h new file mode 100644 index 000000000000..46be2fa7ac26 --- /dev/null +++ b/arch/x86/include/asm/msr.h @@ -0,0 +1,247 @@ +#ifndef _ASM_X86_MSR_H +#define _ASM_X86_MSR_H + +#include <asm/msr-index.h> + +#ifndef __ASSEMBLY__ +# include <linux/types.h> +#endif + +#ifdef __KERNEL__ +#ifndef __ASSEMBLY__ + +#include <asm/asm.h> +#include <asm/errno.h> + +static inline unsigned long long native_read_tscp(unsigned int *aux) +{ + unsigned long low, high; + asm volatile(".byte 0x0f,0x01,0xf9" + : "=a" (low), "=d" (high), "=c" (*aux)); + return low | ((u64)high << 32); +} + +/* + * i386 calling convention returns 64-bit value in edx:eax, while + * x86_64 returns at rax. Also, the "A" constraint does not really + * mean rdx:rax in x86_64, so we need specialized behaviour for each + * architecture + */ +#ifdef CONFIG_X86_64 +#define DECLARE_ARGS(val, low, high) unsigned low, high +#define EAX_EDX_VAL(val, low, high) ((low) | ((u64)(high) << 32)) +#define EAX_EDX_ARGS(val, low, high) "a" (low), "d" (high) +#define EAX_EDX_RET(val, low, high) "=a" (low), "=d" (high) +#else +#define DECLARE_ARGS(val, low, high) unsigned long long val +#define EAX_EDX_VAL(val, low, high) (val) +#define EAX_EDX_ARGS(val, low, high) "A" (val) +#define EAX_EDX_RET(val, low, high) "=A" (val) +#endif + +static inline unsigned long long native_read_msr(unsigned int msr) +{ + DECLARE_ARGS(val, low, high); + + asm volatile("rdmsr" : EAX_EDX_RET(val, low, high) : "c" (msr)); + return EAX_EDX_VAL(val, low, high); +} + +static inline unsigned long long native_read_msr_safe(unsigned int msr, + int *err) +{ + DECLARE_ARGS(val, low, high); + + asm volatile("2: rdmsr ; xor %[err],%[err]\n" + "1:\n\t" + ".section .fixup,\"ax\"\n\t" + "3: mov %[fault],%[err] ; jmp 1b\n\t" + ".previous\n\t" + _ASM_EXTABLE(2b, 3b) + : [err] "=r" (*err), EAX_EDX_RET(val, low, high) + : "c" (msr), [fault] "i" (-EFAULT)); + return EAX_EDX_VAL(val, low, high); +} + +static inline unsigned long long native_read_msr_amd_safe(unsigned int msr, + int *err) +{ + DECLARE_ARGS(val, low, high); + + asm volatile("2: rdmsr ; xor %0,%0\n" + "1:\n\t" + ".section .fixup,\"ax\"\n\t" + "3: mov %3,%0 ; jmp 1b\n\t" + ".previous\n\t" + _ASM_EXTABLE(2b, 3b) + : "=r" (*err), EAX_EDX_RET(val, low, high) + : "c" (msr), "D" (0x9c5a203a), "i" (-EFAULT)); + return EAX_EDX_VAL(val, low, high); +} + +static inline void native_write_msr(unsigned int msr, + unsigned low, unsigned high) +{ + asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory"); +} + +static inline int native_write_msr_safe(unsigned int msr, + unsigned low, unsigned high) +{ + int err; + asm volatile("2: wrmsr ; xor %[err],%[err]\n" + "1:\n\t" + ".section .fixup,\"ax\"\n\t" + "3: mov %[fault],%[err] ; jmp 1b\n\t" + ".previous\n\t" + _ASM_EXTABLE(2b, 3b) + : [err] "=a" (err) + : "c" (msr), "0" (low), "d" (high), + [fault] "i" (-EFAULT) + : "memory"); + return err; +} + +extern unsigned long long native_read_tsc(void); + +static __always_inline unsigned long long __native_read_tsc(void) +{ + DECLARE_ARGS(val, low, high); + + rdtsc_barrier(); + asm volatile("rdtsc" : EAX_EDX_RET(val, low, high)); + rdtsc_barrier(); + + return EAX_EDX_VAL(val, low, high); +} + +static inline unsigned long long native_read_pmc(int counter) +{ + DECLARE_ARGS(val, low, high); + + asm volatile("rdpmc" : EAX_EDX_RET(val, low, high) : "c" (counter)); + return EAX_EDX_VAL(val, low, high); +} + +#ifdef CONFIG_PARAVIRT +#include <asm/paravirt.h> +#else +#include <linux/errno.h> +/* + * Access to machine-specific registers (available on 586 and better only) + * Note: the rd* operations modify the parameters directly (without using + * pointer indirection), this allows gcc to optimize better + */ + +#define rdmsr(msr, val1, val2) \ +do { \ + u64 __val = native_read_msr((msr)); \ + (val1) = (u32)__val; \ + (val2) = (u32)(__val >> 32); \ +} while (0) + +static inline void wrmsr(unsigned msr, unsigned low, unsigned high) +{ + native_write_msr(msr, low, high); +} + +#define rdmsrl(msr, val) \ + ((val) = native_read_msr((msr))) + +#define wrmsrl(msr, val) \ + native_write_msr((msr), (u32)((u64)(val)), (u32)((u64)(val) >> 32)) + +/* wrmsr with exception handling */ +static inline int wrmsr_safe(unsigned msr, unsigned low, unsigned high) +{ + return native_write_msr_safe(msr, low, high); +} + +/* rdmsr with exception handling */ +#define rdmsr_safe(msr, p1, p2) \ +({ \ + int __err; \ + u64 __val = native_read_msr_safe((msr), &__err); \ + (*p1) = (u32)__val; \ + (*p2) = (u32)(__val >> 32); \ + __err; \ +}) + +static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) +{ + int err; + + *p = native_read_msr_safe(msr, &err); + return err; +} +static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) +{ + int err; + + *p = native_read_msr_amd_safe(msr, &err); + return err; +} + +#define rdtscl(low) \ + ((low) = (u32)native_read_tsc()) + +#define rdtscll(val) \ + ((val) = native_read_tsc()) + +#define rdpmc(counter, low, high) \ +do { \ + u64 _l = native_read_pmc((counter)); \ + (low) = (u32)_l; \ + (high) = (u32)(_l >> 32); \ +} while (0) + +#define rdtscp(low, high, aux) \ +do { \ + unsigned long long _val = native_read_tscp(&(aux)); \ + (low) = (u32)_val; \ + (high) = (u32)(_val >> 32); \ +} while (0) + +#define rdtscpll(val, aux) (val) = native_read_tscp(&(aux)) + +#endif /* !CONFIG_PARAVIRT */ + + +#define checking_wrmsrl(msr, val) wrmsr_safe((msr), (u32)(val), \ + (u32)((val) >> 32)) + +#define write_tsc(val1, val2) wrmsr(0x10, (val1), (val2)) + +#define write_rdtscp_aux(val) wrmsr(0xc0000103, (val), 0) + +#ifdef CONFIG_SMP +int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); +int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); +int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); +int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); +#else /* CONFIG_SMP */ +static inline int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) +{ + rdmsr(msr_no, *l, *h); + return 0; +} +static inline int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) +{ + wrmsr(msr_no, l, h); + return 0; +} +static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, + u32 *l, u32 *h) +{ + return rdmsr_safe(msr_no, l, h); +} +static inline int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) +{ + return wrmsr_safe(msr_no, l, h); +} +#endif /* CONFIG_SMP */ +#endif /* __ASSEMBLY__ */ +#endif /* __KERNEL__ */ + + +#endif /* _ASM_X86_MSR_H */ diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h new file mode 100644 index 000000000000..7c1e4258b31e --- /dev/null +++ b/arch/x86/include/asm/mtrr.h @@ -0,0 +1,173 @@ +/* Generic MTRR (Memory Type Range Register) ioctls. + + Copyright (C) 1997-1999 Richard Gooch + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with this library; if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + Richard Gooch may be reached by email at rgooch@atnf.csiro.au + The postal address is: + Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia. +*/ +#ifndef _ASM_X86_MTRR_H +#define _ASM_X86_MTRR_H + +#include <linux/ioctl.h> +#include <linux/errno.h> + +#define MTRR_IOCTL_BASE 'M' + +struct mtrr_sentry { + unsigned long base; /* Base address */ + unsigned int size; /* Size of region */ + unsigned int type; /* Type of region */ +}; + +/* Warning: this structure has a different order from i386 + on x86-64. The 32bit emulation code takes care of that. + But you need to use this for 64bit, otherwise your X server + will break. */ + +#ifdef __i386__ +struct mtrr_gentry { + unsigned int regnum; /* Register number */ + unsigned long base; /* Base address */ + unsigned int size; /* Size of region */ + unsigned int type; /* Type of region */ +}; + +#else /* __i386__ */ + +struct mtrr_gentry { + unsigned long base; /* Base address */ + unsigned int size; /* Size of region */ + unsigned int regnum; /* Register number */ + unsigned int type; /* Type of region */ +}; +#endif /* !__i386__ */ + +/* These are the various ioctls */ +#define MTRRIOC_ADD_ENTRY _IOW(MTRR_IOCTL_BASE, 0, struct mtrr_sentry) +#define MTRRIOC_SET_ENTRY _IOW(MTRR_IOCTL_BASE, 1, struct mtrr_sentry) +#define MTRRIOC_DEL_ENTRY _IOW(MTRR_IOCTL_BASE, 2, struct mtrr_sentry) +#define MTRRIOC_GET_ENTRY _IOWR(MTRR_IOCTL_BASE, 3, struct mtrr_gentry) +#define MTRRIOC_KILL_ENTRY _IOW(MTRR_IOCTL_BASE, 4, struct mtrr_sentry) +#define MTRRIOC_ADD_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 5, struct mtrr_sentry) +#define MTRRIOC_SET_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 6, struct mtrr_sentry) +#define MTRRIOC_DEL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 7, struct mtrr_sentry) +#define MTRRIOC_GET_PAGE_ENTRY _IOWR(MTRR_IOCTL_BASE, 8, struct mtrr_gentry) +#define MTRRIOC_KILL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 9, struct mtrr_sentry) + +/* These are the region types */ +#define MTRR_TYPE_UNCACHABLE 0 +#define MTRR_TYPE_WRCOMB 1 +/*#define MTRR_TYPE_ 2*/ +/*#define MTRR_TYPE_ 3*/ +#define MTRR_TYPE_WRTHROUGH 4 +#define MTRR_TYPE_WRPROT 5 +#define MTRR_TYPE_WRBACK 6 +#define MTRR_NUM_TYPES 7 + +#ifdef __KERNEL__ + +/* The following functions are for use by other drivers */ +# ifdef CONFIG_MTRR +extern u8 mtrr_type_lookup(u64 addr, u64 end); +extern void mtrr_save_fixed_ranges(void *); +extern void mtrr_save_state(void); +extern int mtrr_add(unsigned long base, unsigned long size, + unsigned int type, bool increment); +extern int mtrr_add_page(unsigned long base, unsigned long size, + unsigned int type, bool increment); +extern int mtrr_del(int reg, unsigned long base, unsigned long size); +extern int mtrr_del_page(int reg, unsigned long base, unsigned long size); +extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi); +extern void mtrr_ap_init(void); +extern void mtrr_bp_init(void); +extern int mtrr_trim_uncached_memory(unsigned long end_pfn); +extern int amd_special_default_mtrr(void); +# else +static inline u8 mtrr_type_lookup(u64 addr, u64 end) +{ + /* + * Return no-MTRRs: + */ + return 0xff; +} +#define mtrr_save_fixed_ranges(arg) do {} while (0) +#define mtrr_save_state() do {} while (0) +static inline int mtrr_add(unsigned long base, unsigned long size, + unsigned int type, bool increment) +{ + return -ENODEV; +} +static inline int mtrr_add_page(unsigned long base, unsigned long size, + unsigned int type, bool increment) +{ + return -ENODEV; +} +static inline int mtrr_del(int reg, unsigned long base, unsigned long size) +{ + return -ENODEV; +} +static inline int mtrr_del_page(int reg, unsigned long base, unsigned long size) +{ + return -ENODEV; +} +static inline int mtrr_trim_uncached_memory(unsigned long end_pfn) +{ + return 0; +} +static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) +{ +} + +#define mtrr_ap_init() do {} while (0) +#define mtrr_bp_init() do {} while (0) +# endif + +#ifdef CONFIG_COMPAT +#include <linux/compat.h> + +struct mtrr_sentry32 { + compat_ulong_t base; /* Base address */ + compat_uint_t size; /* Size of region */ + compat_uint_t type; /* Type of region */ +}; + +struct mtrr_gentry32 { + compat_ulong_t regnum; /* Register number */ + compat_uint_t base; /* Base address */ + compat_uint_t size; /* Size of region */ + compat_uint_t type; /* Type of region */ +}; + +#define MTRR_IOCTL_BASE 'M' + +#define MTRRIOC32_ADD_ENTRY _IOW(MTRR_IOCTL_BASE, 0, struct mtrr_sentry32) +#define MTRRIOC32_SET_ENTRY _IOW(MTRR_IOCTL_BASE, 1, struct mtrr_sentry32) +#define MTRRIOC32_DEL_ENTRY _IOW(MTRR_IOCTL_BASE, 2, struct mtrr_sentry32) +#define MTRRIOC32_GET_ENTRY _IOWR(MTRR_IOCTL_BASE, 3, struct mtrr_gentry32) +#define MTRRIOC32_KILL_ENTRY _IOW(MTRR_IOCTL_BASE, 4, struct mtrr_sentry32) +#define MTRRIOC32_ADD_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 5, struct mtrr_sentry32) +#define MTRRIOC32_SET_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 6, struct mtrr_sentry32) +#define MTRRIOC32_DEL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 7, struct mtrr_sentry32) +#define MTRRIOC32_GET_PAGE_ENTRY _IOWR(MTRR_IOCTL_BASE, 8, struct mtrr_gentry32) +#define MTRRIOC32_KILL_PAGE_ENTRY \ + _IOW(MTRR_IOCTL_BASE, 9, struct mtrr_sentry32) +#endif /* CONFIG_COMPAT */ + +#endif /* __KERNEL__ */ + +#endif /* _ASM_X86_MTRR_H */ diff --git a/arch/x86/include/asm/mutex.h b/arch/x86/include/asm/mutex.h new file mode 100644 index 000000000000..a731b9c573a6 --- /dev/null +++ b/arch/x86/include/asm/mutex.h @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include "mutex_32.h" +#else +# include "mutex_64.h" +#endif diff --git a/arch/x86/include/asm/mutex_32.h b/arch/x86/include/asm/mutex_32.h new file mode 100644 index 000000000000..03f90c8a5a7c --- /dev/null +++ b/arch/x86/include/asm/mutex_32.h @@ -0,0 +1,125 @@ +/* + * Assembly implementation of the mutex fastpath, based on atomic + * decrement/increment. + * + * started by Ingo Molnar: + * + * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + */ +#ifndef _ASM_X86_MUTEX_32_H +#define _ASM_X86_MUTEX_32_H + +#include <asm/alternative.h> + +/** + * __mutex_fastpath_lock - try to take the lock by moving the count + * from 1 to a 0 value + * @count: pointer of type atomic_t + * @fn: function to call if the original value was not 1 + * + * Change the count from 1 to a value lower than 1, and call <fn> if it + * wasn't 1 originally. This function MUST leave the value lower than 1 + * even when the "1" assertion wasn't true. + */ +#define __mutex_fastpath_lock(count, fail_fn) \ +do { \ + unsigned int dummy; \ + \ + typecheck(atomic_t *, count); \ + typecheck_fn(void (*)(atomic_t *), fail_fn); \ + \ + asm volatile(LOCK_PREFIX " decl (%%eax)\n" \ + " jns 1f \n" \ + " call " #fail_fn "\n" \ + "1:\n" \ + : "=a" (dummy) \ + : "a" (count) \ + : "memory", "ecx", "edx"); \ +} while (0) + + +/** + * __mutex_fastpath_lock_retval - try to take the lock by moving the count + * from 1 to a 0 value + * @count: pointer of type atomic_t + * @fail_fn: function to call if the original value was not 1 + * + * Change the count from 1 to a value lower than 1, and call <fail_fn> if it + * wasn't 1 originally. This function returns 0 if the fastpath succeeds, + * or anything the slow path function returns + */ +static inline int __mutex_fastpath_lock_retval(atomic_t *count, + int (*fail_fn)(atomic_t *)) +{ + if (unlikely(atomic_dec_return(count) < 0)) + return fail_fn(count); + else + return 0; +} + +/** + * __mutex_fastpath_unlock - try to promote the mutex from 0 to 1 + * @count: pointer of type atomic_t + * @fail_fn: function to call if the original value was not 0 + * + * try to promote the mutex from 0 to 1. if it wasn't 0, call <fail_fn>. + * In the failure case, this function is allowed to either set the value + * to 1, or to set it to a value lower than 1. + * + * If the implementation sets it to a value of lower than 1, the + * __mutex_slowpath_needs_to_unlock() macro needs to return 1, it needs + * to return 0 otherwise. + */ +#define __mutex_fastpath_unlock(count, fail_fn) \ +do { \ + unsigned int dummy; \ + \ + typecheck(atomic_t *, count); \ + typecheck_fn(void (*)(atomic_t *), fail_fn); \ + \ + asm volatile(LOCK_PREFIX " incl (%%eax)\n" \ + " jg 1f\n" \ + " call " #fail_fn "\n" \ + "1:\n" \ + : "=a" (dummy) \ + : "a" (count) \ + : "memory", "ecx", "edx"); \ +} while (0) + +#define __mutex_slowpath_needs_to_unlock() 1 + +/** + * __mutex_fastpath_trylock - try to acquire the mutex, without waiting + * + * @count: pointer of type atomic_t + * @fail_fn: fallback function + * + * Change the count from 1 to a value lower than 1, and return 0 (failure) + * if it wasn't 1 originally, or return 1 (success) otherwise. This function + * MUST leave the value lower than 1 even when the "1" assertion wasn't true. + * Additionally, if the value was < 0 originally, this function must not leave + * it to 0 on failure. + */ +static inline int __mutex_fastpath_trylock(atomic_t *count, + int (*fail_fn)(atomic_t *)) +{ + /* + * We have two variants here. The cmpxchg based one is the best one + * because it never induce a false contention state. It is included + * here because architectures using the inc/dec algorithms over the + * xchg ones are much more likely to support cmpxchg natively. + * + * If not we fall back to the spinlock based variant - that is + * just as efficient (and simpler) as a 'destructive' probing of + * the mutex state would be. + */ +#ifdef __HAVE_ARCH_CMPXCHG + if (likely(atomic_cmpxchg(count, 1, 0) == 1)) + return 1; + return 0; +#else + return fail_fn(count); +#endif +} + +#endif /* _ASM_X86_MUTEX_32_H */ diff --git a/arch/x86/include/asm/mutex_64.h b/arch/x86/include/asm/mutex_64.h new file mode 100644 index 000000000000..68a87b0f8e29 --- /dev/null +++ b/arch/x86/include/asm/mutex_64.h @@ -0,0 +1,100 @@ +/* + * Assembly implementation of the mutex fastpath, based on atomic + * decrement/increment. + * + * started by Ingo Molnar: + * + * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + */ +#ifndef _ASM_X86_MUTEX_64_H +#define _ASM_X86_MUTEX_64_H + +/** + * __mutex_fastpath_lock - decrement and call function if negative + * @v: pointer of type atomic_t + * @fail_fn: function to call if the result is negative + * + * Atomically decrements @v and calls <fail_fn> if the result is negative. + */ +#define __mutex_fastpath_lock(v, fail_fn) \ +do { \ + unsigned long dummy; \ + \ + typecheck(atomic_t *, v); \ + typecheck_fn(void (*)(atomic_t *), fail_fn); \ + \ + asm volatile(LOCK_PREFIX " decl (%%rdi)\n" \ + " jns 1f \n" \ + " call " #fail_fn "\n" \ + "1:" \ + : "=D" (dummy) \ + : "D" (v) \ + : "rax", "rsi", "rdx", "rcx", \ + "r8", "r9", "r10", "r11", "memory"); \ +} while (0) + +/** + * __mutex_fastpath_lock_retval - try to take the lock by moving the count + * from 1 to a 0 value + * @count: pointer of type atomic_t + * @fail_fn: function to call if the original value was not 1 + * + * Change the count from 1 to a value lower than 1, and call <fail_fn> if + * it wasn't 1 originally. This function returns 0 if the fastpath succeeds, + * or anything the slow path function returns + */ +static inline int __mutex_fastpath_lock_retval(atomic_t *count, + int (*fail_fn)(atomic_t *)) +{ + if (unlikely(atomic_dec_return(count) < 0)) + return fail_fn(count); + else + return 0; +} + +/** + * __mutex_fastpath_unlock - increment and call function if nonpositive + * @v: pointer of type atomic_t + * @fail_fn: function to call if the result is nonpositive + * + * Atomically increments @v and calls <fail_fn> if the result is nonpositive. + */ +#define __mutex_fastpath_unlock(v, fail_fn) \ +do { \ + unsigned long dummy; \ + \ + typecheck(atomic_t *, v); \ + typecheck_fn(void (*)(atomic_t *), fail_fn); \ + \ + asm volatile(LOCK_PREFIX " incl (%%rdi)\n" \ + " jg 1f\n" \ + " call " #fail_fn "\n" \ + "1:" \ + : "=D" (dummy) \ + : "D" (v) \ + : "rax", "rsi", "rdx", "rcx", \ + "r8", "r9", "r10", "r11", "memory"); \ +} while (0) + +#define __mutex_slowpath_needs_to_unlock() 1 + +/** + * __mutex_fastpath_trylock - try to acquire the mutex, without waiting + * + * @count: pointer of type atomic_t + * @fail_fn: fallback function + * + * Change the count from 1 to 0 and return 1 (success), or return 0 (failure) + * if it wasn't 1 originally. [the fallback function is never used on + * x86_64, because all x86_64 CPUs have a CMPXCHG instruction.] + */ +static inline int __mutex_fastpath_trylock(atomic_t *count, + int (*fail_fn)(atomic_t *)) +{ + if (likely(atomic_cmpxchg(count, 1, 0) == 1)) + return 1; + else + return 0; +} + +#endif /* _ASM_X86_MUTEX_64_H */ diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h new file mode 100644 index 000000000000..c45a0a568dff --- /dev/null +++ b/arch/x86/include/asm/nmi.h @@ -0,0 +1,81 @@ +#ifndef _ASM_X86_NMI_H +#define _ASM_X86_NMI_H + +#include <linux/pm.h> +#include <asm/irq.h> +#include <asm/io.h> + +#ifdef ARCH_HAS_NMI_WATCHDOG + +/** + * do_nmi_callback + * + * Check to see if a callback exists and execute it. Return 1 + * if the handler exists and was handled successfully. + */ +int do_nmi_callback(struct pt_regs *regs, int cpu); + +extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); +extern int check_nmi_watchdog(void); +extern int nmi_watchdog_enabled; +extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); +extern int avail_to_resrv_perfctr_nmi(unsigned int); +extern int reserve_perfctr_nmi(unsigned int); +extern void release_perfctr_nmi(unsigned int); +extern int reserve_evntsel_nmi(unsigned int); +extern void release_evntsel_nmi(unsigned int); + +extern void setup_apic_nmi_watchdog(void *); +extern void stop_apic_nmi_watchdog(void *); +extern void disable_timer_nmi_watchdog(void); +extern void enable_timer_nmi_watchdog(void); +extern int nmi_watchdog_tick(struct pt_regs *regs, unsigned reason); +extern void cpu_nmi_set_wd_enabled(void); + +extern atomic_t nmi_active; +extern unsigned int nmi_watchdog; +#define NMI_NONE 0 +#define NMI_IO_APIC 1 +#define NMI_LOCAL_APIC 2 +#define NMI_INVALID 3 + +struct ctl_table; +struct file; +extern int proc_nmi_enabled(struct ctl_table *, int , struct file *, + void __user *, size_t *, loff_t *); +extern int unknown_nmi_panic; + +void __trigger_all_cpu_backtrace(void); +#define trigger_all_cpu_backtrace() __trigger_all_cpu_backtrace() + +static inline void localise_nmi_watchdog(void) +{ + if (nmi_watchdog == NMI_IO_APIC) + nmi_watchdog = NMI_LOCAL_APIC; +} + +/* check if nmi_watchdog is active (ie was specified at boot) */ +static inline int nmi_watchdog_active(void) +{ + /* + * actually it should be: + * return (nmi_watchdog == NMI_LOCAL_APIC || + * nmi_watchdog == NMI_IO_APIC) + * but since they are power of two we could use a + * cheaper way --cvg + */ + return nmi_watchdog & 0x3; +} +#endif + +void lapic_watchdog_stop(void); +int lapic_watchdog_init(unsigned nmi_hz); +int lapic_wd_event(unsigned nmi_hz); +unsigned lapic_adjust_nmi_hz(unsigned hz); +int lapic_watchdog_ok(void); +void disable_lapic_nmi_watchdog(void); +void enable_lapic_nmi_watchdog(void); +void stop_nmi(void); +void restart_nmi(void); + +#endif /* _ASM_X86_NMI_H */ diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h new file mode 100644 index 000000000000..ad2668ee1aa7 --- /dev/null +++ b/arch/x86/include/asm/nops.h @@ -0,0 +1,118 @@ +#ifndef _ASM_X86_NOPS_H +#define _ASM_X86_NOPS_H + +/* Define nops for use with alternative() */ + +/* generic versions from gas + 1: nop + the following instructions are NOT nops in 64-bit mode, + for 64-bit mode use K8 or P6 nops instead + 2: movl %esi,%esi + 3: leal 0x00(%esi),%esi + 4: leal 0x00(,%esi,1),%esi + 6: leal 0x00000000(%esi),%esi + 7: leal 0x00000000(,%esi,1),%esi +*/ +#define GENERIC_NOP1 ".byte 0x90\n" +#define GENERIC_NOP2 ".byte 0x89,0xf6\n" +#define GENERIC_NOP3 ".byte 0x8d,0x76,0x00\n" +#define GENERIC_NOP4 ".byte 0x8d,0x74,0x26,0x00\n" +#define GENERIC_NOP5 GENERIC_NOP1 GENERIC_NOP4 +#define GENERIC_NOP6 ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n" +#define GENERIC_NOP7 ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n" +#define GENERIC_NOP8 GENERIC_NOP1 GENERIC_NOP7 + +/* Opteron 64bit nops + 1: nop + 2: osp nop + 3: osp osp nop + 4: osp osp osp nop +*/ +#define K8_NOP1 GENERIC_NOP1 +#define K8_NOP2 ".byte 0x66,0x90\n" +#define K8_NOP3 ".byte 0x66,0x66,0x90\n" +#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n" +#define K8_NOP5 K8_NOP3 K8_NOP2 +#define K8_NOP6 K8_NOP3 K8_NOP3 +#define K8_NOP7 K8_NOP4 K8_NOP3 +#define K8_NOP8 K8_NOP4 K8_NOP4 + +/* K7 nops + uses eax dependencies (arbitary choice) + 1: nop + 2: movl %eax,%eax + 3: leal (,%eax,1),%eax + 4: leal 0x00(,%eax,1),%eax + 6: leal 0x00000000(%eax),%eax + 7: leal 0x00000000(,%eax,1),%eax +*/ +#define K7_NOP1 GENERIC_NOP1 +#define K7_NOP2 ".byte 0x8b,0xc0\n" +#define K7_NOP3 ".byte 0x8d,0x04,0x20\n" +#define K7_NOP4 ".byte 0x8d,0x44,0x20,0x00\n" +#define K7_NOP5 K7_NOP4 ASM_NOP1 +#define K7_NOP6 ".byte 0x8d,0x80,0,0,0,0\n" +#define K7_NOP7 ".byte 0x8D,0x04,0x05,0,0,0,0\n" +#define K7_NOP8 K7_NOP7 ASM_NOP1 + +/* P6 nops + uses eax dependencies (Intel-recommended choice) + 1: nop + 2: osp nop + 3: nopl (%eax) + 4: nopl 0x00(%eax) + 5: nopl 0x00(%eax,%eax,1) + 6: osp nopl 0x00(%eax,%eax,1) + 7: nopl 0x00000000(%eax) + 8: nopl 0x00000000(%eax,%eax,1) +*/ +#define P6_NOP1 GENERIC_NOP1 +#define P6_NOP2 ".byte 0x66,0x90\n" +#define P6_NOP3 ".byte 0x0f,0x1f,0x00\n" +#define P6_NOP4 ".byte 0x0f,0x1f,0x40,0\n" +#define P6_NOP5 ".byte 0x0f,0x1f,0x44,0x00,0\n" +#define P6_NOP6 ".byte 0x66,0x0f,0x1f,0x44,0x00,0\n" +#define P6_NOP7 ".byte 0x0f,0x1f,0x80,0,0,0,0\n" +#define P6_NOP8 ".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n" + +#if defined(CONFIG_MK7) +#define ASM_NOP1 K7_NOP1 +#define ASM_NOP2 K7_NOP2 +#define ASM_NOP3 K7_NOP3 +#define ASM_NOP4 K7_NOP4 +#define ASM_NOP5 K7_NOP5 +#define ASM_NOP6 K7_NOP6 +#define ASM_NOP7 K7_NOP7 +#define ASM_NOP8 K7_NOP8 +#elif defined(CONFIG_X86_P6_NOP) +#define ASM_NOP1 P6_NOP1 +#define ASM_NOP2 P6_NOP2 +#define ASM_NOP3 P6_NOP3 +#define ASM_NOP4 P6_NOP4 +#define ASM_NOP5 P6_NOP5 +#define ASM_NOP6 P6_NOP6 +#define ASM_NOP7 P6_NOP7 +#define ASM_NOP8 P6_NOP8 +#elif defined(CONFIG_X86_64) +#define ASM_NOP1 K8_NOP1 +#define ASM_NOP2 K8_NOP2 +#define ASM_NOP3 K8_NOP3 +#define ASM_NOP4 K8_NOP4 +#define ASM_NOP5 K8_NOP5 +#define ASM_NOP6 K8_NOP6 +#define ASM_NOP7 K8_NOP7 +#define ASM_NOP8 K8_NOP8 +#else +#define ASM_NOP1 GENERIC_NOP1 +#define ASM_NOP2 GENERIC_NOP2 +#define ASM_NOP3 GENERIC_NOP3 +#define ASM_NOP4 GENERIC_NOP4 +#define ASM_NOP5 GENERIC_NOP5 +#define ASM_NOP6 GENERIC_NOP6 +#define ASM_NOP7 GENERIC_NOP7 +#define ASM_NOP8 GENERIC_NOP8 +#endif + +#define ASM_NOP_MAX 8 + +#endif /* _ASM_X86_NOPS_H */ diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h new file mode 100644 index 000000000000..27da400d3138 --- /dev/null +++ b/arch/x86/include/asm/numa.h @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include "numa_32.h" +#else +# include "numa_64.h" +#endif diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h new file mode 100644 index 000000000000..e9f5db796244 --- /dev/null +++ b/arch/x86/include/asm/numa_32.h @@ -0,0 +1,11 @@ +#ifndef _ASM_X86_NUMA_32_H +#define _ASM_X86_NUMA_32_H + +extern int pxm_to_nid(int pxm); +extern void numa_remove_cpu(int cpu); + +#ifdef CONFIG_NUMA +extern void set_highmem_pages_init(void); +#endif + +#endif /* _ASM_X86_NUMA_32_H */ diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h new file mode 100644 index 000000000000..064ed6df4cbe --- /dev/null +++ b/arch/x86/include/asm/numa_64.h @@ -0,0 +1,43 @@ +#ifndef _ASM_X86_NUMA_64_H +#define _ASM_X86_NUMA_64_H + +#include <linux/nodemask.h> +#include <asm/apicdef.h> + +struct bootnode { + u64 start; + u64 end; +}; + +extern int compute_hash_shift(struct bootnode *nodes, int numblks, + int *nodeids); + +#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) + +extern void numa_init_array(void); +extern int numa_off; + +extern void srat_reserve_add_area(int nodeid); +extern int hotadd_percent; + +extern s16 apicid_to_node[MAX_LOCAL_APIC]; + +extern unsigned long numa_free_all_bootmem(void); +extern void setup_node_bootmem(int nodeid, unsigned long start, + unsigned long end); + +#ifdef CONFIG_NUMA +extern void __init init_cpu_to_node(void); +extern void __cpuinit numa_set_node(int cpu, int node); +extern void __cpuinit numa_clear_node(int cpu); +extern void __cpuinit numa_add_cpu(int cpu); +extern void __cpuinit numa_remove_cpu(int cpu); +#else +static inline void init_cpu_to_node(void) { } +static inline void numa_set_node(int cpu, int node) { } +static inline void numa_clear_node(int cpu) { } +static inline void numa_add_cpu(int cpu, int node) { } +static inline void numa_remove_cpu(int cpu) { } +#endif + +#endif /* _ASM_X86_NUMA_64_H */ diff --git a/arch/x86/include/asm/numaq.h b/arch/x86/include/asm/numaq.h new file mode 100644 index 000000000000..1e8bd30b4c16 --- /dev/null +++ b/arch/x86/include/asm/numaq.h @@ -0,0 +1,169 @@ +/* + * Written by: Patricia Gaughen, IBM Corporation + * + * Copyright (C) 2002, IBM Corp. + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send feedback to <gone@us.ibm.com> + */ + +#ifndef _ASM_X86_NUMAQ_H +#define _ASM_X86_NUMAQ_H + +#ifdef CONFIG_X86_NUMAQ + +extern int found_numaq; +extern int get_memcfg_numaq(void); + +/* + * SYS_CFG_DATA_PRIV_ADDR, struct eachquadmem, and struct sys_cfg_data are the + */ +#define SYS_CFG_DATA_PRIV_ADDR 0x0009d000 /* place for scd in private + quad space */ + +/* + * Communication area for each processor on lynxer-processor tests. + * + * NOTE: If you change the size of this eachproc structure you need + * to change the definition for EACH_QUAD_SIZE. + */ +struct eachquadmem { + unsigned int priv_mem_start; /* Starting address of this */ + /* quad's private memory. */ + /* This is always 0. */ + /* In MB. */ + unsigned int priv_mem_size; /* Size of this quad's */ + /* private memory. */ + /* In MB. */ + unsigned int low_shrd_mem_strp_start;/* Starting address of this */ + /* quad's low shared block */ + /* (untranslated). */ + /* In MB. */ + unsigned int low_shrd_mem_start; /* Starting address of this */ + /* quad's low shared memory */ + /* (untranslated). */ + /* In MB. */ + unsigned int low_shrd_mem_size; /* Size of this quad's low */ + /* shared memory. */ + /* In MB. */ + unsigned int lmmio_copb_start; /* Starting address of this */ + /* quad's local memory */ + /* mapped I/O in the */ + /* compatibility OPB. */ + /* In MB. */ + unsigned int lmmio_copb_size; /* Size of this quad's local */ + /* memory mapped I/O in the */ + /* compatibility OPB. */ + /* In MB. */ + unsigned int lmmio_nopb_start; /* Starting address of this */ + /* quad's local memory */ + /* mapped I/O in the */ + /* non-compatibility OPB. */ + /* In MB. */ + unsigned int lmmio_nopb_size; /* Size of this quad's local */ + /* memory mapped I/O in the */ + /* non-compatibility OPB. */ + /* In MB. */ + unsigned int io_apic_0_start; /* Starting address of I/O */ + /* APIC 0. */ + unsigned int io_apic_0_sz; /* Size I/O APIC 0. */ + unsigned int io_apic_1_start; /* Starting address of I/O */ + /* APIC 1. */ + unsigned int io_apic_1_sz; /* Size I/O APIC 1. */ + unsigned int hi_shrd_mem_start; /* Starting address of this */ + /* quad's high shared memory.*/ + /* In MB. */ + unsigned int hi_shrd_mem_size; /* Size of this quad's high */ + /* shared memory. */ + /* In MB. */ + unsigned int mps_table_addr; /* Address of this quad's */ + /* MPS tables from BIOS, */ + /* in system space.*/ + unsigned int lcl_MDC_pio_addr; /* Port-I/O address for */ + /* local access of MDC. */ + unsigned int rmt_MDC_mmpio_addr; /* MM-Port-I/O address for */ + /* remote access of MDC. */ + unsigned int mm_port_io_start; /* Starting address of this */ + /* quad's memory mapped Port */ + /* I/O space. */ + unsigned int mm_port_io_size; /* Size of this quad's memory*/ + /* mapped Port I/O space. */ + unsigned int mm_rmt_io_apic_start; /* Starting address of this */ + /* quad's memory mapped */ + /* remote I/O APIC space. */ + unsigned int mm_rmt_io_apic_size; /* Size of this quad's memory*/ + /* mapped remote I/O APIC */ + /* space. */ + unsigned int mm_isa_start; /* Starting address of this */ + /* quad's memory mapped ISA */ + /* space (contains MDC */ + /* memory space). */ + unsigned int mm_isa_size; /* Size of this quad's memory*/ + /* mapped ISA space (contains*/ + /* MDC memory space). */ + unsigned int rmt_qmi_addr; /* Remote addr to access QMI.*/ + unsigned int lcl_qmi_addr; /* Local addr to access QMI. */ +}; + +/* + * Note: This structure must be NOT be changed unless the multiproc and + * OS are changed to reflect the new structure. + */ +struct sys_cfg_data { + unsigned int quad_id; + unsigned int bsp_proc_id; /* Boot Strap Processor in this quad. */ + unsigned int scd_version; /* Version number of this table. */ + unsigned int first_quad_id; + unsigned int quads_present31_0; /* 1 bit for each quad */ + unsigned int quads_present63_32; /* 1 bit for each quad */ + unsigned int config_flags; + unsigned int boot_flags; + unsigned int csr_start_addr; /* Absolute value (not in MB) */ + unsigned int csr_size; /* Absolute value (not in MB) */ + unsigned int lcl_apic_start_addr; /* Absolute value (not in MB) */ + unsigned int lcl_apic_size; /* Absolute value (not in MB) */ + unsigned int low_shrd_mem_base; /* 0 or 512MB or 1GB */ + unsigned int low_shrd_mem_quad_offset; /* 0,128M,256M,512M,1G */ + /* may not be totally populated */ + unsigned int split_mem_enbl; /* 0 for no low shared memory */ + unsigned int mmio_sz; /* Size of total system memory mapped I/O */ + /* (in MB). */ + unsigned int quad_spin_lock; /* Spare location used for quad */ + /* bringup. */ + unsigned int nonzero55; /* For checksumming. */ + unsigned int nonzeroaa; /* For checksumming. */ + unsigned int scd_magic_number; + unsigned int system_type; + unsigned int checksum; + /* + * memory configuration area for each quad + */ + struct eachquadmem eq[MAX_NUMNODES]; /* indexed by quad id */ +}; + +void numaq_tsc_disable(void); + +#else +static inline int get_memcfg_numaq(void) +{ + return 0; +} +#endif /* CONFIG_X86_NUMAQ */ +#endif /* _ASM_X86_NUMAQ_H */ + diff --git a/arch/x86/include/asm/numaq/apic.h b/arch/x86/include/asm/numaq/apic.h new file mode 100644 index 000000000000..0bf2a06b7a4e --- /dev/null +++ b/arch/x86/include/asm/numaq/apic.h @@ -0,0 +1,136 @@ +#ifndef __ASM_NUMAQ_APIC_H +#define __ASM_NUMAQ_APIC_H + +#include <asm/io.h> +#include <linux/mmzone.h> +#include <linux/nodemask.h> + +#define APIC_DFR_VALUE (APIC_DFR_CLUSTER) + +static inline cpumask_t target_cpus(void) +{ + return CPU_MASK_ALL; +} + +#define NO_BALANCE_IRQ (1) +#define esr_disable (1) + +#define INT_DELIVERY_MODE dest_LowestPrio +#define INT_DEST_MODE 0 /* physical delivery on LOCAL quad */ + +static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid) +{ + return physid_isset(apicid, bitmap); +} +static inline unsigned long check_apicid_present(int bit) +{ + return physid_isset(bit, phys_cpu_present_map); +} +#define apicid_cluster(apicid) (apicid & 0xF0) + +static inline int apic_id_registered(void) +{ + return 1; +} + +static inline void init_apic_ldr(void) +{ + /* Already done in NUMA-Q firmware */ +} + +static inline void setup_apic_routing(void) +{ + printk("Enabling APIC mode: %s. Using %d I/O APICs\n", + "NUMA-Q", nr_ioapics); +} + +/* + * Skip adding the timer int on secondary nodes, which causes + * a small but painful rift in the time-space continuum. + */ +static inline int multi_timer_check(int apic, int irq) +{ + return apic != 0 && irq == 0; +} + +static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map) +{ + /* We don't have a good way to do this yet - hack */ + return physids_promote(0xFUL); +} + +/* Mapping from cpu number to logical apicid */ +extern u8 cpu_2_logical_apicid[]; +static inline int cpu_to_logical_apicid(int cpu) +{ + if (cpu >= NR_CPUS) + return BAD_APICID; + return (int)cpu_2_logical_apicid[cpu]; +} + +/* + * Supporting over 60 cpus on NUMA-Q requires a locality-dependent + * cpu to APIC ID relation to properly interact with the intelligent + * mode of the cluster controller. + */ +static inline int cpu_present_to_apicid(int mps_cpu) +{ + if (mps_cpu < 60) + return ((mps_cpu >> 2) << 4) | (1 << (mps_cpu & 0x3)); + else + return BAD_APICID; +} + +static inline int apicid_to_node(int logical_apicid) +{ + return logical_apicid >> 4; +} + +static inline physid_mask_t apicid_to_cpu_present(int logical_apicid) +{ + int node = apicid_to_node(logical_apicid); + int cpu = __ffs(logical_apicid & 0xf); + + return physid_mask_of_physid(cpu + 4*node); +} + +extern void *xquad_portio; + +static inline void setup_portio_remap(void) +{ + int num_quads = num_online_nodes(); + + if (num_quads <= 1) + return; + + printk("Remapping cross-quad port I/O for %d quads\n", num_quads); + xquad_portio = ioremap(XQUAD_PORTIO_BASE, num_quads*XQUAD_PORTIO_QUAD); + printk("xquad_portio vaddr 0x%08lx, len %08lx\n", + (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD); +} + +static inline int check_phys_apicid_present(int boot_cpu_physical_apicid) +{ + return (1); +} + +static inline void enable_apic_mode(void) +{ +} + +/* + * We use physical apicids here, not logical, so just return the default + * physical broadcast to stop people from breaking us + */ +static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) +{ + return (int) 0xF; +} + +/* No NUMA-Q box has a HT CPU, but it can't hurt to use the default code. */ +static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb) +{ + return cpuid_apic >> index_msb; +} + +#endif /* __ASM_NUMAQ_APIC_H */ diff --git a/arch/x86/include/asm/numaq/apicdef.h b/arch/x86/include/asm/numaq/apicdef.h new file mode 100644 index 000000000000..e012a46cc22a --- /dev/null +++ b/arch/x86/include/asm/numaq/apicdef.h @@ -0,0 +1,14 @@ +#ifndef __ASM_NUMAQ_APICDEF_H +#define __ASM_NUMAQ_APICDEF_H + + +#define APIC_ID_MASK (0xF<<24) + +static inline unsigned get_apic_id(unsigned long x) +{ + return (((x)>>24)&0x0F); +} + +#define GET_APIC_ID(x) get_apic_id(x) + +#endif diff --git a/arch/x86/include/asm/numaq/ipi.h b/arch/x86/include/asm/numaq/ipi.h new file mode 100644 index 000000000000..935588d286cf --- /dev/null +++ b/arch/x86/include/asm/numaq/ipi.h @@ -0,0 +1,25 @@ +#ifndef __ASM_NUMAQ_IPI_H +#define __ASM_NUMAQ_IPI_H + +void send_IPI_mask_sequence(cpumask_t, int vector); + +static inline void send_IPI_mask(cpumask_t mask, int vector) +{ + send_IPI_mask_sequence(mask, vector); +} + +static inline void send_IPI_allbutself(int vector) +{ + cpumask_t mask = cpu_online_map; + cpu_clear(smp_processor_id(), mask); + + if (!cpus_empty(mask)) + send_IPI_mask(mask, vector); +} + +static inline void send_IPI_all(int vector) +{ + send_IPI_mask(cpu_online_map, vector); +} + +#endif /* __ASM_NUMAQ_IPI_H */ diff --git a/arch/x86/include/asm/numaq/mpparse.h b/arch/x86/include/asm/numaq/mpparse.h new file mode 100644 index 000000000000..252292e077b6 --- /dev/null +++ b/arch/x86/include/asm/numaq/mpparse.h @@ -0,0 +1,7 @@ +#ifndef __ASM_NUMAQ_MPPARSE_H +#define __ASM_NUMAQ_MPPARSE_H + +extern void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem, + char *productid); + +#endif /* __ASM_NUMAQ_MPPARSE_H */ diff --git a/arch/x86/include/asm/numaq/wakecpu.h b/arch/x86/include/asm/numaq/wakecpu.h new file mode 100644 index 000000000000..c577bda5b1c5 --- /dev/null +++ b/arch/x86/include/asm/numaq/wakecpu.h @@ -0,0 +1,43 @@ +#ifndef __ASM_NUMAQ_WAKECPU_H +#define __ASM_NUMAQ_WAKECPU_H + +/* This file copes with machines that wakeup secondary CPUs by NMIs */ + +#define WAKE_SECONDARY_VIA_NMI + +#define TRAMPOLINE_LOW phys_to_virt(0x8) +#define TRAMPOLINE_HIGH phys_to_virt(0xa) + +#define boot_cpu_apicid boot_cpu_logical_apicid + +/* We don't do anything here because we use NMI's to boot instead */ +static inline void wait_for_init_deassert(atomic_t *deassert) +{ +} + +/* + * Because we use NMIs rather than the INIT-STARTUP sequence to + * bootstrap the CPUs, the APIC may be in a weird state. Kick it. + */ +static inline void smp_callin_clear_local_apic(void) +{ + clear_local_APIC(); +} + +static inline void store_NMI_vector(unsigned short *high, unsigned short *low) +{ + printk("Storing NMI vector\n"); + *high = *((volatile unsigned short *) TRAMPOLINE_HIGH); + *low = *((volatile unsigned short *) TRAMPOLINE_LOW); +} + +static inline void restore_NMI_vector(unsigned short *high, unsigned short *low) +{ + printk("Restoring NMI vector\n"); + *((volatile unsigned short *) TRAMPOLINE_HIGH) = *high; + *((volatile unsigned short *) TRAMPOLINE_LOW) = *low; +} + +#define inquire_remote_apic(apicid) {} + +#endif /* __ASM_NUMAQ_WAKECPU_H */ diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h new file mode 100644 index 000000000000..834a30295fab --- /dev/null +++ b/arch/x86/include/asm/olpc.h @@ -0,0 +1,132 @@ +/* OLPC machine specific definitions */ + +#ifndef _ASM_X86_OLPC_H +#define _ASM_X86_OLPC_H + +#include <asm/geode.h> + +struct olpc_platform_t { + int flags; + uint32_t boardrev; + int ecver; +}; + +#define OLPC_F_PRESENT 0x01 +#define OLPC_F_DCON 0x02 +#define OLPC_F_VSA 0x04 + +#ifdef CONFIG_OLPC + +extern struct olpc_platform_t olpc_platform_info; + +/* + * OLPC board IDs contain the major build number within the mask 0x0ff0, + * and the minor build number withing 0x000f. Pre-builds have a minor + * number less than 8, and normal builds start at 8. For example, 0x0B10 + * is a PreB1, and 0x0C18 is a C1. + */ + +static inline uint32_t olpc_board(uint8_t id) +{ + return (id << 4) | 0x8; +} + +static inline uint32_t olpc_board_pre(uint8_t id) +{ + return id << 4; +} + +static inline int machine_is_olpc(void) +{ + return (olpc_platform_info.flags & OLPC_F_PRESENT) ? 1 : 0; +} + +/* + * The DCON is OLPC's Display Controller. It has a number of unique + * features that we might want to take advantage of.. + */ +static inline int olpc_has_dcon(void) +{ + return (olpc_platform_info.flags & OLPC_F_DCON) ? 1 : 0; +} + +/* + * The VSA is software from AMD that typical Geode bioses will include. + * It is used to emulate the PCI bus, VGA, etc. OLPC's Open Firmware does + * not include the VSA; instead, PCI is emulated by the kernel. + * + * The VSA is described further in arch/x86/pci/olpc.c. + */ +static inline int olpc_has_vsa(void) +{ + return (olpc_platform_info.flags & OLPC_F_VSA) ? 1 : 0; +} + +/* + * The "Mass Production" version of OLPC's XO is identified as being model + * C2. During the prototype phase, the following models (in chronological + * order) were created: A1, B1, B2, B3, B4, C1. The A1 through B2 models + * were based on Geode GX CPUs, and models after that were based upon + * Geode LX CPUs. There were also some hand-assembled models floating + * around, referred to as PreB1, PreB2, etc. + */ +static inline int olpc_board_at_least(uint32_t rev) +{ + return olpc_platform_info.boardrev >= rev; +} + +#else + +static inline int machine_is_olpc(void) +{ + return 0; +} + +static inline int olpc_has_dcon(void) +{ + return 0; +} + +static inline int olpc_has_vsa(void) +{ + return 0; +} + +#endif + +/* EC related functions */ + +extern int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen, + unsigned char *outbuf, size_t outlen); + +extern int olpc_ec_mask_set(uint8_t bits); +extern int olpc_ec_mask_unset(uint8_t bits); + +/* EC commands */ + +#define EC_FIRMWARE_REV 0x08 + +/* SCI source values */ + +#define EC_SCI_SRC_EMPTY 0x00 +#define EC_SCI_SRC_GAME 0x01 +#define EC_SCI_SRC_BATTERY 0x02 +#define EC_SCI_SRC_BATSOC 0x04 +#define EC_SCI_SRC_BATERR 0x08 +#define EC_SCI_SRC_EBOOK 0x10 +#define EC_SCI_SRC_WLAN 0x20 +#define EC_SCI_SRC_ACPWR 0x40 +#define EC_SCI_SRC_ALL 0x7F + +/* GPIO assignments */ + +#define OLPC_GPIO_MIC_AC geode_gpio(1) +#define OLPC_GPIO_DCON_IRQ geode_gpio(7) +#define OLPC_GPIO_THRM_ALRM geode_gpio(10) +#define OLPC_GPIO_SMB_CLK geode_gpio(14) +#define OLPC_GPIO_SMB_DATA geode_gpio(15) +#define OLPC_GPIO_WORKAUX geode_gpio(24) +#define OLPC_GPIO_LID geode_gpio(26) +#define OLPC_GPIO_ECSCI geode_gpio(27) + +#endif /* _ASM_X86_OLPC_H */ diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h new file mode 100644 index 000000000000..e9873a2e8695 --- /dev/null +++ b/arch/x86/include/asm/page.h @@ -0,0 +1,209 @@ +#ifndef _ASM_X86_PAGE_H +#define _ASM_X86_PAGE_H + +#include <linux/const.h> + +/* PAGE_SHIFT determines the page size */ +#define PAGE_SHIFT 12 +#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) +#define PAGE_MASK (~(PAGE_SIZE-1)) + +#ifdef __KERNEL__ + +#define __PHYSICAL_MASK ((phys_addr_t)(1ULL << __PHYSICAL_MASK_SHIFT) - 1) +#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) + +/* Cast PAGE_MASK to a signed type so that it is sign-extended if + virtual addresses are 32-bits but physical addresses are larger + (ie, 32-bit PAE). */ +#define PHYSICAL_PAGE_MASK (((signed long)PAGE_MASK) & __PHYSICAL_MASK) + +/* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */ +#define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK) + +/* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */ +#define PTE_FLAGS_MASK (~PTE_PFN_MASK) + +#define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT) +#define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1)) + +#define HPAGE_SHIFT PMD_SHIFT +#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT) +#define HPAGE_MASK (~(HPAGE_SIZE - 1)) +#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) + +#define HUGE_MAX_HSTATE 2 + +#ifndef __ASSEMBLY__ +#include <linux/types.h> +#endif + +#ifdef CONFIG_X86_64 +#include <asm/page_64.h> +#else +#include <asm/page_32.h> +#endif /* CONFIG_X86_64 */ + +#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET) + +#define VM_DATA_DEFAULT_FLAGS \ + (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \ + VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) + + +#ifndef __ASSEMBLY__ + +typedef struct { pgdval_t pgd; } pgd_t; +typedef struct { pgprotval_t pgprot; } pgprot_t; + +extern int page_is_ram(unsigned long pagenr); +extern int pagerange_is_ram(unsigned long start, unsigned long end); +extern int devmem_is_allowed(unsigned long pagenr); +extern void map_devmem(unsigned long pfn, unsigned long size, + pgprot_t vma_prot); +extern void unmap_devmem(unsigned long pfn, unsigned long size, + pgprot_t vma_prot); + +extern unsigned long max_low_pfn_mapped; +extern unsigned long max_pfn_mapped; + +struct page; + +static inline void clear_user_page(void *page, unsigned long vaddr, + struct page *pg) +{ + clear_page(page); +} + +static inline void copy_user_page(void *to, void *from, unsigned long vaddr, + struct page *topage) +{ + copy_page(to, from); +} + +#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \ + alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr) +#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE + +static inline pgd_t native_make_pgd(pgdval_t val) +{ + return (pgd_t) { val }; +} + +static inline pgdval_t native_pgd_val(pgd_t pgd) +{ + return pgd.pgd; +} + +#if PAGETABLE_LEVELS >= 3 +#if PAGETABLE_LEVELS == 4 +typedef struct { pudval_t pud; } pud_t; + +static inline pud_t native_make_pud(pmdval_t val) +{ + return (pud_t) { val }; +} + +static inline pudval_t native_pud_val(pud_t pud) +{ + return pud.pud; +} +#else /* PAGETABLE_LEVELS == 3 */ +#include <asm-generic/pgtable-nopud.h> + +static inline pudval_t native_pud_val(pud_t pud) +{ + return native_pgd_val(pud.pgd); +} +#endif /* PAGETABLE_LEVELS == 4 */ + +typedef struct { pmdval_t pmd; } pmd_t; + +static inline pmd_t native_make_pmd(pmdval_t val) +{ + return (pmd_t) { val }; +} + +static inline pmdval_t native_pmd_val(pmd_t pmd) +{ + return pmd.pmd; +} +#else /* PAGETABLE_LEVELS == 2 */ +#include <asm-generic/pgtable-nopmd.h> + +static inline pmdval_t native_pmd_val(pmd_t pmd) +{ + return native_pgd_val(pmd.pud.pgd); +} +#endif /* PAGETABLE_LEVELS >= 3 */ + +static inline pte_t native_make_pte(pteval_t val) +{ + return (pte_t) { .pte = val }; +} + +static inline pteval_t native_pte_val(pte_t pte) +{ + return pte.pte; +} + +static inline pteval_t native_pte_flags(pte_t pte) +{ + return native_pte_val(pte) & PTE_FLAGS_MASK; +} + +#define pgprot_val(x) ((x).pgprot) +#define __pgprot(x) ((pgprot_t) { (x) } ) + +#ifdef CONFIG_PARAVIRT +#include <asm/paravirt.h> +#else /* !CONFIG_PARAVIRT */ + +#define pgd_val(x) native_pgd_val(x) +#define __pgd(x) native_make_pgd(x) + +#ifndef __PAGETABLE_PUD_FOLDED +#define pud_val(x) native_pud_val(x) +#define __pud(x) native_make_pud(x) +#endif + +#ifndef __PAGETABLE_PMD_FOLDED +#define pmd_val(x) native_pmd_val(x) +#define __pmd(x) native_make_pmd(x) +#endif + +#define pte_val(x) native_pte_val(x) +#define pte_flags(x) native_pte_flags(x) +#define __pte(x) native_make_pte(x) + +#endif /* CONFIG_PARAVIRT */ + +#define __pa(x) __phys_addr((unsigned long)(x)) +#define __pa_nodebug(x) __phys_addr_nodebug((unsigned long)(x)) +/* __pa_symbol should be used for C visible symbols. + This seems to be the official gcc blessed way to do such arithmetic. */ +#define __pa_symbol(x) __pa(__phys_reloc_hide((unsigned long)(x))) + +#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) + +#define __boot_va(x) __va(x) +#define __boot_pa(x) __pa(x) + +/* + * virt_to_page(kaddr) returns a valid pointer if and only if + * virt_addr_valid(kaddr) returns true. + */ +#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) +#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) +extern bool __virt_addr_valid(unsigned long kaddr); +#define virt_addr_valid(kaddr) __virt_addr_valid((unsigned long) (kaddr)) + +#endif /* __ASSEMBLY__ */ + +#include <asm-generic/memory_model.h> +#include <asm-generic/page.h> + +#define __HAVE_ARCH_GATE_AREA 1 + +#endif /* __KERNEL__ */ +#endif /* _ASM_X86_PAGE_H */ diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h new file mode 100644 index 000000000000..bcde0d7b4325 --- /dev/null +++ b/arch/x86/include/asm/page_32.h @@ -0,0 +1,136 @@ +#ifndef _ASM_X86_PAGE_32_H +#define _ASM_X86_PAGE_32_H + +/* + * This handles the memory map. + * + * A __PAGE_OFFSET of 0xC0000000 means that the kernel has + * a virtual address space of one gigabyte, which limits the + * amount of physical memory you can use to about 950MB. + * + * If you want more physical memory than this then see the CONFIG_HIGHMEM4G + * and CONFIG_HIGHMEM64G options in the kernel configuration. + */ +#define __PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL) + +#ifdef CONFIG_4KSTACKS +#define THREAD_ORDER 0 +#else +#define THREAD_ORDER 1 +#endif +#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) + +#define STACKFAULT_STACK 0 +#define DOUBLEFAULT_STACK 1 +#define NMI_STACK 0 +#define DEBUG_STACK 0 +#define MCE_STACK 0 +#define N_EXCEPTION_STACKS 1 + +#ifdef CONFIG_X86_PAE +/* 44=32+12, the limit we can fit into an unsigned long pfn */ +#define __PHYSICAL_MASK_SHIFT 44 +#define __VIRTUAL_MASK_SHIFT 32 +#define PAGETABLE_LEVELS 3 + +#ifndef __ASSEMBLY__ +typedef u64 pteval_t; +typedef u64 pmdval_t; +typedef u64 pudval_t; +typedef u64 pgdval_t; +typedef u64 pgprotval_t; + +typedef union { + struct { + unsigned long pte_low, pte_high; + }; + pteval_t pte; +} pte_t; +#endif /* __ASSEMBLY__ + */ +#else /* !CONFIG_X86_PAE */ +#define __PHYSICAL_MASK_SHIFT 32 +#define __VIRTUAL_MASK_SHIFT 32 +#define PAGETABLE_LEVELS 2 + +#ifndef __ASSEMBLY__ +typedef unsigned long pteval_t; +typedef unsigned long pmdval_t; +typedef unsigned long pudval_t; +typedef unsigned long pgdval_t; +typedef unsigned long pgprotval_t; + +typedef union { + pteval_t pte; + pteval_t pte_low; +} pte_t; + +#endif /* __ASSEMBLY__ */ +#endif /* CONFIG_X86_PAE */ + +#ifndef __ASSEMBLY__ +typedef struct page *pgtable_t; +#endif + +#ifdef CONFIG_HUGETLB_PAGE +#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA +#endif + +#ifndef __ASSEMBLY__ +#define __phys_addr_nodebug(x) ((x) - PAGE_OFFSET) +#ifdef CONFIG_DEBUG_VIRTUAL +extern unsigned long __phys_addr(unsigned long); +#else +#define __phys_addr(x) __phys_addr_nodebug(x) +#endif +#define __phys_reloc_hide(x) RELOC_HIDE((x), 0) + +#ifdef CONFIG_FLATMEM +#define pfn_valid(pfn) ((pfn) < max_mapnr) +#endif /* CONFIG_FLATMEM */ + +extern int nx_enabled; + +/* + * This much address space is reserved for vmalloc() and iomap() + * as well as fixmap mappings. + */ +extern unsigned int __VMALLOC_RESERVE; +extern int sysctl_legacy_va_layout; + +extern void find_low_pfn_range(void); +extern unsigned long init_memory_mapping(unsigned long start, + unsigned long end); +extern void initmem_init(unsigned long, unsigned long); +extern void free_initmem(void); +extern void setup_bootmem_allocator(void); + + +#ifdef CONFIG_X86_USE_3DNOW +#include <asm/mmx.h> + +static inline void clear_page(void *page) +{ + mmx_clear_page(page); +} + +static inline void copy_page(void *to, void *from) +{ + mmx_copy_page(to, from); +} +#else /* !CONFIG_X86_USE_3DNOW */ +#include <linux/string.h> + +static inline void clear_page(void *page) +{ + memset(page, 0, PAGE_SIZE); +} + +static inline void copy_page(void *to, void *from) +{ + memcpy(to, from, PAGE_SIZE); +} +#endif /* CONFIG_X86_3DNOW */ +#endif /* !__ASSEMBLY__ */ + +#endif /* _ASM_X86_PAGE_32_H */ diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h new file mode 100644 index 000000000000..5ebca29f44f0 --- /dev/null +++ b/arch/x86/include/asm/page_64.h @@ -0,0 +1,105 @@ +#ifndef _ASM_X86_PAGE_64_H +#define _ASM_X86_PAGE_64_H + +#define PAGETABLE_LEVELS 4 + +#define THREAD_ORDER 1 +#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) +#define CURRENT_MASK (~(THREAD_SIZE - 1)) + +#define EXCEPTION_STACK_ORDER 0 +#define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER) + +#define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1) +#define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER) + +#define IRQSTACK_ORDER 2 +#define IRQSTACKSIZE (PAGE_SIZE << IRQSTACK_ORDER) + +#define STACKFAULT_STACK 1 +#define DOUBLEFAULT_STACK 2 +#define NMI_STACK 3 +#define DEBUG_STACK 4 +#define MCE_STACK 5 +#define N_EXCEPTION_STACKS 5 /* hw limit: 7 */ + +#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) +#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) + +/* + * Set __PAGE_OFFSET to the most negative possible address + + * PGDIR_SIZE*16 (pgd slot 272). The gap is to allow a space for a + * hypervisor to fit. Choosing 16 slots here is arbitrary, but it's + * what Xen requires. + */ +#define __PAGE_OFFSET _AC(0xffff880000000000, UL) + +#define __PHYSICAL_START CONFIG_PHYSICAL_START +#define __KERNEL_ALIGN 0x200000 + +/* + * Make sure kernel is aligned to 2MB address. Catching it at compile + * time is better. Change your config file and compile the kernel + * for a 2MB aligned address (CONFIG_PHYSICAL_START) + */ +#if (CONFIG_PHYSICAL_START % __KERNEL_ALIGN) != 0 +#error "CONFIG_PHYSICAL_START must be a multiple of 2MB" +#endif + +#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START) +#define __START_KERNEL_map _AC(0xffffffff80000000, UL) + +/* See Documentation/x86_64/mm.txt for a description of the memory map. */ +#define __PHYSICAL_MASK_SHIFT 46 +#define __VIRTUAL_MASK_SHIFT 48 + +/* + * Kernel image size is limited to 512 MB (see level2_kernel_pgt in + * arch/x86/kernel/head_64.S), and it is mapped here: + */ +#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) +#define KERNEL_IMAGE_START _AC(0xffffffff80000000, UL) + +#ifndef __ASSEMBLY__ +void clear_page(void *page); +void copy_page(void *to, void *from); + +/* duplicated to the one in bootmem.h */ +extern unsigned long max_pfn; +extern unsigned long phys_base; + +extern unsigned long __phys_addr(unsigned long); +#define __phys_reloc_hide(x) (x) + +/* + * These are used to make use of C type-checking.. + */ +typedef unsigned long pteval_t; +typedef unsigned long pmdval_t; +typedef unsigned long pudval_t; +typedef unsigned long pgdval_t; +typedef unsigned long pgprotval_t; + +typedef struct page *pgtable_t; + +typedef struct { pteval_t pte; } pte_t; + +#define vmemmap ((struct page *)VMEMMAP_START) + +extern unsigned long init_memory_mapping(unsigned long start, + unsigned long end); + +extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn); +extern void free_initmem(void); + +extern void init_extra_mapping_uc(unsigned long phys, unsigned long size); +extern void init_extra_mapping_wb(unsigned long phys, unsigned long size); + +#endif /* !__ASSEMBLY__ */ + +#ifdef CONFIG_FLATMEM +#define pfn_valid(pfn) ((pfn) < max_pfn) +#endif + + +#endif /* _ASM_X86_PAGE_64_H */ diff --git a/arch/x86/include/asm/param.h b/arch/x86/include/asm/param.h new file mode 100644 index 000000000000..6f0d0422f4ca --- /dev/null +++ b/arch/x86/include/asm/param.h @@ -0,0 +1,22 @@ +#ifndef _ASM_X86_PARAM_H +#define _ASM_X86_PARAM_H + +#ifdef __KERNEL__ +# define HZ CONFIG_HZ /* Internal kernel timer frequency */ +# define USER_HZ 100 /* some user interfaces are */ +# define CLOCKS_PER_SEC (USER_HZ) /* in "ticks" like times() */ +#endif + +#ifndef HZ +#define HZ 100 +#endif + +#define EXEC_PAGESIZE 4096 + +#ifndef NOGROUP +#define NOGROUP (-1) +#endif + +#define MAXHOSTNAMELEN 64 /* max length of hostname */ + +#endif /* _ASM_X86_PARAM_H */ diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h new file mode 100644 index 000000000000..ba3e2ff6aedc --- /dev/null +++ b/arch/x86/include/asm/paravirt.h @@ -0,0 +1,1650 @@ +#ifndef _ASM_X86_PARAVIRT_H +#define _ASM_X86_PARAVIRT_H +/* Various instructions on x86 need to be replaced for + * para-virtualization: those hooks are defined here. */ + +#ifdef CONFIG_PARAVIRT +#include <asm/page.h> +#include <asm/asm.h> + +/* Bitmask of what can be clobbered: usually at least eax. */ +#define CLBR_NONE 0 +#define CLBR_EAX (1 << 0) +#define CLBR_ECX (1 << 1) +#define CLBR_EDX (1 << 2) + +#ifdef CONFIG_X86_64 +#define CLBR_RSI (1 << 3) +#define CLBR_RDI (1 << 4) +#define CLBR_R8 (1 << 5) +#define CLBR_R9 (1 << 6) +#define CLBR_R10 (1 << 7) +#define CLBR_R11 (1 << 8) +#define CLBR_ANY ((1 << 9) - 1) +#include <asm/desc_defs.h> +#else +/* CLBR_ANY should match all regs platform has. For i386, that's just it */ +#define CLBR_ANY ((1 << 3) - 1) +#endif /* X86_64 */ + +#ifndef __ASSEMBLY__ +#include <linux/types.h> +#include <linux/cpumask.h> +#include <asm/kmap_types.h> +#include <asm/desc_defs.h> + +struct page; +struct thread_struct; +struct desc_ptr; +struct tss_struct; +struct mm_struct; +struct desc_struct; + +/* general info */ +struct pv_info { + unsigned int kernel_rpl; + int shared_kernel_pmd; + int paravirt_enabled; + const char *name; +}; + +struct pv_init_ops { + /* + * Patch may replace one of the defined code sequences with + * arbitrary code, subject to the same register constraints. + * This generally means the code is not free to clobber any + * registers other than EAX. The patch function should return + * the number of bytes of code generated, as we nop pad the + * rest in generic code. + */ + unsigned (*patch)(u8 type, u16 clobber, void *insnbuf, + unsigned long addr, unsigned len); + + /* Basic arch-specific setup */ + void (*arch_setup)(void); + char *(*memory_setup)(void); + void (*post_allocator_init)(void); + + /* Print a banner to identify the environment */ + void (*banner)(void); +}; + + +struct pv_lazy_ops { + /* Set deferred update mode, used for batching operations. */ + void (*enter)(void); + void (*leave)(void); +}; + +struct pv_time_ops { + void (*time_init)(void); + + /* Set and set time of day */ + unsigned long (*get_wallclock)(void); + int (*set_wallclock)(unsigned long); + + unsigned long long (*sched_clock)(void); + unsigned long (*get_tsc_khz)(void); +}; + +struct pv_cpu_ops { + /* hooks for various privileged instructions */ + unsigned long (*get_debugreg)(int regno); + void (*set_debugreg)(int regno, unsigned long value); + + void (*clts)(void); + + unsigned long (*read_cr0)(void); + void (*write_cr0)(unsigned long); + + unsigned long (*read_cr4_safe)(void); + unsigned long (*read_cr4)(void); + void (*write_cr4)(unsigned long); + +#ifdef CONFIG_X86_64 + unsigned long (*read_cr8)(void); + void (*write_cr8)(unsigned long); +#endif + + /* Segment descriptor handling */ + void (*load_tr_desc)(void); + void (*load_gdt)(const struct desc_ptr *); + void (*load_idt)(const struct desc_ptr *); + void (*store_gdt)(struct desc_ptr *); + void (*store_idt)(struct desc_ptr *); + void (*set_ldt)(const void *desc, unsigned entries); + unsigned long (*store_tr)(void); + void (*load_tls)(struct thread_struct *t, unsigned int cpu); +#ifdef CONFIG_X86_64 + void (*load_gs_index)(unsigned int idx); +#endif + void (*write_ldt_entry)(struct desc_struct *ldt, int entrynum, + const void *desc); + void (*write_gdt_entry)(struct desc_struct *, + int entrynum, const void *desc, int size); + void (*write_idt_entry)(gate_desc *, + int entrynum, const gate_desc *gate); + void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries); + void (*free_ldt)(struct desc_struct *ldt, unsigned entries); + + void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t); + + void (*set_iopl_mask)(unsigned mask); + + void (*wbinvd)(void); + void (*io_delay)(void); + + /* cpuid emulation, mostly so that caps bits can be disabled */ + void (*cpuid)(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx); + + /* MSR, PMC and TSR operations. + err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */ + u64 (*read_msr_amd)(unsigned int msr, int *err); + u64 (*read_msr)(unsigned int msr, int *err); + int (*write_msr)(unsigned int msr, unsigned low, unsigned high); + + u64 (*read_tsc)(void); + u64 (*read_pmc)(int counter); + unsigned long long (*read_tscp)(unsigned int *aux); + + /* + * Atomically enable interrupts and return to userspace. This + * is only ever used to return to 32-bit processes; in a + * 64-bit kernel, it's used for 32-on-64 compat processes, but + * never native 64-bit processes. (Jump, not call.) + */ + void (*irq_enable_sysexit)(void); + + /* + * Switch to usermode gs and return to 64-bit usermode using + * sysret. Only used in 64-bit kernels to return to 64-bit + * processes. Usermode register state, including %rsp, must + * already be restored. + */ + void (*usergs_sysret64)(void); + + /* + * Switch to usermode gs and return to 32-bit usermode using + * sysret. Used to return to 32-on-64 compat processes. + * Other usermode register state, including %esp, must already + * be restored. + */ + void (*usergs_sysret32)(void); + + /* Normal iret. Jump to this with the standard iret stack + frame set up. */ + void (*iret)(void); + + void (*swapgs)(void); + + struct pv_lazy_ops lazy_mode; +}; + +struct pv_irq_ops { + void (*init_IRQ)(void); + + /* + * Get/set interrupt state. save_fl and restore_fl are only + * expected to use X86_EFLAGS_IF; all other bits + * returned from save_fl are undefined, and may be ignored by + * restore_fl. + */ + unsigned long (*save_fl)(void); + void (*restore_fl)(unsigned long); + void (*irq_disable)(void); + void (*irq_enable)(void); + void (*safe_halt)(void); + void (*halt)(void); + +#ifdef CONFIG_X86_64 + void (*adjust_exception_frame)(void); +#endif +}; + +struct pv_apic_ops { +#ifdef CONFIG_X86_LOCAL_APIC + void (*setup_boot_clock)(void); + void (*setup_secondary_clock)(void); + + void (*startup_ipi_hook)(int phys_apicid, + unsigned long start_eip, + unsigned long start_esp); +#endif +}; + +struct pv_mmu_ops { + /* + * Called before/after init_mm pagetable setup. setup_start + * may reset %cr3, and may pre-install parts of the pagetable; + * pagetable setup is expected to preserve any existing + * mapping. + */ + void (*pagetable_setup_start)(pgd_t *pgd_base); + void (*pagetable_setup_done)(pgd_t *pgd_base); + + unsigned long (*read_cr2)(void); + void (*write_cr2)(unsigned long); + + unsigned long (*read_cr3)(void); + void (*write_cr3)(unsigned long); + + /* + * Hooks for intercepting the creation/use/destruction of an + * mm_struct. + */ + void (*activate_mm)(struct mm_struct *prev, + struct mm_struct *next); + void (*dup_mmap)(struct mm_struct *oldmm, + struct mm_struct *mm); + void (*exit_mmap)(struct mm_struct *mm); + + + /* TLB operations */ + void (*flush_tlb_user)(void); + void (*flush_tlb_kernel)(void); + void (*flush_tlb_single)(unsigned long addr); + void (*flush_tlb_others)(const cpumask_t *cpus, struct mm_struct *mm, + unsigned long va); + + /* Hooks for allocating and freeing a pagetable top-level */ + int (*pgd_alloc)(struct mm_struct *mm); + void (*pgd_free)(struct mm_struct *mm, pgd_t *pgd); + + /* + * Hooks for allocating/releasing pagetable pages when they're + * attached to a pagetable + */ + void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn); + void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn); + void (*alloc_pmd_clone)(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count); + void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn); + void (*release_pte)(unsigned long pfn); + void (*release_pmd)(unsigned long pfn); + void (*release_pud)(unsigned long pfn); + + /* Pagetable manipulation functions */ + void (*set_pte)(pte_t *ptep, pte_t pteval); + void (*set_pte_at)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pteval); + void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); + void (*pte_update)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep); + void (*pte_update_defer)(struct mm_struct *mm, + unsigned long addr, pte_t *ptep); + + pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep); + void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte); + + pteval_t (*pte_val)(pte_t); + pteval_t (*pte_flags)(pte_t); + pte_t (*make_pte)(pteval_t pte); + + pgdval_t (*pgd_val)(pgd_t); + pgd_t (*make_pgd)(pgdval_t pgd); + +#if PAGETABLE_LEVELS >= 3 +#ifdef CONFIG_X86_PAE + void (*set_pte_atomic)(pte_t *ptep, pte_t pteval); + void (*set_pte_present)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte); + void (*pte_clear)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep); + void (*pmd_clear)(pmd_t *pmdp); + +#endif /* CONFIG_X86_PAE */ + + void (*set_pud)(pud_t *pudp, pud_t pudval); + + pmdval_t (*pmd_val)(pmd_t); + pmd_t (*make_pmd)(pmdval_t pmd); + +#if PAGETABLE_LEVELS == 4 + pudval_t (*pud_val)(pud_t); + pud_t (*make_pud)(pudval_t pud); + + void (*set_pgd)(pgd_t *pudp, pgd_t pgdval); +#endif /* PAGETABLE_LEVELS == 4 */ +#endif /* PAGETABLE_LEVELS >= 3 */ + +#ifdef CONFIG_HIGHPTE + void *(*kmap_atomic_pte)(struct page *page, enum km_type type); +#endif + + struct pv_lazy_ops lazy_mode; + + /* dom0 ops */ + + /* Sometimes the physical address is a pfn, and sometimes its + an mfn. We can tell which is which from the index. */ + void (*set_fixmap)(unsigned /* enum fixed_addresses */ idx, + unsigned long phys, pgprot_t flags); +}; + +struct raw_spinlock; +struct pv_lock_ops { + int (*spin_is_locked)(struct raw_spinlock *lock); + int (*spin_is_contended)(struct raw_spinlock *lock); + void (*spin_lock)(struct raw_spinlock *lock); + void (*spin_lock_flags)(struct raw_spinlock *lock, unsigned long flags); + int (*spin_trylock)(struct raw_spinlock *lock); + void (*spin_unlock)(struct raw_spinlock *lock); +}; + +/* This contains all the paravirt structures: we get a convenient + * number for each function using the offset which we use to indicate + * what to patch. */ +struct paravirt_patch_template { + struct pv_init_ops pv_init_ops; + struct pv_time_ops pv_time_ops; + struct pv_cpu_ops pv_cpu_ops; + struct pv_irq_ops pv_irq_ops; + struct pv_apic_ops pv_apic_ops; + struct pv_mmu_ops pv_mmu_ops; + struct pv_lock_ops pv_lock_ops; +}; + +extern struct pv_info pv_info; +extern struct pv_init_ops pv_init_ops; +extern struct pv_time_ops pv_time_ops; +extern struct pv_cpu_ops pv_cpu_ops; +extern struct pv_irq_ops pv_irq_ops; +extern struct pv_apic_ops pv_apic_ops; +extern struct pv_mmu_ops pv_mmu_ops; +extern struct pv_lock_ops pv_lock_ops; + +#define PARAVIRT_PATCH(x) \ + (offsetof(struct paravirt_patch_template, x) / sizeof(void *)) + +#define paravirt_type(op) \ + [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \ + [paravirt_opptr] "m" (op) +#define paravirt_clobber(clobber) \ + [paravirt_clobber] "i" (clobber) + +/* + * Generate some code, and mark it as patchable by the + * apply_paravirt() alternate instruction patcher. + */ +#define _paravirt_alt(insn_string, type, clobber) \ + "771:\n\t" insn_string "\n" "772:\n" \ + ".pushsection .parainstructions,\"a\"\n" \ + _ASM_ALIGN "\n" \ + _ASM_PTR " 771b\n" \ + " .byte " type "\n" \ + " .byte 772b-771b\n" \ + " .short " clobber "\n" \ + ".popsection\n" + +/* Generate patchable code, with the default asm parameters. */ +#define paravirt_alt(insn_string) \ + _paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]") + +/* Simple instruction patching code. */ +#define DEF_NATIVE(ops, name, code) \ + extern const char start_##ops##_##name[], end_##ops##_##name[]; \ + asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":") + +unsigned paravirt_patch_nop(void); +unsigned paravirt_patch_ignore(unsigned len); +unsigned paravirt_patch_call(void *insnbuf, + const void *target, u16 tgt_clobbers, + unsigned long addr, u16 site_clobbers, + unsigned len); +unsigned paravirt_patch_jmp(void *insnbuf, const void *target, + unsigned long addr, unsigned len); +unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, + unsigned long addr, unsigned len); + +unsigned paravirt_patch_insns(void *insnbuf, unsigned len, + const char *start, const char *end); + +unsigned native_patch(u8 type, u16 clobbers, void *ibuf, + unsigned long addr, unsigned len); + +int paravirt_disable_iospace(void); + +/* + * This generates an indirect call based on the operation type number. + * The type number, computed in PARAVIRT_PATCH, is derived from the + * offset into the paravirt_patch_template structure, and can therefore be + * freely converted back into a structure offset. + */ +#define PARAVIRT_CALL "call *%[paravirt_opptr];" + +/* + * These macros are intended to wrap calls through one of the paravirt + * ops structs, so that they can be later identified and patched at + * runtime. + * + * Normally, a call to a pv_op function is a simple indirect call: + * (pv_op_struct.operations)(args...). + * + * Unfortunately, this is a relatively slow operation for modern CPUs, + * because it cannot necessarily determine what the destination + * address is. In this case, the address is a runtime constant, so at + * the very least we can patch the call to e a simple direct call, or + * ideally, patch an inline implementation into the callsite. (Direct + * calls are essentially free, because the call and return addresses + * are completely predictable.) + * + * For i386, these macros rely on the standard gcc "regparm(3)" calling + * convention, in which the first three arguments are placed in %eax, + * %edx, %ecx (in that order), and the remaining arguments are placed + * on the stack. All caller-save registers (eax,edx,ecx) are expected + * to be modified (either clobbered or used for return values). + * X86_64, on the other hand, already specifies a register-based calling + * conventions, returning at %rax, with parameteres going on %rdi, %rsi, + * %rdx, and %rcx. Note that for this reason, x86_64 does not need any + * special handling for dealing with 4 arguments, unlike i386. + * However, x86_64 also have to clobber all caller saved registers, which + * unfortunately, are quite a bit (r8 - r11) + * + * The call instruction itself is marked by placing its start address + * and size into the .parainstructions section, so that + * apply_paravirt() in arch/i386/kernel/alternative.c can do the + * appropriate patching under the control of the backend pv_init_ops + * implementation. + * + * Unfortunately there's no way to get gcc to generate the args setup + * for the call, and then allow the call itself to be generated by an + * inline asm. Because of this, we must do the complete arg setup and + * return value handling from within these macros. This is fairly + * cumbersome. + * + * There are 5 sets of PVOP_* macros for dealing with 0-4 arguments. + * It could be extended to more arguments, but there would be little + * to be gained from that. For each number of arguments, there are + * the two VCALL and CALL variants for void and non-void functions. + * + * When there is a return value, the invoker of the macro must specify + * the return type. The macro then uses sizeof() on that type to + * determine whether its a 32 or 64 bit value, and places the return + * in the right register(s) (just %eax for 32-bit, and %edx:%eax for + * 64-bit). For x86_64 machines, it just returns at %rax regardless of + * the return value size. + * + * 64-bit arguments are passed as a pair of adjacent 32-bit arguments + * i386 also passes 64-bit arguments as a pair of adjacent 32-bit arguments + * in low,high order + * + * Small structures are passed and returned in registers. The macro + * calling convention can't directly deal with this, so the wrapper + * functions must do this. + * + * These PVOP_* macros are only defined within this header. This + * means that all uses must be wrapped in inline functions. This also + * makes sure the incoming and outgoing types are always correct. + */ +#ifdef CONFIG_X86_32 +#define PVOP_VCALL_ARGS unsigned long __eax, __edx, __ecx +#define PVOP_CALL_ARGS PVOP_VCALL_ARGS +#define PVOP_VCALL_CLOBBERS "=a" (__eax), "=d" (__edx), \ + "=c" (__ecx) +#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS +#define EXTRA_CLOBBERS +#define VEXTRA_CLOBBERS +#else +#define PVOP_VCALL_ARGS unsigned long __edi, __esi, __edx, __ecx +#define PVOP_CALL_ARGS PVOP_VCALL_ARGS, __eax +#define PVOP_VCALL_CLOBBERS "=D" (__edi), \ + "=S" (__esi), "=d" (__edx), \ + "=c" (__ecx) + +#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax) + +#define EXTRA_CLOBBERS , "r8", "r9", "r10", "r11" +#define VEXTRA_CLOBBERS , "rax", "r8", "r9", "r10", "r11" +#endif + +#ifdef CONFIG_PARAVIRT_DEBUG +#define PVOP_TEST_NULL(op) BUG_ON(op == NULL) +#else +#define PVOP_TEST_NULL(op) ((void)op) +#endif + +#define __PVOP_CALL(rettype, op, pre, post, ...) \ + ({ \ + rettype __ret; \ + PVOP_CALL_ARGS; \ + PVOP_TEST_NULL(op); \ + /* This is 32-bit specific, but is okay in 64-bit */ \ + /* since this condition will never hold */ \ + if (sizeof(rettype) > sizeof(unsigned long)) { \ + asm volatile(pre \ + paravirt_alt(PARAVIRT_CALL) \ + post \ + : PVOP_CALL_CLOBBERS \ + : paravirt_type(op), \ + paravirt_clobber(CLBR_ANY), \ + ##__VA_ARGS__ \ + : "memory", "cc" EXTRA_CLOBBERS); \ + __ret = (rettype)((((u64)__edx) << 32) | __eax); \ + } else { \ + asm volatile(pre \ + paravirt_alt(PARAVIRT_CALL) \ + post \ + : PVOP_CALL_CLOBBERS \ + : paravirt_type(op), \ + paravirt_clobber(CLBR_ANY), \ + ##__VA_ARGS__ \ + : "memory", "cc" EXTRA_CLOBBERS); \ + __ret = (rettype)__eax; \ + } \ + __ret; \ + }) +#define __PVOP_VCALL(op, pre, post, ...) \ + ({ \ + PVOP_VCALL_ARGS; \ + PVOP_TEST_NULL(op); \ + asm volatile(pre \ + paravirt_alt(PARAVIRT_CALL) \ + post \ + : PVOP_VCALL_CLOBBERS \ + : paravirt_type(op), \ + paravirt_clobber(CLBR_ANY), \ + ##__VA_ARGS__ \ + : "memory", "cc" VEXTRA_CLOBBERS); \ + }) + +#define PVOP_CALL0(rettype, op) \ + __PVOP_CALL(rettype, op, "", "") +#define PVOP_VCALL0(op) \ + __PVOP_VCALL(op, "", "") + +#define PVOP_CALL1(rettype, op, arg1) \ + __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1))) +#define PVOP_VCALL1(op, arg1) \ + __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1))) + +#define PVOP_CALL2(rettype, op, arg1, arg2) \ + __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \ + "1" ((unsigned long)(arg2))) +#define PVOP_VCALL2(op, arg1, arg2) \ + __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \ + "1" ((unsigned long)(arg2))) + +#define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \ + __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \ + "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3))) +#define PVOP_VCALL3(op, arg1, arg2, arg3) \ + __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \ + "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3))) + +/* This is the only difference in x86_64. We can make it much simpler */ +#ifdef CONFIG_X86_32 +#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ + __PVOP_CALL(rettype, op, \ + "push %[_arg4];", "lea 4(%%esp),%%esp;", \ + "0" ((u32)(arg1)), "1" ((u32)(arg2)), \ + "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4))) +#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ + __PVOP_VCALL(op, \ + "push %[_arg4];", "lea 4(%%esp),%%esp;", \ + "0" ((u32)(arg1)), "1" ((u32)(arg2)), \ + "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4))) +#else +#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ + __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \ + "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)), \ + "3"((unsigned long)(arg4))) +#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ + __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \ + "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)), \ + "3"((unsigned long)(arg4))) +#endif + +static inline int paravirt_enabled(void) +{ + return pv_info.paravirt_enabled; +} + +static inline void load_sp0(struct tss_struct *tss, + struct thread_struct *thread) +{ + PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread); +} + +#define ARCH_SETUP pv_init_ops.arch_setup(); +static inline unsigned long get_wallclock(void) +{ + return PVOP_CALL0(unsigned long, pv_time_ops.get_wallclock); +} + +static inline int set_wallclock(unsigned long nowtime) +{ + return PVOP_CALL1(int, pv_time_ops.set_wallclock, nowtime); +} + +static inline void (*choose_time_init(void))(void) +{ + return pv_time_ops.time_init; +} + +/* The paravirtualized CPUID instruction. */ +static inline void __cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + PVOP_VCALL4(pv_cpu_ops.cpuid, eax, ebx, ecx, edx); +} + +/* + * These special macros can be used to get or set a debugging register + */ +static inline unsigned long paravirt_get_debugreg(int reg) +{ + return PVOP_CALL1(unsigned long, pv_cpu_ops.get_debugreg, reg); +} +#define get_debugreg(var, reg) var = paravirt_get_debugreg(reg) +static inline void set_debugreg(unsigned long val, int reg) +{ + PVOP_VCALL2(pv_cpu_ops.set_debugreg, reg, val); +} + +static inline void clts(void) +{ + PVOP_VCALL0(pv_cpu_ops.clts); +} + +static inline unsigned long read_cr0(void) +{ + return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr0); +} + +static inline void write_cr0(unsigned long x) +{ + PVOP_VCALL1(pv_cpu_ops.write_cr0, x); +} + +static inline unsigned long read_cr2(void) +{ + return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr2); +} + +static inline void write_cr2(unsigned long x) +{ + PVOP_VCALL1(pv_mmu_ops.write_cr2, x); +} + +static inline unsigned long read_cr3(void) +{ + return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3); +} + +static inline void write_cr3(unsigned long x) +{ + PVOP_VCALL1(pv_mmu_ops.write_cr3, x); +} + +static inline unsigned long read_cr4(void) +{ + return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4); +} +static inline unsigned long read_cr4_safe(void) +{ + return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4_safe); +} + +static inline void write_cr4(unsigned long x) +{ + PVOP_VCALL1(pv_cpu_ops.write_cr4, x); +} + +#ifdef CONFIG_X86_64 +static inline unsigned long read_cr8(void) +{ + return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr8); +} + +static inline void write_cr8(unsigned long x) +{ + PVOP_VCALL1(pv_cpu_ops.write_cr8, x); +} +#endif + +static inline void raw_safe_halt(void) +{ + PVOP_VCALL0(pv_irq_ops.safe_halt); +} + +static inline void halt(void) +{ + PVOP_VCALL0(pv_irq_ops.safe_halt); +} + +static inline void wbinvd(void) +{ + PVOP_VCALL0(pv_cpu_ops.wbinvd); +} + +#define get_kernel_rpl() (pv_info.kernel_rpl) + +static inline u64 paravirt_read_msr(unsigned msr, int *err) +{ + return PVOP_CALL2(u64, pv_cpu_ops.read_msr, msr, err); +} +static inline u64 paravirt_read_msr_amd(unsigned msr, int *err) +{ + return PVOP_CALL2(u64, pv_cpu_ops.read_msr_amd, msr, err); +} +static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high) +{ + return PVOP_CALL3(int, pv_cpu_ops.write_msr, msr, low, high); +} + +/* These should all do BUG_ON(_err), but our headers are too tangled. */ +#define rdmsr(msr, val1, val2) \ +do { \ + int _err; \ + u64 _l = paravirt_read_msr(msr, &_err); \ + val1 = (u32)_l; \ + val2 = _l >> 32; \ +} while (0) + +#define wrmsr(msr, val1, val2) \ +do { \ + paravirt_write_msr(msr, val1, val2); \ +} while (0) + +#define rdmsrl(msr, val) \ +do { \ + int _err; \ + val = paravirt_read_msr(msr, &_err); \ +} while (0) + +#define wrmsrl(msr, val) wrmsr(msr, (u32)((u64)(val)), ((u64)(val))>>32) +#define wrmsr_safe(msr, a, b) paravirt_write_msr(msr, a, b) + +/* rdmsr with exception handling */ +#define rdmsr_safe(msr, a, b) \ +({ \ + int _err; \ + u64 _l = paravirt_read_msr(msr, &_err); \ + (*a) = (u32)_l; \ + (*b) = _l >> 32; \ + _err; \ +}) + +static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) +{ + int err; + + *p = paravirt_read_msr(msr, &err); + return err; +} +static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) +{ + int err; + + *p = paravirt_read_msr_amd(msr, &err); + return err; +} + +static inline u64 paravirt_read_tsc(void) +{ + return PVOP_CALL0(u64, pv_cpu_ops.read_tsc); +} + +#define rdtscl(low) \ +do { \ + u64 _l = paravirt_read_tsc(); \ + low = (int)_l; \ +} while (0) + +#define rdtscll(val) (val = paravirt_read_tsc()) + +static inline unsigned long long paravirt_sched_clock(void) +{ + return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock); +} +#define calibrate_tsc() (pv_time_ops.get_tsc_khz()) + +static inline unsigned long long paravirt_read_pmc(int counter) +{ + return PVOP_CALL1(u64, pv_cpu_ops.read_pmc, counter); +} + +#define rdpmc(counter, low, high) \ +do { \ + u64 _l = paravirt_read_pmc(counter); \ + low = (u32)_l; \ + high = _l >> 32; \ +} while (0) + +static inline unsigned long long paravirt_rdtscp(unsigned int *aux) +{ + return PVOP_CALL1(u64, pv_cpu_ops.read_tscp, aux); +} + +#define rdtscp(low, high, aux) \ +do { \ + int __aux; \ + unsigned long __val = paravirt_rdtscp(&__aux); \ + (low) = (u32)__val; \ + (high) = (u32)(__val >> 32); \ + (aux) = __aux; \ +} while (0) + +#define rdtscpll(val, aux) \ +do { \ + unsigned long __aux; \ + val = paravirt_rdtscp(&__aux); \ + (aux) = __aux; \ +} while (0) + +static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries) +{ + PVOP_VCALL2(pv_cpu_ops.alloc_ldt, ldt, entries); +} + +static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries) +{ + PVOP_VCALL2(pv_cpu_ops.free_ldt, ldt, entries); +} + +static inline void load_TR_desc(void) +{ + PVOP_VCALL0(pv_cpu_ops.load_tr_desc); +} +static inline void load_gdt(const struct desc_ptr *dtr) +{ + PVOP_VCALL1(pv_cpu_ops.load_gdt, dtr); +} +static inline void load_idt(const struct desc_ptr *dtr) +{ + PVOP_VCALL1(pv_cpu_ops.load_idt, dtr); +} +static inline void set_ldt(const void *addr, unsigned entries) +{ + PVOP_VCALL2(pv_cpu_ops.set_ldt, addr, entries); +} +static inline void store_gdt(struct desc_ptr *dtr) +{ + PVOP_VCALL1(pv_cpu_ops.store_gdt, dtr); +} +static inline void store_idt(struct desc_ptr *dtr) +{ + PVOP_VCALL1(pv_cpu_ops.store_idt, dtr); +} +static inline unsigned long paravirt_store_tr(void) +{ + return PVOP_CALL0(unsigned long, pv_cpu_ops.store_tr); +} +#define store_tr(tr) ((tr) = paravirt_store_tr()) +static inline void load_TLS(struct thread_struct *t, unsigned cpu) +{ + PVOP_VCALL2(pv_cpu_ops.load_tls, t, cpu); +} + +#ifdef CONFIG_X86_64 +static inline void load_gs_index(unsigned int gs) +{ + PVOP_VCALL1(pv_cpu_ops.load_gs_index, gs); +} +#endif + +static inline void write_ldt_entry(struct desc_struct *dt, int entry, + const void *desc) +{ + PVOP_VCALL3(pv_cpu_ops.write_ldt_entry, dt, entry, desc); +} + +static inline void write_gdt_entry(struct desc_struct *dt, int entry, + void *desc, int type) +{ + PVOP_VCALL4(pv_cpu_ops.write_gdt_entry, dt, entry, desc, type); +} + +static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g) +{ + PVOP_VCALL3(pv_cpu_ops.write_idt_entry, dt, entry, g); +} +static inline void set_iopl_mask(unsigned mask) +{ + PVOP_VCALL1(pv_cpu_ops.set_iopl_mask, mask); +} + +/* The paravirtualized I/O functions */ +static inline void slow_down_io(void) +{ + pv_cpu_ops.io_delay(); +#ifdef REALLY_SLOW_IO + pv_cpu_ops.io_delay(); + pv_cpu_ops.io_delay(); + pv_cpu_ops.io_delay(); +#endif +} + +#ifdef CONFIG_X86_LOCAL_APIC +static inline void setup_boot_clock(void) +{ + PVOP_VCALL0(pv_apic_ops.setup_boot_clock); +} + +static inline void setup_secondary_clock(void) +{ + PVOP_VCALL0(pv_apic_ops.setup_secondary_clock); +} +#endif + +static inline void paravirt_post_allocator_init(void) +{ + if (pv_init_ops.post_allocator_init) + (*pv_init_ops.post_allocator_init)(); +} + +static inline void paravirt_pagetable_setup_start(pgd_t *base) +{ + (*pv_mmu_ops.pagetable_setup_start)(base); +} + +static inline void paravirt_pagetable_setup_done(pgd_t *base) +{ + (*pv_mmu_ops.pagetable_setup_done)(base); +} + +#ifdef CONFIG_SMP +static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip, + unsigned long start_esp) +{ + PVOP_VCALL3(pv_apic_ops.startup_ipi_hook, + phys_apicid, start_eip, start_esp); +} +#endif + +static inline void paravirt_activate_mm(struct mm_struct *prev, + struct mm_struct *next) +{ + PVOP_VCALL2(pv_mmu_ops.activate_mm, prev, next); +} + +static inline void arch_dup_mmap(struct mm_struct *oldmm, + struct mm_struct *mm) +{ + PVOP_VCALL2(pv_mmu_ops.dup_mmap, oldmm, mm); +} + +static inline void arch_exit_mmap(struct mm_struct *mm) +{ + PVOP_VCALL1(pv_mmu_ops.exit_mmap, mm); +} + +static inline void __flush_tlb(void) +{ + PVOP_VCALL0(pv_mmu_ops.flush_tlb_user); +} +static inline void __flush_tlb_global(void) +{ + PVOP_VCALL0(pv_mmu_ops.flush_tlb_kernel); +} +static inline void __flush_tlb_single(unsigned long addr) +{ + PVOP_VCALL1(pv_mmu_ops.flush_tlb_single, addr); +} + +static inline void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, + unsigned long va) +{ + PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, &cpumask, mm, va); +} + +static inline int paravirt_pgd_alloc(struct mm_struct *mm) +{ + return PVOP_CALL1(int, pv_mmu_ops.pgd_alloc, mm); +} + +static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ + PVOP_VCALL2(pv_mmu_ops.pgd_free, mm, pgd); +} + +static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) +{ + PVOP_VCALL2(pv_mmu_ops.alloc_pte, mm, pfn); +} +static inline void paravirt_release_pte(unsigned long pfn) +{ + PVOP_VCALL1(pv_mmu_ops.release_pte, pfn); +} + +static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) +{ + PVOP_VCALL2(pv_mmu_ops.alloc_pmd, mm, pfn); +} + +static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn, + unsigned long start, unsigned long count) +{ + PVOP_VCALL4(pv_mmu_ops.alloc_pmd_clone, pfn, clonepfn, start, count); +} +static inline void paravirt_release_pmd(unsigned long pfn) +{ + PVOP_VCALL1(pv_mmu_ops.release_pmd, pfn); +} + +static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) +{ + PVOP_VCALL2(pv_mmu_ops.alloc_pud, mm, pfn); +} +static inline void paravirt_release_pud(unsigned long pfn) +{ + PVOP_VCALL1(pv_mmu_ops.release_pud, pfn); +} + +#ifdef CONFIG_HIGHPTE +static inline void *kmap_atomic_pte(struct page *page, enum km_type type) +{ + unsigned long ret; + ret = PVOP_CALL2(unsigned long, pv_mmu_ops.kmap_atomic_pte, page, type); + return (void *)ret; +} +#endif + +static inline void pte_update(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep); +} + +static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep); +} + +static inline pte_t __pte(pteval_t val) +{ + pteval_t ret; + + if (sizeof(pteval_t) > sizeof(long)) + ret = PVOP_CALL2(pteval_t, + pv_mmu_ops.make_pte, + val, (u64)val >> 32); + else + ret = PVOP_CALL1(pteval_t, + pv_mmu_ops.make_pte, + val); + + return (pte_t) { .pte = ret }; +} + +static inline pteval_t pte_val(pte_t pte) +{ + pteval_t ret; + + if (sizeof(pteval_t) > sizeof(long)) + ret = PVOP_CALL2(pteval_t, pv_mmu_ops.pte_val, + pte.pte, (u64)pte.pte >> 32); + else + ret = PVOP_CALL1(pteval_t, pv_mmu_ops.pte_val, + pte.pte); + + return ret; +} + +static inline pteval_t pte_flags(pte_t pte) +{ + pteval_t ret; + + if (sizeof(pteval_t) > sizeof(long)) + ret = PVOP_CALL2(pteval_t, pv_mmu_ops.pte_flags, + pte.pte, (u64)pte.pte >> 32); + else + ret = PVOP_CALL1(pteval_t, pv_mmu_ops.pte_flags, + pte.pte); + +#ifdef CONFIG_PARAVIRT_DEBUG + BUG_ON(ret & PTE_PFN_MASK); +#endif + return ret; +} + +static inline pgd_t __pgd(pgdval_t val) +{ + pgdval_t ret; + + if (sizeof(pgdval_t) > sizeof(long)) + ret = PVOP_CALL2(pgdval_t, pv_mmu_ops.make_pgd, + val, (u64)val >> 32); + else + ret = PVOP_CALL1(pgdval_t, pv_mmu_ops.make_pgd, + val); + + return (pgd_t) { ret }; +} + +static inline pgdval_t pgd_val(pgd_t pgd) +{ + pgdval_t ret; + + if (sizeof(pgdval_t) > sizeof(long)) + ret = PVOP_CALL2(pgdval_t, pv_mmu_ops.pgd_val, + pgd.pgd, (u64)pgd.pgd >> 32); + else + ret = PVOP_CALL1(pgdval_t, pv_mmu_ops.pgd_val, + pgd.pgd); + + return ret; +} + +#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION +static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + pteval_t ret; + + ret = PVOP_CALL3(pteval_t, pv_mmu_ops.ptep_modify_prot_start, + mm, addr, ptep); + + return (pte_t) { .pte = ret }; +} + +static inline void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + if (sizeof(pteval_t) > sizeof(long)) + /* 5 arg words */ + pv_mmu_ops.ptep_modify_prot_commit(mm, addr, ptep, pte); + else + PVOP_VCALL4(pv_mmu_ops.ptep_modify_prot_commit, + mm, addr, ptep, pte.pte); +} + +static inline void set_pte(pte_t *ptep, pte_t pte) +{ + if (sizeof(pteval_t) > sizeof(long)) + PVOP_VCALL3(pv_mmu_ops.set_pte, ptep, + pte.pte, (u64)pte.pte >> 32); + else + PVOP_VCALL2(pv_mmu_ops.set_pte, ptep, + pte.pte); +} + +static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + if (sizeof(pteval_t) > sizeof(long)) + /* 5 arg words */ + pv_mmu_ops.set_pte_at(mm, addr, ptep, pte); + else + PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte); +} + +static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) +{ + pmdval_t val = native_pmd_val(pmd); + + if (sizeof(pmdval_t) > sizeof(long)) + PVOP_VCALL3(pv_mmu_ops.set_pmd, pmdp, val, (u64)val >> 32); + else + PVOP_VCALL2(pv_mmu_ops.set_pmd, pmdp, val); +} + +#if PAGETABLE_LEVELS >= 3 +static inline pmd_t __pmd(pmdval_t val) +{ + pmdval_t ret; + + if (sizeof(pmdval_t) > sizeof(long)) + ret = PVOP_CALL2(pmdval_t, pv_mmu_ops.make_pmd, + val, (u64)val >> 32); + else + ret = PVOP_CALL1(pmdval_t, pv_mmu_ops.make_pmd, + val); + + return (pmd_t) { ret }; +} + +static inline pmdval_t pmd_val(pmd_t pmd) +{ + pmdval_t ret; + + if (sizeof(pmdval_t) > sizeof(long)) + ret = PVOP_CALL2(pmdval_t, pv_mmu_ops.pmd_val, + pmd.pmd, (u64)pmd.pmd >> 32); + else + ret = PVOP_CALL1(pmdval_t, pv_mmu_ops.pmd_val, + pmd.pmd); + + return ret; +} + +static inline void set_pud(pud_t *pudp, pud_t pud) +{ + pudval_t val = native_pud_val(pud); + + if (sizeof(pudval_t) > sizeof(long)) + PVOP_VCALL3(pv_mmu_ops.set_pud, pudp, + val, (u64)val >> 32); + else + PVOP_VCALL2(pv_mmu_ops.set_pud, pudp, + val); +} +#if PAGETABLE_LEVELS == 4 +static inline pud_t __pud(pudval_t val) +{ + pudval_t ret; + + if (sizeof(pudval_t) > sizeof(long)) + ret = PVOP_CALL2(pudval_t, pv_mmu_ops.make_pud, + val, (u64)val >> 32); + else + ret = PVOP_CALL1(pudval_t, pv_mmu_ops.make_pud, + val); + + return (pud_t) { ret }; +} + +static inline pudval_t pud_val(pud_t pud) +{ + pudval_t ret; + + if (sizeof(pudval_t) > sizeof(long)) + ret = PVOP_CALL2(pudval_t, pv_mmu_ops.pud_val, + pud.pud, (u64)pud.pud >> 32); + else + ret = PVOP_CALL1(pudval_t, pv_mmu_ops.pud_val, + pud.pud); + + return ret; +} + +static inline void set_pgd(pgd_t *pgdp, pgd_t pgd) +{ + pgdval_t val = native_pgd_val(pgd); + + if (sizeof(pgdval_t) > sizeof(long)) + PVOP_VCALL3(pv_mmu_ops.set_pgd, pgdp, + val, (u64)val >> 32); + else + PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp, + val); +} + +static inline void pgd_clear(pgd_t *pgdp) +{ + set_pgd(pgdp, __pgd(0)); +} + +static inline void pud_clear(pud_t *pudp) +{ + set_pud(pudp, __pud(0)); +} + +#endif /* PAGETABLE_LEVELS == 4 */ + +#endif /* PAGETABLE_LEVELS >= 3 */ + +#ifdef CONFIG_X86_PAE +/* Special-case pte-setting operations for PAE, which can't update a + 64-bit pte atomically */ +static inline void set_pte_atomic(pte_t *ptep, pte_t pte) +{ + PVOP_VCALL3(pv_mmu_ops.set_pte_atomic, ptep, + pte.pte, pte.pte >> 32); +} + +static inline void set_pte_present(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + /* 5 arg words */ + pv_mmu_ops.set_pte_present(mm, addr, ptep, pte); +} + +static inline void pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + PVOP_VCALL3(pv_mmu_ops.pte_clear, mm, addr, ptep); +} + +static inline void pmd_clear(pmd_t *pmdp) +{ + PVOP_VCALL1(pv_mmu_ops.pmd_clear, pmdp); +} +#else /* !CONFIG_X86_PAE */ +static inline void set_pte_atomic(pte_t *ptep, pte_t pte) +{ + set_pte(ptep, pte); +} + +static inline void set_pte_present(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + set_pte(ptep, pte); +} + +static inline void pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + set_pte_at(mm, addr, ptep, __pte(0)); +} + +static inline void pmd_clear(pmd_t *pmdp) +{ + set_pmd(pmdp, __pmd(0)); +} +#endif /* CONFIG_X86_PAE */ + +/* Lazy mode for batching updates / context switch */ +enum paravirt_lazy_mode { + PARAVIRT_LAZY_NONE, + PARAVIRT_LAZY_MMU, + PARAVIRT_LAZY_CPU, +}; + +enum paravirt_lazy_mode paravirt_get_lazy_mode(void); +void paravirt_enter_lazy_cpu(void); +void paravirt_leave_lazy_cpu(void); +void paravirt_enter_lazy_mmu(void); +void paravirt_leave_lazy_mmu(void); +void paravirt_leave_lazy(enum paravirt_lazy_mode mode); + +#define __HAVE_ARCH_ENTER_LAZY_CPU_MODE +static inline void arch_enter_lazy_cpu_mode(void) +{ + PVOP_VCALL0(pv_cpu_ops.lazy_mode.enter); +} + +static inline void arch_leave_lazy_cpu_mode(void) +{ + PVOP_VCALL0(pv_cpu_ops.lazy_mode.leave); +} + +static inline void arch_flush_lazy_cpu_mode(void) +{ + if (unlikely(paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU)) { + arch_leave_lazy_cpu_mode(); + arch_enter_lazy_cpu_mode(); + } +} + + +#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE +static inline void arch_enter_lazy_mmu_mode(void) +{ + PVOP_VCALL0(pv_mmu_ops.lazy_mode.enter); +} + +static inline void arch_leave_lazy_mmu_mode(void) +{ + PVOP_VCALL0(pv_mmu_ops.lazy_mode.leave); +} + +static inline void arch_flush_lazy_mmu_mode(void) +{ + if (unlikely(paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU)) { + arch_leave_lazy_mmu_mode(); + arch_enter_lazy_mmu_mode(); + } +} + +static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, + unsigned long phys, pgprot_t flags) +{ + pv_mmu_ops.set_fixmap(idx, phys, flags); +} + +void _paravirt_nop(void); +#define paravirt_nop ((void *)_paravirt_nop) + +void paravirt_use_bytelocks(void); + +#ifdef CONFIG_SMP + +static inline int __raw_spin_is_locked(struct raw_spinlock *lock) +{ + return PVOP_CALL1(int, pv_lock_ops.spin_is_locked, lock); +} + +static inline int __raw_spin_is_contended(struct raw_spinlock *lock) +{ + return PVOP_CALL1(int, pv_lock_ops.spin_is_contended, lock); +} + +static __always_inline void __raw_spin_lock(struct raw_spinlock *lock) +{ + PVOP_VCALL1(pv_lock_ops.spin_lock, lock); +} + +static __always_inline void __raw_spin_lock_flags(struct raw_spinlock *lock, + unsigned long flags) +{ + PVOP_VCALL2(pv_lock_ops.spin_lock_flags, lock, flags); +} + +static __always_inline int __raw_spin_trylock(struct raw_spinlock *lock) +{ + return PVOP_CALL1(int, pv_lock_ops.spin_trylock, lock); +} + +static __always_inline void __raw_spin_unlock(struct raw_spinlock *lock) +{ + PVOP_VCALL1(pv_lock_ops.spin_unlock, lock); +} + +#endif + +/* These all sit in the .parainstructions section to tell us what to patch. */ +struct paravirt_patch_site { + u8 *instr; /* original instructions */ + u8 instrtype; /* type of this instruction */ + u8 len; /* length of original instruction */ + u16 clobbers; /* what registers you may clobber */ +}; + +extern struct paravirt_patch_site __parainstructions[], + __parainstructions_end[]; + +#ifdef CONFIG_X86_32 +#define PV_SAVE_REGS "pushl %%ecx; pushl %%edx;" +#define PV_RESTORE_REGS "popl %%edx; popl %%ecx" +#define PV_FLAGS_ARG "0" +#define PV_EXTRA_CLOBBERS +#define PV_VEXTRA_CLOBBERS +#else +/* We save some registers, but all of them, that's too much. We clobber all + * caller saved registers but the argument parameter */ +#define PV_SAVE_REGS "pushq %%rdi;" +#define PV_RESTORE_REGS "popq %%rdi;" +#define PV_EXTRA_CLOBBERS EXTRA_CLOBBERS, "rcx" , "rdx", "rsi" +#define PV_VEXTRA_CLOBBERS EXTRA_CLOBBERS, "rdi", "rcx" , "rdx", "rsi" +#define PV_FLAGS_ARG "D" +#endif + +static inline unsigned long __raw_local_save_flags(void) +{ + unsigned long f; + + asm volatile(paravirt_alt(PV_SAVE_REGS + PARAVIRT_CALL + PV_RESTORE_REGS) + : "=a"(f) + : paravirt_type(pv_irq_ops.save_fl), + paravirt_clobber(CLBR_EAX) + : "memory", "cc" PV_VEXTRA_CLOBBERS); + return f; +} + +static inline void raw_local_irq_restore(unsigned long f) +{ + asm volatile(paravirt_alt(PV_SAVE_REGS + PARAVIRT_CALL + PV_RESTORE_REGS) + : "=a"(f) + : PV_FLAGS_ARG(f), + paravirt_type(pv_irq_ops.restore_fl), + paravirt_clobber(CLBR_EAX) + : "memory", "cc" PV_EXTRA_CLOBBERS); +} + +static inline void raw_local_irq_disable(void) +{ + asm volatile(paravirt_alt(PV_SAVE_REGS + PARAVIRT_CALL + PV_RESTORE_REGS) + : + : paravirt_type(pv_irq_ops.irq_disable), + paravirt_clobber(CLBR_EAX) + : "memory", "eax", "cc" PV_EXTRA_CLOBBERS); +} + +static inline void raw_local_irq_enable(void) +{ + asm volatile(paravirt_alt(PV_SAVE_REGS + PARAVIRT_CALL + PV_RESTORE_REGS) + : + : paravirt_type(pv_irq_ops.irq_enable), + paravirt_clobber(CLBR_EAX) + : "memory", "eax", "cc" PV_EXTRA_CLOBBERS); +} + +static inline unsigned long __raw_local_irq_save(void) +{ + unsigned long f; + + f = __raw_local_save_flags(); + raw_local_irq_disable(); + return f; +} + + +/* Make sure as little as possible of this mess escapes. */ +#undef PARAVIRT_CALL +#undef __PVOP_CALL +#undef __PVOP_VCALL +#undef PVOP_VCALL0 +#undef PVOP_CALL0 +#undef PVOP_VCALL1 +#undef PVOP_CALL1 +#undef PVOP_VCALL2 +#undef PVOP_CALL2 +#undef PVOP_VCALL3 +#undef PVOP_CALL3 +#undef PVOP_VCALL4 +#undef PVOP_CALL4 + +#else /* __ASSEMBLY__ */ + +#define _PVSITE(ptype, clobbers, ops, word, algn) \ +771:; \ + ops; \ +772:; \ + .pushsection .parainstructions,"a"; \ + .align algn; \ + word 771b; \ + .byte ptype; \ + .byte 772b-771b; \ + .short clobbers; \ + .popsection + + +#ifdef CONFIG_X86_64 +#define PV_SAVE_REGS \ + push %rax; \ + push %rcx; \ + push %rdx; \ + push %rsi; \ + push %rdi; \ + push %r8; \ + push %r9; \ + push %r10; \ + push %r11 +#define PV_RESTORE_REGS \ + pop %r11; \ + pop %r10; \ + pop %r9; \ + pop %r8; \ + pop %rdi; \ + pop %rsi; \ + pop %rdx; \ + pop %rcx; \ + pop %rax +#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 8) +#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .quad, 8) +#define PARA_INDIRECT(addr) *addr(%rip) +#else +#define PV_SAVE_REGS pushl %eax; pushl %edi; pushl %ecx; pushl %edx +#define PV_RESTORE_REGS popl %edx; popl %ecx; popl %edi; popl %eax +#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 4) +#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .long, 4) +#define PARA_INDIRECT(addr) *%cs:addr +#endif + +#define INTERRUPT_RETURN \ + PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_iret), CLBR_NONE, \ + jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_iret)) + +#define DISABLE_INTERRUPTS(clobbers) \ + PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \ + PV_SAVE_REGS; \ + call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_disable); \ + PV_RESTORE_REGS;) \ + +#define ENABLE_INTERRUPTS(clobbers) \ + PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers, \ + PV_SAVE_REGS; \ + call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable); \ + PV_RESTORE_REGS;) + +#define USERGS_SYSRET32 \ + PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret32), \ + CLBR_NONE, \ + jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret32)) + +#ifdef CONFIG_X86_32 +#define GET_CR0_INTO_EAX \ + push %ecx; push %edx; \ + call PARA_INDIRECT(pv_cpu_ops+PV_CPU_read_cr0); \ + pop %edx; pop %ecx + +#define ENABLE_INTERRUPTS_SYSEXIT \ + PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), \ + CLBR_NONE, \ + jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit)) + + +#else /* !CONFIG_X86_32 */ + +/* + * If swapgs is used while the userspace stack is still current, + * there's no way to call a pvop. The PV replacement *must* be + * inlined, or the swapgs instruction must be trapped and emulated. + */ +#define SWAPGS_UNSAFE_STACK \ + PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \ + swapgs) + +#define SWAPGS \ + PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \ + PV_SAVE_REGS; \ + call PARA_INDIRECT(pv_cpu_ops+PV_CPU_swapgs); \ + PV_RESTORE_REGS \ + ) + +#define GET_CR2_INTO_RCX \ + call PARA_INDIRECT(pv_mmu_ops+PV_MMU_read_cr2); \ + movq %rax, %rcx; \ + xorq %rax, %rax; + +#define PARAVIRT_ADJUST_EXCEPTION_FRAME \ + PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_adjust_exception_frame), \ + CLBR_NONE, \ + call PARA_INDIRECT(pv_irq_ops+PV_IRQ_adjust_exception_frame)) + +#define USERGS_SYSRET64 \ + PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \ + CLBR_NONE, \ + jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64)) + +#define ENABLE_INTERRUPTS_SYSEXIT32 \ + PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), \ + CLBR_NONE, \ + jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit)) +#endif /* CONFIG_X86_32 */ + +#endif /* __ASSEMBLY__ */ +#endif /* CONFIG_PARAVIRT */ +#endif /* _ASM_X86_PARAVIRT_H */ diff --git a/arch/x86/include/asm/parport.h b/arch/x86/include/asm/parport.h new file mode 100644 index 000000000000..3c4ffeb467e9 --- /dev/null +++ b/arch/x86/include/asm/parport.h @@ -0,0 +1,10 @@ +#ifndef _ASM_X86_PARPORT_H +#define _ASM_X86_PARPORT_H + +static int __devinit parport_pc_find_isa_ports(int autoirq, int autodma); +static int __devinit parport_pc_find_nonpci_ports(int autoirq, int autodma) +{ + return parport_pc_find_isa_ports(autoirq, autodma); +} + +#endif /* _ASM_X86_PARPORT_H */ diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h new file mode 100644 index 000000000000..b8493b3b9890 --- /dev/null +++ b/arch/x86/include/asm/pat.h @@ -0,0 +1,22 @@ +#ifndef _ASM_X86_PAT_H +#define _ASM_X86_PAT_H + +#include <linux/types.h> + +#ifdef CONFIG_X86_PAT +extern int pat_enabled; +extern void validate_pat_support(struct cpuinfo_x86 *c); +#else +static const int pat_enabled; +static inline void validate_pat_support(struct cpuinfo_x86 *c) { } +#endif + +extern void pat_init(void); + +extern int reserve_memtype(u64 start, u64 end, + unsigned long req_type, unsigned long *ret_type); +extern int free_memtype(u64 start, u64 end); + +extern void pat_disable(char *reason); + +#endif /* _ASM_X86_PAT_H */ diff --git a/arch/x86/include/asm/pci-direct.h b/arch/x86/include/asm/pci-direct.h new file mode 100644 index 000000000000..b1e7a45d868a --- /dev/null +++ b/arch/x86/include/asm/pci-direct.h @@ -0,0 +1,21 @@ +#ifndef _ASM_X86_PCI_DIRECT_H +#define _ASM_X86_PCI_DIRECT_H + +#include <linux/types.h> + +/* Direct PCI access. This is used for PCI accesses in early boot before + the PCI subsystem works. */ + +extern u32 read_pci_config(u8 bus, u8 slot, u8 func, u8 offset); +extern u8 read_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset); +extern u16 read_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset); +extern void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset, u32 val); +extern void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val); +extern void write_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset, u16 val); + +extern int early_pci_allowed(void); + +extern unsigned int pci_early_dump_regs; +extern void early_dump_pci_device(u8 bus, u8 slot, u8 func); +extern void early_dump_pci_devices(void); +#endif /* _ASM_X86_PCI_DIRECT_H */ diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h new file mode 100644 index 000000000000..647781298e7e --- /dev/null +++ b/arch/x86/include/asm/pci.h @@ -0,0 +1,116 @@ +#ifndef _ASM_X86_PCI_H +#define _ASM_X86_PCI_H + +#include <linux/mm.h> /* for struct page */ +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <asm/scatterlist.h> +#include <asm/io.h> + +#ifdef __KERNEL__ + +struct pci_sysdata { + int domain; /* PCI domain */ + int node; /* NUMA node */ +#ifdef CONFIG_X86_64 + void *iommu; /* IOMMU private data */ +#endif +}; + +extern int pci_routeirq; +extern int noioapicquirk; +extern int noioapicreroute; + +/* scan a bus after allocating a pci_sysdata for it */ +extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops, + int node); +extern struct pci_bus *pci_scan_bus_with_sysdata(int busno); + +static inline int pci_domain_nr(struct pci_bus *bus) +{ + struct pci_sysdata *sd = bus->sysdata; + return sd->domain; +} + +static inline int pci_proc_domain(struct pci_bus *bus) +{ + return pci_domain_nr(bus); +} + + +/* Can be used to override the logic in pci_scan_bus for skipping + already-configured bus numbers - to be used for buggy BIOSes + or architectures with incomplete PCI setup by the loader */ + +#ifdef CONFIG_PCI +extern unsigned int pcibios_assign_all_busses(void); +#else +#define pcibios_assign_all_busses() 0 +#endif +#define pcibios_scan_all_fns(a, b) 0 + +extern unsigned long pci_mem_start; +#define PCIBIOS_MIN_IO 0x1000 +#define PCIBIOS_MIN_MEM (pci_mem_start) + +#define PCIBIOS_MIN_CARDBUS_IO 0x4000 + +void pcibios_config_init(void); +struct pci_bus *pcibios_scan_root(int bus); + +void pcibios_set_master(struct pci_dev *dev); +void pcibios_penalize_isa_irq(int irq, int active); +struct irq_routing_table *pcibios_get_irq_routing_table(void); +int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq); + + +#define HAVE_PCI_MMAP +extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, + enum pci_mmap_state mmap_state, + int write_combine); + + +#ifdef CONFIG_PCI +extern void early_quirks(void); +static inline void pci_dma_burst_advice(struct pci_dev *pdev, + enum pci_dma_burst_strategy *strat, + unsigned long *strategy_parameter) +{ + *strat = PCI_DMA_BURST_INFINITY; + *strategy_parameter = ~0UL; +} +#else +static inline void early_quirks(void) { } +#endif + +#endif /* __KERNEL__ */ + +#ifdef CONFIG_X86_32 +# include "pci_32.h" +#else +# include "pci_64.h" +#endif + +/* implement the pci_ DMA API in terms of the generic device dma_ one */ +#include <asm-generic/pci-dma-compat.h> + +/* generic pci stuff */ +#include <asm-generic/pci.h> + +#ifdef CONFIG_NUMA +/* Returns the node based on pci bus */ +static inline int __pcibus_to_node(struct pci_bus *bus) +{ + struct pci_sysdata *sd = bus->sysdata; + + return sd->node; +} + +static inline cpumask_t __pcibus_to_cpumask(struct pci_bus *bus) +{ + return node_to_cpumask(__pcibus_to_node(bus)); +} +#endif + +#endif /* _ASM_X86_PCI_H */ diff --git a/arch/x86/include/asm/pci_32.h b/arch/x86/include/asm/pci_32.h new file mode 100644 index 000000000000..6f1213a6ef4f --- /dev/null +++ b/arch/x86/include/asm/pci_32.h @@ -0,0 +1,34 @@ +#ifndef _ASM_X86_PCI_32_H +#define _ASM_X86_PCI_32_H + + +#ifdef __KERNEL__ + + +/* Dynamic DMA mapping stuff. + * i386 has everything mapped statically. + */ + +struct pci_dev; + +/* The PCI address space does equal the physical memory + * address space. The networking and block device layers use + * this boolean for bounce buffer decisions. + */ +#define PCI_DMA_BUS_IS_PHYS (1) + +/* pci_unmap_{page,single} is a nop so... */ +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) dma_addr_t ADDR_NAME[0]; +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) unsigned LEN_NAME[0]; +#define pci_unmap_addr(PTR, ADDR_NAME) sizeof((PTR)->ADDR_NAME) +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \ + do { break; } while (pci_unmap_addr(PTR, ADDR_NAME)) +#define pci_unmap_len(PTR, LEN_NAME) sizeof((PTR)->LEN_NAME) +#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \ + do { break; } while (pci_unmap_len(PTR, LEN_NAME)) + + +#endif /* __KERNEL__ */ + + +#endif /* _ASM_X86_PCI_32_H */ diff --git a/arch/x86/include/asm/pci_64.h b/arch/x86/include/asm/pci_64.h new file mode 100644 index 000000000000..5b28995d664e --- /dev/null +++ b/arch/x86/include/asm/pci_64.h @@ -0,0 +1,66 @@ +#ifndef _ASM_X86_PCI_64_H +#define _ASM_X86_PCI_64_H + +#ifdef __KERNEL__ + +#ifdef CONFIG_CALGARY_IOMMU +static inline void *pci_iommu(struct pci_bus *bus) +{ + struct pci_sysdata *sd = bus->sysdata; + return sd->iommu; +} + +static inline void set_pci_iommu(struct pci_bus *bus, void *val) +{ + struct pci_sysdata *sd = bus->sysdata; + sd->iommu = val; +} +#endif /* CONFIG_CALGARY_IOMMU */ + +extern int (*pci_config_read)(int seg, int bus, int dev, int fn, + int reg, int len, u32 *value); +extern int (*pci_config_write)(int seg, int bus, int dev, int fn, + int reg, int len, u32 value); + +extern void dma32_reserve_bootmem(void); +extern void pci_iommu_alloc(void); + +/* The PCI address space does equal the physical memory + * address space. The networking and block device layers use + * this boolean for bounce buffer decisions + * + * On AMD64 it mostly equals, but we set it to zero if a hardware + * IOMMU (gart) of sotware IOMMU (swiotlb) is available. + */ +#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys) + +#if defined(CONFIG_GART_IOMMU) || defined(CONFIG_CALGARY_IOMMU) + +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \ + dma_addr_t ADDR_NAME; +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \ + __u32 LEN_NAME; +#define pci_unmap_addr(PTR, ADDR_NAME) \ + ((PTR)->ADDR_NAME) +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \ + (((PTR)->ADDR_NAME) = (VAL)) +#define pci_unmap_len(PTR, LEN_NAME) \ + ((PTR)->LEN_NAME) +#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \ + (((PTR)->LEN_NAME) = (VAL)) + +#else +/* No IOMMU */ + +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) +#define pci_unmap_addr(PTR, ADDR_NAME) (0) +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0) +#define pci_unmap_len(PTR, LEN_NAME) (0) +#define pci_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0) + +#endif + +#endif /* __KERNEL__ */ + +#endif /* _ASM_X86_PCI_64_H */ diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h new file mode 100644 index 000000000000..2fbfff88df37 --- /dev/null +++ b/arch/x86/include/asm/pda.h @@ -0,0 +1,137 @@ +#ifndef _ASM_X86_PDA_H +#define _ASM_X86_PDA_H + +#ifndef __ASSEMBLY__ +#include <linux/stddef.h> +#include <linux/types.h> +#include <linux/cache.h> +#include <asm/page.h> + +/* Per processor datastructure. %gs points to it while the kernel runs */ +struct x8664_pda { + struct task_struct *pcurrent; /* 0 Current process */ + unsigned long data_offset; /* 8 Per cpu data offset from linker + address */ + unsigned long kernelstack; /* 16 top of kernel stack for current */ + unsigned long oldrsp; /* 24 user rsp for system call */ + int irqcount; /* 32 Irq nesting counter. Starts -1 */ + unsigned int cpunumber; /* 36 Logical CPU number */ +#ifdef CONFIG_CC_STACKPROTECTOR + unsigned long stack_canary; /* 40 stack canary value */ + /* gcc-ABI: this canary MUST be at + offset 40!!! */ +#endif + char *irqstackptr; + short nodenumber; /* number of current node (32k max) */ + short in_bootmem; /* pda lives in bootmem */ + unsigned int __softirq_pending; + unsigned int __nmi_count; /* number of NMI on this CPUs */ + short mmu_state; + short isidle; + struct mm_struct *active_mm; + unsigned apic_timer_irqs; + unsigned irq0_irqs; + unsigned irq_resched_count; + unsigned irq_call_count; + unsigned irq_tlb_count; + unsigned irq_thermal_count; + unsigned irq_threshold_count; + unsigned irq_spurious_count; +} ____cacheline_aligned_in_smp; + +extern struct x8664_pda **_cpu_pda; +extern void pda_init(int); + +#define cpu_pda(i) (_cpu_pda[i]) + +/* + * There is no fast way to get the base address of the PDA, all the accesses + * have to mention %fs/%gs. So it needs to be done this Torvaldian way. + */ +extern void __bad_pda_field(void) __attribute__((noreturn)); + +/* + * proxy_pda doesn't actually exist, but tell gcc it is accessed for + * all PDA accesses so it gets read/write dependencies right. + */ +extern struct x8664_pda _proxy_pda; + +#define pda_offset(field) offsetof(struct x8664_pda, field) + +#define pda_to_op(op, field, val) \ +do { \ + typedef typeof(_proxy_pda.field) T__; \ + if (0) { T__ tmp__; tmp__ = (val); } /* type checking */ \ + switch (sizeof(_proxy_pda.field)) { \ + case 2: \ + asm(op "w %1,%%gs:%c2" : \ + "+m" (_proxy_pda.field) : \ + "ri" ((T__)val), \ + "i"(pda_offset(field))); \ + break; \ + case 4: \ + asm(op "l %1,%%gs:%c2" : \ + "+m" (_proxy_pda.field) : \ + "ri" ((T__)val), \ + "i" (pda_offset(field))); \ + break; \ + case 8: \ + asm(op "q %1,%%gs:%c2": \ + "+m" (_proxy_pda.field) : \ + "ri" ((T__)val), \ + "i"(pda_offset(field))); \ + break; \ + default: \ + __bad_pda_field(); \ + } \ +} while (0) + +#define pda_from_op(op, field) \ +({ \ + typeof(_proxy_pda.field) ret__; \ + switch (sizeof(_proxy_pda.field)) { \ + case 2: \ + asm(op "w %%gs:%c1,%0" : \ + "=r" (ret__) : \ + "i" (pda_offset(field)), \ + "m" (_proxy_pda.field)); \ + break; \ + case 4: \ + asm(op "l %%gs:%c1,%0": \ + "=r" (ret__): \ + "i" (pda_offset(field)), \ + "m" (_proxy_pda.field)); \ + break; \ + case 8: \ + asm(op "q %%gs:%c1,%0": \ + "=r" (ret__) : \ + "i" (pda_offset(field)), \ + "m" (_proxy_pda.field)); \ + break; \ + default: \ + __bad_pda_field(); \ + } \ + ret__; \ +}) + +#define read_pda(field) pda_from_op("mov", field) +#define write_pda(field, val) pda_to_op("mov", field, val) +#define add_pda(field, val) pda_to_op("add", field, val) +#define sub_pda(field, val) pda_to_op("sub", field, val) +#define or_pda(field, val) pda_to_op("or", field, val) + +/* This is not atomic against other CPUs -- CPU preemption needs to be off */ +#define test_and_clear_bit_pda(bit, field) \ +({ \ + int old__; \ + asm volatile("btr %2,%%gs:%c3\n\tsbbl %0,%0" \ + : "=r" (old__), "+m" (_proxy_pda.field) \ + : "dIr" (bit), "i" (pda_offset(field)) : "memory");\ + old__; \ +}) + +#endif + +#define PDA_STACKOFFSET (5*8) + +#endif /* _ASM_X86_PDA_H */ diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h new file mode 100644 index 000000000000..ece72053ba63 --- /dev/null +++ b/arch/x86/include/asm/percpu.h @@ -0,0 +1,218 @@ +#ifndef _ASM_X86_PERCPU_H +#define _ASM_X86_PERCPU_H + +#ifdef CONFIG_X86_64 +#include <linux/compiler.h> + +/* Same as asm-generic/percpu.h, except that we store the per cpu offset + in the PDA. Longer term the PDA and every per cpu variable + should be just put into a single section and referenced directly + from %gs */ + +#ifdef CONFIG_SMP +#include <asm/pda.h> + +#define __per_cpu_offset(cpu) (cpu_pda(cpu)->data_offset) +#define __my_cpu_offset read_pda(data_offset) + +#define per_cpu_offset(x) (__per_cpu_offset(x)) + +#endif +#include <asm-generic/percpu.h> + +DECLARE_PER_CPU(struct x8664_pda, pda); + +/* + * These are supposed to be implemented as a single instruction which + * operates on the per-cpu data base segment. x86-64 doesn't have + * that yet, so this is a fairly inefficient workaround for the + * meantime. The single instruction is atomic with respect to + * preemption and interrupts, so we need to explicitly disable + * interrupts here to achieve the same effect. However, because it + * can be used from within interrupt-disable/enable, we can't actually + * disable interrupts; disabling preemption is enough. + */ +#define x86_read_percpu(var) \ + ({ \ + typeof(per_cpu_var(var)) __tmp; \ + preempt_disable(); \ + __tmp = __get_cpu_var(var); \ + preempt_enable(); \ + __tmp; \ + }) + +#define x86_write_percpu(var, val) \ + do { \ + preempt_disable(); \ + __get_cpu_var(var) = (val); \ + preempt_enable(); \ + } while(0) + +#else /* CONFIG_X86_64 */ + +#ifdef __ASSEMBLY__ + +/* + * PER_CPU finds an address of a per-cpu variable. + * + * Args: + * var - variable name + * reg - 32bit register + * + * The resulting address is stored in the "reg" argument. + * + * Example: + * PER_CPU(cpu_gdt_descr, %ebx) + */ +#ifdef CONFIG_SMP +#define PER_CPU(var, reg) \ + movl %fs:per_cpu__##this_cpu_off, reg; \ + lea per_cpu__##var(reg), reg +#define PER_CPU_VAR(var) %fs:per_cpu__##var +#else /* ! SMP */ +#define PER_CPU(var, reg) \ + movl $per_cpu__##var, reg +#define PER_CPU_VAR(var) per_cpu__##var +#endif /* SMP */ + +#else /* ...!ASSEMBLY */ + +/* + * PER_CPU finds an address of a per-cpu variable. + * + * Args: + * var - variable name + * cpu - 32bit register containing the current CPU number + * + * The resulting address is stored in the "cpu" argument. + * + * Example: + * PER_CPU(cpu_gdt_descr, %ebx) + */ +#ifdef CONFIG_SMP + +#define __my_cpu_offset x86_read_percpu(this_cpu_off) + +/* fs segment starts at (positive) offset == __per_cpu_offset[cpu] */ +#define __percpu_seg "%%fs:" + +#else /* !SMP */ + +#define __percpu_seg "" + +#endif /* SMP */ + +#include <asm-generic/percpu.h> + +/* We can use this directly for local CPU (faster). */ +DECLARE_PER_CPU(unsigned long, this_cpu_off); + +/* For arch-specific code, we can use direct single-insn ops (they + * don't give an lvalue though). */ +extern void __bad_percpu_size(void); + +#define percpu_to_op(op, var, val) \ +do { \ + typedef typeof(var) T__; \ + if (0) { \ + T__ tmp__; \ + tmp__ = (val); \ + } \ + switch (sizeof(var)) { \ + case 1: \ + asm(op "b %1,"__percpu_seg"%0" \ + : "+m" (var) \ + : "ri" ((T__)val)); \ + break; \ + case 2: \ + asm(op "w %1,"__percpu_seg"%0" \ + : "+m" (var) \ + : "ri" ((T__)val)); \ + break; \ + case 4: \ + asm(op "l %1,"__percpu_seg"%0" \ + : "+m" (var) \ + : "ri" ((T__)val)); \ + break; \ + default: __bad_percpu_size(); \ + } \ +} while (0) + +#define percpu_from_op(op, var) \ +({ \ + typeof(var) ret__; \ + switch (sizeof(var)) { \ + case 1: \ + asm(op "b "__percpu_seg"%1,%0" \ + : "=r" (ret__) \ + : "m" (var)); \ + break; \ + case 2: \ + asm(op "w "__percpu_seg"%1,%0" \ + : "=r" (ret__) \ + : "m" (var)); \ + break; \ + case 4: \ + asm(op "l "__percpu_seg"%1,%0" \ + : "=r" (ret__) \ + : "m" (var)); \ + break; \ + default: __bad_percpu_size(); \ + } \ + ret__; \ +}) + +#define x86_read_percpu(var) percpu_from_op("mov", per_cpu__##var) +#define x86_write_percpu(var, val) percpu_to_op("mov", per_cpu__##var, val) +#define x86_add_percpu(var, val) percpu_to_op("add", per_cpu__##var, val) +#define x86_sub_percpu(var, val) percpu_to_op("sub", per_cpu__##var, val) +#define x86_or_percpu(var, val) percpu_to_op("or", per_cpu__##var, val) +#endif /* !__ASSEMBLY__ */ +#endif /* !CONFIG_X86_64 */ + +#ifdef CONFIG_SMP + +/* + * Define the "EARLY_PER_CPU" macros. These are used for some per_cpu + * variables that are initialized and accessed before there are per_cpu + * areas allocated. + */ + +#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \ + DEFINE_PER_CPU(_type, _name) = _initvalue; \ + __typeof__(_type) _name##_early_map[NR_CPUS] __initdata = \ + { [0 ... NR_CPUS-1] = _initvalue }; \ + __typeof__(_type) *_name##_early_ptr __refdata = _name##_early_map + +#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \ + EXPORT_PER_CPU_SYMBOL(_name) + +#define DECLARE_EARLY_PER_CPU(_type, _name) \ + DECLARE_PER_CPU(_type, _name); \ + extern __typeof__(_type) *_name##_early_ptr; \ + extern __typeof__(_type) _name##_early_map[] + +#define early_per_cpu_ptr(_name) (_name##_early_ptr) +#define early_per_cpu_map(_name, _idx) (_name##_early_map[_idx]) +#define early_per_cpu(_name, _cpu) \ + (early_per_cpu_ptr(_name) ? \ + early_per_cpu_ptr(_name)[_cpu] : \ + per_cpu(_name, _cpu)) + +#else /* !CONFIG_SMP */ +#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \ + DEFINE_PER_CPU(_type, _name) = _initvalue + +#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \ + EXPORT_PER_CPU_SYMBOL(_name) + +#define DECLARE_EARLY_PER_CPU(_type, _name) \ + DECLARE_PER_CPU(_type, _name) + +#define early_per_cpu(_name, _cpu) per_cpu(_name, _cpu) +#define early_per_cpu_ptr(_name) NULL +/* no early_per_cpu_map() */ + +#endif /* !CONFIG_SMP */ + +#endif /* _ASM_X86_PERCPU_H */ diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h new file mode 100644 index 000000000000..cb7c151a8bff --- /dev/null +++ b/arch/x86/include/asm/pgalloc.h @@ -0,0 +1,114 @@ +#ifndef _ASM_X86_PGALLOC_H +#define _ASM_X86_PGALLOC_H + +#include <linux/threads.h> +#include <linux/mm.h> /* for struct page */ +#include <linux/pagemap.h> + +static inline int __paravirt_pgd_alloc(struct mm_struct *mm) { return 0; } + +#ifdef CONFIG_PARAVIRT +#include <asm/paravirt.h> +#else +#define paravirt_pgd_alloc(mm) __paravirt_pgd_alloc(mm) +static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) {} +static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) {} +static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) {} +static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn, + unsigned long start, unsigned long count) {} +static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) {} +static inline void paravirt_release_pte(unsigned long pfn) {} +static inline void paravirt_release_pmd(unsigned long pfn) {} +static inline void paravirt_release_pud(unsigned long pfn) {} +#endif + +/* + * Allocate and free page tables. + */ +extern pgd_t *pgd_alloc(struct mm_struct *); +extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); + +extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long); +extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long); + +/* Should really implement gc for free page table pages. This could be + done with a reference count in struct page. */ + +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) +{ + BUG_ON((unsigned long)pte & (PAGE_SIZE-1)); + free_page((unsigned long)pte); +} + +static inline void pte_free(struct mm_struct *mm, struct page *pte) +{ + __free_page(pte); +} + +extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte); + +static inline void pmd_populate_kernel(struct mm_struct *mm, + pmd_t *pmd, pte_t *pte) +{ + paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT); + set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE)); +} + +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, + struct page *pte) +{ + unsigned long pfn = page_to_pfn(pte); + + paravirt_alloc_pte(mm, pfn); + set_pmd(pmd, __pmd(((pteval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE)); +} + +#define pmd_pgtable(pmd) pmd_page(pmd) + +#if PAGETABLE_LEVELS > 2 +static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + return (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); +} + +static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) +{ + BUG_ON((unsigned long)pmd & (PAGE_SIZE-1)); + free_page((unsigned long)pmd); +} + +extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd); + +#ifdef CONFIG_X86_PAE +extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd); +#else /* !CONFIG_X86_PAE */ +static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) +{ + paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); + set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd))); +} +#endif /* CONFIG_X86_PAE */ + +#if PAGETABLE_LEVELS > 3 +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) +{ + paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT); + set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud))); +} + +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + return (pud_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); +} + +static inline void pud_free(struct mm_struct *mm, pud_t *pud) +{ + BUG_ON((unsigned long)pud & (PAGE_SIZE-1)); + free_page((unsigned long)pud); +} + +extern void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud); +#endif /* PAGETABLE_LEVELS > 3 */ +#endif /* PAGETABLE_LEVELS > 2 */ + +#endif /* _ASM_X86_PGALLOC_H */ diff --git a/arch/x86/include/asm/pgtable-2level-defs.h b/arch/x86/include/asm/pgtable-2level-defs.h new file mode 100644 index 000000000000..d77db8990eaa --- /dev/null +++ b/arch/x86/include/asm/pgtable-2level-defs.h @@ -0,0 +1,20 @@ +#ifndef _ASM_X86_PGTABLE_2LEVEL_DEFS_H +#define _ASM_X86_PGTABLE_2LEVEL_DEFS_H + +#define SHARED_KERNEL_PMD 0 + +/* + * traditional i386 two-level paging structure: + */ + +#define PGDIR_SHIFT 22 +#define PTRS_PER_PGD 1024 + +/* + * the i386 is two-level, so we don't really have any + * PMD directory physically. + */ + +#define PTRS_PER_PTE 1024 + +#endif /* _ASM_X86_PGTABLE_2LEVEL_DEFS_H */ diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h new file mode 100644 index 000000000000..b17edfd23628 --- /dev/null +++ b/arch/x86/include/asm/pgtable-2level.h @@ -0,0 +1,79 @@ +#ifndef _ASM_X86_PGTABLE_2LEVEL_H +#define _ASM_X86_PGTABLE_2LEVEL_H + +#define pte_ERROR(e) \ + printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, (e).pte_low) +#define pgd_ERROR(e) \ + printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) + +/* + * Certain architectures need to do special things when PTEs + * within a page table are directly modified. Thus, the following + * hook is made available. + */ +static inline void native_set_pte(pte_t *ptep , pte_t pte) +{ + *ptep = pte; +} + +static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) +{ + *pmdp = pmd; +} + +static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) +{ + native_set_pte(ptep, pte); +} + +static inline void native_set_pte_present(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep, pte_t pte) +{ + native_set_pte(ptep, pte); +} + +static inline void native_pmd_clear(pmd_t *pmdp) +{ + native_set_pmd(pmdp, __pmd(0)); +} + +static inline void native_pte_clear(struct mm_struct *mm, + unsigned long addr, pte_t *xp) +{ + *xp = native_make_pte(0); +} + +#ifdef CONFIG_SMP +static inline pte_t native_ptep_get_and_clear(pte_t *xp) +{ + return __pte(xchg(&xp->pte_low, 0)); +} +#else +#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) +#endif + +#define pte_none(x) (!(x).pte_low) + +/* + * Bits 0, 6 and 7 are taken, split up the 29 bits of offset + * into this range: + */ +#define PTE_FILE_MAX_BITS 29 + +#define pte_to_pgoff(pte) \ + ((((pte).pte_low >> 1) & 0x1f) + (((pte).pte_low >> 8) << 5)) + +#define pgoff_to_pte(off) \ + ((pte_t) { .pte_low = (((off) & 0x1f) << 1) + \ + (((off) >> 5) << 8) + _PAGE_FILE }) + +/* Encode and de-code a swap entry */ +#define __swp_type(x) (((x).val >> 1) & 0x1f) +#define __swp_offset(x) ((x).val >> 8) +#define __swp_entry(type, offset) \ + ((swp_entry_t) { ((type) << 1) | ((offset) << 8) }) +#define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low }) +#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) + +#endif /* _ASM_X86_PGTABLE_2LEVEL_H */ diff --git a/arch/x86/include/asm/pgtable-3level-defs.h b/arch/x86/include/asm/pgtable-3level-defs.h new file mode 100644 index 000000000000..62561367653c --- /dev/null +++ b/arch/x86/include/asm/pgtable-3level-defs.h @@ -0,0 +1,28 @@ +#ifndef _ASM_X86_PGTABLE_3LEVEL_DEFS_H +#define _ASM_X86_PGTABLE_3LEVEL_DEFS_H + +#ifdef CONFIG_PARAVIRT +#define SHARED_KERNEL_PMD (pv_info.shared_kernel_pmd) +#else +#define SHARED_KERNEL_PMD 1 +#endif + +/* + * PGDIR_SHIFT determines what a top-level page table entry can map + */ +#define PGDIR_SHIFT 30 +#define PTRS_PER_PGD 4 + +/* + * PMD_SHIFT determines the size of the area a middle-level + * page table can map + */ +#define PMD_SHIFT 21 +#define PTRS_PER_PMD 512 + +/* + * entries per page directory level + */ +#define PTRS_PER_PTE 512 + +#endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */ diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h new file mode 100644 index 000000000000..fb16cec702e4 --- /dev/null +++ b/arch/x86/include/asm/pgtable-3level.h @@ -0,0 +1,175 @@ +#ifndef _ASM_X86_PGTABLE_3LEVEL_H +#define _ASM_X86_PGTABLE_3LEVEL_H + +/* + * Intel Physical Address Extension (PAE) Mode - three-level page + * tables on PPro+ CPUs. + * + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> + */ + +#define pte_ERROR(e) \ + printk("%s:%d: bad pte %p(%08lx%08lx).\n", \ + __FILE__, __LINE__, &(e), (e).pte_high, (e).pte_low) +#define pmd_ERROR(e) \ + printk("%s:%d: bad pmd %p(%016Lx).\n", \ + __FILE__, __LINE__, &(e), pmd_val(e)) +#define pgd_ERROR(e) \ + printk("%s:%d: bad pgd %p(%016Lx).\n", \ + __FILE__, __LINE__, &(e), pgd_val(e)) + +static inline int pud_none(pud_t pud) +{ + return pud_val(pud) == 0; +} + +static inline int pud_bad(pud_t pud) +{ + return (pud_val(pud) & ~(PTE_PFN_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0; +} + +static inline int pud_present(pud_t pud) +{ + return pud_val(pud) & _PAGE_PRESENT; +} + +/* Rules for using set_pte: the pte being assigned *must* be + * either not present or in a state where the hardware will + * not attempt to update the pte. In places where this is + * not possible, use pte_get_and_clear to obtain the old pte + * value and then use set_pte to update it. -ben + */ +static inline void native_set_pte(pte_t *ptep, pte_t pte) +{ + ptep->pte_high = pte.pte_high; + smp_wmb(); + ptep->pte_low = pte.pte_low; +} + +/* + * Since this is only called on user PTEs, and the page fault handler + * must handle the already racy situation of simultaneous page faults, + * we are justified in merely clearing the PTE present bit, followed + * by a set. The ordering here is important. + */ +static inline void native_set_pte_present(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep, pte_t pte) +{ + ptep->pte_low = 0; + smp_wmb(); + ptep->pte_high = pte.pte_high; + smp_wmb(); + ptep->pte_low = pte.pte_low; +} + +static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) +{ + set_64bit((unsigned long long *)(ptep), native_pte_val(pte)); +} + +static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) +{ + set_64bit((unsigned long long *)(pmdp), native_pmd_val(pmd)); +} + +static inline void native_set_pud(pud_t *pudp, pud_t pud) +{ + set_64bit((unsigned long long *)(pudp), native_pud_val(pud)); +} + +/* + * For PTEs and PDEs, we must clear the P-bit first when clearing a page table + * entry, so clear the bottom half first and enforce ordering with a compiler + * barrier. + */ +static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + ptep->pte_low = 0; + smp_wmb(); + ptep->pte_high = 0; +} + +static inline void native_pmd_clear(pmd_t *pmd) +{ + u32 *tmp = (u32 *)pmd; + *tmp = 0; + smp_wmb(); + *(tmp + 1) = 0; +} + +static inline void pud_clear(pud_t *pudp) +{ + unsigned long pgd; + + set_pud(pudp, __pud(0)); + + /* + * According to Intel App note "TLBs, Paging-Structure Caches, + * and Their Invalidation", April 2007, document 317080-001, + * section 8.1: in PAE mode we explicitly have to flush the + * TLB via cr3 if the top-level pgd is changed... + * + * Make sure the pud entry we're updating is within the + * current pgd to avoid unnecessary TLB flushes. + */ + pgd = read_cr3(); + if (__pa(pudp) >= pgd && __pa(pudp) < + (pgd + sizeof(pgd_t)*PTRS_PER_PGD)) + write_cr3(pgd); +} + +#define pud_page(pud) ((struct page *) __va(pud_val(pud) & PTE_PFN_MASK)) + +#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PTE_PFN_MASK)) + + +/* Find an entry in the second-level page table.. */ +#define pmd_offset(pud, address) ((pmd_t *)pud_page(*(pud)) + \ + pmd_index(address)) + +#ifdef CONFIG_SMP +static inline pte_t native_ptep_get_and_clear(pte_t *ptep) +{ + pte_t res; + + /* xchg acts as a barrier before the setting of the high bits */ + res.pte_low = xchg(&ptep->pte_low, 0); + res.pte_high = ptep->pte_high; + ptep->pte_high = 0; + + return res; +} +#else +#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) +#endif + +#define __HAVE_ARCH_PTE_SAME +static inline int pte_same(pte_t a, pte_t b) +{ + return a.pte_low == b.pte_low && a.pte_high == b.pte_high; +} + +static inline int pte_none(pte_t pte) +{ + return !pte.pte_low && !pte.pte_high; +} + +/* + * Bits 0, 6 and 7 are taken in the low part of the pte, + * put the 32 bits of offset into the high part. + */ +#define pte_to_pgoff(pte) ((pte).pte_high) +#define pgoff_to_pte(off) \ + ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } }) +#define PTE_FILE_MAX_BITS 32 + +/* Encode and de-code a swap entry */ +#define __swp_type(x) (((x).val) & 0x1f) +#define __swp_offset(x) ((x).val >> 5) +#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5}) +#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high }) +#define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } }) + +#endif /* _ASM_X86_PGTABLE_3LEVEL_H */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h new file mode 100644 index 000000000000..c012f3b11671 --- /dev/null +++ b/arch/x86/include/asm/pgtable.h @@ -0,0 +1,562 @@ +#ifndef _ASM_X86_PGTABLE_H +#define _ASM_X86_PGTABLE_H + +#define FIRST_USER_ADDRESS 0 + +#define _PAGE_BIT_PRESENT 0 /* is present */ +#define _PAGE_BIT_RW 1 /* writeable */ +#define _PAGE_BIT_USER 2 /* userspace addressable */ +#define _PAGE_BIT_PWT 3 /* page write through */ +#define _PAGE_BIT_PCD 4 /* page cache disabled */ +#define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */ +#define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */ +#define _PAGE_BIT_FILE 6 +#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ +#define _PAGE_BIT_PAT 7 /* on 4KB pages */ +#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ +#define _PAGE_BIT_UNUSED1 9 /* available for programmer */ +#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */ +#define _PAGE_BIT_UNUSED3 11 +#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ +#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 +#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 +#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ + +#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT) +#define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW) +#define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER) +#define _PAGE_PWT (_AT(pteval_t, 1) << _PAGE_BIT_PWT) +#define _PAGE_PCD (_AT(pteval_t, 1) << _PAGE_BIT_PCD) +#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED) +#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) +#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) +#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) +#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1) +#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP) +#define _PAGE_UNUSED3 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED3) +#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) +#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) +#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) +#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) +#define __HAVE_ARCH_PTE_SPECIAL + +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) +#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) +#else +#define _PAGE_NX (_AT(pteval_t, 0)) +#endif + +/* If _PAGE_PRESENT is clear, we use these: */ +#define _PAGE_FILE _PAGE_DIRTY /* nonlinear file mapping, + * saved PTE; unset:swap */ +#define _PAGE_PROTNONE _PAGE_PSE /* if the user mapped it with PROT_NONE; + pte_present gives true */ + +#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ + _PAGE_ACCESSED | _PAGE_DIRTY) +#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ + _PAGE_DIRTY) + +/* Set of bits not changed in pte_modify */ +#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ + _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY) + +#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT) +#define _PAGE_CACHE_WB (0) +#define _PAGE_CACHE_WC (_PAGE_PWT) +#define _PAGE_CACHE_UC_MINUS (_PAGE_PCD) +#define _PAGE_CACHE_UC (_PAGE_PCD | _PAGE_PWT) + +#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) +#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ + _PAGE_ACCESSED | _PAGE_NX) + +#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | \ + _PAGE_USER | _PAGE_ACCESSED) +#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \ + _PAGE_ACCESSED | _PAGE_NX) +#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \ + _PAGE_ACCESSED) +#define PAGE_COPY PAGE_COPY_NOEXEC +#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | \ + _PAGE_ACCESSED | _PAGE_NX) +#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \ + _PAGE_ACCESSED) + +#define __PAGE_KERNEL_EXEC \ + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_GLOBAL) +#define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX) + +#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW) +#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW) +#define __PAGE_KERNEL_EXEC_NOCACHE (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT) +#define __PAGE_KERNEL_WC (__PAGE_KERNEL | _PAGE_CACHE_WC) +#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT) +#define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD) +#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER) +#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT) +#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) +#define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE) +#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) + +#define __PAGE_KERNEL_IO (__PAGE_KERNEL | _PAGE_IOMAP) +#define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE | _PAGE_IOMAP) +#define __PAGE_KERNEL_IO_UC_MINUS (__PAGE_KERNEL_UC_MINUS | _PAGE_IOMAP) +#define __PAGE_KERNEL_IO_WC (__PAGE_KERNEL_WC | _PAGE_IOMAP) + +#define PAGE_KERNEL __pgprot(__PAGE_KERNEL) +#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) +#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) +#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX) +#define PAGE_KERNEL_WC __pgprot(__PAGE_KERNEL_WC) +#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE) +#define PAGE_KERNEL_UC_MINUS __pgprot(__PAGE_KERNEL_UC_MINUS) +#define PAGE_KERNEL_EXEC_NOCACHE __pgprot(__PAGE_KERNEL_EXEC_NOCACHE) +#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE) +#define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE) +#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) +#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL) +#define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE) + +#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) +#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) +#define PAGE_KERNEL_IO_UC_MINUS __pgprot(__PAGE_KERNEL_IO_UC_MINUS) +#define PAGE_KERNEL_IO_WC __pgprot(__PAGE_KERNEL_IO_WC) + +/* xwr */ +#define __P000 PAGE_NONE +#define __P001 PAGE_READONLY +#define __P010 PAGE_COPY +#define __P011 PAGE_COPY +#define __P100 PAGE_READONLY_EXEC +#define __P101 PAGE_READONLY_EXEC +#define __P110 PAGE_COPY_EXEC +#define __P111 PAGE_COPY_EXEC + +#define __S000 PAGE_NONE +#define __S001 PAGE_READONLY +#define __S010 PAGE_SHARED +#define __S011 PAGE_SHARED +#define __S100 PAGE_READONLY_EXEC +#define __S101 PAGE_READONLY_EXEC +#define __S110 PAGE_SHARED_EXEC +#define __S111 PAGE_SHARED_EXEC + +/* + * early identity mapping pte attrib macros. + */ +#ifdef CONFIG_X86_64 +#define __PAGE_KERNEL_IDENT_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC +#else +/* + * For PDE_IDENT_ATTR include USER bit. As the PDE and PTE protection + * bits are combined, this will alow user to access the high address mapped + * VDSO in the presence of CONFIG_COMPAT_VDSO + */ +#define PTE_IDENT_ATTR 0x003 /* PRESENT+RW */ +#define PDE_IDENT_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */ +#define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */ +#endif + +#ifndef __ASSEMBLY__ + +/* + * ZERO_PAGE is a global shared page that is always zero: used + * for zero-mapped memory areas etc.. + */ +extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; +#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) + +extern spinlock_t pgd_lock; +extern struct list_head pgd_list; + +/* + * The following only work if pte_present() is true. + * Undefined behaviour if not.. + */ +static inline int pte_dirty(pte_t pte) +{ + return pte_flags(pte) & _PAGE_DIRTY; +} + +static inline int pte_young(pte_t pte) +{ + return pte_flags(pte) & _PAGE_ACCESSED; +} + +static inline int pte_write(pte_t pte) +{ + return pte_flags(pte) & _PAGE_RW; +} + +static inline int pte_file(pte_t pte) +{ + return pte_flags(pte) & _PAGE_FILE; +} + +static inline int pte_huge(pte_t pte) +{ + return pte_flags(pte) & _PAGE_PSE; +} + +static inline int pte_global(pte_t pte) +{ + return pte_flags(pte) & _PAGE_GLOBAL; +} + +static inline int pte_exec(pte_t pte) +{ + return !(pte_flags(pte) & _PAGE_NX); +} + +static inline int pte_special(pte_t pte) +{ + return pte_flags(pte) & _PAGE_SPECIAL; +} + +static inline unsigned long pte_pfn(pte_t pte) +{ + return (pte_val(pte) & PTE_PFN_MASK) >> PAGE_SHIFT; +} + +#define pte_page(pte) pfn_to_page(pte_pfn(pte)) + +static inline int pmd_large(pmd_t pte) +{ + return (pmd_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) == + (_PAGE_PSE | _PAGE_PRESENT); +} + +static inline pte_t pte_mkclean(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_DIRTY); +} + +static inline pte_t pte_mkold(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_ACCESSED); +} + +static inline pte_t pte_wrprotect(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_RW); +} + +static inline pte_t pte_mkexec(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_NX); +} + +static inline pte_t pte_mkdirty(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_DIRTY); +} + +static inline pte_t pte_mkyoung(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_ACCESSED); +} + +static inline pte_t pte_mkwrite(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_RW); +} + +static inline pte_t pte_mkhuge(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_PSE); +} + +static inline pte_t pte_clrhuge(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_PSE); +} + +static inline pte_t pte_mkglobal(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_GLOBAL); +} + +static inline pte_t pte_clrglobal(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_GLOBAL); +} + +static inline pte_t pte_mkspecial(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_SPECIAL); +} + +extern pteval_t __supported_pte_mask; + +static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) +{ + return __pte((((phys_addr_t)page_nr << PAGE_SHIFT) | + pgprot_val(pgprot)) & __supported_pte_mask); +} + +static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) +{ + return __pmd((((phys_addr_t)page_nr << PAGE_SHIFT) | + pgprot_val(pgprot)) & __supported_pte_mask); +} + +static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) +{ + pteval_t val = pte_val(pte); + + /* + * Chop off the NX bit (if present), and add the NX portion of + * the newprot (if present): + */ + val &= _PAGE_CHG_MASK; + val |= pgprot_val(newprot) & (~_PAGE_CHG_MASK) & __supported_pte_mask; + + return __pte(val); +} + +/* mprotect needs to preserve PAT bits when updating vm_page_prot */ +#define pgprot_modify pgprot_modify +static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) +{ + pgprotval_t preservebits = pgprot_val(oldprot) & _PAGE_CHG_MASK; + pgprotval_t addbits = pgprot_val(newprot); + return __pgprot(preservebits | addbits); +} + +#define pte_pgprot(x) __pgprot(pte_flags(x) & PTE_FLAGS_MASK) + +#define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask) + +#ifndef __ASSEMBLY__ +#define __HAVE_PHYS_MEM_ACCESS_PROT +struct file; +pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, + unsigned long size, pgprot_t vma_prot); +int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, + unsigned long size, pgprot_t *vma_prot); +#endif + +/* Install a pte for a particular vaddr in kernel space. */ +void set_pte_vaddr(unsigned long vaddr, pte_t pte); + +#ifdef CONFIG_X86_32 +extern void native_pagetable_setup_start(pgd_t *base); +extern void native_pagetable_setup_done(pgd_t *base); +#else +static inline void native_pagetable_setup_start(pgd_t *base) {} +static inline void native_pagetable_setup_done(pgd_t *base) {} +#endif + +struct seq_file; +extern void arch_report_meminfo(struct seq_file *m); + +#ifdef CONFIG_PARAVIRT +#include <asm/paravirt.h> +#else /* !CONFIG_PARAVIRT */ +#define set_pte(ptep, pte) native_set_pte(ptep, pte) +#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte) + +#define set_pte_present(mm, addr, ptep, pte) \ + native_set_pte_present(mm, addr, ptep, pte) +#define set_pte_atomic(ptep, pte) \ + native_set_pte_atomic(ptep, pte) + +#define set_pmd(pmdp, pmd) native_set_pmd(pmdp, pmd) + +#ifndef __PAGETABLE_PUD_FOLDED +#define set_pgd(pgdp, pgd) native_set_pgd(pgdp, pgd) +#define pgd_clear(pgd) native_pgd_clear(pgd) +#endif + +#ifndef set_pud +# define set_pud(pudp, pud) native_set_pud(pudp, pud) +#endif + +#ifndef __PAGETABLE_PMD_FOLDED +#define pud_clear(pud) native_pud_clear(pud) +#endif + +#define pte_clear(mm, addr, ptep) native_pte_clear(mm, addr, ptep) +#define pmd_clear(pmd) native_pmd_clear(pmd) + +#define pte_update(mm, addr, ptep) do { } while (0) +#define pte_update_defer(mm, addr, ptep) do { } while (0) + +static inline void __init paravirt_pagetable_setup_start(pgd_t *base) +{ + native_pagetable_setup_start(base); +} + +static inline void __init paravirt_pagetable_setup_done(pgd_t *base) +{ + native_pagetable_setup_done(base); +} +#endif /* CONFIG_PARAVIRT */ + +#endif /* __ASSEMBLY__ */ + +#ifdef CONFIG_X86_32 +# include "pgtable_32.h" +#else +# include "pgtable_64.h" +#endif + +/* + * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD] + * + * this macro returns the index of the entry in the pgd page which would + * control the given virtual address + */ +#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1)) + +/* + * pgd_offset() returns a (pgd_t *) + * pgd_index() is used get the offset into the pgd page's array of pgd_t's; + */ +#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address))) +/* + * a shortcut which implies the use of the kernel's pgd, instead + * of a process's + */ +#define pgd_offset_k(address) pgd_offset(&init_mm, (address)) + + +#define KERNEL_PGD_BOUNDARY pgd_index(PAGE_OFFSET) +#define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY) + +#ifndef __ASSEMBLY__ + +enum { + PG_LEVEL_NONE, + PG_LEVEL_4K, + PG_LEVEL_2M, + PG_LEVEL_1G, + PG_LEVEL_NUM +}; + +#ifdef CONFIG_PROC_FS +extern void update_page_count(int level, unsigned long pages); +#else +static inline void update_page_count(int level, unsigned long pages) { } +#endif + +/* + * Helper function that returns the kernel pagetable entry controlling + * the virtual address 'address'. NULL means no pagetable entry present. + * NOTE: the return type is pte_t but if the pmd is PSE then we return it + * as a pte too. + */ +extern pte_t *lookup_address(unsigned long address, unsigned int *level); + +/* local pte updates need not use xchg for locking */ +static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) +{ + pte_t res = *ptep; + + /* Pure native function needs no input for mm, addr */ + native_pte_clear(NULL, 0, ptep); + return res; +} + +static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep , pte_t pte) +{ + native_set_pte(ptep, pte); +} + +#ifndef CONFIG_PARAVIRT +/* + * Rules for using pte_update - it must be called after any PTE update which + * has not been done using the set_pte / clear_pte interfaces. It is used by + * shadow mode hypervisors to resynchronize the shadow page tables. Kernel PTE + * updates should either be sets, clears, or set_pte_atomic for P->P + * transitions, which means this hook should only be called for user PTEs. + * This hook implies a P->P protection or access change has taken place, which + * requires a subsequent TLB flush. The notification can optionally be delayed + * until the TLB flush event by using the pte_update_defer form of the + * interface, but care must be taken to assure that the flush happens while + * still holding the same page table lock so that the shadow and primary pages + * do not become out of sync on SMP. + */ +#define pte_update(mm, addr, ptep) do { } while (0) +#define pte_update_defer(mm, addr, ptep) do { } while (0) +#endif + +/* + * We only update the dirty/accessed state if we set + * the dirty bit by hand in the kernel, since the hardware + * will do the accessed bit for us, and we don't want to + * race with other CPU's that might be updating the dirty + * bit at the same time. + */ +struct vm_area_struct; + +#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS +extern int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty); + +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG +extern int ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep); + +#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH +extern int ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep); + +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + pte_t pte = native_ptep_get_and_clear(ptep); + pte_update(mm, addr, ptep); + return pte; +} + +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL +static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + int full) +{ + pte_t pte; + if (full) { + /* + * Full address destruction in progress; paravirt does not + * care about updates and native needs no locking + */ + pte = native_local_ptep_get_and_clear(ptep); + } else { + pte = ptep_get_and_clear(mm, addr, ptep); + } + return pte; +} + +#define __HAVE_ARCH_PTEP_SET_WRPROTECT +static inline void ptep_set_wrprotect(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + clear_bit(_PAGE_BIT_RW, (unsigned long *)&ptep->pte); + pte_update(mm, addr, ptep); +} + +/* + * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); + * + * dst - pointer to pgd range anwhere on a pgd page + * src - "" + * count - the number of pgds to copy. + * + * dst and src can be on the same page, but the range must not overlap, + * and must not cross a page boundary. + */ +static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) +{ + memcpy(dst, src, count * sizeof(pgd_t)); +} + + +#include <asm-generic/pgtable.h> +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_X86_PGTABLE_H */ diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h new file mode 100644 index 000000000000..f9d5889b336b --- /dev/null +++ b/arch/x86/include/asm/pgtable_32.h @@ -0,0 +1,191 @@ +#ifndef _ASM_X86_PGTABLE_32_H +#define _ASM_X86_PGTABLE_32_H + + +/* + * The Linux memory management assumes a three-level page table setup. On + * the i386, we use that, but "fold" the mid level into the top-level page + * table, so that we physically have the same two-level page table as the + * i386 mmu expects. + * + * This file contains the functions and defines necessary to modify and use + * the i386 page table tree. + */ +#ifndef __ASSEMBLY__ +#include <asm/processor.h> +#include <asm/fixmap.h> +#include <linux/threads.h> +#include <asm/paravirt.h> + +#include <linux/bitops.h> +#include <linux/slab.h> +#include <linux/list.h> +#include <linux/spinlock.h> + +struct mm_struct; +struct vm_area_struct; + +extern pgd_t swapper_pg_dir[1024]; + +static inline void pgtable_cache_init(void) { } +static inline void check_pgt_cache(void) { } +void paging_init(void); + +extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t); + +/* + * The Linux x86 paging architecture is 'compile-time dual-mode', it + * implements both the traditional 2-level x86 page tables and the + * newer 3-level PAE-mode page tables. + */ +#ifdef CONFIG_X86_PAE +# include <asm/pgtable-3level-defs.h> +# define PMD_SIZE (1UL << PMD_SHIFT) +# define PMD_MASK (~(PMD_SIZE - 1)) +#else +# include <asm/pgtable-2level-defs.h> +#endif + +#define PGDIR_SIZE (1UL << PGDIR_SHIFT) +#define PGDIR_MASK (~(PGDIR_SIZE - 1)) + +/* Just any arbitrary offset to the start of the vmalloc VM area: the + * current 8MB value just means that there will be a 8MB "hole" after the + * physical memory until the kernel virtual memory starts. That means that + * any out-of-bounds memory accesses will hopefully be caught. + * The vmalloc() routines leaves a hole of 4kB between each vmalloced + * area for the same reason. ;) + */ +#define VMALLOC_OFFSET (8 * 1024 * 1024) +#define VMALLOC_START ((unsigned long)high_memory + VMALLOC_OFFSET) +#ifdef CONFIG_X86_PAE +#define LAST_PKMAP 512 +#else +#define LAST_PKMAP 1024 +#endif + +#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE * (LAST_PKMAP + 1)) \ + & PMD_MASK) + +#ifdef CONFIG_HIGHMEM +# define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE) +#else +# define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE) +#endif + +#define MAXMEM (VMALLOC_END - PAGE_OFFSET - __VMALLOC_RESERVE) + +/* + * Define this if things work differently on an i386 and an i486: + * it will (on an i486) warn about kernel memory accesses that are + * done without a 'access_ok(VERIFY_WRITE,..)' + */ +#undef TEST_ACCESS_OK + +/* The boot page tables (all created as a single array) */ +extern unsigned long pg0[]; + +#define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE)) + +/* To avoid harmful races, pmd_none(x) should check only the lower when PAE */ +#define pmd_none(x) (!(unsigned long)pmd_val((x))) +#define pmd_present(x) (pmd_val((x)) & _PAGE_PRESENT) +#define pmd_bad(x) ((pmd_val(x) & (PTE_FLAGS_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) + +#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) + +#ifdef CONFIG_X86_PAE +# include <asm/pgtable-3level.h> +#else +# include <asm/pgtable-2level.h> +#endif + +/* + * Macro to mark a page protection value as "uncacheable". + * On processors which do not support it, this is a no-op. + */ +#define pgprot_noncached(prot) \ + ((boot_cpu_data.x86 > 3) \ + ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) \ + : (prot)) + +/* + * Conversion functions: convert a page and protection to a page entry, + * and a page entry and page directory to the page they refer to. + */ +#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) + + +static inline int pud_large(pud_t pud) { return 0; } + +/* + * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] + * + * this macro returns the index of the entry in the pmd page which would + * control the given virtual address + */ +#define pmd_index(address) \ + (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)) + +/* + * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE] + * + * this macro returns the index of the entry in the pte page which would + * control the given virtual address + */ +#define pte_index(address) \ + (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +#define pte_offset_kernel(dir, address) \ + ((pte_t *)pmd_page_vaddr(*(dir)) + pte_index((address))) + +#define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT)) + +#define pmd_page_vaddr(pmd) \ + ((unsigned long)__va(pmd_val((pmd)) & PTE_PFN_MASK)) + +#if defined(CONFIG_HIGHPTE) +#define pte_offset_map(dir, address) \ + ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE0) + \ + pte_index((address))) +#define pte_offset_map_nested(dir, address) \ + ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) + \ + pte_index((address))) +#define pte_unmap(pte) kunmap_atomic((pte), KM_PTE0) +#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1) +#else +#define pte_offset_map(dir, address) \ + ((pte_t *)page_address(pmd_page(*(dir))) + pte_index((address))) +#define pte_offset_map_nested(dir, address) pte_offset_map((dir), (address)) +#define pte_unmap(pte) do { } while (0) +#define pte_unmap_nested(pte) do { } while (0) +#endif + +/* Clear a kernel PTE and flush it from the TLB */ +#define kpte_clear_flush(ptep, vaddr) \ +do { \ + pte_clear(&init_mm, (vaddr), (ptep)); \ + __flush_tlb_one((vaddr)); \ +} while (0) + +/* + * The i386 doesn't have any external MMU info: the kernel page + * tables contain all the necessary information. + */ +#define update_mmu_cache(vma, address, pte) do { } while (0) + +#endif /* !__ASSEMBLY__ */ + +/* + * kern_addr_valid() is (1) for FLATMEM and (0) for + * SPARSEMEM and DISCONTIGMEM + */ +#ifdef CONFIG_FLATMEM +#define kern_addr_valid(addr) (1) +#else +#define kern_addr_valid(kaddr) (0) +#endif + +#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ + remap_pfn_range(vma, vaddr, pfn, size, prot) + +#endif /* _ASM_X86_PGTABLE_32_H */ diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h new file mode 100644 index 000000000000..545a0e042bb2 --- /dev/null +++ b/arch/x86/include/asm/pgtable_64.h @@ -0,0 +1,285 @@ +#ifndef _ASM_X86_PGTABLE_64_H +#define _ASM_X86_PGTABLE_64_H + +#include <linux/const.h> +#ifndef __ASSEMBLY__ + +/* + * This file contains the functions and defines necessary to modify and use + * the x86-64 page table tree. + */ +#include <asm/processor.h> +#include <linux/bitops.h> +#include <linux/threads.h> +#include <asm/pda.h> + +extern pud_t level3_kernel_pgt[512]; +extern pud_t level3_ident_pgt[512]; +extern pmd_t level2_kernel_pgt[512]; +extern pmd_t level2_fixmap_pgt[512]; +extern pmd_t level2_ident_pgt[512]; +extern pgd_t init_level4_pgt[]; + +#define swapper_pg_dir init_level4_pgt + +extern void paging_init(void); + +#endif /* !__ASSEMBLY__ */ + +#define SHARED_KERNEL_PMD 0 + +/* + * PGDIR_SHIFT determines what a top-level page table entry can map + */ +#define PGDIR_SHIFT 39 +#define PTRS_PER_PGD 512 + +/* + * 3rd level page + */ +#define PUD_SHIFT 30 +#define PTRS_PER_PUD 512 + +/* + * PMD_SHIFT determines the size of the area a middle-level + * page table can map + */ +#define PMD_SHIFT 21 +#define PTRS_PER_PMD 512 + +/* + * entries per page directory level + */ +#define PTRS_PER_PTE 512 + +#ifndef __ASSEMBLY__ + +#define pte_ERROR(e) \ + printk("%s:%d: bad pte %p(%016lx).\n", \ + __FILE__, __LINE__, &(e), pte_val(e)) +#define pmd_ERROR(e) \ + printk("%s:%d: bad pmd %p(%016lx).\n", \ + __FILE__, __LINE__, &(e), pmd_val(e)) +#define pud_ERROR(e) \ + printk("%s:%d: bad pud %p(%016lx).\n", \ + __FILE__, __LINE__, &(e), pud_val(e)) +#define pgd_ERROR(e) \ + printk("%s:%d: bad pgd %p(%016lx).\n", \ + __FILE__, __LINE__, &(e), pgd_val(e)) + +#define pgd_none(x) (!pgd_val(x)) +#define pud_none(x) (!pud_val(x)) + +struct mm_struct; + +void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte); + + +static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + *ptep = native_make_pte(0); +} + +static inline void native_set_pte(pte_t *ptep, pte_t pte) +{ + *ptep = pte; +} + +static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) +{ + native_set_pte(ptep, pte); +} + +static inline pte_t native_ptep_get_and_clear(pte_t *xp) +{ +#ifdef CONFIG_SMP + return native_make_pte(xchg(&xp->pte, 0)); +#else + /* native_local_ptep_get_and_clear, + but duplicated because of cyclic dependency */ + pte_t ret = *xp; + native_pte_clear(NULL, 0, xp); + return ret; +#endif +} + +static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) +{ + *pmdp = pmd; +} + +static inline void native_pmd_clear(pmd_t *pmd) +{ + native_set_pmd(pmd, native_make_pmd(0)); +} + +static inline void native_set_pud(pud_t *pudp, pud_t pud) +{ + *pudp = pud; +} + +static inline void native_pud_clear(pud_t *pud) +{ + native_set_pud(pud, native_make_pud(0)); +} + +static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) +{ + *pgdp = pgd; +} + +static inline void native_pgd_clear(pgd_t *pgd) +{ + native_set_pgd(pgd, native_make_pgd(0)); +} + +#define pte_same(a, b) ((a).pte == (b).pte) + +#endif /* !__ASSEMBLY__ */ + +#define PMD_SIZE (_AC(1, UL) << PMD_SHIFT) +#define PMD_MASK (~(PMD_SIZE - 1)) +#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT) +#define PUD_MASK (~(PUD_SIZE - 1)) +#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT) +#define PGDIR_MASK (~(PGDIR_SIZE - 1)) + + +#define MAXMEM _AC(0x00003fffffffffff, UL) +#define VMALLOC_START _AC(0xffffc20000000000, UL) +#define VMALLOC_END _AC(0xffffe1ffffffffff, UL) +#define VMEMMAP_START _AC(0xffffe20000000000, UL) +#define MODULES_VADDR _AC(0xffffffffa0000000, UL) +#define MODULES_END _AC(0xffffffffff000000, UL) +#define MODULES_LEN (MODULES_END - MODULES_VADDR) + +#ifndef __ASSEMBLY__ + +static inline int pgd_bad(pgd_t pgd) +{ + return (pgd_val(pgd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE; +} + +static inline int pud_bad(pud_t pud) +{ + return (pud_val(pud) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE; +} + +static inline int pmd_bad(pmd_t pmd) +{ + return (pmd_val(pmd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE; +} + +#define pte_none(x) (!pte_val((x))) +#define pte_present(x) (pte_val((x)) & (_PAGE_PRESENT | _PAGE_PROTNONE)) + +#define pages_to_mb(x) ((x) >> (20 - PAGE_SHIFT)) /* FIXME: is this right? */ + +/* + * Macro to mark a page protection value as "uncacheable". + */ +#define pgprot_noncached(prot) \ + (__pgprot(pgprot_val((prot)) | _PAGE_PCD | _PAGE_PWT)) + +/* + * Conversion functions: convert a page and protection to a page entry, + * and a page entry and page directory to the page they refer to. + */ + +/* + * Level 4 access. + */ +#define pgd_page_vaddr(pgd) \ + ((unsigned long)__va((unsigned long)pgd_val((pgd)) & PTE_PFN_MASK)) +#define pgd_page(pgd) (pfn_to_page(pgd_val((pgd)) >> PAGE_SHIFT)) +#define pgd_present(pgd) (pgd_val(pgd) & _PAGE_PRESENT) +static inline int pgd_large(pgd_t pgd) { return 0; } +#define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE) + +/* PUD - Level3 access */ +/* to find an entry in a page-table-directory. */ +#define pud_page_vaddr(pud) \ + ((unsigned long)__va(pud_val((pud)) & PHYSICAL_PAGE_MASK)) +#define pud_page(pud) (pfn_to_page(pud_val((pud)) >> PAGE_SHIFT)) +#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)) +#define pud_offset(pgd, address) \ + ((pud_t *)pgd_page_vaddr(*(pgd)) + pud_index((address))) +#define pud_present(pud) (pud_val((pud)) & _PAGE_PRESENT) + +static inline int pud_large(pud_t pte) +{ + return (pud_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) == + (_PAGE_PSE | _PAGE_PRESENT); +} + +/* PMD - Level 2 access */ +#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val((pmd)) & PTE_PFN_MASK)) +#define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT)) + +#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)) +#define pmd_offset(dir, address) ((pmd_t *)pud_page_vaddr(*(dir)) + \ + pmd_index(address)) +#define pmd_none(x) (!pmd_val((x))) +#define pmd_present(x) (pmd_val((x)) & _PAGE_PRESENT) +#define pfn_pmd(nr, prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val((prot)))) +#define pmd_pfn(x) ((pmd_val((x)) & __PHYSICAL_MASK) >> PAGE_SHIFT) + +#define pte_to_pgoff(pte) ((pte_val((pte)) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT) +#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | \ + _PAGE_FILE }) +#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT + +/* PTE - Level 1 access. */ + +/* page, protection -> pte */ +#define mk_pte(page, pgprot) pfn_pte(page_to_pfn((page)), (pgprot)) + +#define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +#define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_vaddr(*(dir)) + \ + pte_index((address))) + +/* x86-64 always has all page tables mapped. */ +#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address)) +#define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address)) +#define pte_unmap(pte) /* NOP */ +#define pte_unmap_nested(pte) /* NOP */ + +#define update_mmu_cache(vma, address, pte) do { } while (0) + +extern int direct_gbpages; + +/* Encode and de-code a swap entry */ +#define __swp_type(x) (((x).val >> 1) & 0x3f) +#define __swp_offset(x) ((x).val >> 8) +#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | \ + ((offset) << 8) }) +#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) }) +#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) + +extern int kern_addr_valid(unsigned long addr); +extern void cleanup_highmap(void); + +#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ + remap_pfn_range(vma, vaddr, pfn, size, prot) + +#define HAVE_ARCH_UNMAPPED_AREA +#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN + +#define pgtable_cache_init() do { } while (0) +#define check_pgt_cache() do { } while (0) + +#define PAGE_AGP PAGE_KERNEL_NOCACHE +#define HAVE_PAGE_AGP 1 + +/* fs/proc/kcore.c */ +#define kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK) +#define kc_offset_to_vaddr(o) \ + (((o) & (1UL << (__VIRTUAL_MASK_SHIFT - 1))) \ + ? ((o) | ~__VIRTUAL_MASK) \ + : (o)) + +#define __HAVE_ARCH_PTE_SAME +#endif /* !__ASSEMBLY__ */ + +#endif /* _ASM_X86_PGTABLE_64_H */ diff --git a/arch/x86/include/asm/poll.h b/arch/x86/include/asm/poll.h new file mode 100644 index 000000000000..c98509d3149e --- /dev/null +++ b/arch/x86/include/asm/poll.h @@ -0,0 +1 @@ +#include <asm-generic/poll.h> diff --git a/arch/x86/include/asm/posix_types.h b/arch/x86/include/asm/posix_types.h new file mode 100644 index 000000000000..bb7133dc155d --- /dev/null +++ b/arch/x86/include/asm/posix_types.h @@ -0,0 +1,13 @@ +#ifdef __KERNEL__ +# ifdef CONFIG_X86_32 +# include "posix_types_32.h" +# else +# include "posix_types_64.h" +# endif +#else +# ifdef __i386__ +# include "posix_types_32.h" +# else +# include "posix_types_64.h" +# endif +#endif diff --git a/arch/x86/include/asm/posix_types_32.h b/arch/x86/include/asm/posix_types_32.h new file mode 100644 index 000000000000..f7d9adf82e53 --- /dev/null +++ b/arch/x86/include/asm/posix_types_32.h @@ -0,0 +1,85 @@ +#ifndef _ASM_X86_POSIX_TYPES_32_H +#define _ASM_X86_POSIX_TYPES_32_H + +/* + * This file is generally used by user-level software, so you need to + * be a little careful about namespace pollution etc. Also, we cannot + * assume GCC is being used. + */ + +typedef unsigned long __kernel_ino_t; +typedef unsigned short __kernel_mode_t; +typedef unsigned short __kernel_nlink_t; +typedef long __kernel_off_t; +typedef int __kernel_pid_t; +typedef unsigned short __kernel_ipc_pid_t; +typedef unsigned short __kernel_uid_t; +typedef unsigned short __kernel_gid_t; +typedef unsigned int __kernel_size_t; +typedef int __kernel_ssize_t; +typedef int __kernel_ptrdiff_t; +typedef long __kernel_time_t; +typedef long __kernel_suseconds_t; +typedef long __kernel_clock_t; +typedef int __kernel_timer_t; +typedef int __kernel_clockid_t; +typedef int __kernel_daddr_t; +typedef char * __kernel_caddr_t; +typedef unsigned short __kernel_uid16_t; +typedef unsigned short __kernel_gid16_t; +typedef unsigned int __kernel_uid32_t; +typedef unsigned int __kernel_gid32_t; + +typedef unsigned short __kernel_old_uid_t; +typedef unsigned short __kernel_old_gid_t; +typedef unsigned short __kernel_old_dev_t; + +#ifdef __GNUC__ +typedef long long __kernel_loff_t; +#endif + +typedef struct { + int val[2]; +} __kernel_fsid_t; + +#if defined(__KERNEL__) + +#undef __FD_SET +#define __FD_SET(fd,fdsetp) \ + asm volatile("btsl %1,%0": \ + "+m" (*(__kernel_fd_set *)(fdsetp)) \ + : "r" ((int)(fd))) + +#undef __FD_CLR +#define __FD_CLR(fd,fdsetp) \ + asm volatile("btrl %1,%0": \ + "+m" (*(__kernel_fd_set *)(fdsetp)) \ + : "r" ((int) (fd))) + +#undef __FD_ISSET +#define __FD_ISSET(fd,fdsetp) \ + (__extension__ \ + ({ \ + unsigned char __result; \ + asm volatile("btl %1,%2 ; setb %0" \ + : "=q" (__result) \ + : "r" ((int)(fd)), \ + "m" (*(__kernel_fd_set *)(fdsetp))); \ + __result; \ +})) + +#undef __FD_ZERO +#define __FD_ZERO(fdsetp) \ +do { \ + int __d0, __d1; \ + asm volatile("cld ; rep ; stosl" \ + : "=m" (*(__kernel_fd_set *)(fdsetp)), \ + "=&c" (__d0), "=&D" (__d1) \ + : "a" (0), "1" (__FDSET_LONGS), \ + "2" ((__kernel_fd_set *)(fdsetp)) \ + : "memory"); \ +} while (0) + +#endif /* defined(__KERNEL__) */ + +#endif /* _ASM_X86_POSIX_TYPES_32_H */ diff --git a/arch/x86/include/asm/posix_types_64.h b/arch/x86/include/asm/posix_types_64.h new file mode 100644 index 000000000000..eb8d2d92b63e --- /dev/null +++ b/arch/x86/include/asm/posix_types_64.h @@ -0,0 +1,119 @@ +#ifndef _ASM_X86_POSIX_TYPES_64_H +#define _ASM_X86_POSIX_TYPES_64_H + +/* + * This file is generally used by user-level software, so you need to + * be a little careful about namespace pollution etc. Also, we cannot + * assume GCC is being used. + */ + +typedef unsigned long __kernel_ino_t; +typedef unsigned int __kernel_mode_t; +typedef unsigned long __kernel_nlink_t; +typedef long __kernel_off_t; +typedef int __kernel_pid_t; +typedef int __kernel_ipc_pid_t; +typedef unsigned int __kernel_uid_t; +typedef unsigned int __kernel_gid_t; +typedef unsigned long __kernel_size_t; +typedef long __kernel_ssize_t; +typedef long __kernel_ptrdiff_t; +typedef long __kernel_time_t; +typedef long __kernel_suseconds_t; +typedef long __kernel_clock_t; +typedef int __kernel_timer_t; +typedef int __kernel_clockid_t; +typedef int __kernel_daddr_t; +typedef char * __kernel_caddr_t; +typedef unsigned short __kernel_uid16_t; +typedef unsigned short __kernel_gid16_t; + +#ifdef __GNUC__ +typedef long long __kernel_loff_t; +#endif + +typedef struct { + int val[2]; +} __kernel_fsid_t; + +typedef unsigned short __kernel_old_uid_t; +typedef unsigned short __kernel_old_gid_t; +typedef __kernel_uid_t __kernel_uid32_t; +typedef __kernel_gid_t __kernel_gid32_t; + +typedef unsigned long __kernel_old_dev_t; + +#ifdef __KERNEL__ + +#undef __FD_SET +static inline void __FD_SET(unsigned long fd, __kernel_fd_set *fdsetp) +{ + unsigned long _tmp = fd / __NFDBITS; + unsigned long _rem = fd % __NFDBITS; + fdsetp->fds_bits[_tmp] |= (1UL<<_rem); +} + +#undef __FD_CLR +static inline void __FD_CLR(unsigned long fd, __kernel_fd_set *fdsetp) +{ + unsigned long _tmp = fd / __NFDBITS; + unsigned long _rem = fd % __NFDBITS; + fdsetp->fds_bits[_tmp] &= ~(1UL<<_rem); +} + +#undef __FD_ISSET +static inline int __FD_ISSET(unsigned long fd, __const__ __kernel_fd_set *p) +{ + unsigned long _tmp = fd / __NFDBITS; + unsigned long _rem = fd % __NFDBITS; + return (p->fds_bits[_tmp] & (1UL<<_rem)) != 0; +} + +/* + * This will unroll the loop for the normal constant cases (8 or 32 longs, + * for 256 and 1024-bit fd_sets respectively) + */ +#undef __FD_ZERO +static inline void __FD_ZERO(__kernel_fd_set *p) +{ + unsigned long *tmp = p->fds_bits; + int i; + + if (__builtin_constant_p(__FDSET_LONGS)) { + switch (__FDSET_LONGS) { + case 32: + tmp[ 0] = 0; tmp[ 1] = 0; tmp[ 2] = 0; tmp[ 3] = 0; + tmp[ 4] = 0; tmp[ 5] = 0; tmp[ 6] = 0; tmp[ 7] = 0; + tmp[ 8] = 0; tmp[ 9] = 0; tmp[10] = 0; tmp[11] = 0; + tmp[12] = 0; tmp[13] = 0; tmp[14] = 0; tmp[15] = 0; + tmp[16] = 0; tmp[17] = 0; tmp[18] = 0; tmp[19] = 0; + tmp[20] = 0; tmp[21] = 0; tmp[22] = 0; tmp[23] = 0; + tmp[24] = 0; tmp[25] = 0; tmp[26] = 0; tmp[27] = 0; + tmp[28] = 0; tmp[29] = 0; tmp[30] = 0; tmp[31] = 0; + return; + case 16: + tmp[ 0] = 0; tmp[ 1] = 0; tmp[ 2] = 0; tmp[ 3] = 0; + tmp[ 4] = 0; tmp[ 5] = 0; tmp[ 6] = 0; tmp[ 7] = 0; + tmp[ 8] = 0; tmp[ 9] = 0; tmp[10] = 0; tmp[11] = 0; + tmp[12] = 0; tmp[13] = 0; tmp[14] = 0; tmp[15] = 0; + return; + case 8: + tmp[ 0] = 0; tmp[ 1] = 0; tmp[ 2] = 0; tmp[ 3] = 0; + tmp[ 4] = 0; tmp[ 5] = 0; tmp[ 6] = 0; tmp[ 7] = 0; + return; + case 4: + tmp[ 0] = 0; tmp[ 1] = 0; tmp[ 2] = 0; tmp[ 3] = 0; + return; + } + } + i = __FDSET_LONGS; + while (i) { + i--; + *tmp = 0; + tmp++; + } +} + +#endif /* defined(__KERNEL__) */ + +#endif /* _ASM_X86_POSIX_TYPES_64_H */ diff --git a/arch/x86/include/asm/prctl.h b/arch/x86/include/asm/prctl.h new file mode 100644 index 000000000000..fe681147a4f7 --- /dev/null +++ b/arch/x86/include/asm/prctl.h @@ -0,0 +1,10 @@ +#ifndef _ASM_X86_PRCTL_H +#define _ASM_X86_PRCTL_H + +#define ARCH_SET_GS 0x1001 +#define ARCH_SET_FS 0x1002 +#define ARCH_GET_FS 0x1003 +#define ARCH_GET_GS 0x1004 + + +#endif /* _ASM_X86_PRCTL_H */ diff --git a/arch/x86/include/asm/processor-cyrix.h b/arch/x86/include/asm/processor-cyrix.h new file mode 100644 index 000000000000..1198f2a0e42c --- /dev/null +++ b/arch/x86/include/asm/processor-cyrix.h @@ -0,0 +1,38 @@ +/* + * NSC/Cyrix CPU indexed register access. Must be inlined instead of + * macros to ensure correct access ordering + * Access order is always 0x22 (=offset), 0x23 (=value) + * + * When using the old macros a line like + * setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88); + * gets expanded to: + * do { + * outb((CX86_CCR2), 0x22); + * outb((({ + * outb((CX86_CCR2), 0x22); + * inb(0x23); + * }) | 0x88), 0x23); + * } while (0); + * + * which in fact violates the access order (= 0x22, 0x22, 0x23, 0x23). + */ + +static inline u8 getCx86(u8 reg) +{ + outb(reg, 0x22); + return inb(0x23); +} + +static inline void setCx86(u8 reg, u8 data) +{ + outb(reg, 0x22); + outb(data, 0x23); +} + +#define getCx86_old(reg) ({ outb((reg), 0x22); inb(0x23); }) + +#define setCx86_old(reg, data) do { \ + outb((reg), 0x22); \ + outb((data), 0x23); \ +} while (0) + diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h new file mode 100644 index 000000000000..7a3e836eb2a9 --- /dev/null +++ b/arch/x86/include/asm/processor-flags.h @@ -0,0 +1,100 @@ +#ifndef _ASM_X86_PROCESSOR_FLAGS_H +#define _ASM_X86_PROCESSOR_FLAGS_H +/* Various flags defined: can be included from assembler. */ + +/* + * EFLAGS bits + */ +#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */ +#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */ +#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */ +#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */ +#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */ +#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */ +#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */ +#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */ +#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */ +#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */ +#define X86_EFLAGS_NT 0x00004000 /* Nested Task */ +#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */ +#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */ +#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */ +#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */ +#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */ +#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */ + +/* + * Basic CPU control in CR0 + */ +#define X86_CR0_PE 0x00000001 /* Protection Enable */ +#define X86_CR0_MP 0x00000002 /* Monitor Coprocessor */ +#define X86_CR0_EM 0x00000004 /* Emulation */ +#define X86_CR0_TS 0x00000008 /* Task Switched */ +#define X86_CR0_ET 0x00000010 /* Extension Type */ +#define X86_CR0_NE 0x00000020 /* Numeric Error */ +#define X86_CR0_WP 0x00010000 /* Write Protect */ +#define X86_CR0_AM 0x00040000 /* Alignment Mask */ +#define X86_CR0_NW 0x20000000 /* Not Write-through */ +#define X86_CR0_CD 0x40000000 /* Cache Disable */ +#define X86_CR0_PG 0x80000000 /* Paging */ + +/* + * Paging options in CR3 + */ +#define X86_CR3_PWT 0x00000008 /* Page Write Through */ +#define X86_CR3_PCD 0x00000010 /* Page Cache Disable */ + +/* + * Intel CPU features in CR4 + */ +#define X86_CR4_VME 0x00000001 /* enable vm86 extensions */ +#define X86_CR4_PVI 0x00000002 /* virtual interrupts flag enable */ +#define X86_CR4_TSD 0x00000004 /* disable time stamp at ipl 3 */ +#define X86_CR4_DE 0x00000008 /* enable debugging extensions */ +#define X86_CR4_PSE 0x00000010 /* enable page size extensions */ +#define X86_CR4_PAE 0x00000020 /* enable physical address extensions */ +#define X86_CR4_MCE 0x00000040 /* Machine check enable */ +#define X86_CR4_PGE 0x00000080 /* enable global pages */ +#define X86_CR4_PCE 0x00000100 /* enable performance counters at ipl 3 */ +#define X86_CR4_OSFXSR 0x00000200 /* enable fast FPU save and restore */ +#define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */ +#define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */ +#define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */ + +/* + * x86-64 Task Priority Register, CR8 + */ +#define X86_CR8_TPR 0x0000000F /* task priority register */ + +/* + * AMD and Transmeta use MSRs for configuration; see <asm/msr-index.h> + */ + +/* + * NSC/Cyrix CPU configuration register indexes + */ +#define CX86_PCR0 0x20 +#define CX86_GCR 0xb8 +#define CX86_CCR0 0xc0 +#define CX86_CCR1 0xc1 +#define CX86_CCR2 0xc2 +#define CX86_CCR3 0xc3 +#define CX86_CCR4 0xe8 +#define CX86_CCR5 0xe9 +#define CX86_CCR6 0xea +#define CX86_CCR7 0xeb +#define CX86_PCR1 0xf0 +#define CX86_DIR0 0xfe +#define CX86_DIR1 0xff +#define CX86_ARR_BASE 0xc4 +#define CX86_RCR_BASE 0xdc + +#ifdef __KERNEL__ +#ifdef CONFIG_VM86 +#define X86_VM_MASK X86_EFLAGS_VM +#else +#define X86_VM_MASK 0 /* No VM86 support */ +#endif +#endif + +#endif /* _ASM_X86_PROCESSOR_FLAGS_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h new file mode 100644 index 000000000000..5ca01e383269 --- /dev/null +++ b/arch/x86/include/asm/processor.h @@ -0,0 +1,936 @@ +#ifndef _ASM_X86_PROCESSOR_H +#define _ASM_X86_PROCESSOR_H + +#include <asm/processor-flags.h> + +/* Forward declaration, a strange C thing */ +struct task_struct; +struct mm_struct; + +#include <asm/vm86.h> +#include <asm/math_emu.h> +#include <asm/segment.h> +#include <asm/types.h> +#include <asm/sigcontext.h> +#include <asm/current.h> +#include <asm/cpufeature.h> +#include <asm/system.h> +#include <asm/page.h> +#include <asm/percpu.h> +#include <asm/msr.h> +#include <asm/desc_defs.h> +#include <asm/nops.h> +#include <asm/ds.h> + +#include <linux/personality.h> +#include <linux/cpumask.h> +#include <linux/cache.h> +#include <linux/threads.h> +#include <linux/init.h> + +/* + * Default implementation of macro that returns current + * instruction pointer ("program counter"). + */ +static inline void *current_text_addr(void) +{ + void *pc; + + asm volatile("mov $1f, %0; 1:":"=r" (pc)); + + return pc; +} + +#ifdef CONFIG_X86_VSMP +# define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT) +# define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT) +#else +# define ARCH_MIN_TASKALIGN 16 +# define ARCH_MIN_MMSTRUCT_ALIGN 0 +#endif + +/* + * CPU type and hardware bug flags. Kept separately for each CPU. + * Members of this structure are referenced in head.S, so think twice + * before touching them. [mj] + */ + +struct cpuinfo_x86 { + __u8 x86; /* CPU family */ + __u8 x86_vendor; /* CPU vendor */ + __u8 x86_model; + __u8 x86_mask; +#ifdef CONFIG_X86_32 + char wp_works_ok; /* It doesn't on 386's */ + + /* Problems on some 486Dx4's and old 386's: */ + char hlt_works_ok; + char hard_math; + char rfu; + char fdiv_bug; + char f00f_bug; + char coma_bug; + char pad0; +#else + /* Number of 4K pages in DTLB/ITLB combined(in pages): */ + int x86_tlbsize; + __u8 x86_virt_bits; + __u8 x86_phys_bits; +#endif + /* CPUID returned core id bits: */ + __u8 x86_coreid_bits; + /* Max extended CPUID function supported: */ + __u32 extended_cpuid_level; + /* Maximum supported CPUID level, -1=no CPUID: */ + int cpuid_level; + __u32 x86_capability[NCAPINTS]; + char x86_vendor_id[16]; + char x86_model_id[64]; + /* in KB - valid for CPUS which support this call: */ + int x86_cache_size; + int x86_cache_alignment; /* In bytes */ + int x86_power; + unsigned long loops_per_jiffy; +#ifdef CONFIG_SMP + /* cpus sharing the last level cache: */ + cpumask_t llc_shared_map; +#endif + /* cpuid returned max cores value: */ + u16 x86_max_cores; + u16 apicid; + u16 initial_apicid; + u16 x86_clflush_size; +#ifdef CONFIG_SMP + /* number of cores as seen by the OS: */ + u16 booted_cores; + /* Physical processor id: */ + u16 phys_proc_id; + /* Core id: */ + u16 cpu_core_id; + /* Index into per_cpu list: */ + u16 cpu_index; +#endif +} __attribute__((__aligned__(SMP_CACHE_BYTES))); + +#define X86_VENDOR_INTEL 0 +#define X86_VENDOR_CYRIX 1 +#define X86_VENDOR_AMD 2 +#define X86_VENDOR_UMC 3 +#define X86_VENDOR_CENTAUR 5 +#define X86_VENDOR_TRANSMETA 7 +#define X86_VENDOR_NSC 8 +#define X86_VENDOR_NUM 9 + +#define X86_VENDOR_UNKNOWN 0xff + +/* + * capabilities of CPUs + */ +extern struct cpuinfo_x86 boot_cpu_data; +extern struct cpuinfo_x86 new_cpu_data; + +extern struct tss_struct doublefault_tss; +extern __u32 cleared_cpu_caps[NCAPINTS]; + +#ifdef CONFIG_SMP +DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info); +#define cpu_data(cpu) per_cpu(cpu_info, cpu) +#define current_cpu_data __get_cpu_var(cpu_info) +#else +#define cpu_data(cpu) boot_cpu_data +#define current_cpu_data boot_cpu_data +#endif + +extern const struct seq_operations cpuinfo_op; + +static inline int hlt_works(int cpu) +{ +#ifdef CONFIG_X86_32 + return cpu_data(cpu).hlt_works_ok; +#else + return 1; +#endif +} + +#define cache_line_size() (boot_cpu_data.x86_cache_alignment) + +extern void cpu_detect(struct cpuinfo_x86 *c); + +extern struct pt_regs *idle_regs(struct pt_regs *); + +extern void early_cpu_init(void); +extern void identify_boot_cpu(void); +extern void identify_secondary_cpu(struct cpuinfo_x86 *); +extern void print_cpu_info(struct cpuinfo_x86 *); +extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); +extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); +extern unsigned short num_cache_leaves; + +extern void detect_extended_topology(struct cpuinfo_x86 *c); +extern void detect_ht(struct cpuinfo_x86 *c); + +static inline void native_cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + /* ecx is often an input as well as an output. */ + asm("cpuid" + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (*eax), "2" (*ecx)); +} + +static inline void load_cr3(pgd_t *pgdir) +{ + write_cr3(__pa(pgdir)); +} + +#ifdef CONFIG_X86_32 +/* This is the TSS defined by the hardware. */ +struct x86_hw_tss { + unsigned short back_link, __blh; + unsigned long sp0; + unsigned short ss0, __ss0h; + unsigned long sp1; + /* ss1 caches MSR_IA32_SYSENTER_CS: */ + unsigned short ss1, __ss1h; + unsigned long sp2; + unsigned short ss2, __ss2h; + unsigned long __cr3; + unsigned long ip; + unsigned long flags; + unsigned long ax; + unsigned long cx; + unsigned long dx; + unsigned long bx; + unsigned long sp; + unsigned long bp; + unsigned long si; + unsigned long di; + unsigned short es, __esh; + unsigned short cs, __csh; + unsigned short ss, __ssh; + unsigned short ds, __dsh; + unsigned short fs, __fsh; + unsigned short gs, __gsh; + unsigned short ldt, __ldth; + unsigned short trace; + unsigned short io_bitmap_base; + +} __attribute__((packed)); +#else +struct x86_hw_tss { + u32 reserved1; + u64 sp0; + u64 sp1; + u64 sp2; + u64 reserved2; + u64 ist[7]; + u32 reserved3; + u32 reserved4; + u16 reserved5; + u16 io_bitmap_base; + +} __attribute__((packed)) ____cacheline_aligned; +#endif + +/* + * IO-bitmap sizes: + */ +#define IO_BITMAP_BITS 65536 +#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8) +#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long)) +#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap) +#define INVALID_IO_BITMAP_OFFSET 0x8000 +#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000 + +struct tss_struct { + /* + * The hardware state: + */ + struct x86_hw_tss x86_tss; + + /* + * The extra 1 is there because the CPU will access an + * additional byte beyond the end of the IO permission + * bitmap. The extra byte must be all 1 bits, and must + * be within the limit. + */ + unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; + /* + * Cache the current maximum and the last task that used the bitmap: + */ + unsigned long io_bitmap_max; + struct thread_struct *io_bitmap_owner; + + /* + * .. and then another 0x100 bytes for the emergency kernel stack: + */ + unsigned long stack[64]; + +} ____cacheline_aligned; + +DECLARE_PER_CPU(struct tss_struct, init_tss); + +/* + * Save the original ist values for checking stack pointers during debugging + */ +struct orig_ist { + unsigned long ist[7]; +}; + +#define MXCSR_DEFAULT 0x1f80 + +struct i387_fsave_struct { + u32 cwd; /* FPU Control Word */ + u32 swd; /* FPU Status Word */ + u32 twd; /* FPU Tag Word */ + u32 fip; /* FPU IP Offset */ + u32 fcs; /* FPU IP Selector */ + u32 foo; /* FPU Operand Pointer Offset */ + u32 fos; /* FPU Operand Pointer Selector */ + + /* 8*10 bytes for each FP-reg = 80 bytes: */ + u32 st_space[20]; + + /* Software status information [not touched by FSAVE ]: */ + u32 status; +}; + +struct i387_fxsave_struct { + u16 cwd; /* Control Word */ + u16 swd; /* Status Word */ + u16 twd; /* Tag Word */ + u16 fop; /* Last Instruction Opcode */ + union { + struct { + u64 rip; /* Instruction Pointer */ + u64 rdp; /* Data Pointer */ + }; + struct { + u32 fip; /* FPU IP Offset */ + u32 fcs; /* FPU IP Selector */ + u32 foo; /* FPU Operand Offset */ + u32 fos; /* FPU Operand Selector */ + }; + }; + u32 mxcsr; /* MXCSR Register State */ + u32 mxcsr_mask; /* MXCSR Mask */ + + /* 8*16 bytes for each FP-reg = 128 bytes: */ + u32 st_space[32]; + + /* 16*16 bytes for each XMM-reg = 256 bytes: */ + u32 xmm_space[64]; + + u32 padding[12]; + + union { + u32 padding1[12]; + u32 sw_reserved[12]; + }; + +} __attribute__((aligned(16))); + +struct i387_soft_struct { + u32 cwd; + u32 swd; + u32 twd; + u32 fip; + u32 fcs; + u32 foo; + u32 fos; + /* 8*10 bytes for each FP-reg = 80 bytes: */ + u32 st_space[20]; + u8 ftop; + u8 changed; + u8 lookahead; + u8 no_update; + u8 rm; + u8 alimit; + struct info *info; + u32 entry_eip; +}; + +struct xsave_hdr_struct { + u64 xstate_bv; + u64 reserved1[2]; + u64 reserved2[5]; +} __attribute__((packed)); + +struct xsave_struct { + struct i387_fxsave_struct i387; + struct xsave_hdr_struct xsave_hdr; + /* new processor state extensions will go here */ +} __attribute__ ((packed, aligned (64))); + +union thread_xstate { + struct i387_fsave_struct fsave; + struct i387_fxsave_struct fxsave; + struct i387_soft_struct soft; + struct xsave_struct xsave; +}; + +#ifdef CONFIG_X86_64 +DECLARE_PER_CPU(struct orig_ist, orig_ist); +#endif + +extern void print_cpu_info(struct cpuinfo_x86 *); +extern unsigned int xstate_size; +extern void free_thread_xstate(struct task_struct *); +extern struct kmem_cache *task_xstate_cachep; +extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); +extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); +extern unsigned short num_cache_leaves; + +struct thread_struct { + /* Cached TLS descriptors: */ + struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; + unsigned long sp0; + unsigned long sp; +#ifdef CONFIG_X86_32 + unsigned long sysenter_cs; +#else + unsigned long usersp; /* Copy from PDA */ + unsigned short es; + unsigned short ds; + unsigned short fsindex; + unsigned short gsindex; +#endif + unsigned long ip; + unsigned long fs; + unsigned long gs; + /* Hardware debugging registers: */ + unsigned long debugreg0; + unsigned long debugreg1; + unsigned long debugreg2; + unsigned long debugreg3; + unsigned long debugreg6; + unsigned long debugreg7; + /* Fault info: */ + unsigned long cr2; + unsigned long trap_no; + unsigned long error_code; + /* floating point and extended processor state */ + union thread_xstate *xstate; +#ifdef CONFIG_X86_32 + /* Virtual 86 mode info */ + struct vm86_struct __user *vm86_info; + unsigned long screen_bitmap; + unsigned long v86flags; + unsigned long v86mask; + unsigned long saved_sp0; + unsigned int saved_fs; + unsigned int saved_gs; +#endif + /* IO permissions: */ + unsigned long *io_bitmap_ptr; + unsigned long iopl; + /* Max allowed port in the bitmap, in bytes: */ + unsigned io_bitmap_max; +/* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set. */ + unsigned long debugctlmsr; +#ifdef CONFIG_X86_DS +/* Debug Store context; see include/asm-x86/ds.h; goes into MSR_IA32_DS_AREA */ + struct ds_context *ds_ctx; +#endif /* CONFIG_X86_DS */ +#ifdef CONFIG_X86_PTRACE_BTS +/* the signal to send on a bts buffer overflow */ + unsigned int bts_ovfl_signal; +#endif /* CONFIG_X86_PTRACE_BTS */ +}; + +static inline unsigned long native_get_debugreg(int regno) +{ + unsigned long val = 0; /* Damn you, gcc! */ + + switch (regno) { + case 0: + asm("mov %%db0, %0" :"=r" (val)); + break; + case 1: + asm("mov %%db1, %0" :"=r" (val)); + break; + case 2: + asm("mov %%db2, %0" :"=r" (val)); + break; + case 3: + asm("mov %%db3, %0" :"=r" (val)); + break; + case 6: + asm("mov %%db6, %0" :"=r" (val)); + break; + case 7: + asm("mov %%db7, %0" :"=r" (val)); + break; + default: + BUG(); + } + return val; +} + +static inline void native_set_debugreg(int regno, unsigned long value) +{ + switch (regno) { + case 0: + asm("mov %0, %%db0" ::"r" (value)); + break; + case 1: + asm("mov %0, %%db1" ::"r" (value)); + break; + case 2: + asm("mov %0, %%db2" ::"r" (value)); + break; + case 3: + asm("mov %0, %%db3" ::"r" (value)); + break; + case 6: + asm("mov %0, %%db6" ::"r" (value)); + break; + case 7: + asm("mov %0, %%db7" ::"r" (value)); + break; + default: + BUG(); + } +} + +/* + * Set IOPL bits in EFLAGS from given mask + */ +static inline void native_set_iopl_mask(unsigned mask) +{ +#ifdef CONFIG_X86_32 + unsigned int reg; + + asm volatile ("pushfl;" + "popl %0;" + "andl %1, %0;" + "orl %2, %0;" + "pushl %0;" + "popfl" + : "=&r" (reg) + : "i" (~X86_EFLAGS_IOPL), "r" (mask)); +#endif +} + +static inline void +native_load_sp0(struct tss_struct *tss, struct thread_struct *thread) +{ + tss->x86_tss.sp0 = thread->sp0; +#ifdef CONFIG_X86_32 + /* Only happens when SEP is enabled, no need to test "SEP"arately: */ + if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) { + tss->x86_tss.ss1 = thread->sysenter_cs; + wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); + } +#endif +} + +static inline void native_swapgs(void) +{ +#ifdef CONFIG_X86_64 + asm volatile("swapgs" ::: "memory"); +#endif +} + +#ifdef CONFIG_PARAVIRT +#include <asm/paravirt.h> +#else +#define __cpuid native_cpuid +#define paravirt_enabled() 0 + +/* + * These special macros can be used to get or set a debugging register + */ +#define get_debugreg(var, register) \ + (var) = native_get_debugreg(register) +#define set_debugreg(value, register) \ + native_set_debugreg(register, value) + +static inline void load_sp0(struct tss_struct *tss, + struct thread_struct *thread) +{ + native_load_sp0(tss, thread); +} + +#define set_iopl_mask native_set_iopl_mask +#endif /* CONFIG_PARAVIRT */ + +/* + * Save the cr4 feature set we're using (ie + * Pentium 4MB enable and PPro Global page + * enable), so that any CPU's that boot up + * after us can get the correct flags. + */ +extern unsigned long mmu_cr4_features; + +static inline void set_in_cr4(unsigned long mask) +{ + unsigned cr4; + + mmu_cr4_features |= mask; + cr4 = read_cr4(); + cr4 |= mask; + write_cr4(cr4); +} + +static inline void clear_in_cr4(unsigned long mask) +{ + unsigned cr4; + + mmu_cr4_features &= ~mask; + cr4 = read_cr4(); + cr4 &= ~mask; + write_cr4(cr4); +} + +typedef struct { + unsigned long seg; +} mm_segment_t; + + +/* + * create a kernel thread without removing it from tasklists + */ +extern int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); + +/* Free all resources held by a thread. */ +extern void release_thread(struct task_struct *); + +/* Prepare to copy thread state - unlazy all lazy state */ +extern void prepare_to_copy(struct task_struct *tsk); + +unsigned long get_wchan(struct task_struct *p); + +/* + * Generic CPUID function + * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx + * resulting in stale register contents being returned. + */ +static inline void cpuid(unsigned int op, + unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + *eax = op; + *ecx = 0; + __cpuid(eax, ebx, ecx, edx); +} + +/* Some CPUID calls want 'count' to be placed in ecx */ +static inline void cpuid_count(unsigned int op, int count, + unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + *eax = op; + *ecx = count; + __cpuid(eax, ebx, ecx, edx); +} + +/* + * CPUID functions returning a single datum + */ +static inline unsigned int cpuid_eax(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + + return eax; +} + +static inline unsigned int cpuid_ebx(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + + return ebx; +} + +static inline unsigned int cpuid_ecx(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + + return ecx; +} + +static inline unsigned int cpuid_edx(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + + return edx; +} + +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ +static inline void rep_nop(void) +{ + asm volatile("rep; nop" ::: "memory"); +} + +static inline void cpu_relax(void) +{ + rep_nop(); +} + +/* Stop speculative execution: */ +static inline void sync_core(void) +{ + int tmp; + + asm volatile("cpuid" : "=a" (tmp) : "0" (1) + : "ebx", "ecx", "edx", "memory"); +} + +static inline void __monitor(const void *eax, unsigned long ecx, + unsigned long edx) +{ + /* "monitor %eax, %ecx, %edx;" */ + asm volatile(".byte 0x0f, 0x01, 0xc8;" + :: "a" (eax), "c" (ecx), "d"(edx)); +} + +static inline void __mwait(unsigned long eax, unsigned long ecx) +{ + /* "mwait %eax, %ecx;" */ + asm volatile(".byte 0x0f, 0x01, 0xc9;" + :: "a" (eax), "c" (ecx)); +} + +static inline void __sti_mwait(unsigned long eax, unsigned long ecx) +{ + trace_hardirqs_on(); + /* "mwait %eax, %ecx;" */ + asm volatile("sti; .byte 0x0f, 0x01, 0xc9;" + :: "a" (eax), "c" (ecx)); +} + +extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx); + +extern void select_idle_routine(const struct cpuinfo_x86 *c); + +extern unsigned long boot_option_idle_override; +extern unsigned long idle_halt; +extern unsigned long idle_nomwait; + +/* + * on systems with caches, caches must be flashed as the absolute + * last instruction before going into a suspended halt. Otherwise, + * dirty data can linger in the cache and become stale on resume, + * leading to strange errors. + * + * perform a variety of operations to guarantee that the compiler + * will not reorder instructions. wbinvd itself is serializing + * so the processor will not reorder. + * + * Systems without cache can just go into halt. + */ +static inline void wbinvd_halt(void) +{ + mb(); + /* check for clflush to determine if wbinvd is legal */ + if (cpu_has_clflush) + asm volatile("cli; wbinvd; 1: hlt; jmp 1b" : : : "memory"); + else + while (1) + halt(); +} + +extern void enable_sep_cpu(void); +extern int sysenter_setup(void); + +/* Defined in head.S */ +extern struct desc_ptr early_gdt_descr; + +extern void cpu_set_gdt(int); +extern void switch_to_new_gdt(void); +extern void cpu_init(void); +extern void init_gdt(int cpu); + +static inline void update_debugctlmsr(unsigned long debugctlmsr) +{ +#ifndef CONFIG_X86_DEBUGCTLMSR + if (boot_cpu_data.x86 < 6) + return; +#endif + wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); +} + +/* + * from system description table in BIOS. Mostly for MCA use, but + * others may find it useful: + */ +extern unsigned int machine_id; +extern unsigned int machine_submodel_id; +extern unsigned int BIOS_revision; + +/* Boot loader type from the setup header: */ +extern int bootloader_type; + +extern char ignore_fpu_irq; + +#define HAVE_ARCH_PICK_MMAP_LAYOUT 1 +#define ARCH_HAS_PREFETCHW +#define ARCH_HAS_SPINLOCK_PREFETCH + +#ifdef CONFIG_X86_32 +# define BASE_PREFETCH ASM_NOP4 +# define ARCH_HAS_PREFETCH +#else +# define BASE_PREFETCH "prefetcht0 (%1)" +#endif + +/* + * Prefetch instructions for Pentium III (+) and AMD Athlon (+) + * + * It's not worth to care about 3dnow prefetches for the K6 + * because they are microcoded there and very slow. + */ +static inline void prefetch(const void *x) +{ + alternative_input(BASE_PREFETCH, + "prefetchnta (%1)", + X86_FEATURE_XMM, + "r" (x)); +} + +/* + * 3dnow prefetch to get an exclusive cache line. + * Useful for spinlocks to avoid one state transition in the + * cache coherency protocol: + */ +static inline void prefetchw(const void *x) +{ + alternative_input(BASE_PREFETCH, + "prefetchw (%1)", + X86_FEATURE_3DNOW, + "r" (x)); +} + +static inline void spin_lock_prefetch(const void *x) +{ + prefetchw(x); +} + +#ifdef CONFIG_X86_32 +/* + * User space process size: 3GB (default). + */ +#define TASK_SIZE PAGE_OFFSET +#define STACK_TOP TASK_SIZE +#define STACK_TOP_MAX STACK_TOP + +#define INIT_THREAD { \ + .sp0 = sizeof(init_stack) + (long)&init_stack, \ + .vm86_info = NULL, \ + .sysenter_cs = __KERNEL_CS, \ + .io_bitmap_ptr = NULL, \ + .fs = __KERNEL_PERCPU, \ +} + +/* + * Note that the .io_bitmap member must be extra-big. This is because + * the CPU will access an additional byte beyond the end of the IO + * permission bitmap. The extra byte must be all 1 bits, and must + * be within the limit. + */ +#define INIT_TSS { \ + .x86_tss = { \ + .sp0 = sizeof(init_stack) + (long)&init_stack, \ + .ss0 = __KERNEL_DS, \ + .ss1 = __KERNEL_CS, \ + .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \ + }, \ + .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \ +} + +extern unsigned long thread_saved_pc(struct task_struct *tsk); + +#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long)) +#define KSTK_TOP(info) \ +({ \ + unsigned long *__ptr = (unsigned long *)(info); \ + (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \ +}) + +/* + * The below -8 is to reserve 8 bytes on top of the ring0 stack. + * This is necessary to guarantee that the entire "struct pt_regs" + * is accessable even if the CPU haven't stored the SS/ESP registers + * on the stack (interrupt gate does not save these registers + * when switching to the same priv ring). + * Therefore beware: accessing the ss/esp fields of the + * "struct pt_regs" is possible, but they may contain the + * completely wrong values. + */ +#define task_pt_regs(task) \ +({ \ + struct pt_regs *__regs__; \ + __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \ + __regs__ - 1; \ +}) + +#define KSTK_ESP(task) (task_pt_regs(task)->sp) + +#else +/* + * User space process size. 47bits minus one guard page. + */ +#define TASK_SIZE64 ((1UL << 47) - PAGE_SIZE) + +/* This decides where the kernel will search for a free chunk of vm + * space during mmap's. + */ +#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \ + 0xc0000000 : 0xFFFFe000) + +#define TASK_SIZE (test_thread_flag(TIF_IA32) ? \ + IA32_PAGE_OFFSET : TASK_SIZE64) +#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? \ + IA32_PAGE_OFFSET : TASK_SIZE64) + +#define STACK_TOP TASK_SIZE +#define STACK_TOP_MAX TASK_SIZE64 + +#define INIT_THREAD { \ + .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \ +} + +#define INIT_TSS { \ + .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \ +} + +/* + * Return saved PC of a blocked thread. + * What is this good for? it will be always the scheduler or ret_from_fork. + */ +#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8)) + +#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) +#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */ +#endif /* CONFIG_X86_64 */ + +extern void start_thread(struct pt_regs *regs, unsigned long new_ip, + unsigned long new_sp); + +/* + * This decides where the kernel will search for a free chunk of vm + * space during mmap's. + */ +#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3)) + +#define KSTK_EIP(task) (task_pt_regs(task)->ip) + +/* Get/set a process' ability to use the timestamp counter instruction */ +#define GET_TSC_CTL(adr) get_tsc_mode((adr)) +#define SET_TSC_CTL(val) set_tsc_mode((val)) + +extern int get_tsc_mode(unsigned long adr); +extern int set_tsc_mode(unsigned int val); + +#endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h new file mode 100644 index 000000000000..d6a22f92ba77 --- /dev/null +++ b/arch/x86/include/asm/proto.h @@ -0,0 +1,32 @@ +#ifndef _ASM_X86_PROTO_H +#define _ASM_X86_PROTO_H + +#include <asm/ldt.h> + +/* misc architecture specific prototypes */ + +extern void early_idt_handler(void); + +extern void system_call(void); +extern void syscall_init(void); + +extern void ia32_syscall(void); +extern void ia32_cstar_target(void); +extern void ia32_sysenter_target(void); + +extern void syscall32_cpu_init(void); + +extern void check_efer(void); + +#ifdef CONFIG_X86_BIOS_REBOOT +extern int reboot_force; +#else +static const int reboot_force = 0; +#endif + +long do_arch_prctl(struct task_struct *task, int code, unsigned long addr); + +#define round_up(x, y) (((x) + (y) - 1) & ~((y) - 1)) +#define round_down(x, y) ((x) & ~((y) - 1)) + +#endif /* _ASM_X86_PROTO_H */ diff --git a/arch/x86/include/asm/ptrace-abi.h b/arch/x86/include/asm/ptrace-abi.h new file mode 100644 index 000000000000..25f1bb8fc626 --- /dev/null +++ b/arch/x86/include/asm/ptrace-abi.h @@ -0,0 +1,145 @@ +#ifndef _ASM_X86_PTRACE_ABI_H +#define _ASM_X86_PTRACE_ABI_H + +#ifdef __i386__ + +#define EBX 0 +#define ECX 1 +#define EDX 2 +#define ESI 3 +#define EDI 4 +#define EBP 5 +#define EAX 6 +#define DS 7 +#define ES 8 +#define FS 9 +#define GS 10 +#define ORIG_EAX 11 +#define EIP 12 +#define CS 13 +#define EFL 14 +#define UESP 15 +#define SS 16 +#define FRAME_SIZE 17 + +#else /* __i386__ */ + +#if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS) +#define R15 0 +#define R14 8 +#define R13 16 +#define R12 24 +#define RBP 32 +#define RBX 40 +/* arguments: interrupts/non tracing syscalls only save upto here*/ +#define R11 48 +#define R10 56 +#define R9 64 +#define R8 72 +#define RAX 80 +#define RCX 88 +#define RDX 96 +#define RSI 104 +#define RDI 112 +#define ORIG_RAX 120 /* = ERROR */ +/* end of arguments */ +/* cpu exception frame or undefined in case of fast syscall. */ +#define RIP 128 +#define CS 136 +#define EFLAGS 144 +#define RSP 152 +#define SS 160 +#define ARGOFFSET R11 +#endif /* __ASSEMBLY__ */ + +/* top of stack page */ +#define FRAME_SIZE 168 + +#endif /* !__i386__ */ + +/* Arbitrarily choose the same ptrace numbers as used by the Sparc code. */ +#define PTRACE_GETREGS 12 +#define PTRACE_SETREGS 13 +#define PTRACE_GETFPREGS 14 +#define PTRACE_SETFPREGS 15 +#define PTRACE_GETFPXREGS 18 +#define PTRACE_SETFPXREGS 19 + +#define PTRACE_OLDSETOPTIONS 21 + +/* only useful for access 32bit programs / kernels */ +#define PTRACE_GET_THREAD_AREA 25 +#define PTRACE_SET_THREAD_AREA 26 + +#ifdef __x86_64__ +# define PTRACE_ARCH_PRCTL 30 +#endif + +#define PTRACE_SYSEMU 31 +#define PTRACE_SYSEMU_SINGLESTEP 32 + +#define PTRACE_SINGLEBLOCK 33 /* resume execution until next branch */ + +#ifdef CONFIG_X86_PTRACE_BTS + +#ifndef __ASSEMBLY__ +#include <asm/types.h> + +/* configuration/status structure used in PTRACE_BTS_CONFIG and + PTRACE_BTS_STATUS commands. +*/ +struct ptrace_bts_config { + /* requested or actual size of BTS buffer in bytes */ + __u32 size; + /* bitmask of below flags */ + __u32 flags; + /* buffer overflow signal */ + __u32 signal; + /* actual size of bts_struct in bytes */ + __u32 bts_size; +}; +#endif /* __ASSEMBLY__ */ + +#define PTRACE_BTS_O_TRACE 0x1 /* branch trace */ +#define PTRACE_BTS_O_SCHED 0x2 /* scheduling events w/ jiffies */ +#define PTRACE_BTS_O_SIGNAL 0x4 /* send SIG<signal> on buffer overflow + instead of wrapping around */ +#define PTRACE_BTS_O_ALLOC 0x8 /* (re)allocate buffer */ + +#define PTRACE_BTS_CONFIG 40 +/* Configure branch trace recording. + ADDR points to a struct ptrace_bts_config. + DATA gives the size of that buffer. + A new buffer is allocated, if requested in the flags. + An overflow signal may only be requested for new buffers. + Returns the number of bytes read. +*/ +#define PTRACE_BTS_STATUS 41 +/* Return the current configuration in a struct ptrace_bts_config + pointed to by ADDR; DATA gives the size of that buffer. + Returns the number of bytes written. +*/ +#define PTRACE_BTS_SIZE 42 +/* Return the number of available BTS records for draining. + DATA and ADDR are ignored. +*/ +#define PTRACE_BTS_GET 43 +/* Get a single BTS record. + DATA defines the index into the BTS array, where 0 is the newest + entry, and higher indices refer to older entries. + ADDR is pointing to struct bts_struct (see asm/ds.h). +*/ +#define PTRACE_BTS_CLEAR 44 +/* Clear the BTS buffer. + DATA and ADDR are ignored. +*/ +#define PTRACE_BTS_DRAIN 45 +/* Read all available BTS records and clear the buffer. + ADDR points to an array of struct bts_struct. + DATA gives the size of that buffer. + BTS records are read from oldest to newest. + Returns number of BTS records drained. +*/ +#endif /* CONFIG_X86_PTRACE_BTS */ + +#endif /* _ASM_X86_PTRACE_ABI_H */ diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h new file mode 100644 index 000000000000..d1531c8480b7 --- /dev/null +++ b/arch/x86/include/asm/ptrace.h @@ -0,0 +1,280 @@ +#ifndef _ASM_X86_PTRACE_H +#define _ASM_X86_PTRACE_H + +#include <linux/compiler.h> /* For __user */ +#include <asm/ptrace-abi.h> +#include <asm/processor-flags.h> + +#ifdef __KERNEL__ +#include <asm/ds.h> /* the DS BTS struct is used for ptrace too */ +#include <asm/segment.h> +#endif + +#ifndef __ASSEMBLY__ + +#ifdef __i386__ +/* this struct defines the way the registers are stored on the + stack during a system call. */ + +#ifndef __KERNEL__ + +struct pt_regs { + long ebx; + long ecx; + long edx; + long esi; + long edi; + long ebp; + long eax; + int xds; + int xes; + int xfs; + /* int gs; */ + long orig_eax; + long eip; + int xcs; + long eflags; + long esp; + int xss; +}; + +#else /* __KERNEL__ */ + +struct pt_regs { + unsigned long bx; + unsigned long cx; + unsigned long dx; + unsigned long si; + unsigned long di; + unsigned long bp; + unsigned long ax; + unsigned long ds; + unsigned long es; + unsigned long fs; + /* int gs; */ + unsigned long orig_ax; + unsigned long ip; + unsigned long cs; + unsigned long flags; + unsigned long sp; + unsigned long ss; +}; + +#endif /* __KERNEL__ */ + +#else /* __i386__ */ + +#ifndef __KERNEL__ + +struct pt_regs { + unsigned long r15; + unsigned long r14; + unsigned long r13; + unsigned long r12; + unsigned long rbp; + unsigned long rbx; +/* arguments: non interrupts/non tracing syscalls only save upto here*/ + unsigned long r11; + unsigned long r10; + unsigned long r9; + unsigned long r8; + unsigned long rax; + unsigned long rcx; + unsigned long rdx; + unsigned long rsi; + unsigned long rdi; + unsigned long orig_rax; +/* end of arguments */ +/* cpu exception frame or undefined */ + unsigned long rip; + unsigned long cs; + unsigned long eflags; + unsigned long rsp; + unsigned long ss; +/* top of stack page */ +}; + +#else /* __KERNEL__ */ + +struct pt_regs { + unsigned long r15; + unsigned long r14; + unsigned long r13; + unsigned long r12; + unsigned long bp; + unsigned long bx; +/* arguments: non interrupts/non tracing syscalls only save upto here*/ + unsigned long r11; + unsigned long r10; + unsigned long r9; + unsigned long r8; + unsigned long ax; + unsigned long cx; + unsigned long dx; + unsigned long si; + unsigned long di; + unsigned long orig_ax; +/* end of arguments */ +/* cpu exception frame or undefined */ + unsigned long ip; + unsigned long cs; + unsigned long flags; + unsigned long sp; + unsigned long ss; +/* top of stack page */ +}; + +#endif /* __KERNEL__ */ +#endif /* !__i386__ */ + + +#ifdef CONFIG_X86_PTRACE_BTS +/* a branch trace record entry + * + * In order to unify the interface between various processor versions, + * we use the below data structure for all processors. + */ +enum bts_qualifier { + BTS_INVALID = 0, + BTS_BRANCH, + BTS_TASK_ARRIVES, + BTS_TASK_DEPARTS +}; + +struct bts_struct { + __u64 qualifier; + union { + /* BTS_BRANCH */ + struct { + __u64 from_ip; + __u64 to_ip; + } lbr; + /* BTS_TASK_ARRIVES or + BTS_TASK_DEPARTS */ + __u64 jiffies; + } variant; +}; +#endif /* CONFIG_X86_PTRACE_BTS */ + +#ifdef __KERNEL__ + +#include <linux/init.h> + +struct cpuinfo_x86; +struct task_struct; + +#ifdef CONFIG_X86_PTRACE_BTS +extern void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *); +extern void ptrace_bts_take_timestamp(struct task_struct *, enum bts_qualifier); +#else +#define ptrace_bts_init_intel(config) do {} while (0) +#endif /* CONFIG_X86_PTRACE_BTS */ + +extern unsigned long profile_pc(struct pt_regs *regs); + +extern unsigned long +convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs); +extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, + int error_code, int si_code); +void signal_fault(struct pt_regs *regs, void __user *frame, char *where); + +extern long syscall_trace_enter(struct pt_regs *); +extern void syscall_trace_leave(struct pt_regs *); + +static inline unsigned long regs_return_value(struct pt_regs *regs) +{ + return regs->ax; +} + +/* + * user_mode_vm(regs) determines whether a register set came from user mode. + * This is true if V8086 mode was enabled OR if the register set was from + * protected mode with RPL-3 CS value. This tricky test checks that with + * one comparison. Many places in the kernel can bypass this full check + * if they have already ruled out V8086 mode, so user_mode(regs) can be used. + */ +static inline int user_mode(struct pt_regs *regs) +{ +#ifdef CONFIG_X86_32 + return (regs->cs & SEGMENT_RPL_MASK) == USER_RPL; +#else + return !!(regs->cs & 3); +#endif +} + +static inline int user_mode_vm(struct pt_regs *regs) +{ +#ifdef CONFIG_X86_32 + return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & X86_VM_MASK)) >= + USER_RPL; +#else + return user_mode(regs); +#endif +} + +static inline int v8086_mode(struct pt_regs *regs) +{ +#ifdef CONFIG_X86_32 + return (regs->flags & X86_VM_MASK); +#else + return 0; /* No V86 mode support in long mode */ +#endif +} + +/* + * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode + * when it traps. So regs will be the current sp. + * + * This is valid only for kernel mode traps. + */ +static inline unsigned long kernel_trap_sp(struct pt_regs *regs) +{ +#ifdef CONFIG_X86_32 + return (unsigned long)regs; +#else + return regs->sp; +#endif +} + +static inline unsigned long instruction_pointer(struct pt_regs *regs) +{ + return regs->ip; +} + +static inline unsigned long frame_pointer(struct pt_regs *regs) +{ + return regs->bp; +} + +static inline unsigned long user_stack_pointer(struct pt_regs *regs) +{ + return regs->sp; +} + +/* + * These are defined as per linux/ptrace.h, which see. + */ +#define arch_has_single_step() (1) +extern void user_enable_single_step(struct task_struct *); +extern void user_disable_single_step(struct task_struct *); + +extern void user_enable_block_step(struct task_struct *); +#ifdef CONFIG_X86_DEBUGCTLMSR +#define arch_has_block_step() (1) +#else +#define arch_has_block_step() (boot_cpu_data.x86 >= 6) +#endif + +struct user_desc; +extern int do_get_thread_area(struct task_struct *p, int idx, + struct user_desc __user *info); +extern int do_set_thread_area(struct task_struct *p, int idx, + struct user_desc __user *info, int can_allocate); + +#define __ARCH_WANT_COMPAT_SYS_PTRACE + +#endif /* __KERNEL__ */ + +#endif /* !__ASSEMBLY__ */ + +#endif /* _ASM_X86_PTRACE_H */ diff --git a/arch/x86/include/asm/pvclock-abi.h b/arch/x86/include/asm/pvclock-abi.h new file mode 100644 index 000000000000..6d93508f2626 --- /dev/null +++ b/arch/x86/include/asm/pvclock-abi.h @@ -0,0 +1,42 @@ +#ifndef _ASM_X86_PVCLOCK_ABI_H +#define _ASM_X86_PVCLOCK_ABI_H +#ifndef __ASSEMBLY__ + +/* + * These structs MUST NOT be changed. + * They are the ABI between hypervisor and guest OS. + * Both Xen and KVM are using this. + * + * pvclock_vcpu_time_info holds the system time and the tsc timestamp + * of the last update. So the guest can use the tsc delta to get a + * more precise system time. There is one per virtual cpu. + * + * pvclock_wall_clock references the point in time when the system + * time was zero (usually boot time), thus the guest calculates the + * current wall clock by adding the system time. + * + * Protocol for the "version" fields is: hypervisor raises it (making + * it uneven) before it starts updating the fields and raises it again + * (making it even) when it is done. Thus the guest can make sure the + * time values it got are consistent by checking the version before + * and after reading them. + */ + +struct pvclock_vcpu_time_info { + u32 version; + u32 pad0; + u64 tsc_timestamp; + u64 system_time; + u32 tsc_to_system_mul; + s8 tsc_shift; + u8 pad[3]; +} __attribute__((__packed__)); /* 32 bytes */ + +struct pvclock_wall_clock { + u32 version; + u32 sec; + u32 nsec; +} __attribute__((__packed__)); + +#endif /* __ASSEMBLY__ */ +#endif /* _ASM_X86_PVCLOCK_ABI_H */ diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h new file mode 100644 index 000000000000..53235fd5f8ce --- /dev/null +++ b/arch/x86/include/asm/pvclock.h @@ -0,0 +1,14 @@ +#ifndef _ASM_X86_PVCLOCK_H +#define _ASM_X86_PVCLOCK_H + +#include <linux/clocksource.h> +#include <asm/pvclock-abi.h> + +/* some helper functions for xen and kvm pv clock sources */ +cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); +unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src); +void pvclock_read_wallclock(struct pvclock_wall_clock *wall, + struct pvclock_vcpu_time_info *vcpu, + struct timespec *ts); + +#endif /* _ASM_X86_PVCLOCK_H */ diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h new file mode 100644 index 000000000000..df7710354f85 --- /dev/null +++ b/arch/x86/include/asm/reboot.h @@ -0,0 +1,21 @@ +#ifndef _ASM_X86_REBOOT_H +#define _ASM_X86_REBOOT_H + +struct pt_regs; + +struct machine_ops { + void (*restart)(char *cmd); + void (*halt)(void); + void (*power_off)(void); + void (*shutdown)(void); + void (*crash_shutdown)(struct pt_regs *); + void (*emergency_restart)(void); +}; + +extern struct machine_ops machine_ops; + +void native_machine_crash_shutdown(struct pt_regs *regs); +void native_machine_shutdown(void); +void machine_real_restart(const unsigned char *code, int length); + +#endif /* _ASM_X86_REBOOT_H */ diff --git a/arch/x86/include/asm/reboot_fixups.h b/arch/x86/include/asm/reboot_fixups.h new file mode 100644 index 000000000000..765debe4c54c --- /dev/null +++ b/arch/x86/include/asm/reboot_fixups.h @@ -0,0 +1,6 @@ +#ifndef _ASM_X86_REBOOT_FIXUPS_H +#define _ASM_X86_REBOOT_FIXUPS_H + +extern void mach_reboot_fixups(void); + +#endif /* _ASM_X86_REBOOT_FIXUPS_H */ diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h new file mode 100644 index 000000000000..d5cd6c586881 --- /dev/null +++ b/arch/x86/include/asm/required-features.h @@ -0,0 +1,82 @@ +#ifndef _ASM_X86_REQUIRED_FEATURES_H +#define _ASM_X86_REQUIRED_FEATURES_H + +/* Define minimum CPUID feature set for kernel These bits are checked + really early to actually display a visible error message before the + kernel dies. Make sure to assign features to the proper mask! + + Some requirements that are not in CPUID yet are also in the + CONFIG_X86_MINIMUM_CPU_FAMILY which is checked too. + + The real information is in arch/x86/Kconfig.cpu, this just converts + the CONFIGs into a bitmask */ + +#ifndef CONFIG_MATH_EMULATION +# define NEED_FPU (1<<(X86_FEATURE_FPU & 31)) +#else +# define NEED_FPU 0 +#endif + +#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) +# define NEED_PAE (1<<(X86_FEATURE_PAE & 31)) +#else +# define NEED_PAE 0 +#endif + +#ifdef CONFIG_X86_CMPXCHG64 +# define NEED_CX8 (1<<(X86_FEATURE_CX8 & 31)) +#else +# define NEED_CX8 0 +#endif + +#if defined(CONFIG_X86_CMOV) || defined(CONFIG_X86_64) +# define NEED_CMOV (1<<(X86_FEATURE_CMOV & 31)) +#else +# define NEED_CMOV 0 +#endif + +#ifdef CONFIG_X86_USE_3DNOW +# define NEED_3DNOW (1<<(X86_FEATURE_3DNOW & 31)) +#else +# define NEED_3DNOW 0 +#endif + +#if defined(CONFIG_X86_P6_NOP) || defined(CONFIG_X86_64) +# define NEED_NOPL (1<<(X86_FEATURE_NOPL & 31)) +#else +# define NEED_NOPL 0 +#endif + +#ifdef CONFIG_X86_64 +#define NEED_PSE 0 +#define NEED_MSR (1<<(X86_FEATURE_MSR & 31)) +#define NEED_PGE (1<<(X86_FEATURE_PGE & 31)) +#define NEED_FXSR (1<<(X86_FEATURE_FXSR & 31)) +#define NEED_XMM (1<<(X86_FEATURE_XMM & 31)) +#define NEED_XMM2 (1<<(X86_FEATURE_XMM2 & 31)) +#define NEED_LM (1<<(X86_FEATURE_LM & 31)) +#else +#define NEED_PSE 0 +#define NEED_MSR 0 +#define NEED_PGE 0 +#define NEED_FXSR 0 +#define NEED_XMM 0 +#define NEED_XMM2 0 +#define NEED_LM 0 +#endif + +#define REQUIRED_MASK0 (NEED_FPU|NEED_PSE|NEED_MSR|NEED_PAE|\ + NEED_CX8|NEED_PGE|NEED_FXSR|NEED_CMOV|\ + NEED_XMM|NEED_XMM2) +#define SSE_MASK (NEED_XMM|NEED_XMM2) + +#define REQUIRED_MASK1 (NEED_LM|NEED_3DNOW) + +#define REQUIRED_MASK2 0 +#define REQUIRED_MASK3 (NEED_NOPL) +#define REQUIRED_MASK4 0 +#define REQUIRED_MASK5 0 +#define REQUIRED_MASK6 0 +#define REQUIRED_MASK7 0 + +#endif /* _ASM_X86_REQUIRED_FEATURES_H */ diff --git a/arch/x86/include/asm/resource.h b/arch/x86/include/asm/resource.h new file mode 100644 index 000000000000..04bc4db8921b --- /dev/null +++ b/arch/x86/include/asm/resource.h @@ -0,0 +1 @@ +#include <asm-generic/resource.h> diff --git a/arch/x86/include/asm/resume-trace.h b/arch/x86/include/asm/resume-trace.h new file mode 100644 index 000000000000..3ff1c2cb1da5 --- /dev/null +++ b/arch/x86/include/asm/resume-trace.h @@ -0,0 +1,21 @@ +#ifndef _ASM_X86_RESUME_TRACE_H +#define _ASM_X86_RESUME_TRACE_H + +#include <asm/asm.h> + +#define TRACE_RESUME(user) \ +do { \ + if (pm_trace_enabled) { \ + const void *tracedata; \ + asm volatile(_ASM_MOV " $1f,%0\n" \ + ".section .tracedata,\"a\"\n" \ + "1:\t.word %c1\n\t" \ + _ASM_PTR " %c2\n" \ + ".previous" \ + :"=r" (tracedata) \ + : "i" (__LINE__), "i" (__FILE__)); \ + generate_resume_trace(tracedata, user); \ + } \ +} while (0) + +#endif /* _ASM_X86_RESUME_TRACE_H */ diff --git a/arch/x86/include/asm/rio.h b/arch/x86/include/asm/rio.h new file mode 100644 index 000000000000..97bab6388a92 --- /dev/null +++ b/arch/x86/include/asm/rio.h @@ -0,0 +1,63 @@ +/* + * Derived from include/asm-x86/mach-summit/mach_mpparse.h + * and include/asm-x86/mach-default/bios_ebda.h + * + * Author: Laurent Vivier <Laurent.Vivier@bull.net> + */ + +#ifndef _ASM_X86_RIO_H +#define _ASM_X86_RIO_H + +#define RIO_TABLE_VERSION 3 + +struct rio_table_hdr { + u8 version; /* Version number of this data structure */ + u8 num_scal_dev; /* # of Scalability devices */ + u8 num_rio_dev; /* # of RIO I/O devices */ +} __attribute__((packed)); + +struct scal_detail { + u8 node_id; /* Scalability Node ID */ + u32 CBAR; /* Address of 1MB register space */ + u8 port0node; /* Node ID port connected to: 0xFF=None */ + u8 port0port; /* Port num port connected to: 0,1,2, or */ + /* 0xFF=None */ + u8 port1node; /* Node ID port connected to: 0xFF = None */ + u8 port1port; /* Port num port connected to: 0,1,2, or */ + /* 0xFF=None */ + u8 port2node; /* Node ID port connected to: 0xFF = None */ + u8 port2port; /* Port num port connected to: 0,1,2, or */ + /* 0xFF=None */ + u8 chassis_num; /* 1 based Chassis number (1 = boot node) */ +} __attribute__((packed)); + +struct rio_detail { + u8 node_id; /* RIO Node ID */ + u32 BBAR; /* Address of 1MB register space */ + u8 type; /* Type of device */ + u8 owner_id; /* Node ID of Hurricane that owns this */ + /* node */ + u8 port0node; /* Node ID port connected to: 0xFF=None */ + u8 port0port; /* Port num port connected to: 0,1,2, or */ + /* 0xFF=None */ + u8 port1node; /* Node ID port connected to: 0xFF=None */ + u8 port1port; /* Port num port connected to: 0,1,2, or */ + /* 0xFF=None */ + u8 first_slot; /* Lowest slot number below this Calgary */ + u8 status; /* Bit 0 = 1 : the XAPIC is used */ + /* = 0 : the XAPIC is not used, ie: */ + /* ints fwded to another XAPIC */ + /* Bits1:7 Reserved */ + u8 WP_index; /* instance index - lower ones have */ + /* lower slot numbers/PCI bus numbers */ + u8 chassis_num; /* 1 based Chassis number */ +} __attribute__((packed)); + +enum { + HURR_SCALABILTY = 0, /* Hurricane Scalability info */ + HURR_RIOIB = 2, /* Hurricane RIOIB info */ + COMPAT_CALGARY = 4, /* Compatibility Calgary */ + ALT_CALGARY = 5, /* Second Planar Calgary */ +}; + +#endif /* _ASM_X86_RIO_H */ diff --git a/arch/x86/include/asm/rtc.h b/arch/x86/include/asm/rtc.h new file mode 100644 index 000000000000..f71c3b0ed360 --- /dev/null +++ b/arch/x86/include/asm/rtc.h @@ -0,0 +1 @@ +#include <asm-generic/rtc.h> diff --git a/arch/x86/include/asm/rwlock.h b/arch/x86/include/asm/rwlock.h new file mode 100644 index 000000000000..6a8c0d645108 --- /dev/null +++ b/arch/x86/include/asm/rwlock.h @@ -0,0 +1,8 @@ +#ifndef _ASM_X86_RWLOCK_H +#define _ASM_X86_RWLOCK_H + +#define RW_LOCK_BIAS 0x01000000 + +/* Actual code is in asm/spinlock.h or in arch/x86/lib/rwlock.S */ + +#endif /* _ASM_X86_RWLOCK_H */ diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h new file mode 100644 index 000000000000..ca7517d33776 --- /dev/null +++ b/arch/x86/include/asm/rwsem.h @@ -0,0 +1,265 @@ +/* rwsem.h: R/W semaphores implemented using XADD/CMPXCHG for i486+ + * + * Written by David Howells (dhowells@redhat.com). + * + * Derived from asm-x86/semaphore.h + * + * + * The MSW of the count is the negated number of active writers and waiting + * lockers, and the LSW is the total number of active locks + * + * The lock count is initialized to 0 (no active and no waiting lockers). + * + * When a writer subtracts WRITE_BIAS, it'll get 0xffff0001 for the case of an + * uncontended lock. This can be determined because XADD returns the old value. + * Readers increment by 1 and see a positive value when uncontended, negative + * if there are writers (and maybe) readers waiting (in which case it goes to + * sleep). + * + * The value of WAITING_BIAS supports up to 32766 waiting processes. This can + * be extended to 65534 by manually checking the whole MSW rather than relying + * on the S flag. + * + * The value of ACTIVE_BIAS supports up to 65535 active processes. + * + * This should be totally fair - if anything is waiting, a process that wants a + * lock will go to the back of the queue. When the currently active lock is + * released, if there's a writer at the front of the queue, then that and only + * that will be woken up; if there's a bunch of consequtive readers at the + * front, then they'll all be woken up, but no other readers will be. + */ + +#ifndef _ASM_X86_RWSEM_H +#define _ASM_X86_RWSEM_H + +#ifndef _LINUX_RWSEM_H +#error "please don't include asm/rwsem.h directly, use linux/rwsem.h instead" +#endif + +#ifdef __KERNEL__ + +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/lockdep.h> + +struct rwsem_waiter; + +extern asmregparm struct rw_semaphore * + rwsem_down_read_failed(struct rw_semaphore *sem); +extern asmregparm struct rw_semaphore * + rwsem_down_write_failed(struct rw_semaphore *sem); +extern asmregparm struct rw_semaphore * + rwsem_wake(struct rw_semaphore *); +extern asmregparm struct rw_semaphore * + rwsem_downgrade_wake(struct rw_semaphore *sem); + +/* + * the semaphore definition + */ + +#define RWSEM_UNLOCKED_VALUE 0x00000000 +#define RWSEM_ACTIVE_BIAS 0x00000001 +#define RWSEM_ACTIVE_MASK 0x0000ffff +#define RWSEM_WAITING_BIAS (-0x00010000) +#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS +#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) + +struct rw_semaphore { + signed long count; + spinlock_t wait_lock; + struct list_head wait_list; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } +#else +# define __RWSEM_DEP_MAP_INIT(lockname) +#endif + + +#define __RWSEM_INITIALIZER(name) \ +{ \ + RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \ + LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) \ +} + +#define DECLARE_RWSEM(name) \ + struct rw_semaphore name = __RWSEM_INITIALIZER(name) + +extern void __init_rwsem(struct rw_semaphore *sem, const char *name, + struct lock_class_key *key); + +#define init_rwsem(sem) \ +do { \ + static struct lock_class_key __key; \ + \ + __init_rwsem((sem), #sem, &__key); \ +} while (0) + +/* + * lock for reading + */ +static inline void __down_read(struct rw_semaphore *sem) +{ + asm volatile("# beginning down_read\n\t" + LOCK_PREFIX " incl (%%eax)\n\t" + /* adds 0x00000001, returns the old value */ + " jns 1f\n" + " call call_rwsem_down_read_failed\n" + "1:\n\t" + "# ending down_read\n\t" + : "+m" (sem->count) + : "a" (sem) + : "memory", "cc"); +} + +/* + * trylock for reading -- returns 1 if successful, 0 if contention + */ +static inline int __down_read_trylock(struct rw_semaphore *sem) +{ + __s32 result, tmp; + asm volatile("# beginning __down_read_trylock\n\t" + " movl %0,%1\n\t" + "1:\n\t" + " movl %1,%2\n\t" + " addl %3,%2\n\t" + " jle 2f\n\t" + LOCK_PREFIX " cmpxchgl %2,%0\n\t" + " jnz 1b\n\t" + "2:\n\t" + "# ending __down_read_trylock\n\t" + : "+m" (sem->count), "=&a" (result), "=&r" (tmp) + : "i" (RWSEM_ACTIVE_READ_BIAS) + : "memory", "cc"); + return result >= 0 ? 1 : 0; +} + +/* + * lock for writing + */ +static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) +{ + int tmp; + + tmp = RWSEM_ACTIVE_WRITE_BIAS; + asm volatile("# beginning down_write\n\t" + LOCK_PREFIX " xadd %%edx,(%%eax)\n\t" + /* subtract 0x0000ffff, returns the old value */ + " testl %%edx,%%edx\n\t" + /* was the count 0 before? */ + " jz 1f\n" + " call call_rwsem_down_write_failed\n" + "1:\n" + "# ending down_write" + : "+m" (sem->count), "=d" (tmp) + : "a" (sem), "1" (tmp) + : "memory", "cc"); +} + +static inline void __down_write(struct rw_semaphore *sem) +{ + __down_write_nested(sem, 0); +} + +/* + * trylock for writing -- returns 1 if successful, 0 if contention + */ +static inline int __down_write_trylock(struct rw_semaphore *sem) +{ + signed long ret = cmpxchg(&sem->count, + RWSEM_UNLOCKED_VALUE, + RWSEM_ACTIVE_WRITE_BIAS); + if (ret == RWSEM_UNLOCKED_VALUE) + return 1; + return 0; +} + +/* + * unlock after reading + */ +static inline void __up_read(struct rw_semaphore *sem) +{ + __s32 tmp = -RWSEM_ACTIVE_READ_BIAS; + asm volatile("# beginning __up_read\n\t" + LOCK_PREFIX " xadd %%edx,(%%eax)\n\t" + /* subtracts 1, returns the old value */ + " jns 1f\n\t" + " call call_rwsem_wake\n" + "1:\n" + "# ending __up_read\n" + : "+m" (sem->count), "=d" (tmp) + : "a" (sem), "1" (tmp) + : "memory", "cc"); +} + +/* + * unlock after writing + */ +static inline void __up_write(struct rw_semaphore *sem) +{ + asm volatile("# beginning __up_write\n\t" + " movl %2,%%edx\n\t" + LOCK_PREFIX " xaddl %%edx,(%%eax)\n\t" + /* tries to transition + 0xffff0001 -> 0x00000000 */ + " jz 1f\n" + " call call_rwsem_wake\n" + "1:\n\t" + "# ending __up_write\n" + : "+m" (sem->count) + : "a" (sem), "i" (-RWSEM_ACTIVE_WRITE_BIAS) + : "memory", "cc", "edx"); +} + +/* + * downgrade write lock to read lock + */ +static inline void __downgrade_write(struct rw_semaphore *sem) +{ + asm volatile("# beginning __downgrade_write\n\t" + LOCK_PREFIX " addl %2,(%%eax)\n\t" + /* transitions 0xZZZZ0001 -> 0xYYYY0001 */ + " jns 1f\n\t" + " call call_rwsem_downgrade_wake\n" + "1:\n\t" + "# ending __downgrade_write\n" + : "+m" (sem->count) + : "a" (sem), "i" (-RWSEM_WAITING_BIAS) + : "memory", "cc"); +} + +/* + * implement atomic add functionality + */ +static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) +{ + asm volatile(LOCK_PREFIX "addl %1,%0" + : "+m" (sem->count) + : "ir" (delta)); +} + +/* + * implement exchange and add functionality + */ +static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) +{ + int tmp = delta; + + asm volatile(LOCK_PREFIX "xadd %0,%1" + : "+r" (tmp), "+m" (sem->count) + : : "memory"); + + return tmp + delta; +} + +static inline int rwsem_is_locked(struct rw_semaphore *sem) +{ + return (sem->count != 0); +} + +#endif /* __KERNEL__ */ +#endif /* _ASM_X86_RWSEM_H */ diff --git a/arch/x86/include/asm/scatterlist.h b/arch/x86/include/asm/scatterlist.h new file mode 100644 index 000000000000..263d397d2eef --- /dev/null +++ b/arch/x86/include/asm/scatterlist.h @@ -0,0 +1,33 @@ +#ifndef _ASM_X86_SCATTERLIST_H +#define _ASM_X86_SCATTERLIST_H + +#include <asm/types.h> + +struct scatterlist { +#ifdef CONFIG_DEBUG_SG + unsigned long sg_magic; +#endif + unsigned long page_link; + unsigned int offset; + unsigned int length; + dma_addr_t dma_address; + unsigned int dma_length; +}; + +#define ARCH_HAS_SG_CHAIN +#define ISA_DMA_THRESHOLD (0x00ffffff) + +/* + * These macros should be used after a pci_map_sg call has been done + * to get bus addresses of each of the SG entries and their lengths. + * You should only work with the number of sg entries pci_map_sg + * returns. + */ +#define sg_dma_address(sg) ((sg)->dma_address) +#ifdef CONFIG_X86_32 +# define sg_dma_len(sg) ((sg)->length) +#else +# define sg_dma_len(sg) ((sg)->dma_length) +#endif + +#endif /* _ASM_X86_SCATTERLIST_H */ diff --git a/arch/x86/include/asm/seccomp.h b/arch/x86/include/asm/seccomp.h new file mode 100644 index 000000000000..c62e58a5a90d --- /dev/null +++ b/arch/x86/include/asm/seccomp.h @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include "seccomp_32.h" +#else +# include "seccomp_64.h" +#endif diff --git a/arch/x86/include/asm/seccomp_32.h b/arch/x86/include/asm/seccomp_32.h new file mode 100644 index 000000000000..a6ad87b352c4 --- /dev/null +++ b/arch/x86/include/asm/seccomp_32.h @@ -0,0 +1,17 @@ +#ifndef _ASM_X86_SECCOMP_32_H +#define _ASM_X86_SECCOMP_32_H + +#include <linux/thread_info.h> + +#ifdef TIF_32BIT +#error "unexpected TIF_32BIT on i386" +#endif + +#include <linux/unistd.h> + +#define __NR_seccomp_read __NR_read +#define __NR_seccomp_write __NR_write +#define __NR_seccomp_exit __NR_exit +#define __NR_seccomp_sigreturn __NR_sigreturn + +#endif /* _ASM_X86_SECCOMP_32_H */ diff --git a/arch/x86/include/asm/seccomp_64.h b/arch/x86/include/asm/seccomp_64.h new file mode 100644 index 000000000000..4171bb794e9e --- /dev/null +++ b/arch/x86/include/asm/seccomp_64.h @@ -0,0 +1,25 @@ +#ifndef _ASM_X86_SECCOMP_64_H +#define _ASM_X86_SECCOMP_64_H + +#include <linux/thread_info.h> + +#ifdef TIF_32BIT +#error "unexpected TIF_32BIT on x86_64" +#else +#define TIF_32BIT TIF_IA32 +#endif + +#include <linux/unistd.h> +#include <asm/ia32_unistd.h> + +#define __NR_seccomp_read __NR_read +#define __NR_seccomp_write __NR_write +#define __NR_seccomp_exit __NR_exit +#define __NR_seccomp_sigreturn __NR_rt_sigreturn + +#define __NR_seccomp_read_32 __NR_ia32_read +#define __NR_seccomp_write_32 __NR_ia32_write +#define __NR_seccomp_exit_32 __NR_ia32_exit +#define __NR_seccomp_sigreturn_32 __NR_ia32_sigreturn + +#endif /* _ASM_X86_SECCOMP_64_H */ diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h new file mode 100644 index 000000000000..2b8c5160388f --- /dev/null +++ b/arch/x86/include/asm/sections.h @@ -0,0 +1 @@ +#include <asm-generic/sections.h> diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h new file mode 100644 index 000000000000..1dc1b51ac623 --- /dev/null +++ b/arch/x86/include/asm/segment.h @@ -0,0 +1,209 @@ +#ifndef _ASM_X86_SEGMENT_H +#define _ASM_X86_SEGMENT_H + +/* Constructor for a conventional segment GDT (or LDT) entry */ +/* This is a macro so it can be used in initializers */ +#define GDT_ENTRY(flags, base, limit) \ + ((((base) & 0xff000000ULL) << (56-24)) | \ + (((flags) & 0x0000f0ffULL) << 40) | \ + (((limit) & 0x000f0000ULL) << (48-16)) | \ + (((base) & 0x00ffffffULL) << 16) | \ + (((limit) & 0x0000ffffULL))) + +/* Simple and small GDT entries for booting only */ + +#define GDT_ENTRY_BOOT_CS 2 +#define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8) + +#define GDT_ENTRY_BOOT_DS (GDT_ENTRY_BOOT_CS + 1) +#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8) + +#define GDT_ENTRY_BOOT_TSS (GDT_ENTRY_BOOT_CS + 2) +#define __BOOT_TSS (GDT_ENTRY_BOOT_TSS * 8) + +#ifdef CONFIG_X86_32 +/* + * The layout of the per-CPU GDT under Linux: + * + * 0 - null + * 1 - reserved + * 2 - reserved + * 3 - reserved + * + * 4 - unused <==== new cacheline + * 5 - unused + * + * ------- start of TLS (Thread-Local Storage) segments: + * + * 6 - TLS segment #1 [ glibc's TLS segment ] + * 7 - TLS segment #2 [ Wine's %fs Win32 segment ] + * 8 - TLS segment #3 + * 9 - reserved + * 10 - reserved + * 11 - reserved + * + * ------- start of kernel segments: + * + * 12 - kernel code segment <==== new cacheline + * 13 - kernel data segment + * 14 - default user CS + * 15 - default user DS + * 16 - TSS + * 17 - LDT + * 18 - PNPBIOS support (16->32 gate) + * 19 - PNPBIOS support + * 20 - PNPBIOS support + * 21 - PNPBIOS support + * 22 - PNPBIOS support + * 23 - APM BIOS support + * 24 - APM BIOS support + * 25 - APM BIOS support + * + * 26 - ESPFIX small SS + * 27 - per-cpu [ offset to per-cpu data area ] + * 28 - unused + * 29 - unused + * 30 - unused + * 31 - TSS for double fault handler + */ +#define GDT_ENTRY_TLS_MIN 6 +#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1) + +#define GDT_ENTRY_DEFAULT_USER_CS 14 + +#define GDT_ENTRY_DEFAULT_USER_DS 15 + +#define GDT_ENTRY_KERNEL_BASE 12 + +#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0) + +#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1) + +#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4) +#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5) + +#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 6) +#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 11) + +#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14) +#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8) + +#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE + 15) +#ifdef CONFIG_SMP +#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8) +#else +#define __KERNEL_PERCPU 0 +#endif + +#define GDT_ENTRY_DOUBLEFAULT_TSS 31 + +/* + * The GDT has 32 entries + */ +#define GDT_ENTRIES 32 + +/* The PnP BIOS entries in the GDT */ +#define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0) +#define GDT_ENTRY_PNPBIOS_CS16 (GDT_ENTRY_PNPBIOS_BASE + 1) +#define GDT_ENTRY_PNPBIOS_DS (GDT_ENTRY_PNPBIOS_BASE + 2) +#define GDT_ENTRY_PNPBIOS_TS1 (GDT_ENTRY_PNPBIOS_BASE + 3) +#define GDT_ENTRY_PNPBIOS_TS2 (GDT_ENTRY_PNPBIOS_BASE + 4) + +/* The PnP BIOS selectors */ +#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */ +#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */ +#define PNP_DS (GDT_ENTRY_PNPBIOS_DS * 8) /* data segment for BIOS */ +#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */ +#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */ + +/* Bottom two bits of selector give the ring privilege level */ +#define SEGMENT_RPL_MASK 0x3 +/* Bit 2 is table indicator (LDT/GDT) */ +#define SEGMENT_TI_MASK 0x4 + +/* User mode is privilege level 3 */ +#define USER_RPL 0x3 +/* LDT segment has TI set, GDT has it cleared */ +#define SEGMENT_LDT 0x4 +#define SEGMENT_GDT 0x0 + +/* + * Matching rules for certain types of segments. + */ + +/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */ +#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8) + + +#else +#include <asm/cache.h> + +#define GDT_ENTRY_KERNEL32_CS 1 +#define GDT_ENTRY_KERNEL_CS 2 +#define GDT_ENTRY_KERNEL_DS 3 + +#define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS * 8) + +/* + * we cannot use the same code segment descriptor for user and kernel + * -- not even in the long flat mode, because of different DPL /kkeil + * The segment offset needs to contain a RPL. Grr. -AK + * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets) + */ +#define GDT_ENTRY_DEFAULT_USER32_CS 4 +#define GDT_ENTRY_DEFAULT_USER_DS 5 +#define GDT_ENTRY_DEFAULT_USER_CS 6 +#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS * 8 + 3) +#define __USER32_DS __USER_DS + +#define GDT_ENTRY_TSS 8 /* needs two entries */ +#define GDT_ENTRY_LDT 10 /* needs two entries */ +#define GDT_ENTRY_TLS_MIN 12 +#define GDT_ENTRY_TLS_MAX 14 + +#define GDT_ENTRY_PER_CPU 15 /* Abused to load per CPU data from limit */ +#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU * 8 + 3) + +/* TLS indexes for 64bit - hardcoded in arch_prctl */ +#define FS_TLS 0 +#define GS_TLS 1 + +#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3) +#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3) + +#define GDT_ENTRIES 16 + +#endif + +#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8) +#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8) +#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS* 8 + 3) +#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS* 8 + 3) +#ifndef CONFIG_PARAVIRT +#define get_kernel_rpl() 0 +#endif + +/* User mode is privilege level 3 */ +#define USER_RPL 0x3 +/* LDT segment has TI set, GDT has it cleared */ +#define SEGMENT_LDT 0x4 +#define SEGMENT_GDT 0x0 + +/* Bottom two bits of selector give the ring privilege level */ +#define SEGMENT_RPL_MASK 0x3 +/* Bit 2 is table indicator (LDT/GDT) */ +#define SEGMENT_TI_MASK 0x4 + +#define IDT_ENTRIES 256 +#define NUM_EXCEPTION_VECTORS 32 +#define GDT_SIZE (GDT_ENTRIES * 8) +#define GDT_ENTRY_TLS_ENTRIES 3 +#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8) + +#ifdef __KERNEL__ +#ifndef __ASSEMBLY__ +extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][10]; +#endif +#endif + +#endif /* _ASM_X86_SEGMENT_H */ diff --git a/arch/x86/include/asm/sembuf.h b/arch/x86/include/asm/sembuf.h new file mode 100644 index 000000000000..ee50c801f7b7 --- /dev/null +++ b/arch/x86/include/asm/sembuf.h @@ -0,0 +1,24 @@ +#ifndef _ASM_X86_SEMBUF_H +#define _ASM_X86_SEMBUF_H + +/* + * The semid64_ds structure for x86 architecture. + * Note extra padding because this structure is passed back and forth + * between kernel and user space. + * + * Pad space is left for: + * - 64-bit time_t to solve y2038 problem + * - 2 miscellaneous 32-bit values + */ +struct semid64_ds { + struct ipc64_perm sem_perm; /* permissions .. see ipc.h */ + __kernel_time_t sem_otime; /* last semop time */ + unsigned long __unused1; + __kernel_time_t sem_ctime; /* last change time */ + unsigned long __unused2; + unsigned long sem_nsems; /* no. of semaphores in array */ + unsigned long __unused3; + unsigned long __unused4; +}; + +#endif /* _ASM_X86_SEMBUF_H */ diff --git a/arch/x86/include/asm/serial.h b/arch/x86/include/asm/serial.h new file mode 100644 index 000000000000..628c801535ea --- /dev/null +++ b/arch/x86/include/asm/serial.h @@ -0,0 +1,29 @@ +#ifndef _ASM_X86_SERIAL_H +#define _ASM_X86_SERIAL_H + +/* + * This assumes you have a 1.8432 MHz clock for your UART. + * + * It'd be nice if someone built a serial card with a 24.576 MHz + * clock, since the 16550A is capable of handling a top speed of 1.5 + * megabits/second; but this requires the faster clock. + */ +#define BASE_BAUD ( 1843200 / 16 ) + +/* Standard COM flags (except for COM4, because of the 8514 problem) */ +#ifdef CONFIG_SERIAL_DETECT_IRQ +#define STD_COM_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_SKIP_TEST | ASYNC_AUTO_IRQ) +#define STD_COM4_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_AUTO_IRQ) +#else +#define STD_COM_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_SKIP_TEST) +#define STD_COM4_FLAGS ASYNC_BOOT_AUTOCONF +#endif + +#define SERIAL_PORT_DFNS \ + /* UART CLK PORT IRQ FLAGS */ \ + { 0, BASE_BAUD, 0x3F8, 4, STD_COM_FLAGS }, /* ttyS0 */ \ + { 0, BASE_BAUD, 0x2F8, 3, STD_COM_FLAGS }, /* ttyS1 */ \ + { 0, BASE_BAUD, 0x3E8, 4, STD_COM_FLAGS }, /* ttyS2 */ \ + { 0, BASE_BAUD, 0x2E8, 3, STD_COM4_FLAGS }, /* ttyS3 */ + +#endif /* _ASM_X86_SERIAL_H */ diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h new file mode 100644 index 000000000000..f12d37237465 --- /dev/null +++ b/arch/x86/include/asm/setup.h @@ -0,0 +1,105 @@ +#ifndef _ASM_X86_SETUP_H +#define _ASM_X86_SETUP_H + +#define COMMAND_LINE_SIZE 2048 + +#ifndef __ASSEMBLY__ + +/* Interrupt control for vSMPowered x86_64 systems */ +void vsmp_init(void); + +#ifdef CONFIG_X86_VISWS +extern void visws_early_detect(void); +extern int is_visws_box(void); +#else +static inline void visws_early_detect(void) { } +static inline int is_visws_box(void) { return 0; } +#endif + +/* + * Any setup quirks to be performed? + */ +struct mpc_config_processor; +struct mpc_config_bus; +struct mp_config_oemtable; +struct x86_quirks { + int (*arch_pre_time_init)(void); + int (*arch_time_init)(void); + int (*arch_pre_intr_init)(void); + int (*arch_intr_init)(void); + int (*arch_trap_init)(void); + char * (*arch_memory_setup)(void); + int (*mach_get_smp_config)(unsigned int early); + int (*mach_find_smp_config)(unsigned int reserve); + + int *mpc_record; + int (*mpc_apic_id)(struct mpc_config_processor *m); + void (*mpc_oem_bus_info)(struct mpc_config_bus *m, char *name); + void (*mpc_oem_pci_bus)(struct mpc_config_bus *m); + void (*smp_read_mpc_oem)(struct mp_config_oemtable *oemtable, + unsigned short oemsize); + int (*setup_ioapic_ids)(void); +}; + +extern struct x86_quirks *x86_quirks; +extern unsigned long saved_video_mode; + +#ifndef CONFIG_PARAVIRT +#define paravirt_post_allocator_init() do {} while (0) +#endif +#endif /* __ASSEMBLY__ */ + +#ifdef __KERNEL__ + +#ifdef __i386__ + +#include <linux/pfn.h> +/* + * Reserved space for vmalloc and iomap - defined in asm/page.h + */ +#define MAXMEM_PFN PFN_DOWN(MAXMEM) +#define MAX_NONPAE_PFN (1 << 20) + +#endif /* __i386__ */ + +#define PARAM_SIZE 4096 /* sizeof(struct boot_params) */ + +#define OLD_CL_MAGIC 0xA33F +#define OLD_CL_ADDRESS 0x020 /* Relative to real mode data */ +#define NEW_CL_POINTER 0x228 /* Relative to real mode data */ + +#ifndef __ASSEMBLY__ +#include <asm/bootparam.h> + +#ifndef _SETUP + +/* + * This is set up by the setup-routine at boot-time + */ +extern struct boot_params boot_params; + +/* + * Do NOT EVER look at the BIOS memory size location. + * It does not work on many machines. + */ +#define LOWMEMSIZE() (0x9f000) + +#ifdef __i386__ + +void __init i386_start_kernel(void); +extern void probe_roms(void); + +extern unsigned long init_pg_tables_start; +extern unsigned long init_pg_tables_end; + +#else +void __init x86_64_init_pda(void); +void __init x86_64_start_kernel(char *real_mode); +void __init x86_64_start_reservations(char *real_mode_data); + +#endif /* __i386__ */ +#endif /* _SETUP */ +#endif /* __ASSEMBLY__ */ +#endif /* __KERNEL__ */ + +#endif /* _ASM_X86_SETUP_H */ diff --git a/arch/x86/include/asm/shmbuf.h b/arch/x86/include/asm/shmbuf.h new file mode 100644 index 000000000000..b51413b74971 --- /dev/null +++ b/arch/x86/include/asm/shmbuf.h @@ -0,0 +1,51 @@ +#ifndef _ASM_X86_SHMBUF_H +#define _ASM_X86_SHMBUF_H + +/* + * The shmid64_ds structure for x86 architecture. + * Note extra padding because this structure is passed back and forth + * between kernel and user space. + * + * Pad space on 32 bit is left for: + * - 64-bit time_t to solve y2038 problem + * - 2 miscellaneous 32-bit values + * + * Pad space on 64 bit is left for: + * - 2 miscellaneous 64-bit values + */ + +struct shmid64_ds { + struct ipc64_perm shm_perm; /* operation perms */ + size_t shm_segsz; /* size of segment (bytes) */ + __kernel_time_t shm_atime; /* last attach time */ +#ifdef __i386__ + unsigned long __unused1; +#endif + __kernel_time_t shm_dtime; /* last detach time */ +#ifdef __i386__ + unsigned long __unused2; +#endif + __kernel_time_t shm_ctime; /* last change time */ +#ifdef __i386__ + unsigned long __unused3; +#endif + __kernel_pid_t shm_cpid; /* pid of creator */ + __kernel_pid_t shm_lpid; /* pid of last operator */ + unsigned long shm_nattch; /* no. of current attaches */ + unsigned long __unused4; + unsigned long __unused5; +}; + +struct shminfo64 { + unsigned long shmmax; + unsigned long shmmin; + unsigned long shmmni; + unsigned long shmseg; + unsigned long shmall; + unsigned long __unused1; + unsigned long __unused2; + unsigned long __unused3; + unsigned long __unused4; +}; + +#endif /* _ASM_X86_SHMBUF_H */ diff --git a/arch/x86/include/asm/shmparam.h b/arch/x86/include/asm/shmparam.h new file mode 100644 index 000000000000..0880cf0917b9 --- /dev/null +++ b/arch/x86/include/asm/shmparam.h @@ -0,0 +1,6 @@ +#ifndef _ASM_X86_SHMPARAM_H +#define _ASM_X86_SHMPARAM_H + +#define SHMLBA PAGE_SIZE /* attach addr a multiple of this */ + +#endif /* _ASM_X86_SHMPARAM_H */ diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h new file mode 100644 index 000000000000..0afcb5e58acc --- /dev/null +++ b/arch/x86/include/asm/sigcontext.h @@ -0,0 +1,284 @@ +#ifndef _ASM_X86_SIGCONTEXT_H +#define _ASM_X86_SIGCONTEXT_H + +#include <linux/compiler.h> +#include <asm/types.h> + +#define FP_XSTATE_MAGIC1 0x46505853U +#define FP_XSTATE_MAGIC2 0x46505845U +#define FP_XSTATE_MAGIC2_SIZE sizeof(FP_XSTATE_MAGIC2) + +/* + * bytes 464..511 in the current 512byte layout of fxsave/fxrstor frame + * are reserved for SW usage. On cpu's supporting xsave/xrstor, these bytes + * are used to extended the fpstate pointer in the sigcontext, which now + * includes the extended state information along with fpstate information. + * + * Presence of FP_XSTATE_MAGIC1 at the beginning of this SW reserved + * area and FP_XSTATE_MAGIC2 at the end of memory layout + * (extended_size - FP_XSTATE_MAGIC2_SIZE) indicates the presence of the + * extended state information in the memory layout pointed by the fpstate + * pointer in sigcontext. + */ +struct _fpx_sw_bytes { + __u32 magic1; /* FP_XSTATE_MAGIC1 */ + __u32 extended_size; /* total size of the layout referred by + * fpstate pointer in the sigcontext. + */ + __u64 xstate_bv; + /* feature bit mask (including fp/sse/extended + * state) that is present in the memory + * layout. + */ + __u32 xstate_size; /* actual xsave state size, based on the + * features saved in the layout. + * 'extended_size' will be greater than + * 'xstate_size'. + */ + __u32 padding[7]; /* for future use. */ +}; + +#ifdef __i386__ +/* + * As documented in the iBCS2 standard.. + * + * The first part of "struct _fpstate" is just the normal i387 + * hardware setup, the extra "status" word is used to save the + * coprocessor status word before entering the handler. + * + * Pentium III FXSR, SSE support + * Gareth Hughes <gareth@valinux.com>, May 2000 + * + * The FPU state data structure has had to grow to accommodate the + * extended FPU state required by the Streaming SIMD Extensions. + * There is no documented standard to accomplish this at the moment. + */ +struct _fpreg { + unsigned short significand[4]; + unsigned short exponent; +}; + +struct _fpxreg { + unsigned short significand[4]; + unsigned short exponent; + unsigned short padding[3]; +}; + +struct _xmmreg { + unsigned long element[4]; +}; + +struct _fpstate { + /* Regular FPU environment */ + unsigned long cw; + unsigned long sw; + unsigned long tag; + unsigned long ipoff; + unsigned long cssel; + unsigned long dataoff; + unsigned long datasel; + struct _fpreg _st[8]; + unsigned short status; + unsigned short magic; /* 0xffff = regular FPU data only */ + + /* FXSR FPU environment */ + unsigned long _fxsr_env[6]; /* FXSR FPU env is ignored */ + unsigned long mxcsr; + unsigned long reserved; + struct _fpxreg _fxsr_st[8]; /* FXSR FPU reg data is ignored */ + struct _xmmreg _xmm[8]; + unsigned long padding1[44]; + + union { + unsigned long padding2[12]; + struct _fpx_sw_bytes sw_reserved; /* represents the extended + * state info */ + }; +}; + +#define X86_FXSR_MAGIC 0x0000 + +#ifdef __KERNEL__ +struct sigcontext { + unsigned short gs, __gsh; + unsigned short fs, __fsh; + unsigned short es, __esh; + unsigned short ds, __dsh; + unsigned long di; + unsigned long si; + unsigned long bp; + unsigned long sp; + unsigned long bx; + unsigned long dx; + unsigned long cx; + unsigned long ax; + unsigned long trapno; + unsigned long err; + unsigned long ip; + unsigned short cs, __csh; + unsigned long flags; + unsigned long sp_at_signal; + unsigned short ss, __ssh; + + /* + * fpstate is really (struct _fpstate *) or (struct _xstate *) + * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved + * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end + * of extended memory layout. See comments at the defintion of + * (struct _fpx_sw_bytes) + */ + void __user *fpstate; /* zero when no FPU/extended context */ + unsigned long oldmask; + unsigned long cr2; +}; +#else /* __KERNEL__ */ +/* + * User-space might still rely on the old definition: + */ +struct sigcontext { + unsigned short gs, __gsh; + unsigned short fs, __fsh; + unsigned short es, __esh; + unsigned short ds, __dsh; + unsigned long edi; + unsigned long esi; + unsigned long ebp; + unsigned long esp; + unsigned long ebx; + unsigned long edx; + unsigned long ecx; + unsigned long eax; + unsigned long trapno; + unsigned long err; + unsigned long eip; + unsigned short cs, __csh; + unsigned long eflags; + unsigned long esp_at_signal; + unsigned short ss, __ssh; + struct _fpstate __user *fpstate; + unsigned long oldmask; + unsigned long cr2; +}; +#endif /* !__KERNEL__ */ + +#else /* __i386__ */ + +/* FXSAVE frame */ +/* Note: reserved1/2 may someday contain valuable data. Always save/restore + them when you change signal frames. */ +struct _fpstate { + __u16 cwd; + __u16 swd; + __u16 twd; /* Note this is not the same as the + 32bit/x87/FSAVE twd */ + __u16 fop; + __u64 rip; + __u64 rdp; + __u32 mxcsr; + __u32 mxcsr_mask; + __u32 st_space[32]; /* 8*16 bytes for each FP-reg */ + __u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg */ + __u32 reserved2[12]; + union { + __u32 reserved3[12]; + struct _fpx_sw_bytes sw_reserved; /* represents the extended + * state information */ + }; +}; + +#ifdef __KERNEL__ +struct sigcontext { + unsigned long r8; + unsigned long r9; + unsigned long r10; + unsigned long r11; + unsigned long r12; + unsigned long r13; + unsigned long r14; + unsigned long r15; + unsigned long di; + unsigned long si; + unsigned long bp; + unsigned long bx; + unsigned long dx; + unsigned long ax; + unsigned long cx; + unsigned long sp; + unsigned long ip; + unsigned long flags; + unsigned short cs; + unsigned short gs; + unsigned short fs; + unsigned short __pad0; + unsigned long err; + unsigned long trapno; + unsigned long oldmask; + unsigned long cr2; + + /* + * fpstate is really (struct _fpstate *) or (struct _xstate *) + * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved + * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end + * of extended memory layout. See comments at the defintion of + * (struct _fpx_sw_bytes) + */ + void __user *fpstate; /* zero when no FPU/extended context */ + unsigned long reserved1[8]; +}; +#else /* __KERNEL__ */ +/* + * User-space might still rely on the old definition: + */ +struct sigcontext { + unsigned long r8; + unsigned long r9; + unsigned long r10; + unsigned long r11; + unsigned long r12; + unsigned long r13; + unsigned long r14; + unsigned long r15; + unsigned long rdi; + unsigned long rsi; + unsigned long rbp; + unsigned long rbx; + unsigned long rdx; + unsigned long rax; + unsigned long rcx; + unsigned long rsp; + unsigned long rip; + unsigned long eflags; /* RFLAGS */ + unsigned short cs; + unsigned short gs; + unsigned short fs; + unsigned short __pad0; + unsigned long err; + unsigned long trapno; + unsigned long oldmask; + unsigned long cr2; + struct _fpstate __user *fpstate; /* zero when no FPU context */ + unsigned long reserved1[8]; +}; +#endif /* !__KERNEL__ */ + +#endif /* !__i386__ */ + +struct _xsave_hdr { + __u64 xstate_bv; + __u64 reserved1[2]; + __u64 reserved2[5]; +}; + +/* + * Extended state pointed by the fpstate pointer in the sigcontext. + * In addition to the fpstate, information encoded in the xstate_hdr + * indicates the presence of other extended state information + * supported by the processor and OS. + */ +struct _xstate { + struct _fpstate fpstate; + struct _xsave_hdr xstate_hdr; + /* new processor state extensions go here */ +}; + +#endif /* _ASM_X86_SIGCONTEXT_H */ diff --git a/arch/x86/include/asm/sigcontext32.h b/arch/x86/include/asm/sigcontext32.h new file mode 100644 index 000000000000..6126188cf3a9 --- /dev/null +++ b/arch/x86/include/asm/sigcontext32.h @@ -0,0 +1,75 @@ +#ifndef _ASM_X86_SIGCONTEXT32_H +#define _ASM_X86_SIGCONTEXT32_H + +/* signal context for 32bit programs. */ + +#define X86_FXSR_MAGIC 0x0000 + +struct _fpreg { + unsigned short significand[4]; + unsigned short exponent; +}; + +struct _fpxreg { + unsigned short significand[4]; + unsigned short exponent; + unsigned short padding[3]; +}; + +struct _xmmreg { + __u32 element[4]; +}; + +/* FSAVE frame with extensions */ +struct _fpstate_ia32 { + /* Regular FPU environment */ + __u32 cw; + __u32 sw; + __u32 tag; /* not compatible to 64bit twd */ + __u32 ipoff; + __u32 cssel; + __u32 dataoff; + __u32 datasel; + struct _fpreg _st[8]; + unsigned short status; + unsigned short magic; /* 0xffff = regular FPU data only */ + + /* FXSR FPU environment */ + __u32 _fxsr_env[6]; + __u32 mxcsr; + __u32 reserved; + struct _fpxreg _fxsr_st[8]; + struct _xmmreg _xmm[8]; /* It's actually 16 */ + __u32 padding[44]; + union { + __u32 padding2[12]; + struct _fpx_sw_bytes sw_reserved; + }; +}; + +struct sigcontext_ia32 { + unsigned short gs, __gsh; + unsigned short fs, __fsh; + unsigned short es, __esh; + unsigned short ds, __dsh; + unsigned int di; + unsigned int si; + unsigned int bp; + unsigned int sp; + unsigned int bx; + unsigned int dx; + unsigned int cx; + unsigned int ax; + unsigned int trapno; + unsigned int err; + unsigned int ip; + unsigned short cs, __csh; + unsigned int flags; + unsigned int sp_at_signal; + unsigned short ss, __ssh; + unsigned int fpstate; /* really (struct _fpstate_ia32 *) */ + unsigned int oldmask; + unsigned int cr2; +}; + +#endif /* _ASM_X86_SIGCONTEXT32_H */ diff --git a/arch/x86/include/asm/siginfo.h b/arch/x86/include/asm/siginfo.h new file mode 100644 index 000000000000..fc1aa5535646 --- /dev/null +++ b/arch/x86/include/asm/siginfo.h @@ -0,0 +1,10 @@ +#ifndef _ASM_X86_SIGINFO_H +#define _ASM_X86_SIGINFO_H + +#ifdef __x86_64__ +# define __ARCH_SI_PREAMBLE_SIZE (4 * sizeof(int)) +#endif + +#include <asm-generic/siginfo.h> + +#endif /* _ASM_X86_SIGINFO_H */ diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h new file mode 100644 index 000000000000..96ac44f275da --- /dev/null +++ b/arch/x86/include/asm/signal.h @@ -0,0 +1,262 @@ +#ifndef _ASM_X86_SIGNAL_H +#define _ASM_X86_SIGNAL_H + +#ifndef __ASSEMBLY__ +#include <linux/types.h> +#include <linux/time.h> +#include <linux/compiler.h> + +/* Avoid too many header ordering problems. */ +struct siginfo; + +#ifdef __KERNEL__ +#include <linux/linkage.h> + +/* Most things should be clean enough to redefine this at will, if care + is taken to make libc match. */ + +#define _NSIG 64 + +#ifdef __i386__ +# define _NSIG_BPW 32 +#else +# define _NSIG_BPW 64 +#endif + +#define _NSIG_WORDS (_NSIG / _NSIG_BPW) + +typedef unsigned long old_sigset_t; /* at least 32 bits */ + +typedef struct { + unsigned long sig[_NSIG_WORDS]; +} sigset_t; + +#else +/* Here we must cater to libcs that poke about in kernel headers. */ + +#define NSIG 32 +typedef unsigned long sigset_t; + +#endif /* __KERNEL__ */ +#endif /* __ASSEMBLY__ */ + +#define SIGHUP 1 +#define SIGINT 2 +#define SIGQUIT 3 +#define SIGILL 4 +#define SIGTRAP 5 +#define SIGABRT 6 +#define SIGIOT 6 +#define SIGBUS 7 +#define SIGFPE 8 +#define SIGKILL 9 +#define SIGUSR1 10 +#define SIGSEGV 11 +#define SIGUSR2 12 +#define SIGPIPE 13 +#define SIGALRM 14 +#define SIGTERM 15 +#define SIGSTKFLT 16 +#define SIGCHLD 17 +#define SIGCONT 18 +#define SIGSTOP 19 +#define SIGTSTP 20 +#define SIGTTIN 21 +#define SIGTTOU 22 +#define SIGURG 23 +#define SIGXCPU 24 +#define SIGXFSZ 25 +#define SIGVTALRM 26 +#define SIGPROF 27 +#define SIGWINCH 28 +#define SIGIO 29 +#define SIGPOLL SIGIO +/* +#define SIGLOST 29 +*/ +#define SIGPWR 30 +#define SIGSYS 31 +#define SIGUNUSED 31 + +/* These should not be considered constants from userland. */ +#define SIGRTMIN 32 +#define SIGRTMAX _NSIG + +/* + * SA_FLAGS values: + * + * SA_ONSTACK indicates that a registered stack_t will be used. + * SA_RESTART flag to get restarting signals (which were the default long ago) + * SA_NOCLDSTOP flag to turn off SIGCHLD when children stop. + * SA_RESETHAND clears the handler when the signal is delivered. + * SA_NOCLDWAIT flag on SIGCHLD to inhibit zombies. + * SA_NODEFER prevents the current signal from being masked in the handler. + * + * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single + * Unix names RESETHAND and NODEFER respectively. + */ +#define SA_NOCLDSTOP 0x00000001u +#define SA_NOCLDWAIT 0x00000002u +#define SA_SIGINFO 0x00000004u +#define SA_ONSTACK 0x08000000u +#define SA_RESTART 0x10000000u +#define SA_NODEFER 0x40000000u +#define SA_RESETHAND 0x80000000u + +#define SA_NOMASK SA_NODEFER +#define SA_ONESHOT SA_RESETHAND + +#define SA_RESTORER 0x04000000 + +/* + * sigaltstack controls + */ +#define SS_ONSTACK 1 +#define SS_DISABLE 2 + +#define MINSIGSTKSZ 2048 +#define SIGSTKSZ 8192 + +#include <asm-generic/signal.h> + +#ifndef __ASSEMBLY__ + +#ifdef __i386__ +# ifdef __KERNEL__ +struct old_sigaction { + __sighandler_t sa_handler; + old_sigset_t sa_mask; + unsigned long sa_flags; + __sigrestore_t sa_restorer; +}; + +struct sigaction { + __sighandler_t sa_handler; + unsigned long sa_flags; + __sigrestore_t sa_restorer; + sigset_t sa_mask; /* mask last for extensibility */ +}; + +struct k_sigaction { + struct sigaction sa; +}; + +extern void do_notify_resume(struct pt_regs *, void *, __u32); + +# else /* __KERNEL__ */ +/* Here we must cater to libcs that poke about in kernel headers. */ + +struct sigaction { + union { + __sighandler_t _sa_handler; + void (*_sa_sigaction)(int, struct siginfo *, void *); + } _u; + sigset_t sa_mask; + unsigned long sa_flags; + void (*sa_restorer)(void); +}; + +#define sa_handler _u._sa_handler +#define sa_sigaction _u._sa_sigaction + +# endif /* ! __KERNEL__ */ +#else /* __i386__ */ + +struct sigaction { + __sighandler_t sa_handler; + unsigned long sa_flags; + __sigrestore_t sa_restorer; + sigset_t sa_mask; /* mask last for extensibility */ +}; + +struct k_sigaction { + struct sigaction sa; +}; + +#endif /* !__i386__ */ + +typedef struct sigaltstack { + void __user *ss_sp; + int ss_flags; + size_t ss_size; +} stack_t; + +#ifdef __KERNEL__ +#include <asm/sigcontext.h> + +#ifdef __i386__ + +#define __HAVE_ARCH_SIG_BITOPS + +#define sigaddset(set,sig) \ + (__builtin_constant_p(sig) \ + ? __const_sigaddset((set), (sig)) \ + : __gen_sigaddset((set), (sig))) + +static inline void __gen_sigaddset(sigset_t *set, int _sig) +{ + asm("btsl %1,%0" : "+m"(*set) : "Ir"(_sig - 1) : "cc"); +} + +static inline void __const_sigaddset(sigset_t *set, int _sig) +{ + unsigned long sig = _sig - 1; + set->sig[sig / _NSIG_BPW] |= 1 << (sig % _NSIG_BPW); +} + +#define sigdelset(set, sig) \ + (__builtin_constant_p(sig) \ + ? __const_sigdelset((set), (sig)) \ + : __gen_sigdelset((set), (sig))) + + +static inline void __gen_sigdelset(sigset_t *set, int _sig) +{ + asm("btrl %1,%0" : "+m"(*set) : "Ir"(_sig - 1) : "cc"); +} + +static inline void __const_sigdelset(sigset_t *set, int _sig) +{ + unsigned long sig = _sig - 1; + set->sig[sig / _NSIG_BPW] &= ~(1 << (sig % _NSIG_BPW)); +} + +static inline int __const_sigismember(sigset_t *set, int _sig) +{ + unsigned long sig = _sig - 1; + return 1 & (set->sig[sig / _NSIG_BPW] >> (sig % _NSIG_BPW)); +} + +static inline int __gen_sigismember(sigset_t *set, int _sig) +{ + int ret; + asm("btl %2,%1\n\tsbbl %0,%0" + : "=r"(ret) : "m"(*set), "Ir"(_sig-1) : "cc"); + return ret; +} + +#define sigismember(set, sig) \ + (__builtin_constant_p(sig) \ + ? __const_sigismember((set), (sig)) \ + : __gen_sigismember((set), (sig))) + +static inline int sigfindinword(unsigned long word) +{ + asm("bsfl %1,%0" : "=r"(word) : "rm"(word) : "cc"); + return word; +} + +struct pt_regs; + +#else /* __i386__ */ + +#undef __HAVE_ARCH_SIG_BITOPS + +#endif /* !__i386__ */ + +#define ptrace_signal_deliver(regs, cookie) do { } while (0) + +#endif /* __KERNEL__ */ +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_X86_SIGNAL_H */ diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h new file mode 100644 index 000000000000..2766021aef80 --- /dev/null +++ b/arch/x86/include/asm/smp.h @@ -0,0 +1,229 @@ +#ifndef _ASM_X86_SMP_H +#define _ASM_X86_SMP_H +#ifndef __ASSEMBLY__ +#include <linux/cpumask.h> +#include <linux/init.h> +#include <asm/percpu.h> + +/* + * We need the APIC definitions automatically as part of 'smp.h' + */ +#ifdef CONFIG_X86_LOCAL_APIC +# include <asm/mpspec.h> +# include <asm/apic.h> +# ifdef CONFIG_X86_IO_APIC +# include <asm/io_apic.h> +# endif +#endif +#include <asm/pda.h> +#include <asm/thread_info.h> + +extern cpumask_t cpu_callout_map; +extern cpumask_t cpu_initialized; +extern cpumask_t cpu_callin_map; + +extern void (*mtrr_hook)(void); +extern void zap_low_mappings(void); + +extern int __cpuinit get_local_pda(int cpu); + +extern int smp_num_siblings; +extern unsigned int num_processors; +extern cpumask_t cpu_initialized; + +DECLARE_PER_CPU(cpumask_t, cpu_sibling_map); +DECLARE_PER_CPU(cpumask_t, cpu_core_map); +DECLARE_PER_CPU(u16, cpu_llc_id); +#ifdef CONFIG_X86_32 +DECLARE_PER_CPU(int, cpu_number); +#endif + +DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid); +DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid); + +/* Static state in head.S used to set up a CPU */ +extern struct { + void *sp; + unsigned short ss; +} stack_start; + +struct smp_ops { + void (*smp_prepare_boot_cpu)(void); + void (*smp_prepare_cpus)(unsigned max_cpus); + void (*smp_cpus_done)(unsigned max_cpus); + + void (*smp_send_stop)(void); + void (*smp_send_reschedule)(int cpu); + + int (*cpu_up)(unsigned cpu); + int (*cpu_disable)(void); + void (*cpu_die)(unsigned int cpu); + void (*play_dead)(void); + + void (*send_call_func_ipi)(cpumask_t mask); + void (*send_call_func_single_ipi)(int cpu); +}; + +/* Globals due to paravirt */ +extern void set_cpu_sibling_map(int cpu); + +#ifdef CONFIG_SMP +#ifndef CONFIG_PARAVIRT +#define startup_ipi_hook(phys_apicid, start_eip, start_esp) do { } while (0) +#endif +extern struct smp_ops smp_ops; + +static inline void smp_send_stop(void) +{ + smp_ops.smp_send_stop(); +} + +static inline void smp_prepare_boot_cpu(void) +{ + smp_ops.smp_prepare_boot_cpu(); +} + +static inline void smp_prepare_cpus(unsigned int max_cpus) +{ + smp_ops.smp_prepare_cpus(max_cpus); +} + +static inline void smp_cpus_done(unsigned int max_cpus) +{ + smp_ops.smp_cpus_done(max_cpus); +} + +static inline int __cpu_up(unsigned int cpu) +{ + return smp_ops.cpu_up(cpu); +} + +static inline int __cpu_disable(void) +{ + return smp_ops.cpu_disable(); +} + +static inline void __cpu_die(unsigned int cpu) +{ + smp_ops.cpu_die(cpu); +} + +static inline void play_dead(void) +{ + smp_ops.play_dead(); +} + +static inline void smp_send_reschedule(int cpu) +{ + smp_ops.smp_send_reschedule(cpu); +} + +static inline void arch_send_call_function_single_ipi(int cpu) +{ + smp_ops.send_call_func_single_ipi(cpu); +} + +static inline void arch_send_call_function_ipi(cpumask_t mask) +{ + smp_ops.send_call_func_ipi(mask); +} + +void cpu_disable_common(void); +void native_smp_prepare_boot_cpu(void); +void native_smp_prepare_cpus(unsigned int max_cpus); +void native_smp_cpus_done(unsigned int max_cpus); +int native_cpu_up(unsigned int cpunum); +int native_cpu_disable(void); +void native_cpu_die(unsigned int cpu); +void native_play_dead(void); +void play_dead_common(void); + +void native_send_call_func_ipi(cpumask_t mask); +void native_send_call_func_single_ipi(int cpu); + +extern void prefill_possible_map(void); + +void smp_store_cpu_info(int id); +#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu) + +/* We don't mark CPUs online until __cpu_up(), so we need another measure */ +static inline int num_booting_cpus(void) +{ + return cpus_weight(cpu_callout_map); +} +#else +static inline void prefill_possible_map(void) +{ +} +#endif /* CONFIG_SMP */ + +extern unsigned disabled_cpus __cpuinitdata; + +#ifdef CONFIG_X86_32_SMP +/* + * This function is needed by all SMP systems. It must _always_ be valid + * from the initial startup. We map APIC_BASE very early in page_setup(), + * so this is correct in the x86 case. + */ +#define raw_smp_processor_id() (x86_read_percpu(cpu_number)) +extern int safe_smp_processor_id(void); + +#elif defined(CONFIG_X86_64_SMP) +#define raw_smp_processor_id() read_pda(cpunumber) + +#define stack_smp_processor_id() \ +({ \ + struct thread_info *ti; \ + __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \ + ti->cpu; \ +}) +#define safe_smp_processor_id() smp_processor_id() + +#else /* !CONFIG_X86_32_SMP && !CONFIG_X86_64_SMP */ +#define cpu_physical_id(cpu) boot_cpu_physical_apicid +#define safe_smp_processor_id() 0 +#define stack_smp_processor_id() 0 +#endif + +#ifdef CONFIG_X86_LOCAL_APIC + +#ifndef CONFIG_X86_64 +static inline int logical_smp_processor_id(void) +{ + /* we don't want to mark this access volatile - bad code generation */ + return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR)); +} + +#include <mach_apicdef.h> +static inline unsigned int read_apic_id(void) +{ + unsigned int reg; + + reg = *(u32 *)(APIC_BASE + APIC_ID); + + return GET_APIC_ID(reg); +} +#endif + + +# if defined(APIC_DEFINITION) || defined(CONFIG_X86_64) +extern int hard_smp_processor_id(void); +# else +#include <mach_apicdef.h> +static inline int hard_smp_processor_id(void) +{ + /* we don't want to mark this access volatile - bad code generation */ + return read_apic_id(); +} +# endif /* APIC_DEFINITION */ + +#else /* CONFIG_X86_LOCAL_APIC */ + +# ifndef CONFIG_SMP +# define hard_smp_processor_id() 0 +# endif + +#endif /* CONFIG_X86_LOCAL_APIC */ + +#endif /* __ASSEMBLY__ */ +#endif /* _ASM_X86_SMP_H */ diff --git a/arch/x86/include/asm/socket.h b/arch/x86/include/asm/socket.h new file mode 100644 index 000000000000..8ab9cc8b2ecc --- /dev/null +++ b/arch/x86/include/asm/socket.h @@ -0,0 +1,57 @@ +#ifndef _ASM_X86_SOCKET_H +#define _ASM_X86_SOCKET_H + +#include <asm/sockios.h> + +/* For setsockopt(2) */ +#define SOL_SOCKET 1 + +#define SO_DEBUG 1 +#define SO_REUSEADDR 2 +#define SO_TYPE 3 +#define SO_ERROR 4 +#define SO_DONTROUTE 5 +#define SO_BROADCAST 6 +#define SO_SNDBUF 7 +#define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 +#define SO_KEEPALIVE 9 +#define SO_OOBINLINE 10 +#define SO_NO_CHECK 11 +#define SO_PRIORITY 12 +#define SO_LINGER 13 +#define SO_BSDCOMPAT 14 +/* To add :#define SO_REUSEPORT 15 */ +#define SO_PASSCRED 16 +#define SO_PEERCRED 17 +#define SO_RCVLOWAT 18 +#define SO_SNDLOWAT 19 +#define SO_RCVTIMEO 20 +#define SO_SNDTIMEO 21 + +/* Security levels - as per NRL IPv6 - don't actually do anything */ +#define SO_SECURITY_AUTHENTICATION 22 +#define SO_SECURITY_ENCRYPTION_TRANSPORT 23 +#define SO_SECURITY_ENCRYPTION_NETWORK 24 + +#define SO_BINDTODEVICE 25 + +/* Socket filtering */ +#define SO_ATTACH_FILTER 26 +#define SO_DETACH_FILTER 27 + +#define SO_PEERNAME 28 +#define SO_TIMESTAMP 29 +#define SCM_TIMESTAMP SO_TIMESTAMP + +#define SO_ACCEPTCONN 30 + +#define SO_PEERSEC 31 +#define SO_PASSSEC 34 +#define SO_TIMESTAMPNS 35 +#define SCM_TIMESTAMPNS SO_TIMESTAMPNS + +#define SO_MARK 36 + +#endif /* _ASM_X86_SOCKET_H */ diff --git a/arch/x86/include/asm/sockios.h b/arch/x86/include/asm/sockios.h new file mode 100644 index 000000000000..49cc72b5d3c9 --- /dev/null +++ b/arch/x86/include/asm/sockios.h @@ -0,0 +1,13 @@ +#ifndef _ASM_X86_SOCKIOS_H +#define _ASM_X86_SOCKIOS_H + +/* Socket-level I/O control calls. */ +#define FIOSETOWN 0x8901 +#define SIOCSPGRP 0x8902 +#define FIOGETOWN 0x8903 +#define SIOCGPGRP 0x8904 +#define SIOCATMARK 0x8905 +#define SIOCGSTAMP 0x8906 /* Get stamp (timeval) */ +#define SIOCGSTAMPNS 0x8907 /* Get stamp (timespec) */ + +#endif /* _ASM_X86_SOCKIOS_H */ diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h new file mode 100644 index 000000000000..be44f7dab395 --- /dev/null +++ b/arch/x86/include/asm/sparsemem.h @@ -0,0 +1,34 @@ +#ifndef _ASM_X86_SPARSEMEM_H +#define _ASM_X86_SPARSEMEM_H + +#ifdef CONFIG_SPARSEMEM +/* + * generic non-linear memory support: + * + * 1) we will not split memory into more chunks than will fit into the flags + * field of the struct page + * + * SECTION_SIZE_BITS 2^n: size of each section + * MAX_PHYSADDR_BITS 2^n: max size of physical address space + * MAX_PHYSMEM_BITS 2^n: how much memory we can have in that space + * + */ + +#ifdef CONFIG_X86_32 +# ifdef CONFIG_X86_PAE +# define SECTION_SIZE_BITS 29 +# define MAX_PHYSADDR_BITS 36 +# define MAX_PHYSMEM_BITS 36 +# else +# define SECTION_SIZE_BITS 26 +# define MAX_PHYSADDR_BITS 32 +# define MAX_PHYSMEM_BITS 32 +# endif +#else /* CONFIG_X86_32 */ +# define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */ +# define MAX_PHYSADDR_BITS 44 +# define MAX_PHYSMEM_BITS 44 +#endif + +#endif /* CONFIG_SPARSEMEM */ +#endif /* _ASM_X86_SPARSEMEM_H */ diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h new file mode 100644 index 000000000000..d17c91981da2 --- /dev/null +++ b/arch/x86/include/asm/spinlock.h @@ -0,0 +1,364 @@ +#ifndef _ASM_X86_SPINLOCK_H +#define _ASM_X86_SPINLOCK_H + +#include <asm/atomic.h> +#include <asm/rwlock.h> +#include <asm/page.h> +#include <asm/processor.h> +#include <linux/compiler.h> +#include <asm/paravirt.h> +/* + * Your basic SMP spinlocks, allowing only a single CPU anywhere + * + * Simple spin lock operations. There are two variants, one clears IRQ's + * on the local processor, one does not. + * + * These are fair FIFO ticket locks, which are currently limited to 256 + * CPUs. + * + * (the type definitions are in asm/spinlock_types.h) + */ + +#ifdef CONFIG_X86_32 +# define LOCK_PTR_REG "a" +# define REG_PTR_MODE "k" +#else +# define LOCK_PTR_REG "D" +# define REG_PTR_MODE "q" +#endif + +#if defined(CONFIG_X86_32) && \ + (defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)) +/* + * On PPro SMP or if we are using OOSTORE, we use a locked operation to unlock + * (PPro errata 66, 92) + */ +# define UNLOCK_LOCK_PREFIX LOCK_PREFIX +#else +# define UNLOCK_LOCK_PREFIX +#endif + +/* + * Ticket locks are conceptually two parts, one indicating the current head of + * the queue, and the other indicating the current tail. The lock is acquired + * by atomically noting the tail and incrementing it by one (thus adding + * ourself to the queue and noting our position), then waiting until the head + * becomes equal to the the initial value of the tail. + * + * We use an xadd covering *both* parts of the lock, to increment the tail and + * also load the position of the head, which takes care of memory ordering + * issues and should be optimal for the uncontended case. Note the tail must be + * in the high part, because a wide xadd increment of the low part would carry + * up and contaminate the high part. + * + * With fewer than 2^8 possible CPUs, we can use x86's partial registers to + * save some instructions and make the code more elegant. There really isn't + * much between them in performance though, especially as locks are out of line. + */ +#if (NR_CPUS < 256) +#define TICKET_SHIFT 8 + +static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock) +{ + short inc = 0x0100; + + asm volatile ( + LOCK_PREFIX "xaddw %w0, %1\n" + "1:\t" + "cmpb %h0, %b0\n\t" + "je 2f\n\t" + "rep ; nop\n\t" + "movb %1, %b0\n\t" + /* don't need lfence here, because loads are in-order */ + "jmp 1b\n" + "2:" + : "+Q" (inc), "+m" (lock->slock) + : + : "memory", "cc"); +} + +static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock) +{ + int tmp, new; + + asm volatile("movzwl %2, %0\n\t" + "cmpb %h0,%b0\n\t" + "leal 0x100(%" REG_PTR_MODE "0), %1\n\t" + "jne 1f\n\t" + LOCK_PREFIX "cmpxchgw %w1,%2\n\t" + "1:" + "sete %b1\n\t" + "movzbl %b1,%0\n\t" + : "=&a" (tmp), "=&q" (new), "+m" (lock->slock) + : + : "memory", "cc"); + + return tmp; +} + +static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock) +{ + asm volatile(UNLOCK_LOCK_PREFIX "incb %0" + : "+m" (lock->slock) + : + : "memory", "cc"); +} +#else +#define TICKET_SHIFT 16 + +static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock) +{ + int inc = 0x00010000; + int tmp; + + asm volatile(LOCK_PREFIX "xaddl %0, %1\n" + "movzwl %w0, %2\n\t" + "shrl $16, %0\n\t" + "1:\t" + "cmpl %0, %2\n\t" + "je 2f\n\t" + "rep ; nop\n\t" + "movzwl %1, %2\n\t" + /* don't need lfence here, because loads are in-order */ + "jmp 1b\n" + "2:" + : "+r" (inc), "+m" (lock->slock), "=&r" (tmp) + : + : "memory", "cc"); +} + +static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock) +{ + int tmp; + int new; + + asm volatile("movl %2,%0\n\t" + "movl %0,%1\n\t" + "roll $16, %0\n\t" + "cmpl %0,%1\n\t" + "leal 0x00010000(%" REG_PTR_MODE "0), %1\n\t" + "jne 1f\n\t" + LOCK_PREFIX "cmpxchgl %1,%2\n\t" + "1:" + "sete %b1\n\t" + "movzbl %b1,%0\n\t" + : "=&a" (tmp), "=&q" (new), "+m" (lock->slock) + : + : "memory", "cc"); + + return tmp; +} + +static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock) +{ + asm volatile(UNLOCK_LOCK_PREFIX "incw %0" + : "+m" (lock->slock) + : + : "memory", "cc"); +} +#endif + +static inline int __ticket_spin_is_locked(raw_spinlock_t *lock) +{ + int tmp = ACCESS_ONCE(lock->slock); + + return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1)); +} + +static inline int __ticket_spin_is_contended(raw_spinlock_t *lock) +{ + int tmp = ACCESS_ONCE(lock->slock); + + return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1; +} + +#ifdef CONFIG_PARAVIRT +/* + * Define virtualization-friendly old-style lock byte lock, for use in + * pv_lock_ops if desired. + * + * This differs from the pre-2.6.24 spinlock by always using xchgb + * rather than decb to take the lock; this allows it to use a + * zero-initialized lock structure. It also maintains a 1-byte + * contention counter, so that we can implement + * __byte_spin_is_contended. + */ +struct __byte_spinlock { + s8 lock; + s8 spinners; +}; + +static inline int __byte_spin_is_locked(raw_spinlock_t *lock) +{ + struct __byte_spinlock *bl = (struct __byte_spinlock *)lock; + return bl->lock != 0; +} + +static inline int __byte_spin_is_contended(raw_spinlock_t *lock) +{ + struct __byte_spinlock *bl = (struct __byte_spinlock *)lock; + return bl->spinners != 0; +} + +static inline void __byte_spin_lock(raw_spinlock_t *lock) +{ + struct __byte_spinlock *bl = (struct __byte_spinlock *)lock; + s8 val = 1; + + asm("1: xchgb %1, %0\n" + " test %1,%1\n" + " jz 3f\n" + " " LOCK_PREFIX "incb %2\n" + "2: rep;nop\n" + " cmpb $1, %0\n" + " je 2b\n" + " " LOCK_PREFIX "decb %2\n" + " jmp 1b\n" + "3:" + : "+m" (bl->lock), "+q" (val), "+m" (bl->spinners): : "memory"); +} + +static inline int __byte_spin_trylock(raw_spinlock_t *lock) +{ + struct __byte_spinlock *bl = (struct __byte_spinlock *)lock; + u8 old = 1; + + asm("xchgb %1,%0" + : "+m" (bl->lock), "+q" (old) : : "memory"); + + return old == 0; +} + +static inline void __byte_spin_unlock(raw_spinlock_t *lock) +{ + struct __byte_spinlock *bl = (struct __byte_spinlock *)lock; + smp_wmb(); + bl->lock = 0; +} +#else /* !CONFIG_PARAVIRT */ +static inline int __raw_spin_is_locked(raw_spinlock_t *lock) +{ + return __ticket_spin_is_locked(lock); +} + +static inline int __raw_spin_is_contended(raw_spinlock_t *lock) +{ + return __ticket_spin_is_contended(lock); +} + +static __always_inline void __raw_spin_lock(raw_spinlock_t *lock) +{ + __ticket_spin_lock(lock); +} + +static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock) +{ + return __ticket_spin_trylock(lock); +} + +static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock) +{ + __ticket_spin_unlock(lock); +} + +static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock, + unsigned long flags) +{ + __raw_spin_lock(lock); +} + +#endif /* CONFIG_PARAVIRT */ + +static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock) +{ + while (__raw_spin_is_locked(lock)) + cpu_relax(); +} + +/* + * Read-write spinlocks, allowing multiple readers + * but only one writer. + * + * NOTE! it is quite common to have readers in interrupts + * but no interrupt writers. For those circumstances we + * can "mix" irq-safe locks - any writer needs to get a + * irq-safe write-lock, but readers can get non-irqsafe + * read-locks. + * + * On x86, we implement read-write locks as a 32-bit counter + * with the high bit (sign) being the "contended" bit. + */ + +/** + * read_can_lock - would read_trylock() succeed? + * @lock: the rwlock in question. + */ +static inline int __raw_read_can_lock(raw_rwlock_t *lock) +{ + return (int)(lock)->lock > 0; +} + +/** + * write_can_lock - would write_trylock() succeed? + * @lock: the rwlock in question. + */ +static inline int __raw_write_can_lock(raw_rwlock_t *lock) +{ + return (lock)->lock == RW_LOCK_BIAS; +} + +static inline void __raw_read_lock(raw_rwlock_t *rw) +{ + asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t" + "jns 1f\n" + "call __read_lock_failed\n\t" + "1:\n" + ::LOCK_PTR_REG (rw) : "memory"); +} + +static inline void __raw_write_lock(raw_rwlock_t *rw) +{ + asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t" + "jz 1f\n" + "call __write_lock_failed\n\t" + "1:\n" + ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory"); +} + +static inline int __raw_read_trylock(raw_rwlock_t *lock) +{ + atomic_t *count = (atomic_t *)lock; + + atomic_dec(count); + if (atomic_read(count) >= 0) + return 1; + atomic_inc(count); + return 0; +} + +static inline int __raw_write_trylock(raw_rwlock_t *lock) +{ + atomic_t *count = (atomic_t *)lock; + + if (atomic_sub_and_test(RW_LOCK_BIAS, count)) + return 1; + atomic_add(RW_LOCK_BIAS, count); + return 0; +} + +static inline void __raw_read_unlock(raw_rwlock_t *rw) +{ + asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory"); +} + +static inline void __raw_write_unlock(raw_rwlock_t *rw) +{ + asm volatile(LOCK_PREFIX "addl %1, %0" + : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory"); +} + +#define _raw_spin_relax(lock) cpu_relax() +#define _raw_read_relax(lock) cpu_relax() +#define _raw_write_relax(lock) cpu_relax() + +#endif /* _ASM_X86_SPINLOCK_H */ diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h new file mode 100644 index 000000000000..845f81c87091 --- /dev/null +++ b/arch/x86/include/asm/spinlock_types.h @@ -0,0 +1,20 @@ +#ifndef _ASM_X86_SPINLOCK_TYPES_H +#define _ASM_X86_SPINLOCK_TYPES_H + +#ifndef __LINUX_SPINLOCK_TYPES_H +# error "please don't include this file directly" +#endif + +typedef struct raw_spinlock { + unsigned int slock; +} raw_spinlock_t; + +#define __RAW_SPIN_LOCK_UNLOCKED { 0 } + +typedef struct { + unsigned int lock; +} raw_rwlock_t; + +#define __RAW_RW_LOCK_UNLOCKED { RW_LOCK_BIAS } + +#endif /* _ASM_X86_SPINLOCK_TYPES_H */ diff --git a/arch/x86/include/asm/srat.h b/arch/x86/include/asm/srat.h new file mode 100644 index 000000000000..b508d639d1a7 --- /dev/null +++ b/arch/x86/include/asm/srat.h @@ -0,0 +1,39 @@ +/* + * Some of the code in this file has been gleaned from the 64 bit + * discontigmem support code base. + * + * Copyright (C) 2002, IBM Corp. + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send feedback to Pat Gaughen <gone@us.ibm.com> + */ + +#ifndef _ASM_X86_SRAT_H +#define _ASM_X86_SRAT_H + +#ifdef CONFIG_ACPI_NUMA +extern int get_memcfg_from_srat(void); +#else +static inline int get_memcfg_from_srat(void) +{ + return 0; +} +#endif + +#endif /* _ASM_X86_SRAT_H */ diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h new file mode 100644 index 000000000000..f517944b2b17 --- /dev/null +++ b/arch/x86/include/asm/stacktrace.h @@ -0,0 +1,21 @@ +#ifndef _ASM_X86_STACKTRACE_H +#define _ASM_X86_STACKTRACE_H + +extern int kstack_depth_to_print; + +/* Generic stack tracer with callbacks */ + +struct stacktrace_ops { + void (*warning)(void *data, char *msg); + /* msg must contain %s for the symbol */ + void (*warning_symbol)(void *data, char *msg, unsigned long symbol); + void (*address)(void *data, unsigned long address, int reliable); + /* On negative return stop dumping */ + int (*stack)(void *data, char *name); +}; + +void dump_trace(struct task_struct *tsk, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data); + +#endif /* _ASM_X86_STACKTRACE_H */ diff --git a/arch/x86/include/asm/stat.h b/arch/x86/include/asm/stat.h new file mode 100644 index 000000000000..e0b1d9bbcbc6 --- /dev/null +++ b/arch/x86/include/asm/stat.h @@ -0,0 +1,114 @@ +#ifndef _ASM_X86_STAT_H +#define _ASM_X86_STAT_H + +#define STAT_HAVE_NSEC 1 + +#ifdef __i386__ +struct stat { + unsigned long st_dev; + unsigned long st_ino; + unsigned short st_mode; + unsigned short st_nlink; + unsigned short st_uid; + unsigned short st_gid; + unsigned long st_rdev; + unsigned long st_size; + unsigned long st_blksize; + unsigned long st_blocks; + unsigned long st_atime; + unsigned long st_atime_nsec; + unsigned long st_mtime; + unsigned long st_mtime_nsec; + unsigned long st_ctime; + unsigned long st_ctime_nsec; + unsigned long __unused4; + unsigned long __unused5; +}; + +#define STAT64_HAS_BROKEN_ST_INO 1 + +/* This matches struct stat64 in glibc2.1, hence the absolutely + * insane amounts of padding around dev_t's. + */ +struct stat64 { + unsigned long long st_dev; + unsigned char __pad0[4]; + + unsigned long __st_ino; + + unsigned int st_mode; + unsigned int st_nlink; + + unsigned long st_uid; + unsigned long st_gid; + + unsigned long long st_rdev; + unsigned char __pad3[4]; + + long long st_size; + unsigned long st_blksize; + + /* Number 512-byte blocks allocated. */ + unsigned long long st_blocks; + + unsigned long st_atime; + unsigned long st_atime_nsec; + + unsigned long st_mtime; + unsigned int st_mtime_nsec; + + unsigned long st_ctime; + unsigned long st_ctime_nsec; + + unsigned long long st_ino; +}; + +#else /* __i386__ */ + +struct stat { + unsigned long st_dev; + unsigned long st_ino; + unsigned long st_nlink; + + unsigned int st_mode; + unsigned int st_uid; + unsigned int st_gid; + unsigned int __pad0; + unsigned long st_rdev; + long st_size; + long st_blksize; + long st_blocks; /* Number 512-byte blocks allocated. */ + + unsigned long st_atime; + unsigned long st_atime_nsec; + unsigned long st_mtime; + unsigned long st_mtime_nsec; + unsigned long st_ctime; + unsigned long st_ctime_nsec; + long __unused[3]; +}; +#endif + +/* for 32bit emulation and 32 bit kernels */ +struct __old_kernel_stat { + unsigned short st_dev; + unsigned short st_ino; + unsigned short st_mode; + unsigned short st_nlink; + unsigned short st_uid; + unsigned short st_gid; + unsigned short st_rdev; +#ifdef __i386__ + unsigned long st_size; + unsigned long st_atime; + unsigned long st_mtime; + unsigned long st_ctime; +#else + unsigned int st_size; + unsigned int st_atime; + unsigned int st_mtime; + unsigned int st_ctime; +#endif +}; + +#endif /* _ASM_X86_STAT_H */ diff --git a/arch/x86/include/asm/statfs.h b/arch/x86/include/asm/statfs.h new file mode 100644 index 000000000000..2d0adbf99a8e --- /dev/null +++ b/arch/x86/include/asm/statfs.h @@ -0,0 +1,12 @@ +#ifndef _ASM_X86_STATFS_H +#define _ASM_X86_STATFS_H + +/* + * We need compat_statfs64 to be packed, because the i386 ABI won't + * add padding at the end to bring it to a multiple of 8 bytes, but + * the x86_64 ABI will. + */ +#define ARCH_PACK_COMPAT_STATFS64 __attribute__((packed,aligned(4))) + +#include <asm-generic/statfs.h> +#endif /* _ASM_X86_STATFS_H */ diff --git a/arch/x86/include/asm/string.h b/arch/x86/include/asm/string.h new file mode 100644 index 000000000000..6dfd6d9373a0 --- /dev/null +++ b/arch/x86/include/asm/string.h @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include "string_32.h" +#else +# include "string_64.h" +#endif diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h new file mode 100644 index 000000000000..0e0e3ba827f7 --- /dev/null +++ b/arch/x86/include/asm/string_32.h @@ -0,0 +1,326 @@ +#ifndef _ASM_X86_STRING_32_H +#define _ASM_X86_STRING_32_H + +#ifdef __KERNEL__ + +/* Let gcc decide whether to inline or use the out of line functions */ + +#define __HAVE_ARCH_STRCPY +extern char *strcpy(char *dest, const char *src); + +#define __HAVE_ARCH_STRNCPY +extern char *strncpy(char *dest, const char *src, size_t count); + +#define __HAVE_ARCH_STRCAT +extern char *strcat(char *dest, const char *src); + +#define __HAVE_ARCH_STRNCAT +extern char *strncat(char *dest, const char *src, size_t count); + +#define __HAVE_ARCH_STRCMP +extern int strcmp(const char *cs, const char *ct); + +#define __HAVE_ARCH_STRNCMP +extern int strncmp(const char *cs, const char *ct, size_t count); + +#define __HAVE_ARCH_STRCHR +extern char *strchr(const char *s, int c); + +#define __HAVE_ARCH_STRLEN +extern size_t strlen(const char *s); + +static __always_inline void *__memcpy(void *to, const void *from, size_t n) +{ + int d0, d1, d2; + asm volatile("rep ; movsl\n\t" + "movl %4,%%ecx\n\t" + "andl $3,%%ecx\n\t" + "jz 1f\n\t" + "rep ; movsb\n\t" + "1:" + : "=&c" (d0), "=&D" (d1), "=&S" (d2) + : "0" (n / 4), "g" (n), "1" ((long)to), "2" ((long)from) + : "memory"); + return to; +} + +/* + * This looks ugly, but the compiler can optimize it totally, + * as the count is constant. + */ +static __always_inline void *__constant_memcpy(void *to, const void *from, + size_t n) +{ + long esi, edi; + if (!n) + return to; + + switch (n) { + case 1: + *(char *)to = *(char *)from; + return to; + case 2: + *(short *)to = *(short *)from; + return to; + case 4: + *(int *)to = *(int *)from; + return to; + + case 3: + *(short *)to = *(short *)from; + *((char *)to + 2) = *((char *)from + 2); + return to; + case 5: + *(int *)to = *(int *)from; + *((char *)to + 4) = *((char *)from + 4); + return to; + case 6: + *(int *)to = *(int *)from; + *((short *)to + 2) = *((short *)from + 2); + return to; + case 8: + *(int *)to = *(int *)from; + *((int *)to + 1) = *((int *)from + 1); + return to; + } + + esi = (long)from; + edi = (long)to; + if (n >= 5 * 4) { + /* large block: use rep prefix */ + int ecx; + asm volatile("rep ; movsl" + : "=&c" (ecx), "=&D" (edi), "=&S" (esi) + : "0" (n / 4), "1" (edi), "2" (esi) + : "memory" + ); + } else { + /* small block: don't clobber ecx + smaller code */ + if (n >= 4 * 4) + asm volatile("movsl" + : "=&D"(edi), "=&S"(esi) + : "0"(edi), "1"(esi) + : "memory"); + if (n >= 3 * 4) + asm volatile("movsl" + : "=&D"(edi), "=&S"(esi) + : "0"(edi), "1"(esi) + : "memory"); + if (n >= 2 * 4) + asm volatile("movsl" + : "=&D"(edi), "=&S"(esi) + : "0"(edi), "1"(esi) + : "memory"); + if (n >= 1 * 4) + asm volatile("movsl" + : "=&D"(edi), "=&S"(esi) + : "0"(edi), "1"(esi) + : "memory"); + } + switch (n % 4) { + /* tail */ + case 0: + return to; + case 1: + asm volatile("movsb" + : "=&D"(edi), "=&S"(esi) + : "0"(edi), "1"(esi) + : "memory"); + return to; + case 2: + asm volatile("movsw" + : "=&D"(edi), "=&S"(esi) + : "0"(edi), "1"(esi) + : "memory"); + return to; + default: + asm volatile("movsw\n\tmovsb" + : "=&D"(edi), "=&S"(esi) + : "0"(edi), "1"(esi) + : "memory"); + return to; + } +} + +#define __HAVE_ARCH_MEMCPY + +#ifdef CONFIG_X86_USE_3DNOW + +#include <asm/mmx.h> + +/* + * This CPU favours 3DNow strongly (eg AMD Athlon) + */ + +static inline void *__constant_memcpy3d(void *to, const void *from, size_t len) +{ + if (len < 512) + return __constant_memcpy(to, from, len); + return _mmx_memcpy(to, from, len); +} + +static inline void *__memcpy3d(void *to, const void *from, size_t len) +{ + if (len < 512) + return __memcpy(to, from, len); + return _mmx_memcpy(to, from, len); +} + +#define memcpy(t, f, n) \ + (__builtin_constant_p((n)) \ + ? __constant_memcpy3d((t), (f), (n)) \ + : __memcpy3d((t), (f), (n))) + +#else + +/* + * No 3D Now! + */ + +#define memcpy(t, f, n) \ + (__builtin_constant_p((n)) \ + ? __constant_memcpy((t), (f), (n)) \ + : __memcpy((t), (f), (n))) + +#endif + +#define __HAVE_ARCH_MEMMOVE +void *memmove(void *dest, const void *src, size_t n); + +#define memcmp __builtin_memcmp + +#define __HAVE_ARCH_MEMCHR +extern void *memchr(const void *cs, int c, size_t count); + +static inline void *__memset_generic(void *s, char c, size_t count) +{ + int d0, d1; + asm volatile("rep\n\t" + "stosb" + : "=&c" (d0), "=&D" (d1) + : "a" (c), "1" (s), "0" (count) + : "memory"); + return s; +} + +/* we might want to write optimized versions of these later */ +#define __constant_count_memset(s, c, count) __memset_generic((s), (c), (count)) + +/* + * memset(x, 0, y) is a reasonably common thing to do, so we want to fill + * things 32 bits at a time even when we don't know the size of the + * area at compile-time.. + */ +static __always_inline +void *__constant_c_memset(void *s, unsigned long c, size_t count) +{ + int d0, d1; + asm volatile("rep ; stosl\n\t" + "testb $2,%b3\n\t" + "je 1f\n\t" + "stosw\n" + "1:\ttestb $1,%b3\n\t" + "je 2f\n\t" + "stosb\n" + "2:" + : "=&c" (d0), "=&D" (d1) + : "a" (c), "q" (count), "0" (count/4), "1" ((long)s) + : "memory"); + return s; +} + +/* Added by Gertjan van Wingerde to make minix and sysv module work */ +#define __HAVE_ARCH_STRNLEN +extern size_t strnlen(const char *s, size_t count); +/* end of additional stuff */ + +#define __HAVE_ARCH_STRSTR +extern char *strstr(const char *cs, const char *ct); + +/* + * This looks horribly ugly, but the compiler can optimize it totally, + * as we by now know that both pattern and count is constant.. + */ +static __always_inline +void *__constant_c_and_count_memset(void *s, unsigned long pattern, + size_t count) +{ + switch (count) { + case 0: + return s; + case 1: + *(unsigned char *)s = pattern & 0xff; + return s; + case 2: + *(unsigned short *)s = pattern & 0xffff; + return s; + case 3: + *(unsigned short *)s = pattern & 0xffff; + *((unsigned char *)s + 2) = pattern & 0xff; + return s; + case 4: + *(unsigned long *)s = pattern; + return s; + } + +#define COMMON(x) \ + asm volatile("rep ; stosl" \ + x \ + : "=&c" (d0), "=&D" (d1) \ + : "a" (eax), "0" (count/4), "1" ((long)s) \ + : "memory") + + { + int d0, d1; +#if __GNUC__ == 4 && __GNUC_MINOR__ == 0 + /* Workaround for broken gcc 4.0 */ + register unsigned long eax asm("%eax") = pattern; +#else + unsigned long eax = pattern; +#endif + + switch (count % 4) { + case 0: + COMMON(""); + return s; + case 1: + COMMON("\n\tstosb"); + return s; + case 2: + COMMON("\n\tstosw"); + return s; + default: + COMMON("\n\tstosw\n\tstosb"); + return s; + } + } + +#undef COMMON +} + +#define __constant_c_x_memset(s, c, count) \ + (__builtin_constant_p(count) \ + ? __constant_c_and_count_memset((s), (c), (count)) \ + : __constant_c_memset((s), (c), (count))) + +#define __memset(s, c, count) \ + (__builtin_constant_p(count) \ + ? __constant_count_memset((s), (c), (count)) \ + : __memset_generic((s), (c), (count))) + +#define __HAVE_ARCH_MEMSET +#define memset(s, c, count) \ + (__builtin_constant_p(c) \ + ? __constant_c_x_memset((s), (0x01010101UL * (unsigned char)(c)), \ + (count)) \ + : __memset((s), (c), (count))) + +/* + * find the first occurrence of byte 'c', or 1 past the area if none + */ +#define __HAVE_ARCH_MEMSCAN +extern void *memscan(void *addr, int c, size_t size); + +#endif /* __KERNEL__ */ + +#endif /* _ASM_X86_STRING_32_H */ diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h new file mode 100644 index 000000000000..2afe164bf1e6 --- /dev/null +++ b/arch/x86/include/asm/string_64.h @@ -0,0 +1,60 @@ +#ifndef _ASM_X86_STRING_64_H +#define _ASM_X86_STRING_64_H + +#ifdef __KERNEL__ + +/* Written 2002 by Andi Kleen */ + +/* Only used for special circumstances. Stolen from i386/string.h */ +static __always_inline void *__inline_memcpy(void *to, const void *from, size_t n) +{ + unsigned long d0, d1, d2; + asm volatile("rep ; movsl\n\t" + "testb $2,%b4\n\t" + "je 1f\n\t" + "movsw\n" + "1:\ttestb $1,%b4\n\t" + "je 2f\n\t" + "movsb\n" + "2:" + : "=&c" (d0), "=&D" (d1), "=&S" (d2) + : "0" (n / 4), "q" (n), "1" ((long)to), "2" ((long)from) + : "memory"); + return to; +} + +/* Even with __builtin_ the compiler may decide to use the out of line + function. */ + +#define __HAVE_ARCH_MEMCPY 1 +#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4 +extern void *memcpy(void *to, const void *from, size_t len); +#else +extern void *__memcpy(void *to, const void *from, size_t len); +#define memcpy(dst, src, len) \ +({ \ + size_t __len = (len); \ + void *__ret; \ + if (__builtin_constant_p(len) && __len >= 64) \ + __ret = __memcpy((dst), (src), __len); \ + else \ + __ret = __builtin_memcpy((dst), (src), __len); \ + __ret; \ +}) +#endif + +#define __HAVE_ARCH_MEMSET +void *memset(void *s, int c, size_t n); + +#define __HAVE_ARCH_MEMMOVE +void *memmove(void *dest, const void *src, size_t count); + +int memcmp(const void *cs, const void *ct, size_t count); +size_t strlen(const char *s); +char *strcpy(char *dest, const char *src); +char *strcat(char *dest, const char *src); +int strcmp(const char *cs, const char *ct); + +#endif /* __KERNEL__ */ + +#endif /* _ASM_X86_STRING_64_H */ diff --git a/arch/x86/include/asm/summit/apic.h b/arch/x86/include/asm/summit/apic.h new file mode 100644 index 000000000000..9b3070f1c2ac --- /dev/null +++ b/arch/x86/include/asm/summit/apic.h @@ -0,0 +1,184 @@ +#ifndef __ASM_SUMMIT_APIC_H +#define __ASM_SUMMIT_APIC_H + +#include <asm/smp.h> + +#define esr_disable (1) +#define NO_BALANCE_IRQ (0) + +/* In clustered mode, the high nibble of APIC ID is a cluster number. + * The low nibble is a 4-bit bitmap. */ +#define XAPIC_DEST_CPUS_SHIFT 4 +#define XAPIC_DEST_CPUS_MASK ((1u << XAPIC_DEST_CPUS_SHIFT) - 1) +#define XAPIC_DEST_CLUSTER_MASK (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT) + +#define APIC_DFR_VALUE (APIC_DFR_CLUSTER) + +static inline cpumask_t target_cpus(void) +{ + /* CPU_MASK_ALL (0xff) has undefined behaviour with + * dest_LowestPrio mode logical clustered apic interrupt routing + * Just start on cpu 0. IRQ balancing will spread load + */ + return cpumask_of_cpu(0); +} + +#define INT_DELIVERY_MODE (dest_LowestPrio) +#define INT_DEST_MODE 1 /* logical delivery broadcast to all procs */ + +static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid) +{ + return 0; +} + +/* we don't use the phys_cpu_present_map to indicate apicid presence */ +static inline unsigned long check_apicid_present(int bit) +{ + return 1; +} + +#define apicid_cluster(apicid) ((apicid) & XAPIC_DEST_CLUSTER_MASK) + +extern u8 cpu_2_logical_apicid[]; + +static inline void init_apic_ldr(void) +{ + unsigned long val, id; + int count = 0; + u8 my_id = (u8)hard_smp_processor_id(); + u8 my_cluster = (u8)apicid_cluster(my_id); +#ifdef CONFIG_SMP + u8 lid; + int i; + + /* Create logical APIC IDs by counting CPUs already in cluster. */ + for (count = 0, i = NR_CPUS; --i >= 0; ) { + lid = cpu_2_logical_apicid[i]; + if (lid != BAD_APICID && apicid_cluster(lid) == my_cluster) + ++count; + } +#endif + /* We only have a 4 wide bitmap in cluster mode. If a deranged + * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */ + BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT); + id = my_cluster | (1UL << count); + apic_write(APIC_DFR, APIC_DFR_VALUE); + val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; + val |= SET_APIC_LOGICAL_ID(id); + apic_write(APIC_LDR, val); +} + +static inline int multi_timer_check(int apic, int irq) +{ + return 0; +} + +static inline int apic_id_registered(void) +{ + return 1; +} + +static inline void setup_apic_routing(void) +{ + printk("Enabling APIC mode: Summit. Using %d I/O APICs\n", + nr_ioapics); +} + +static inline int apicid_to_node(int logical_apicid) +{ +#ifdef CONFIG_SMP + return apicid_2_node[hard_smp_processor_id()]; +#else + return 0; +#endif +} + +/* Mapping from cpu number to logical apicid */ +static inline int cpu_to_logical_apicid(int cpu) +{ +#ifdef CONFIG_SMP + if (cpu >= NR_CPUS) + return BAD_APICID; + return (int)cpu_2_logical_apicid[cpu]; +#else + return logical_smp_processor_id(); +#endif +} + +static inline int cpu_present_to_apicid(int mps_cpu) +{ + if (mps_cpu < NR_CPUS) + return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu); + else + return BAD_APICID; +} + +static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_id_map) +{ + /* For clustered we don't have a good way to do this yet - hack */ + return physids_promote(0x0F); +} + +static inline physid_mask_t apicid_to_cpu_present(int apicid) +{ + return physid_mask_of_physid(0); +} + +static inline void setup_portio_remap(void) +{ +} + +static inline int check_phys_apicid_present(int boot_cpu_physical_apicid) +{ + return 1; +} + +static inline void enable_apic_mode(void) +{ +} + +static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) +{ + int num_bits_set; + int cpus_found = 0; + int cpu; + int apicid; + + num_bits_set = cpus_weight(cpumask); + /* Return id to all */ + if (num_bits_set == NR_CPUS) + return (int) 0xFF; + /* + * The cpus in the mask must all be on the apic cluster. If are not + * on the same apicid cluster return default value of TARGET_CPUS. + */ + cpu = first_cpu(cpumask); + apicid = cpu_to_logical_apicid(cpu); + while (cpus_found < num_bits_set) { + if (cpu_isset(cpu, cpumask)) { + int new_apicid = cpu_to_logical_apicid(cpu); + if (apicid_cluster(apicid) != + apicid_cluster(new_apicid)){ + printk ("%s: Not a valid mask!\n", __func__); + return 0xFF; + } + apicid = apicid | new_apicid; + cpus_found++; + } + cpu++; + } + return apicid; +} + +/* cpuid returns the value latched in the HW at reset, not the APIC ID + * register's value. For any box whose BIOS changes APIC IDs, like + * clustered APIC systems, we must use hard_smp_processor_id. + * + * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID. + */ +static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb) +{ + return hard_smp_processor_id() >> index_msb; +} + +#endif /* __ASM_SUMMIT_APIC_H */ diff --git a/arch/x86/include/asm/summit/apicdef.h b/arch/x86/include/asm/summit/apicdef.h new file mode 100644 index 000000000000..f3fbca1f61c1 --- /dev/null +++ b/arch/x86/include/asm/summit/apicdef.h @@ -0,0 +1,13 @@ +#ifndef __ASM_SUMMIT_APICDEF_H +#define __ASM_SUMMIT_APICDEF_H + +#define APIC_ID_MASK (0xFF<<24) + +static inline unsigned get_apic_id(unsigned long x) +{ + return (x>>24)&0xFF; +} + +#define GET_APIC_ID(x) get_apic_id(x) + +#endif diff --git a/arch/x86/include/asm/summit/ipi.h b/arch/x86/include/asm/summit/ipi.h new file mode 100644 index 000000000000..53bd1e7bd7b4 --- /dev/null +++ b/arch/x86/include/asm/summit/ipi.h @@ -0,0 +1,25 @@ +#ifndef __ASM_SUMMIT_IPI_H +#define __ASM_SUMMIT_IPI_H + +void send_IPI_mask_sequence(cpumask_t mask, int vector); + +static inline void send_IPI_mask(cpumask_t mask, int vector) +{ + send_IPI_mask_sequence(mask, vector); +} + +static inline void send_IPI_allbutself(int vector) +{ + cpumask_t mask = cpu_online_map; + cpu_clear(smp_processor_id(), mask); + + if (!cpus_empty(mask)) + send_IPI_mask(mask, vector); +} + +static inline void send_IPI_all(int vector) +{ + send_IPI_mask(cpu_online_map, vector); +} + +#endif /* __ASM_SUMMIT_IPI_H */ diff --git a/arch/x86/include/asm/summit/mpparse.h b/arch/x86/include/asm/summit/mpparse.h new file mode 100644 index 000000000000..013ce6fab2d5 --- /dev/null +++ b/arch/x86/include/asm/summit/mpparse.h @@ -0,0 +1,109 @@ +#ifndef __ASM_SUMMIT_MPPARSE_H +#define __ASM_SUMMIT_MPPARSE_H + +#include <asm/tsc.h> + +extern int use_cyclone; + +#ifdef CONFIG_X86_SUMMIT_NUMA +extern void setup_summit(void); +#else +#define setup_summit() {} +#endif + +static inline int mps_oem_check(struct mp_config_table *mpc, char *oem, + char *productid) +{ + if (!strncmp(oem, "IBM ENSW", 8) && + (!strncmp(productid, "VIGIL SMP", 9) + || !strncmp(productid, "EXA", 3) + || !strncmp(productid, "RUTHLESS SMP", 12))){ + mark_tsc_unstable("Summit based system"); + use_cyclone = 1; /*enable cyclone-timer*/ + setup_summit(); + return 1; + } + return 0; +} + +/* Hook from generic ACPI tables.c */ +static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + if (!strncmp(oem_id, "IBM", 3) && + (!strncmp(oem_table_id, "SERVIGIL", 8) + || !strncmp(oem_table_id, "EXA", 3))){ + mark_tsc_unstable("Summit based system"); + use_cyclone = 1; /*enable cyclone-timer*/ + setup_summit(); + return 1; + } + return 0; +} + +struct rio_table_hdr { + unsigned char version; /* Version number of this data structure */ + /* Version 3 adds chassis_num & WP_index */ + unsigned char num_scal_dev; /* # of Scalability devices (Twisters for Vigil) */ + unsigned char num_rio_dev; /* # of RIO I/O devices (Cyclones and Winnipegs) */ +} __attribute__((packed)); + +struct scal_detail { + unsigned char node_id; /* Scalability Node ID */ + unsigned long CBAR; /* Address of 1MB register space */ + unsigned char port0node; /* Node ID port connected to: 0xFF=None */ + unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */ + unsigned char port1node; /* Node ID port connected to: 0xFF = None */ + unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */ + unsigned char port2node; /* Node ID port connected to: 0xFF = None */ + unsigned char port2port; /* Port num port connected to: 0,1,2, or 0xFF=None */ + unsigned char chassis_num; /* 1 based Chassis number (1 = boot node) */ +} __attribute__((packed)); + +struct rio_detail { + unsigned char node_id; /* RIO Node ID */ + unsigned long BBAR; /* Address of 1MB register space */ + unsigned char type; /* Type of device */ + unsigned char owner_id; /* For WPEG: Node ID of Cyclone that owns this WPEG*/ + /* For CYC: Node ID of Twister that owns this CYC */ + unsigned char port0node; /* Node ID port connected to: 0xFF=None */ + unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */ + unsigned char port1node; /* Node ID port connected to: 0xFF=None */ + unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */ + unsigned char first_slot; /* For WPEG: Lowest slot number below this WPEG */ + /* For CYC: 0 */ + unsigned char status; /* For WPEG: Bit 0 = 1 : the XAPIC is used */ + /* = 0 : the XAPIC is not used, ie:*/ + /* ints fwded to another XAPIC */ + /* Bits1:7 Reserved */ + /* For CYC: Bits0:7 Reserved */ + unsigned char WP_index; /* For WPEG: WPEG instance index - lower ones have */ + /* lower slot numbers/PCI bus numbers */ + /* For CYC: No meaning */ + unsigned char chassis_num; /* 1 based Chassis number */ + /* For LookOut WPEGs this field indicates the */ + /* Expansion Chassis #, enumerated from Boot */ + /* Node WPEG external port, then Boot Node CYC */ + /* external port, then Next Vigil chassis WPEG */ + /* external port, etc. */ + /* Shared Lookouts have only 1 chassis number (the */ + /* first one assigned) */ +} __attribute__((packed)); + + +typedef enum { + CompatTwister = 0, /* Compatibility Twister */ + AltTwister = 1, /* Alternate Twister of internal 8-way */ + CompatCyclone = 2, /* Compatibility Cyclone */ + AltCyclone = 3, /* Alternate Cyclone of internal 8-way */ + CompatWPEG = 4, /* Compatibility WPEG */ + AltWPEG = 5, /* Second Planar WPEG */ + LookOutAWPEG = 6, /* LookOut WPEG */ + LookOutBWPEG = 7, /* LookOut WPEG */ +} node_type; + +static inline int is_WPEG(struct rio_detail *rio){ + return (rio->type == CompatWPEG || rio->type == AltWPEG || + rio->type == LookOutAWPEG || rio->type == LookOutBWPEG); +} + +#endif /* __ASM_SUMMIT_MPPARSE_H */ diff --git a/arch/x86/include/asm/suspend.h b/arch/x86/include/asm/suspend.h new file mode 100644 index 000000000000..9bd521fe4570 --- /dev/null +++ b/arch/x86/include/asm/suspend.h @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include "suspend_32.h" +#else +# include "suspend_64.h" +#endif diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h new file mode 100644 index 000000000000..a5074bd0f8be --- /dev/null +++ b/arch/x86/include/asm/suspend_32.h @@ -0,0 +1,51 @@ +/* + * Copyright 2001-2002 Pavel Machek <pavel@suse.cz> + * Based on code + * Copyright 2001 Patrick Mochel <mochel@osdl.org> + */ +#ifndef _ASM_X86_SUSPEND_32_H +#define _ASM_X86_SUSPEND_32_H + +#include <asm/desc.h> +#include <asm/i387.h> + +static inline int arch_prepare_suspend(void) { return 0; } + +/* image of the saved processor state */ +struct saved_context { + u16 es, fs, gs, ss; + unsigned long cr0, cr2, cr3, cr4; + struct desc_ptr gdt; + struct desc_ptr idt; + u16 ldt; + u16 tss; + unsigned long tr; + unsigned long safety; + unsigned long return_address; +} __attribute__((packed)); + +#ifdef CONFIG_ACPI +extern unsigned long saved_eip; +extern unsigned long saved_esp; +extern unsigned long saved_ebp; +extern unsigned long saved_ebx; +extern unsigned long saved_esi; +extern unsigned long saved_edi; + +static inline void acpi_save_register_state(unsigned long return_point) +{ + saved_eip = return_point; + asm volatile("movl %%esp,%0" : "=m" (saved_esp)); + asm volatile("movl %%ebp,%0" : "=m" (saved_ebp)); + asm volatile("movl %%ebx,%0" : "=m" (saved_ebx)); + asm volatile("movl %%edi,%0" : "=m" (saved_edi)); + asm volatile("movl %%esi,%0" : "=m" (saved_esi)); +} + +#define acpi_restore_register_state() do {} while (0) + +/* routines for saving/restoring kernel state */ +extern int acpi_save_state_mem(void); +#endif + +#endif /* _ASM_X86_SUSPEND_32_H */ diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h new file mode 100644 index 000000000000..06284f42b759 --- /dev/null +++ b/arch/x86/include/asm/suspend_64.h @@ -0,0 +1,52 @@ +/* + * Copyright 2001-2003 Pavel Machek <pavel@suse.cz> + * Based on code + * Copyright 2001 Patrick Mochel <mochel@osdl.org> + */ +#ifndef _ASM_X86_SUSPEND_64_H +#define _ASM_X86_SUSPEND_64_H + +#include <asm/desc.h> +#include <asm/i387.h> + +static inline int arch_prepare_suspend(void) +{ + return 0; +} + +/* + * Image of the saved processor state, used by the low level ACPI suspend to + * RAM code and by the low level hibernation code. + * + * If you modify it, fix arch/x86/kernel/acpi/wakeup_64.S and make sure that + * __save/__restore_processor_state(), defined in arch/x86/kernel/suspend_64.c, + * still work as required. + */ +struct saved_context { + struct pt_regs regs; + u16 ds, es, fs, gs, ss; + unsigned long gs_base, gs_kernel_base, fs_base; + unsigned long cr0, cr2, cr3, cr4, cr8; + unsigned long efer; + u16 gdt_pad; + u16 gdt_limit; + unsigned long gdt_base; + u16 idt_pad; + u16 idt_limit; + unsigned long idt_base; + u16 ldt; + u16 tss; + unsigned long tr; + unsigned long safety; + unsigned long return_address; +} __attribute__((packed)); + +#define loaddebug(thread,register) \ + set_debugreg((thread)->debugreg##register, register) + +/* routines for saving/restoring kernel state */ +extern int acpi_save_state_mem(void); +extern char core_restore_code; +extern char restore_registers; + +#endif /* _ASM_X86_SUSPEND_64_H */ diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h new file mode 100644 index 000000000000..51fb2c76ad74 --- /dev/null +++ b/arch/x86/include/asm/swiotlb.h @@ -0,0 +1,58 @@ +#ifndef _ASM_X86_SWIOTLB_H +#define _ASM_X86_SWIOTLB_H + +#include <asm/dma-mapping.h> + +/* SWIOTLB interface */ + +extern dma_addr_t swiotlb_map_single(struct device *hwdev, void *ptr, + size_t size, int dir); +extern void *swiotlb_alloc_coherent(struct device *hwdev, size_t size, + dma_addr_t *dma_handle, gfp_t flags); +extern void swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, + size_t size, int dir); +extern void swiotlb_sync_single_for_cpu(struct device *hwdev, + dma_addr_t dev_addr, + size_t size, int dir); +extern void swiotlb_sync_single_for_device(struct device *hwdev, + dma_addr_t dev_addr, + size_t size, int dir); +extern void swiotlb_sync_single_range_for_cpu(struct device *hwdev, + dma_addr_t dev_addr, + unsigned long offset, + size_t size, int dir); +extern void swiotlb_sync_single_range_for_device(struct device *hwdev, + dma_addr_t dev_addr, + unsigned long offset, + size_t size, int dir); +extern void swiotlb_sync_sg_for_cpu(struct device *hwdev, + struct scatterlist *sg, int nelems, + int dir); +extern void swiotlb_sync_sg_for_device(struct device *hwdev, + struct scatterlist *sg, int nelems, + int dir); +extern int swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg, + int nents, int direction); +extern void swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg, + int nents, int direction); +extern int swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr); +extern void swiotlb_free_coherent(struct device *hwdev, size_t size, + void *vaddr, dma_addr_t dma_handle); +extern int swiotlb_dma_supported(struct device *hwdev, u64 mask); +extern void swiotlb_init(void); + +extern int swiotlb_force; + +#ifdef CONFIG_SWIOTLB +extern int swiotlb; +extern void pci_swiotlb_init(void); +#else +#define swiotlb 0 +static inline void pci_swiotlb_init(void) +{ +} +#endif + +static inline void dma_mark_clean(void *addr, size_t size) {} + +#endif /* _ASM_X86_SWIOTLB_H */ diff --git a/arch/x86/include/asm/sync_bitops.h b/arch/x86/include/asm/sync_bitops.h new file mode 100644 index 000000000000..9d09b4073b60 --- /dev/null +++ b/arch/x86/include/asm/sync_bitops.h @@ -0,0 +1,130 @@ +#ifndef _ASM_X86_SYNC_BITOPS_H +#define _ASM_X86_SYNC_BITOPS_H + +/* + * Copyright 1992, Linus Torvalds. + */ + +/* + * These have to be done with inline assembly: that way the bit-setting + * is guaranteed to be atomic. All bit operations return 0 if the bit + * was cleared before the operation and != 0 if it was not. + * + * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1). + */ + +#define ADDR (*(volatile long *)addr) + +/** + * sync_set_bit - Atomically set a bit in memory + * @nr: the bit to set + * @addr: the address to start counting from + * + * This function is atomic and may not be reordered. See __set_bit() + * if you do not require the atomic guarantees. + * + * Note that @nr may be almost arbitrarily large; this function is not + * restricted to acting on a single-word quantity. + */ +static inline void sync_set_bit(int nr, volatile unsigned long *addr) +{ + asm volatile("lock; btsl %1,%0" + : "+m" (ADDR) + : "Ir" (nr) + : "memory"); +} + +/** + * sync_clear_bit - Clears a bit in memory + * @nr: Bit to clear + * @addr: Address to start counting from + * + * sync_clear_bit() is atomic and may not be reordered. However, it does + * not contain a memory barrier, so if it is used for locking purposes, + * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit() + * in order to ensure changes are visible on other processors. + */ +static inline void sync_clear_bit(int nr, volatile unsigned long *addr) +{ + asm volatile("lock; btrl %1,%0" + : "+m" (ADDR) + : "Ir" (nr) + : "memory"); +} + +/** + * sync_change_bit - Toggle a bit in memory + * @nr: Bit to change + * @addr: Address to start counting from + * + * sync_change_bit() is atomic and may not be reordered. + * Note that @nr may be almost arbitrarily large; this function is not + * restricted to acting on a single-word quantity. + */ +static inline void sync_change_bit(int nr, volatile unsigned long *addr) +{ + asm volatile("lock; btcl %1,%0" + : "+m" (ADDR) + : "Ir" (nr) + : "memory"); +} + +/** + * sync_test_and_set_bit - Set a bit and return its old value + * @nr: Bit to set + * @addr: Address to count from + * + * This operation is atomic and cannot be reordered. + * It also implies a memory barrier. + */ +static inline int sync_test_and_set_bit(int nr, volatile unsigned long *addr) +{ + int oldbit; + + asm volatile("lock; btsl %2,%1\n\tsbbl %0,%0" + : "=r" (oldbit), "+m" (ADDR) + : "Ir" (nr) : "memory"); + return oldbit; +} + +/** + * sync_test_and_clear_bit - Clear a bit and return its old value + * @nr: Bit to clear + * @addr: Address to count from + * + * This operation is atomic and cannot be reordered. + * It also implies a memory barrier. + */ +static inline int sync_test_and_clear_bit(int nr, volatile unsigned long *addr) +{ + int oldbit; + + asm volatile("lock; btrl %2,%1\n\tsbbl %0,%0" + : "=r" (oldbit), "+m" (ADDR) + : "Ir" (nr) : "memory"); + return oldbit; +} + +/** + * sync_test_and_change_bit - Change a bit and return its old value + * @nr: Bit to change + * @addr: Address to count from + * + * This operation is atomic and cannot be reordered. + * It also implies a memory barrier. + */ +static inline int sync_test_and_change_bit(int nr, volatile unsigned long *addr) +{ + int oldbit; + + asm volatile("lock; btcl %2,%1\n\tsbbl %0,%0" + : "=r" (oldbit), "+m" (ADDR) + : "Ir" (nr) : "memory"); + return oldbit; +} + +#define sync_test_bit(nr, addr) test_bit(nr, addr) + +#undef ADDR + +#endif /* _ASM_X86_SYNC_BITOPS_H */ diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h new file mode 100644 index 000000000000..d82f39bb7905 --- /dev/null +++ b/arch/x86/include/asm/syscall.h @@ -0,0 +1,213 @@ +/* + * Access to user system call parameters and results + * + * Copyright (C) 2008 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + * + * See asm-generic/syscall.h for descriptions of what we must do here. + */ + +#ifndef _ASM_X86_SYSCALL_H +#define _ASM_X86_SYSCALL_H + +#include <linux/sched.h> +#include <linux/err.h> + +static inline long syscall_get_nr(struct task_struct *task, + struct pt_regs *regs) +{ + /* + * We always sign-extend a -1 value being set here, + * so this is always either -1L or a syscall number. + */ + return regs->orig_ax; +} + +static inline void syscall_rollback(struct task_struct *task, + struct pt_regs *regs) +{ + regs->ax = regs->orig_ax; +} + +static inline long syscall_get_error(struct task_struct *task, + struct pt_regs *regs) +{ + unsigned long error = regs->ax; +#ifdef CONFIG_IA32_EMULATION + /* + * TS_COMPAT is set for 32-bit syscall entries and then + * remains set until we return to user mode. + */ + if (task_thread_info(task)->status & TS_COMPAT) + /* + * Sign-extend the value so (int)-EFOO becomes (long)-EFOO + * and will match correctly in comparisons. + */ + error = (long) (int) error; +#endif + return IS_ERR_VALUE(error) ? error : 0; +} + +static inline long syscall_get_return_value(struct task_struct *task, + struct pt_regs *regs) +{ + return regs->ax; +} + +static inline void syscall_set_return_value(struct task_struct *task, + struct pt_regs *regs, + int error, long val) +{ + regs->ax = (long) error ?: val; +} + +#ifdef CONFIG_X86_32 + +static inline void syscall_get_arguments(struct task_struct *task, + struct pt_regs *regs, + unsigned int i, unsigned int n, + unsigned long *args) +{ + BUG_ON(i + n > 6); + memcpy(args, ®s->bx + i, n * sizeof(args[0])); +} + +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + unsigned int i, unsigned int n, + const unsigned long *args) +{ + BUG_ON(i + n > 6); + memcpy(®s->bx + i, args, n * sizeof(args[0])); +} + +#else /* CONFIG_X86_64 */ + +static inline void syscall_get_arguments(struct task_struct *task, + struct pt_regs *regs, + unsigned int i, unsigned int n, + unsigned long *args) +{ +# ifdef CONFIG_IA32_EMULATION + if (task_thread_info(task)->status & TS_COMPAT) + switch (i) { + case 0: + if (!n--) break; + *args++ = regs->bx; + case 1: + if (!n--) break; + *args++ = regs->cx; + case 2: + if (!n--) break; + *args++ = regs->dx; + case 3: + if (!n--) break; + *args++ = regs->si; + case 4: + if (!n--) break; + *args++ = regs->di; + case 5: + if (!n--) break; + *args++ = regs->bp; + case 6: + if (!n--) break; + default: + BUG(); + break; + } + else +# endif + switch (i) { + case 0: + if (!n--) break; + *args++ = regs->di; + case 1: + if (!n--) break; + *args++ = regs->si; + case 2: + if (!n--) break; + *args++ = regs->dx; + case 3: + if (!n--) break; + *args++ = regs->r10; + case 4: + if (!n--) break; + *args++ = regs->r8; + case 5: + if (!n--) break; + *args++ = regs->r9; + case 6: + if (!n--) break; + default: + BUG(); + break; + } +} + +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + unsigned int i, unsigned int n, + const unsigned long *args) +{ +# ifdef CONFIG_IA32_EMULATION + if (task_thread_info(task)->status & TS_COMPAT) + switch (i) { + case 0: + if (!n--) break; + regs->bx = *args++; + case 1: + if (!n--) break; + regs->cx = *args++; + case 2: + if (!n--) break; + regs->dx = *args++; + case 3: + if (!n--) break; + regs->si = *args++; + case 4: + if (!n--) break; + regs->di = *args++; + case 5: + if (!n--) break; + regs->bp = *args++; + case 6: + if (!n--) break; + default: + BUG(); + break; + } + else +# endif + switch (i) { + case 0: + if (!n--) break; + regs->di = *args++; + case 1: + if (!n--) break; + regs->si = *args++; + case 2: + if (!n--) break; + regs->dx = *args++; + case 3: + if (!n--) break; + regs->r10 = *args++; + case 4: + if (!n--) break; + regs->r8 = *args++; + case 5: + if (!n--) break; + regs->r9 = *args++; + case 6: + if (!n--) break; + default: + BUG(); + break; + } +} + +#endif /* CONFIG_X86_32 */ + +#endif /* _ASM_X86_SYSCALL_H */ diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h new file mode 100644 index 000000000000..87803da44010 --- /dev/null +++ b/arch/x86/include/asm/syscalls.h @@ -0,0 +1,93 @@ +/* + * syscalls.h - Linux syscall interfaces (arch-specific) + * + * Copyright (c) 2008 Jaswinder Singh + * + * This file is released under the GPLv2. + * See the file COPYING for more details. + */ + +#ifndef _ASM_X86_SYSCALLS_H +#define _ASM_X86_SYSCALLS_H + +#include <linux/compiler.h> +#include <linux/linkage.h> +#include <linux/types.h> +#include <linux/signal.h> + +/* Common in X86_32 and X86_64 */ +/* kernel/ioport.c */ +asmlinkage long sys_ioperm(unsigned long, unsigned long, int); + +/* X86_32 only */ +#ifdef CONFIG_X86_32 +/* kernel/process_32.c */ +asmlinkage int sys_fork(struct pt_regs); +asmlinkage int sys_clone(struct pt_regs); +asmlinkage int sys_vfork(struct pt_regs); +asmlinkage int sys_execve(struct pt_regs); + +/* kernel/signal_32.c */ +asmlinkage int sys_sigsuspend(int, int, old_sigset_t); +asmlinkage int sys_sigaction(int, const struct old_sigaction __user *, + struct old_sigaction __user *); +asmlinkage int sys_sigaltstack(unsigned long); +asmlinkage unsigned long sys_sigreturn(unsigned long); +asmlinkage int sys_rt_sigreturn(unsigned long); + +/* kernel/ioport.c */ +asmlinkage long sys_iopl(unsigned long); + +/* kernel/ldt.c */ +asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); + +/* kernel/sys_i386_32.c */ +asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long, + unsigned long, unsigned long, unsigned long); +struct mmap_arg_struct; +asmlinkage int old_mmap(struct mmap_arg_struct __user *); +struct sel_arg_struct; +asmlinkage int old_select(struct sel_arg_struct __user *); +asmlinkage int sys_ipc(uint, int, int, int, void __user *, long); +struct old_utsname; +asmlinkage int sys_uname(struct old_utsname __user *); +struct oldold_utsname; +asmlinkage int sys_olduname(struct oldold_utsname __user *); + +/* kernel/tls.c */ +asmlinkage int sys_set_thread_area(struct user_desc __user *); +asmlinkage int sys_get_thread_area(struct user_desc __user *); + +/* kernel/vm86_32.c */ +asmlinkage int sys_vm86old(struct pt_regs); +asmlinkage int sys_vm86(struct pt_regs); + +#else /* CONFIG_X86_32 */ + +/* X86_64 only */ +/* kernel/process_64.c */ +asmlinkage long sys_fork(struct pt_regs *); +asmlinkage long sys_clone(unsigned long, unsigned long, + void __user *, void __user *, + struct pt_regs *); +asmlinkage long sys_vfork(struct pt_regs *); +asmlinkage long sys_execve(char __user *, char __user * __user *, + char __user * __user *, + struct pt_regs *); + +/* kernel/ioport.c */ +asmlinkage long sys_iopl(unsigned int, struct pt_regs *); + +/* kernel/signal_64.c */ +asmlinkage long sys_sigaltstack(const stack_t __user *, stack_t __user *, + struct pt_regs *); +asmlinkage long sys_rt_sigreturn(struct pt_regs *); + +/* kernel/sys_x86_64.c */ +asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long, + unsigned long, unsigned long, unsigned long); +struct new_utsname; +asmlinkage long sys_uname(struct new_utsname __user *); + +#endif /* CONFIG_X86_32 */ +#endif /* _ASM_X86_SYSCALLS_H */ diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h new file mode 100644 index 000000000000..2ed3f0f44ff7 --- /dev/null +++ b/arch/x86/include/asm/system.h @@ -0,0 +1,425 @@ +#ifndef _ASM_X86_SYSTEM_H +#define _ASM_X86_SYSTEM_H + +#include <asm/asm.h> +#include <asm/segment.h> +#include <asm/cpufeature.h> +#include <asm/cmpxchg.h> +#include <asm/nops.h> + +#include <linux/kernel.h> +#include <linux/irqflags.h> + +/* entries in ARCH_DLINFO: */ +#ifdef CONFIG_IA32_EMULATION +# define AT_VECTOR_SIZE_ARCH 2 +#else +# define AT_VECTOR_SIZE_ARCH 1 +#endif + +#ifdef CONFIG_X86_32 + +struct task_struct; /* one of the stranger aspects of C forward declarations */ +struct task_struct *__switch_to(struct task_struct *prev, + struct task_struct *next); + +/* + * Saving eflags is important. It switches not only IOPL between tasks, + * it also protects other tasks from NT leaking through sysenter etc. + */ +#define switch_to(prev, next, last) \ +do { \ + /* \ + * Context-switching clobbers all registers, so we clobber \ + * them explicitly, via unused output variables. \ + * (EAX and EBP is not listed because EBP is saved/restored \ + * explicitly for wchan access and EAX is the return value of \ + * __switch_to()) \ + */ \ + unsigned long ebx, ecx, edx, esi, edi; \ + \ + asm volatile("pushfl\n\t" /* save flags */ \ + "pushl %%ebp\n\t" /* save EBP */ \ + "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \ + "movl %[next_sp],%%esp\n\t" /* restore ESP */ \ + "movl $1f,%[prev_ip]\n\t" /* save EIP */ \ + "pushl %[next_ip]\n\t" /* restore EIP */ \ + "jmp __switch_to\n" /* regparm call */ \ + "1:\t" \ + "popl %%ebp\n\t" /* restore EBP */ \ + "popfl\n" /* restore flags */ \ + \ + /* output parameters */ \ + : [prev_sp] "=m" (prev->thread.sp), \ + [prev_ip] "=m" (prev->thread.ip), \ + "=a" (last), \ + \ + /* clobbered output registers: */ \ + "=b" (ebx), "=c" (ecx), "=d" (edx), \ + "=S" (esi), "=D" (edi) \ + \ + /* input parameters: */ \ + : [next_sp] "m" (next->thread.sp), \ + [next_ip] "m" (next->thread.ip), \ + \ + /* regparm parameters for __switch_to(): */ \ + [prev] "a" (prev), \ + [next] "d" (next) \ + \ + : /* reloaded segment registers */ \ + "memory"); \ +} while (0) + +/* + * disable hlt during certain critical i/o operations + */ +#define HAVE_DISABLE_HLT +#else +#define __SAVE(reg, offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t" +#define __RESTORE(reg, offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t" + +/* frame pointer must be last for get_wchan */ +#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t" +#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t" + +#define __EXTRA_CLOBBER \ + , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \ + "r12", "r13", "r14", "r15" + +/* Save restore flags to clear handle leaking NT */ +#define switch_to(prev, next, last) \ + asm volatile(SAVE_CONTEXT \ + "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ + "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ + "call __switch_to\n\t" \ + ".globl thread_return\n" \ + "thread_return:\n\t" \ + "movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \ + "movq %P[thread_info](%%rsi),%%r8\n\t" \ + LOCK_PREFIX "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \ + "movq %%rax,%%rdi\n\t" \ + "jc ret_from_fork\n\t" \ + RESTORE_CONTEXT \ + : "=a" (last) \ + : [next] "S" (next), [prev] "D" (prev), \ + [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \ + [ti_flags] "i" (offsetof(struct thread_info, flags)), \ + [tif_fork] "i" (TIF_FORK), \ + [thread_info] "i" (offsetof(struct task_struct, stack)), \ + [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \ + : "memory", "cc" __EXTRA_CLOBBER) +#endif + +#ifdef __KERNEL__ +#define _set_base(addr, base) do { unsigned long __pr; \ +__asm__ __volatile__ ("movw %%dx,%1\n\t" \ + "rorl $16,%%edx\n\t" \ + "movb %%dl,%2\n\t" \ + "movb %%dh,%3" \ + :"=&d" (__pr) \ + :"m" (*((addr)+2)), \ + "m" (*((addr)+4)), \ + "m" (*((addr)+7)), \ + "0" (base) \ + ); } while (0) + +#define _set_limit(addr, limit) do { unsigned long __lr; \ +__asm__ __volatile__ ("movw %%dx,%1\n\t" \ + "rorl $16,%%edx\n\t" \ + "movb %2,%%dh\n\t" \ + "andb $0xf0,%%dh\n\t" \ + "orb %%dh,%%dl\n\t" \ + "movb %%dl,%2" \ + :"=&d" (__lr) \ + :"m" (*(addr)), \ + "m" (*((addr)+6)), \ + "0" (limit) \ + ); } while (0) + +#define set_base(ldt, base) _set_base(((char *)&(ldt)) , (base)) +#define set_limit(ldt, limit) _set_limit(((char *)&(ldt)) , ((limit)-1)) + +extern void native_load_gs_index(unsigned); + +/* + * Load a segment. Fall back on loading the zero + * segment if something goes wrong.. + */ +#define loadsegment(seg, value) \ + asm volatile("\n" \ + "1:\t" \ + "movl %k0,%%" #seg "\n" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3:\t" \ + "movl %k1, %%" #seg "\n\t" \ + "jmp 2b\n" \ + ".previous\n" \ + _ASM_EXTABLE(1b,3b) \ + : :"r" (value), "r" (0) : "memory") + + +/* + * Save a segment register away + */ +#define savesegment(seg, value) \ + asm("mov %%" #seg ",%0":"=r" (value) : : "memory") + +static inline unsigned long get_limit(unsigned long segment) +{ + unsigned long __limit; + asm("lsll %1,%0" : "=r" (__limit) : "r" (segment)); + return __limit + 1; +} + +static inline void native_clts(void) +{ + asm volatile("clts"); +} + +/* + * Volatile isn't enough to prevent the compiler from reordering the + * read/write functions for the control registers and messing everything up. + * A memory clobber would solve the problem, but would prevent reordering of + * all loads stores around it, which can hurt performance. Solution is to + * use a variable and mimic reads and writes to it to enforce serialization + */ +static unsigned long __force_order; + +static inline unsigned long native_read_cr0(void) +{ + unsigned long val; + asm volatile("mov %%cr0,%0\n\t" : "=r" (val), "=m" (__force_order)); + return val; +} + +static inline void native_write_cr0(unsigned long val) +{ + asm volatile("mov %0,%%cr0": : "r" (val), "m" (__force_order)); +} + +static inline unsigned long native_read_cr2(void) +{ + unsigned long val; + asm volatile("mov %%cr2,%0\n\t" : "=r" (val), "=m" (__force_order)); + return val; +} + +static inline void native_write_cr2(unsigned long val) +{ + asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order)); +} + +static inline unsigned long native_read_cr3(void) +{ + unsigned long val; + asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order)); + return val; +} + +static inline void native_write_cr3(unsigned long val) +{ + asm volatile("mov %0,%%cr3": : "r" (val), "m" (__force_order)); +} + +static inline unsigned long native_read_cr4(void) +{ + unsigned long val; + asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order)); + return val; +} + +static inline unsigned long native_read_cr4_safe(void) +{ + unsigned long val; + /* This could fault if %cr4 does not exist. In x86_64, a cr4 always + * exists, so it will never fail. */ +#ifdef CONFIG_X86_32 + asm volatile("1: mov %%cr4, %0\n" + "2:\n" + _ASM_EXTABLE(1b, 2b) + : "=r" (val), "=m" (__force_order) : "0" (0)); +#else + val = native_read_cr4(); +#endif + return val; +} + +static inline void native_write_cr4(unsigned long val) +{ + asm volatile("mov %0,%%cr4": : "r" (val), "m" (__force_order)); +} + +#ifdef CONFIG_X86_64 +static inline unsigned long native_read_cr8(void) +{ + unsigned long cr8; + asm volatile("movq %%cr8,%0" : "=r" (cr8)); + return cr8; +} + +static inline void native_write_cr8(unsigned long val) +{ + asm volatile("movq %0,%%cr8" :: "r" (val) : "memory"); +} +#endif + +static inline void native_wbinvd(void) +{ + asm volatile("wbinvd": : :"memory"); +} + +#ifdef CONFIG_PARAVIRT +#include <asm/paravirt.h> +#else +#define read_cr0() (native_read_cr0()) +#define write_cr0(x) (native_write_cr0(x)) +#define read_cr2() (native_read_cr2()) +#define write_cr2(x) (native_write_cr2(x)) +#define read_cr3() (native_read_cr3()) +#define write_cr3(x) (native_write_cr3(x)) +#define read_cr4() (native_read_cr4()) +#define read_cr4_safe() (native_read_cr4_safe()) +#define write_cr4(x) (native_write_cr4(x)) +#define wbinvd() (native_wbinvd()) +#ifdef CONFIG_X86_64 +#define read_cr8() (native_read_cr8()) +#define write_cr8(x) (native_write_cr8(x)) +#define load_gs_index native_load_gs_index +#endif + +/* Clear the 'TS' bit */ +#define clts() (native_clts()) + +#endif/* CONFIG_PARAVIRT */ + +#define stts() write_cr0(read_cr0() | X86_CR0_TS) + +#endif /* __KERNEL__ */ + +static inline void clflush(volatile void *__p) +{ + asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p)); +} + +#define nop() asm volatile ("nop") + +void disable_hlt(void); +void enable_hlt(void); + +void cpu_idle_wait(void); + +extern unsigned long arch_align_stack(unsigned long sp); +extern void free_init_pages(char *what, unsigned long begin, unsigned long end); + +void default_idle(void); + +/* + * Force strict CPU ordering. + * And yes, this is required on UP too when we're talking + * to devices. + */ +#ifdef CONFIG_X86_32 +/* + * Some non-Intel clones support out of order store. wmb() ceases to be a + * nop for these. + */ +#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2) +#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2) +#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM) +#else +#define mb() asm volatile("mfence":::"memory") +#define rmb() asm volatile("lfence":::"memory") +#define wmb() asm volatile("sfence" ::: "memory") +#endif + +/** + * read_barrier_depends - Flush all pending reads that subsequents reads + * depend on. + * + * No data-dependent reads from memory-like regions are ever reordered + * over this barrier. All reads preceding this primitive are guaranteed + * to access memory (but not necessarily other CPUs' caches) before any + * reads following this primitive that depend on the data return by + * any of the preceding reads. This primitive is much lighter weight than + * rmb() on most CPUs, and is never heavier weight than is + * rmb(). + * + * These ordering constraints are respected by both the local CPU + * and the compiler. + * + * Ordering is not guaranteed by anything other than these primitives, + * not even by data dependencies. See the documentation for + * memory_barrier() for examples and URLs to more information. + * + * For example, the following code would force ordering (the initial + * value of "a" is zero, "b" is one, and "p" is "&a"): + * + * <programlisting> + * CPU 0 CPU 1 + * + * b = 2; + * memory_barrier(); + * p = &b; q = p; + * read_barrier_depends(); + * d = *q; + * </programlisting> + * + * because the read of "*q" depends on the read of "p" and these + * two reads are separated by a read_barrier_depends(). However, + * the following code, with the same initial values for "a" and "b": + * + * <programlisting> + * CPU 0 CPU 1 + * + * a = 2; + * memory_barrier(); + * b = 3; y = b; + * read_barrier_depends(); + * x = a; + * </programlisting> + * + * does not enforce ordering, since there is no data dependency between + * the read of "a" and the read of "b". Therefore, on some CPUs, such + * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb() + * in cases like this where there are no data dependencies. + **/ + +#define read_barrier_depends() do { } while (0) + +#ifdef CONFIG_SMP +#define smp_mb() mb() +#ifdef CONFIG_X86_PPRO_FENCE +# define smp_rmb() rmb() +#else +# define smp_rmb() barrier() +#endif +#ifdef CONFIG_X86_OOSTORE +# define smp_wmb() wmb() +#else +# define smp_wmb() barrier() +#endif +#define smp_read_barrier_depends() read_barrier_depends() +#define set_mb(var, value) do { (void)xchg(&var, value); } while (0) +#else +#define smp_mb() barrier() +#define smp_rmb() barrier() +#define smp_wmb() barrier() +#define smp_read_barrier_depends() do { } while (0) +#define set_mb(var, value) do { var = value; barrier(); } while (0) +#endif + +/* + * Stop RDTSC speculation. This is needed when you need to use RDTSC + * (or get_cycles or vread that possibly accesses the TSC) in a defined + * code region. + * + * (Could use an alternative three way for this if there was one.) + */ +static inline void rdtsc_barrier(void) +{ + alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); + alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); +} + +#endif /* _ASM_X86_SYSTEM_H */ diff --git a/arch/x86/include/asm/system_64.h b/arch/x86/include/asm/system_64.h new file mode 100644 index 000000000000..1159e091ad09 --- /dev/null +++ b/arch/x86/include/asm/system_64.h @@ -0,0 +1,22 @@ +#ifndef _ASM_X86_SYSTEM_64_H +#define _ASM_X86_SYSTEM_64_H + +#include <asm/segment.h> +#include <asm/cmpxchg.h> + + +static inline unsigned long read_cr8(void) +{ + unsigned long cr8; + asm volatile("movq %%cr8,%0" : "=r" (cr8)); + return cr8; +} + +static inline void write_cr8(unsigned long val) +{ + asm volatile("movq %0,%%cr8" :: "r" (val) : "memory"); +} + +#include <linux/irqflags.h> + +#endif /* _ASM_X86_SYSTEM_64_H */ diff --git a/arch/x86/include/asm/tce.h b/arch/x86/include/asm/tce.h new file mode 100644 index 000000000000..7a6677c1a715 --- /dev/null +++ b/arch/x86/include/asm/tce.h @@ -0,0 +1,48 @@ +/* + * This file is derived from asm-powerpc/tce.h. + * + * Copyright (C) IBM Corporation, 2006 + * + * Author: Muli Ben-Yehuda <muli@il.ibm.com> + * Author: Jon Mason <jdmason@us.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _ASM_X86_TCE_H +#define _ASM_X86_TCE_H + +extern unsigned int specified_table_size; +struct iommu_table; + +#define TCE_ENTRY_SIZE 8 /* in bytes */ + +#define TCE_READ_SHIFT 0 +#define TCE_WRITE_SHIFT 1 +#define TCE_HUBID_SHIFT 2 /* unused */ +#define TCE_RSVD_SHIFT 8 /* unused */ +#define TCE_RPN_SHIFT 12 +#define TCE_UNUSED_SHIFT 48 /* unused */ + +#define TCE_RPN_MASK 0x0000fffffffff000ULL + +extern void tce_build(struct iommu_table *tbl, unsigned long index, + unsigned int npages, unsigned long uaddr, int direction); +extern void tce_free(struct iommu_table *tbl, long index, unsigned int npages); +extern void * __init alloc_tce_table(void); +extern void __init free_tce_table(void *tbl); +extern int __init build_tce_table(struct pci_dev *dev, void __iomem *bbar); + +#endif /* _ASM_X86_TCE_H */ diff --git a/arch/x86/include/asm/termbits.h b/arch/x86/include/asm/termbits.h new file mode 100644 index 000000000000..af1b70ea440f --- /dev/null +++ b/arch/x86/include/asm/termbits.h @@ -0,0 +1,198 @@ +#ifndef _ASM_X86_TERMBITS_H +#define _ASM_X86_TERMBITS_H + +#include <linux/posix_types.h> + +typedef unsigned char cc_t; +typedef unsigned int speed_t; +typedef unsigned int tcflag_t; + +#define NCCS 19 +struct termios { + tcflag_t c_iflag; /* input mode flags */ + tcflag_t c_oflag; /* output mode flags */ + tcflag_t c_cflag; /* control mode flags */ + tcflag_t c_lflag; /* local mode flags */ + cc_t c_line; /* line discipline */ + cc_t c_cc[NCCS]; /* control characters */ +}; + +struct termios2 { + tcflag_t c_iflag; /* input mode flags */ + tcflag_t c_oflag; /* output mode flags */ + tcflag_t c_cflag; /* control mode flags */ + tcflag_t c_lflag; /* local mode flags */ + cc_t c_line; /* line discipline */ + cc_t c_cc[NCCS]; /* control characters */ + speed_t c_ispeed; /* input speed */ + speed_t c_ospeed; /* output speed */ +}; + +struct ktermios { + tcflag_t c_iflag; /* input mode flags */ + tcflag_t c_oflag; /* output mode flags */ + tcflag_t c_cflag; /* control mode flags */ + tcflag_t c_lflag; /* local mode flags */ + cc_t c_line; /* line discipline */ + cc_t c_cc[NCCS]; /* control characters */ + speed_t c_ispeed; /* input speed */ + speed_t c_ospeed; /* output speed */ +}; + +/* c_cc characters */ +#define VINTR 0 +#define VQUIT 1 +#define VERASE 2 +#define VKILL 3 +#define VEOF 4 +#define VTIME 5 +#define VMIN 6 +#define VSWTC 7 +#define VSTART 8 +#define VSTOP 9 +#define VSUSP 10 +#define VEOL 11 +#define VREPRINT 12 +#define VDISCARD 13 +#define VWERASE 14 +#define VLNEXT 15 +#define VEOL2 16 + +/* c_iflag bits */ +#define IGNBRK 0000001 +#define BRKINT 0000002 +#define IGNPAR 0000004 +#define PARMRK 0000010 +#define INPCK 0000020 +#define ISTRIP 0000040 +#define INLCR 0000100 +#define IGNCR 0000200 +#define ICRNL 0000400 +#define IUCLC 0001000 +#define IXON 0002000 +#define IXANY 0004000 +#define IXOFF 0010000 +#define IMAXBEL 0020000 +#define IUTF8 0040000 + +/* c_oflag bits */ +#define OPOST 0000001 +#define OLCUC 0000002 +#define ONLCR 0000004 +#define OCRNL 0000010 +#define ONOCR 0000020 +#define ONLRET 0000040 +#define OFILL 0000100 +#define OFDEL 0000200 +#define NLDLY 0000400 +#define NL0 0000000 +#define NL1 0000400 +#define CRDLY 0003000 +#define CR0 0000000 +#define CR1 0001000 +#define CR2 0002000 +#define CR3 0003000 +#define TABDLY 0014000 +#define TAB0 0000000 +#define TAB1 0004000 +#define TAB2 0010000 +#define TAB3 0014000 +#define XTABS 0014000 +#define BSDLY 0020000 +#define BS0 0000000 +#define BS1 0020000 +#define VTDLY 0040000 +#define VT0 0000000 +#define VT1 0040000 +#define FFDLY 0100000 +#define FF0 0000000 +#define FF1 0100000 + +/* c_cflag bit meaning */ +#define CBAUD 0010017 +#define B0 0000000 /* hang up */ +#define B50 0000001 +#define B75 0000002 +#define B110 0000003 +#define B134 0000004 +#define B150 0000005 +#define B200 0000006 +#define B300 0000007 +#define B600 0000010 +#define B1200 0000011 +#define B1800 0000012 +#define B2400 0000013 +#define B4800 0000014 +#define B9600 0000015 +#define B19200 0000016 +#define B38400 0000017 +#define EXTA B19200 +#define EXTB B38400 +#define CSIZE 0000060 +#define CS5 0000000 +#define CS6 0000020 +#define CS7 0000040 +#define CS8 0000060 +#define CSTOPB 0000100 +#define CREAD 0000200 +#define PARENB 0000400 +#define PARODD 0001000 +#define HUPCL 0002000 +#define CLOCAL 0004000 +#define CBAUDEX 0010000 +#define BOTHER 0010000 /* non standard rate */ +#define B57600 0010001 +#define B115200 0010002 +#define B230400 0010003 +#define B460800 0010004 +#define B500000 0010005 +#define B576000 0010006 +#define B921600 0010007 +#define B1000000 0010010 +#define B1152000 0010011 +#define B1500000 0010012 +#define B2000000 0010013 +#define B2500000 0010014 +#define B3000000 0010015 +#define B3500000 0010016 +#define B4000000 0010017 +#define CIBAUD 002003600000 /* input baud rate */ +#define CMSPAR 010000000000 /* mark or space (stick) parity */ +#define CRTSCTS 020000000000 /* flow control */ + +#define IBSHIFT 16 /* Shift from CBAUD to CIBAUD */ + +/* c_lflag bits */ +#define ISIG 0000001 +#define ICANON 0000002 +#define XCASE 0000004 +#define ECHO 0000010 +#define ECHOE 0000020 +#define ECHOK 0000040 +#define ECHONL 0000100 +#define NOFLSH 0000200 +#define TOSTOP 0000400 +#define ECHOCTL 0001000 +#define ECHOPRT 0002000 +#define ECHOKE 0004000 +#define FLUSHO 0010000 +#define PENDIN 0040000 +#define IEXTEN 0100000 + +/* tcflow() and TCXONC use these */ +#define TCOOFF 0 +#define TCOON 1 +#define TCIOFF 2 +#define TCION 3 + +/* tcflush() and TCFLSH use these */ +#define TCIFLUSH 0 +#define TCOFLUSH 1 +#define TCIOFLUSH 2 + +/* tcsetattr uses these */ +#define TCSANOW 0 +#define TCSADRAIN 1 +#define TCSAFLUSH 2 + +#endif /* _ASM_X86_TERMBITS_H */ diff --git a/arch/x86/include/asm/termios.h b/arch/x86/include/asm/termios.h new file mode 100644 index 000000000000..f72956331c49 --- /dev/null +++ b/arch/x86/include/asm/termios.h @@ -0,0 +1,113 @@ +#ifndef _ASM_X86_TERMIOS_H +#define _ASM_X86_TERMIOS_H + +#include <asm/termbits.h> +#include <asm/ioctls.h> + +struct winsize { + unsigned short ws_row; + unsigned short ws_col; + unsigned short ws_xpixel; + unsigned short ws_ypixel; +}; + +#define NCC 8 +struct termio { + unsigned short c_iflag; /* input mode flags */ + unsigned short c_oflag; /* output mode flags */ + unsigned short c_cflag; /* control mode flags */ + unsigned short c_lflag; /* local mode flags */ + unsigned char c_line; /* line discipline */ + unsigned char c_cc[NCC]; /* control characters */ +}; + +/* modem lines */ +#define TIOCM_LE 0x001 +#define TIOCM_DTR 0x002 +#define TIOCM_RTS 0x004 +#define TIOCM_ST 0x008 +#define TIOCM_SR 0x010 +#define TIOCM_CTS 0x020 +#define TIOCM_CAR 0x040 +#define TIOCM_RNG 0x080 +#define TIOCM_DSR 0x100 +#define TIOCM_CD TIOCM_CAR +#define TIOCM_RI TIOCM_RNG +#define TIOCM_OUT1 0x2000 +#define TIOCM_OUT2 0x4000 +#define TIOCM_LOOP 0x8000 + +/* ioctl (fd, TIOCSERGETLSR, &result) where result may be as below */ + +#ifdef __KERNEL__ + +#include <asm/uaccess.h> + +/* intr=^C quit=^\ erase=del kill=^U + eof=^D vtime=\0 vmin=\1 sxtc=\0 + start=^Q stop=^S susp=^Z eol=\0 + reprint=^R discard=^U werase=^W lnext=^V + eol2=\0 +*/ +#define INIT_C_CC "\003\034\177\025\004\0\1\0\021\023\032\0\022\017\027\026\0" + +/* + * Translate a "termio" structure into a "termios". Ugh. + */ +#define SET_LOW_TERMIOS_BITS(termios, termio, x) { \ + unsigned short __tmp; \ + get_user(__tmp,&(termio)->x); \ + *(unsigned short *) &(termios)->x = __tmp; \ +} + +static inline int user_termio_to_kernel_termios(struct ktermios *termios, + struct termio __user *termio) +{ + SET_LOW_TERMIOS_BITS(termios, termio, c_iflag); + SET_LOW_TERMIOS_BITS(termios, termio, c_oflag); + SET_LOW_TERMIOS_BITS(termios, termio, c_cflag); + SET_LOW_TERMIOS_BITS(termios, termio, c_lflag); + return copy_from_user(termios->c_cc, termio->c_cc, NCC); +} + +/* + * Translate a "termios" structure into a "termio". Ugh. + */ +static inline int kernel_termios_to_user_termio(struct termio __user *termio, + struct ktermios *termios) +{ + put_user((termios)->c_iflag, &(termio)->c_iflag); + put_user((termios)->c_oflag, &(termio)->c_oflag); + put_user((termios)->c_cflag, &(termio)->c_cflag); + put_user((termios)->c_lflag, &(termio)->c_lflag); + put_user((termios)->c_line, &(termio)->c_line); + return copy_to_user((termio)->c_cc, (termios)->c_cc, NCC); +} + +static inline int user_termios_to_kernel_termios(struct ktermios *k, + struct termios2 __user *u) +{ + return copy_from_user(k, u, sizeof(struct termios2)); +} + +static inline int kernel_termios_to_user_termios(struct termios2 __user *u, + struct ktermios *k) +{ + return copy_to_user(u, k, sizeof(struct termios2)); +} + +static inline int user_termios_to_kernel_termios_1(struct ktermios *k, + struct termios __user *u) +{ + return copy_from_user(k, u, sizeof(struct termios)); +} + +static inline int kernel_termios_to_user_termios_1(struct termios __user *u, + struct ktermios *k) +{ + return copy_to_user(u, k, sizeof(struct termios)); +} + +#endif /* __KERNEL__ */ + +#endif /* _ASM_X86_TERMIOS_H */ diff --git a/arch/x86/include/asm/therm_throt.h b/arch/x86/include/asm/therm_throt.h new file mode 100644 index 000000000000..c62349ee7860 --- /dev/null +++ b/arch/x86/include/asm/therm_throt.h @@ -0,0 +1,9 @@ +#ifndef _ASM_X86_THERM_THROT_H +#define _ASM_X86_THERM_THROT_H + +#include <asm/atomic.h> + +extern atomic_t therm_throt_en; +int therm_throt_process(int curr); + +#endif /* _ASM_X86_THERM_THROT_H */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h new file mode 100644 index 000000000000..e44d379faad2 --- /dev/null +++ b/arch/x86/include/asm/thread_info.h @@ -0,0 +1,264 @@ +/* thread_info.h: low-level thread information + * + * Copyright (C) 2002 David Howells (dhowells@redhat.com) + * - Incorporating suggestions made by Linus Torvalds and Dave Miller + */ + +#ifndef _ASM_X86_THREAD_INFO_H +#define _ASM_X86_THREAD_INFO_H + +#include <linux/compiler.h> +#include <asm/page.h> +#include <asm/types.h> + +/* + * low level task data that entry.S needs immediate access to + * - this struct should fit entirely inside of one cache line + * - this struct shares the supervisor stack pages + */ +#ifndef __ASSEMBLY__ +struct task_struct; +struct exec_domain; +#include <asm/processor.h> + +struct thread_info { + struct task_struct *task; /* main task structure */ + struct exec_domain *exec_domain; /* execution domain */ + unsigned long flags; /* low level flags */ + __u32 status; /* thread synchronous flags */ + __u32 cpu; /* current CPU */ + int preempt_count; /* 0 => preemptable, + <0 => BUG */ + mm_segment_t addr_limit; + struct restart_block restart_block; + void __user *sysenter_return; +#ifdef CONFIG_X86_32 + unsigned long previous_esp; /* ESP of the previous stack in + case of nested (IRQ) stacks + */ + __u8 supervisor_stack[0]; +#endif +}; + +#define INIT_THREAD_INFO(tsk) \ +{ \ + .task = &tsk, \ + .exec_domain = &default_exec_domain, \ + .flags = 0, \ + .cpu = 0, \ + .preempt_count = 1, \ + .addr_limit = KERNEL_DS, \ + .restart_block = { \ + .fn = do_no_restart_syscall, \ + }, \ +} + +#define init_thread_info (init_thread_union.thread_info) +#define init_stack (init_thread_union.stack) + +#else /* !__ASSEMBLY__ */ + +#include <asm/asm-offsets.h> + +#endif + +/* + * thread information flags + * - these are process state flags that various assembly files + * may need to access + * - pending work-to-be-done flags are in LSW + * - other flags in MSW + * Warning: layout of LSW is hardcoded in entry.S + */ +#define TIF_SYSCALL_TRACE 0 /* syscall trace active */ +#define TIF_NOTIFY_RESUME 1 /* callback before returning to user */ +#define TIF_SIGPENDING 2 /* signal pending */ +#define TIF_NEED_RESCHED 3 /* rescheduling necessary */ +#define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/ +#define TIF_IRET 5 /* force IRET */ +#define TIF_SYSCALL_EMU 6 /* syscall emulation active */ +#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ +#define TIF_SECCOMP 8 /* secure computing */ +#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ +#define TIF_NOTSC 16 /* TSC is not accessible in userland */ +#define TIF_IA32 17 /* 32bit process */ +#define TIF_FORK 18 /* ret_from_fork */ +#define TIF_ABI_PENDING 19 +#define TIF_MEMDIE 20 +#define TIF_DEBUG 21 /* uses debug registers */ +#define TIF_IO_BITMAP 22 /* uses I/O bitmap */ +#define TIF_FREEZE 23 /* is freezing for suspend */ +#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ +#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ +#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ +#define TIF_BTS_TRACE_TS 27 /* record scheduling event timestamps */ + +#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) +#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) +#define _TIF_SIGPENDING (1 << TIF_SIGPENDING) +#define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) +#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) +#define _TIF_IRET (1 << TIF_IRET) +#define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU) +#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) +#define _TIF_SECCOMP (1 << TIF_SECCOMP) +#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY) +#define _TIF_NOTSC (1 << TIF_NOTSC) +#define _TIF_IA32 (1 << TIF_IA32) +#define _TIF_FORK (1 << TIF_FORK) +#define _TIF_ABI_PENDING (1 << TIF_ABI_PENDING) +#define _TIF_DEBUG (1 << TIF_DEBUG) +#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) +#define _TIF_FREEZE (1 << TIF_FREEZE) +#define _TIF_FORCED_TF (1 << TIF_FORCED_TF) +#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) +#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) +#define _TIF_BTS_TRACE_TS (1 << TIF_BTS_TRACE_TS) + +/* work to do in syscall_trace_enter() */ +#define _TIF_WORK_SYSCALL_ENTRY \ + (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | \ + _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | _TIF_SINGLESTEP) + +/* work to do in syscall_trace_leave() */ +#define _TIF_WORK_SYSCALL_EXIT \ + (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP) + +/* work to do on interrupt/exception return */ +#define _TIF_WORK_MASK \ + (0x0000FFFF & \ + ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT| \ + _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU)) + +/* work to do on any return to user space */ +#define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP) + +/* Only used for 64 bit */ +#define _TIF_DO_NOTIFY_MASK \ + (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME) + +/* flags to check in __switch_to() */ +#define _TIF_WORK_CTXSW \ + (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS| \ + _TIF_NOTSC) + +#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW +#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG) + +#define PREEMPT_ACTIVE 0x10000000 + +/* thread information allocation */ +#ifdef CONFIG_DEBUG_STACK_USAGE +#define THREAD_FLAGS (GFP_KERNEL | __GFP_ZERO) +#else +#define THREAD_FLAGS GFP_KERNEL +#endif + +#define __HAVE_ARCH_THREAD_INFO_ALLOCATOR + +#define alloc_thread_info(tsk) \ + ((struct thread_info *)__get_free_pages(THREAD_FLAGS, THREAD_ORDER)) + +#ifdef CONFIG_X86_32 + +#define STACK_WARN (THREAD_SIZE/8) +/* + * macros/functions for gaining access to the thread information structure + * + * preempt_count needs to be 1 initially, until the scheduler is functional. + */ +#ifndef __ASSEMBLY__ + + +/* how to get the current stack pointer from C */ +register unsigned long current_stack_pointer asm("esp") __used; + +/* how to get the thread information struct from C */ +static inline struct thread_info *current_thread_info(void) +{ + return (struct thread_info *) + (current_stack_pointer & ~(THREAD_SIZE - 1)); +} + +#else /* !__ASSEMBLY__ */ + +/* how to get the thread information struct from ASM */ +#define GET_THREAD_INFO(reg) \ + movl $-THREAD_SIZE, reg; \ + andl %esp, reg + +/* use this one if reg already contains %esp */ +#define GET_THREAD_INFO_WITH_ESP(reg) \ + andl $-THREAD_SIZE, reg + +#endif + +#else /* X86_32 */ + +#include <asm/pda.h> + +/* + * macros/functions for gaining access to the thread information structure + * preempt_count needs to be 1 initially, until the scheduler is functional. + */ +#ifndef __ASSEMBLY__ +static inline struct thread_info *current_thread_info(void) +{ + struct thread_info *ti; + ti = (void *)(read_pda(kernelstack) + PDA_STACKOFFSET - THREAD_SIZE); + return ti; +} + +/* do not use in interrupt context */ +static inline struct thread_info *stack_thread_info(void) +{ + struct thread_info *ti; + asm("andq %%rsp,%0; " : "=r" (ti) : "0" (~(THREAD_SIZE - 1))); + return ti; +} + +#else /* !__ASSEMBLY__ */ + +/* how to get the thread information struct from ASM */ +#define GET_THREAD_INFO(reg) \ + movq %gs:pda_kernelstack,reg ; \ + subq $(THREAD_SIZE-PDA_STACKOFFSET),reg + +#endif + +#endif /* !X86_32 */ + +/* + * Thread-synchronous status. + * + * This is different from the flags in that nobody else + * ever touches our thread-synchronous status, so we don't + * have to worry about atomic accesses. + */ +#define TS_USEDFPU 0x0001 /* FPU was used by this task + this quantum (SMP) */ +#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/ +#define TS_POLLING 0x0004 /* true if in idle loop + and not sleeping */ +#define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal() */ +#define TS_XSAVE 0x0010 /* Use xsave/xrstor */ + +#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING) + +#ifndef __ASSEMBLY__ +#define HAVE_SET_RESTORE_SIGMASK 1 +static inline void set_restore_sigmask(void) +{ + struct thread_info *ti = current_thread_info(); + ti->status |= TS_RESTORE_SIGMASK; + set_bit(TIF_SIGPENDING, (unsigned long *)&ti->flags); +} +#endif /* !__ASSEMBLY__ */ + +#ifndef __ASSEMBLY__ +extern void arch_task_cache_init(void); +extern void free_thread_info(struct thread_info *ti); +extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); +#define arch_task_cache_init arch_task_cache_init +#endif +#endif /* _ASM_X86_THREAD_INFO_H */ diff --git a/arch/x86/include/asm/time.h b/arch/x86/include/asm/time.h new file mode 100644 index 000000000000..50c733aac421 --- /dev/null +++ b/arch/x86/include/asm/time.h @@ -0,0 +1,63 @@ +#ifndef _ASM_X86_TIME_H +#define _ASM_X86_TIME_H + +extern void hpet_time_init(void); + +#include <asm/mc146818rtc.h> +#ifdef CONFIG_X86_32 +#include <linux/efi.h> + +static inline unsigned long native_get_wallclock(void) +{ + unsigned long retval; + + if (efi_enabled) + retval = efi_get_time(); + else + retval = mach_get_cmos_time(); + + return retval; +} + +static inline int native_set_wallclock(unsigned long nowtime) +{ + int retval; + + if (efi_enabled) + retval = efi_set_rtc_mmss(nowtime); + else + retval = mach_set_rtc_mmss(nowtime); + + return retval; +} + +#else +extern void native_time_init_hook(void); + +static inline unsigned long native_get_wallclock(void) +{ + return mach_get_cmos_time(); +} + +static inline int native_set_wallclock(unsigned long nowtime) +{ + return mach_set_rtc_mmss(nowtime); +} + +#endif + +extern void time_init(void); + +#ifdef CONFIG_PARAVIRT +#include <asm/paravirt.h> +#else /* !CONFIG_PARAVIRT */ + +#define get_wallclock() native_get_wallclock() +#define set_wallclock(x) native_set_wallclock(x) +#define choose_time_init() hpet_time_init + +#endif /* CONFIG_PARAVIRT */ + +extern unsigned long __init calibrate_cpu(void); + +#endif /* _ASM_X86_TIME_H */ diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h new file mode 100644 index 000000000000..2bb6a835c453 --- /dev/null +++ b/arch/x86/include/asm/timer.h @@ -0,0 +1,66 @@ +#ifndef _ASM_X86_TIMER_H +#define _ASM_X86_TIMER_H +#include <linux/init.h> +#include <linux/pm.h> +#include <linux/percpu.h> + +#define TICK_SIZE (tick_nsec / 1000) + +unsigned long long native_sched_clock(void); +unsigned long native_calibrate_tsc(void); + +#ifdef CONFIG_X86_32 +extern int timer_ack; +extern int recalibrate_cpu_khz(void); +#endif /* CONFIG_X86_32 */ + +extern int no_timer_check; + +#ifndef CONFIG_PARAVIRT +#define calibrate_tsc() native_calibrate_tsc() +#endif + +/* Accelerators for sched_clock() + * convert from cycles(64bits) => nanoseconds (64bits) + * basic equation: + * ns = cycles / (freq / ns_per_sec) + * ns = cycles * (ns_per_sec / freq) + * ns = cycles * (10^9 / (cpu_khz * 10^3)) + * ns = cycles * (10^6 / cpu_khz) + * + * Then we use scaling math (suggested by george@mvista.com) to get: + * ns = cycles * (10^6 * SC / cpu_khz) / SC + * ns = cycles * cyc2ns_scale / SC + * + * And since SC is a constant power of two, we can convert the div + * into a shift. + * + * We can use khz divisor instead of mhz to keep a better precision, since + * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. + * (mathieu.desnoyers@polymtl.ca) + * + * -johnstul@us.ibm.com "math is hard, lets go shopping!" + */ + +DECLARE_PER_CPU(unsigned long, cyc2ns); + +#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ + +static inline unsigned long long __cycles_2_ns(unsigned long long cyc) +{ + return cyc * per_cpu(cyc2ns, smp_processor_id()) >> CYC2NS_SCALE_FACTOR; +} + +static inline unsigned long long cycles_2_ns(unsigned long long cyc) +{ + unsigned long long ns; + unsigned long flags; + + local_irq_save(flags); + ns = __cycles_2_ns(cyc); + local_irq_restore(flags); + + return ns; +} + +#endif /* _ASM_X86_TIMER_H */ diff --git a/arch/x86/include/asm/timex.h b/arch/x86/include/asm/timex.h new file mode 100644 index 000000000000..1287dc1347d6 --- /dev/null +++ b/arch/x86/include/asm/timex.h @@ -0,0 +1,19 @@ +/* x86 architecture timex specifications */ +#ifndef _ASM_X86_TIMEX_H +#define _ASM_X86_TIMEX_H + +#include <asm/processor.h> +#include <asm/tsc.h> + +#ifdef CONFIG_X86_ELAN +# define PIT_TICK_RATE 1189200 /* AMD Elan has different frequency! */ +#elif defined(CONFIG_X86_RDC321X) +# define PIT_TICK_RATE 1041667 /* Underlying HZ for R8610 */ +#else +# define PIT_TICK_RATE 1193182 /* Underlying HZ */ +#endif +#define CLOCK_TICK_RATE PIT_TICK_RATE + +#define ARCH_HAS_READ_CURRENT_TIMER + +#endif /* _ASM_X86_TIMEX_H */ diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h new file mode 100644 index 000000000000..829215fef9ee --- /dev/null +++ b/arch/x86/include/asm/tlb.h @@ -0,0 +1,11 @@ +#ifndef _ASM_X86_TLB_H +#define _ASM_X86_TLB_H + +#define tlb_start_vma(tlb, vma) do { } while (0) +#define tlb_end_vma(tlb, vma) do { } while (0) +#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) +#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) + +#include <asm-generic/tlb.h> + +#endif /* _ASM_X86_TLB_H */ diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h new file mode 100644 index 000000000000..0e7bbb549116 --- /dev/null +++ b/arch/x86/include/asm/tlbflush.h @@ -0,0 +1,178 @@ +#ifndef _ASM_X86_TLBFLUSH_H +#define _ASM_X86_TLBFLUSH_H + +#include <linux/mm.h> +#include <linux/sched.h> + +#include <asm/processor.h> +#include <asm/system.h> + +#ifdef CONFIG_PARAVIRT +#include <asm/paravirt.h> +#else +#define __flush_tlb() __native_flush_tlb() +#define __flush_tlb_global() __native_flush_tlb_global() +#define __flush_tlb_single(addr) __native_flush_tlb_single(addr) +#endif + +static inline void __native_flush_tlb(void) +{ + write_cr3(read_cr3()); +} + +static inline void __native_flush_tlb_global(void) +{ + unsigned long flags; + unsigned long cr4; + + /* + * Read-modify-write to CR4 - protect it from preemption and + * from interrupts. (Use the raw variant because this code can + * be called from deep inside debugging code.) + */ + raw_local_irq_save(flags); + + cr4 = read_cr4(); + /* clear PGE */ + write_cr4(cr4 & ~X86_CR4_PGE); + /* write old PGE again and flush TLBs */ + write_cr4(cr4); + + raw_local_irq_restore(flags); +} + +static inline void __native_flush_tlb_single(unsigned long addr) +{ + asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); +} + +static inline void __flush_tlb_all(void) +{ + if (cpu_has_pge) + __flush_tlb_global(); + else + __flush_tlb(); +} + +static inline void __flush_tlb_one(unsigned long addr) +{ + if (cpu_has_invlpg) + __flush_tlb_single(addr); + else + __flush_tlb(); +} + +#ifdef CONFIG_X86_32 +# define TLB_FLUSH_ALL 0xffffffff +#else +# define TLB_FLUSH_ALL -1ULL +#endif + +/* + * TLB flushing: + * + * - flush_tlb() flushes the current mm struct TLBs + * - flush_tlb_all() flushes all processes TLBs + * - flush_tlb_mm(mm) flushes the specified mm context TLB's + * - flush_tlb_page(vma, vmaddr) flushes one page + * - flush_tlb_range(vma, start, end) flushes a range of pages + * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages + * - flush_tlb_others(cpumask, mm, va) flushes TLBs on other cpus + * + * ..but the i386 has somewhat limited tlb flushing capabilities, + * and page-granular flushes are available only on i486 and up. + * + * x86-64 can only flush individual pages or full VMs. For a range flush + * we always do the full VM. Might be worth trying if for a small + * range a few INVLPGs in a row are a win. + */ + +#ifndef CONFIG_SMP + +#define flush_tlb() __flush_tlb() +#define flush_tlb_all() __flush_tlb_all() +#define local_flush_tlb() __flush_tlb() + +static inline void flush_tlb_mm(struct mm_struct *mm) +{ + if (mm == current->active_mm) + __flush_tlb(); +} + +static inline void flush_tlb_page(struct vm_area_struct *vma, + unsigned long addr) +{ + if (vma->vm_mm == current->active_mm) + __flush_tlb_one(addr); +} + +static inline void flush_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + if (vma->vm_mm == current->active_mm) + __flush_tlb(); +} + +static inline void native_flush_tlb_others(const cpumask_t *cpumask, + struct mm_struct *mm, + unsigned long va) +{ +} + +static inline void reset_lazy_tlbstate(void) +{ +} + +#else /* SMP */ + +#include <asm/smp.h> + +#define local_flush_tlb() __flush_tlb() + +extern void flush_tlb_all(void); +extern void flush_tlb_current_task(void); +extern void flush_tlb_mm(struct mm_struct *); +extern void flush_tlb_page(struct vm_area_struct *, unsigned long); + +#define flush_tlb() flush_tlb_current_task() + +static inline void flush_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + flush_tlb_mm(vma->vm_mm); +} + +void native_flush_tlb_others(const cpumask_t *cpumask, struct mm_struct *mm, + unsigned long va); + +#define TLBSTATE_OK 1 +#define TLBSTATE_LAZY 2 + +#ifdef CONFIG_X86_32 +struct tlb_state { + struct mm_struct *active_mm; + int state; + char __cacheline_padding[L1_CACHE_BYTES-8]; +}; +DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate); + +void reset_lazy_tlbstate(void); +#else +static inline void reset_lazy_tlbstate(void) +{ +} +#endif + +#endif /* SMP */ + +#ifndef CONFIG_PARAVIRT +#define flush_tlb_others(mask, mm, va) native_flush_tlb_others(&mask, mm, va) +#endif + +static inline void flush_tlb_kernel_range(unsigned long start, + unsigned long end) +{ + flush_tlb_all(); +} + +#endif /* _ASM_X86_TLBFLUSH_H */ diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h new file mode 100644 index 000000000000..90ac7718469a --- /dev/null +++ b/arch/x86/include/asm/topology.h @@ -0,0 +1,258 @@ +/* + * Written by: Matthew Dobson, IBM Corporation + * + * Copyright (C) 2002, IBM Corp. + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send feedback to <colpatch@us.ibm.com> + */ +#ifndef _ASM_X86_TOPOLOGY_H +#define _ASM_X86_TOPOLOGY_H + +#ifdef CONFIG_X86_32 +# ifdef CONFIG_X86_HT +# define ENABLE_TOPO_DEFINES +# endif +#else +# ifdef CONFIG_SMP +# define ENABLE_TOPO_DEFINES +# endif +#endif + +/* Node not present */ +#define NUMA_NO_NODE (-1) + +#ifdef CONFIG_NUMA +#include <linux/cpumask.h> +#include <asm/mpspec.h> + +#ifdef CONFIG_X86_32 + +/* Mappings between node number and cpus on that node. */ +extern cpumask_t node_to_cpumask_map[]; + +/* Mappings between logical cpu number and node number */ +extern int cpu_to_node_map[]; + +/* Returns the number of the node containing CPU 'cpu' */ +static inline int cpu_to_node(int cpu) +{ + return cpu_to_node_map[cpu]; +} +#define early_cpu_to_node(cpu) cpu_to_node(cpu) + +/* Returns a bitmask of CPUs on Node 'node'. + * + * Side note: this function creates the returned cpumask on the stack + * so with a high NR_CPUS count, excessive stack space is used. The + * node_to_cpumask_ptr function should be used whenever possible. + */ +static inline cpumask_t node_to_cpumask(int node) +{ + return node_to_cpumask_map[node]; +} + +#else /* CONFIG_X86_64 */ + +/* Mappings between node number and cpus on that node. */ +extern cpumask_t *node_to_cpumask_map; + +/* Mappings between logical cpu number and node number */ +DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map); + +/* Returns the number of the current Node. */ +#define numa_node_id() read_pda(nodenumber) + +#ifdef CONFIG_DEBUG_PER_CPU_MAPS +extern int cpu_to_node(int cpu); +extern int early_cpu_to_node(int cpu); +extern const cpumask_t *_node_to_cpumask_ptr(int node); +extern cpumask_t node_to_cpumask(int node); + +#else /* !CONFIG_DEBUG_PER_CPU_MAPS */ + +/* Returns the number of the node containing CPU 'cpu' */ +static inline int cpu_to_node(int cpu) +{ + return per_cpu(x86_cpu_to_node_map, cpu); +} + +/* Same function but used if called before per_cpu areas are setup */ +static inline int early_cpu_to_node(int cpu) +{ + if (early_per_cpu_ptr(x86_cpu_to_node_map)) + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; + + return per_cpu(x86_cpu_to_node_map, cpu); +} + +/* Returns a pointer to the cpumask of CPUs on Node 'node'. */ +static inline const cpumask_t *_node_to_cpumask_ptr(int node) +{ + return &node_to_cpumask_map[node]; +} + +/* Returns a bitmask of CPUs on Node 'node'. */ +static inline cpumask_t node_to_cpumask(int node) +{ + return node_to_cpumask_map[node]; +} + +#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ + +/* Replace default node_to_cpumask_ptr with optimized version */ +#define node_to_cpumask_ptr(v, node) \ + const cpumask_t *v = _node_to_cpumask_ptr(node) + +#define node_to_cpumask_ptr_next(v, node) \ + v = _node_to_cpumask_ptr(node) + +#endif /* CONFIG_X86_64 */ + +/* + * Returns the number of the node containing Node 'node'. This + * architecture is flat, so it is a pretty simple function! + */ +#define parent_node(node) (node) + +#define pcibus_to_node(bus) __pcibus_to_node(bus) +#define pcibus_to_cpumask(bus) __pcibus_to_cpumask(bus) + +#ifdef CONFIG_X86_32 +extern unsigned long node_start_pfn[]; +extern unsigned long node_end_pfn[]; +extern unsigned long node_remap_size[]; +#define node_has_online_mem(nid) (node_start_pfn[nid] != node_end_pfn[nid]) + +# define SD_CACHE_NICE_TRIES 1 +# define SD_IDLE_IDX 1 +# define SD_NEWIDLE_IDX 2 +# define SD_FORKEXEC_IDX 0 + +#else + +# define SD_CACHE_NICE_TRIES 2 +# define SD_IDLE_IDX 2 +# define SD_NEWIDLE_IDX 2 +# define SD_FORKEXEC_IDX 1 + +#endif + +/* sched_domains SD_NODE_INIT for NUMAQ machines */ +#define SD_NODE_INIT (struct sched_domain) { \ + .min_interval = 8, \ + .max_interval = 32, \ + .busy_factor = 32, \ + .imbalance_pct = 125, \ + .cache_nice_tries = SD_CACHE_NICE_TRIES, \ + .busy_idx = 3, \ + .idle_idx = SD_IDLE_IDX, \ + .newidle_idx = SD_NEWIDLE_IDX, \ + .wake_idx = 1, \ + .forkexec_idx = SD_FORKEXEC_IDX, \ + .flags = SD_LOAD_BALANCE \ + | SD_BALANCE_EXEC \ + | SD_BALANCE_FORK \ + | SD_SERIALIZE \ + | SD_WAKE_BALANCE, \ + .last_balance = jiffies, \ + .balance_interval = 1, \ +} + +#ifdef CONFIG_X86_64_ACPI_NUMA +extern int __node_distance(int, int); +#define node_distance(a, b) __node_distance(a, b) +#endif + +#else /* !CONFIG_NUMA */ + +#define numa_node_id() 0 +#define cpu_to_node(cpu) 0 +#define early_cpu_to_node(cpu) 0 + +static inline const cpumask_t *_node_to_cpumask_ptr(int node) +{ + return &cpu_online_map; +} +static inline cpumask_t node_to_cpumask(int node) +{ + return cpu_online_map; +} +static inline int node_to_first_cpu(int node) +{ + return first_cpu(cpu_online_map); +} + +/* Replace default node_to_cpumask_ptr with optimized version */ +#define node_to_cpumask_ptr(v, node) \ + const cpumask_t *v = _node_to_cpumask_ptr(node) + +#define node_to_cpumask_ptr_next(v, node) \ + v = _node_to_cpumask_ptr(node) +#endif + +#include <asm-generic/topology.h> + +#ifdef CONFIG_NUMA +/* Returns the number of the first CPU on Node 'node'. */ +static inline int node_to_first_cpu(int node) +{ + node_to_cpumask_ptr(mask, node); + return first_cpu(*mask); +} +#endif + +extern cpumask_t cpu_coregroup_map(int cpu); + +#ifdef ENABLE_TOPO_DEFINES +#define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id) +#define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id) +#define topology_core_siblings(cpu) (per_cpu(cpu_core_map, cpu)) +#define topology_thread_siblings(cpu) (per_cpu(cpu_sibling_map, cpu)) + +/* indicates that pointers to the topology cpumask_t maps are valid */ +#define arch_provides_topology_pointers yes +#endif + +static inline void arch_fix_phys_package_id(int num, u32 slot) +{ +} + +struct pci_bus; +void set_pci_bus_resources_arch_default(struct pci_bus *b); + +#ifdef CONFIG_SMP +#define mc_capable() (boot_cpu_data.x86_max_cores > 1) +#define smt_capable() (smp_num_siblings > 1) +#endif + +#ifdef CONFIG_NUMA +extern int get_mp_bus_to_node(int busnum); +extern void set_mp_bus_to_node(int busnum, int node); +#else +static inline int get_mp_bus_to_node(int busnum) +{ + return 0; +} +static inline void set_mp_bus_to_node(int busnum, int node) +{ +} +#endif + +#endif /* _ASM_X86_TOPOLOGY_H */ diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h new file mode 100644 index 000000000000..fa0d79facdbc --- /dev/null +++ b/arch/x86/include/asm/trampoline.h @@ -0,0 +1,21 @@ +#ifndef _ASM_X86_TRAMPOLINE_H +#define _ASM_X86_TRAMPOLINE_H + +#ifndef __ASSEMBLY__ + +/* + * Trampoline 80x86 program as an array. + */ +extern const unsigned char trampoline_data []; +extern const unsigned char trampoline_end []; +extern unsigned char *trampoline_base; + +extern unsigned long init_rsp; +extern unsigned long initial_code; + +#define TRAMPOLINE_BASE 0x6000 +extern unsigned long setup_trampoline(void); + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_X86_TRAMPOLINE_H */ diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h new file mode 100644 index 000000000000..45dee286e45c --- /dev/null +++ b/arch/x86/include/asm/traps.h @@ -0,0 +1,81 @@ +#ifndef _ASM_X86_TRAPS_H +#define _ASM_X86_TRAPS_H + +#include <asm/debugreg.h> + +#ifdef CONFIG_X86_32 +#define dotraplinkage +#else +#define dotraplinkage asmlinkage +#endif + +asmlinkage void divide_error(void); +asmlinkage void debug(void); +asmlinkage void nmi(void); +asmlinkage void int3(void); +asmlinkage void overflow(void); +asmlinkage void bounds(void); +asmlinkage void invalid_op(void); +asmlinkage void device_not_available(void); +#ifdef CONFIG_X86_64 +asmlinkage void double_fault(void); +#endif +asmlinkage void coprocessor_segment_overrun(void); +asmlinkage void invalid_TSS(void); +asmlinkage void segment_not_present(void); +asmlinkage void stack_segment(void); +asmlinkage void general_protection(void); +asmlinkage void page_fault(void); +asmlinkage void spurious_interrupt_bug(void); +asmlinkage void coprocessor_error(void); +asmlinkage void alignment_check(void); +#ifdef CONFIG_X86_MCE +asmlinkage void machine_check(void); +#endif /* CONFIG_X86_MCE */ +asmlinkage void simd_coprocessor_error(void); + +dotraplinkage void do_divide_error(struct pt_regs *, long); +dotraplinkage void do_debug(struct pt_regs *, long); +dotraplinkage void do_nmi(struct pt_regs *, long); +dotraplinkage void do_int3(struct pt_regs *, long); +dotraplinkage void do_overflow(struct pt_regs *, long); +dotraplinkage void do_bounds(struct pt_regs *, long); +dotraplinkage void do_invalid_op(struct pt_regs *, long); +dotraplinkage void do_device_not_available(struct pt_regs *, long); +dotraplinkage void do_coprocessor_segment_overrun(struct pt_regs *, long); +dotraplinkage void do_invalid_TSS(struct pt_regs *, long); +dotraplinkage void do_segment_not_present(struct pt_regs *, long); +dotraplinkage void do_stack_segment(struct pt_regs *, long); +dotraplinkage void do_general_protection(struct pt_regs *, long); +dotraplinkage void do_page_fault(struct pt_regs *, unsigned long); +dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long); +dotraplinkage void do_coprocessor_error(struct pt_regs *, long); +dotraplinkage void do_alignment_check(struct pt_regs *, long); +#ifdef CONFIG_X86_MCE +dotraplinkage void do_machine_check(struct pt_regs *, long); +#endif +dotraplinkage void do_simd_coprocessor_error(struct pt_regs *, long); +#ifdef CONFIG_X86_32 +dotraplinkage void do_iret_error(struct pt_regs *, long); +#endif + +static inline int get_si_code(unsigned long condition) +{ + if (condition & DR_STEP) + return TRAP_TRACE; + else if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) + return TRAP_HWBKPT; + else + return TRAP_BRKPT; +} + +extern int panic_on_unrecovered_nmi; +extern int kstack_depth_to_print; + +#ifdef CONFIG_X86_32 +void math_error(void __user *); +unsigned long patch_espfix_desc(unsigned long, unsigned long); +asmlinkage void math_emulate(long); +#endif + +#endif /* _ASM_X86_TRAPS_H */ diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h new file mode 100644 index 000000000000..38ae163cc91b --- /dev/null +++ b/arch/x86/include/asm/tsc.h @@ -0,0 +1,62 @@ +/* + * x86 TSC related functions + */ +#ifndef _ASM_X86_TSC_H +#define _ASM_X86_TSC_H + +#include <asm/processor.h> + +#define NS_SCALE 10 /* 2^10, carefully chosen */ +#define US_SCALE 32 /* 2^32, arbitralrily chosen */ + +/* + * Standard way to access the cycle counter. + */ +typedef unsigned long long cycles_t; + +extern unsigned int cpu_khz; +extern unsigned int tsc_khz; + +extern void disable_TSC(void); + +static inline cycles_t get_cycles(void) +{ + unsigned long long ret = 0; + +#ifndef CONFIG_X86_TSC + if (!cpu_has_tsc) + return 0; +#endif + rdtscll(ret); + + return ret; +} + +static __always_inline cycles_t vget_cycles(void) +{ + /* + * We only do VDSOs on TSC capable CPUs, so this shouldnt + * access boot_cpu_data (which is not VDSO-safe): + */ +#ifndef CONFIG_X86_TSC + if (!cpu_has_tsc) + return 0; +#endif + return (cycles_t)__native_read_tsc(); +} + +extern void tsc_init(void); +extern void mark_tsc_unstable(char *reason); +extern int unsynchronized_tsc(void); +int check_tsc_unstable(void); + +/* + * Boot-time check whether the TSCs are synchronized across + * all CPUs/cores: + */ +extern void check_tsc_sync_source(int cpu); +extern void check_tsc_sync_target(void); + +extern int notsc_setup(char *); + +#endif /* _ASM_X86_TSC_H */ diff --git a/arch/x86/include/asm/types.h b/arch/x86/include/asm/types.h new file mode 100644 index 000000000000..e6f736320077 --- /dev/null +++ b/arch/x86/include/asm/types.h @@ -0,0 +1,36 @@ +#ifndef _ASM_X86_TYPES_H +#define _ASM_X86_TYPES_H + +#include <asm-generic/int-ll64.h> + +#ifndef __ASSEMBLY__ + +typedef unsigned short umode_t; + +#endif /* __ASSEMBLY__ */ + +/* + * These aren't exported outside the kernel to avoid name space clashes + */ +#ifdef __KERNEL__ + +#ifdef CONFIG_X86_32 +# define BITS_PER_LONG 32 +#else +# define BITS_PER_LONG 64 +#endif + +#ifndef __ASSEMBLY__ + +typedef u64 dma64_addr_t; +#if defined(CONFIG_X86_64) || defined(CONFIG_HIGHMEM64G) +/* DMA addresses come in 32-bit and 64-bit flavours. */ +typedef u64 dma_addr_t; +#else +typedef u32 dma_addr_t; +#endif + +#endif /* __ASSEMBLY__ */ +#endif /* __KERNEL__ */ + +#endif /* _ASM_X86_TYPES_H */ diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h new file mode 100644 index 000000000000..35c54921b2e4 --- /dev/null +++ b/arch/x86/include/asm/uaccess.h @@ -0,0 +1,454 @@ +#ifndef _ASM_X86_UACCESS_H +#define _ASM_X86_UACCESS_H +/* + * User space memory access functions + */ +#include <linux/errno.h> +#include <linux/compiler.h> +#include <linux/thread_info.h> +#include <linux/prefetch.h> +#include <linux/string.h> +#include <asm/asm.h> +#include <asm/page.h> + +#define VERIFY_READ 0 +#define VERIFY_WRITE 1 + +/* + * The fs value determines whether argument validity checking should be + * performed or not. If get_fs() == USER_DS, checking is performed, with + * get_fs() == KERNEL_DS, checking is bypassed. + * + * For historical reasons, these macros are grossly misnamed. + */ + +#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) }) + +#define KERNEL_DS MAKE_MM_SEG(-1UL) +#define USER_DS MAKE_MM_SEG(PAGE_OFFSET) + +#define get_ds() (KERNEL_DS) +#define get_fs() (current_thread_info()->addr_limit) +#define set_fs(x) (current_thread_info()->addr_limit = (x)) + +#define segment_eq(a, b) ((a).seg == (b).seg) + +#define __addr_ok(addr) \ + ((unsigned long __force)(addr) < \ + (current_thread_info()->addr_limit.seg)) + +/* + * Test whether a block of memory is a valid user space address. + * Returns 0 if the range is valid, nonzero otherwise. + * + * This is equivalent to the following test: + * (u33)addr + (u33)size >= (u33)current->addr_limit.seg (u65 for x86_64) + * + * This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry... + */ + +#define __range_not_ok(addr, size) \ +({ \ + unsigned long flag, roksum; \ + __chk_user_ptr(addr); \ + asm("add %3,%1 ; sbb %0,%0 ; cmp %1,%4 ; sbb $0,%0" \ + : "=&r" (flag), "=r" (roksum) \ + : "1" (addr), "g" ((long)(size)), \ + "rm" (current_thread_info()->addr_limit.seg)); \ + flag; \ +}) + +/** + * access_ok: - Checks if a user space pointer is valid + * @type: Type of access: %VERIFY_READ or %VERIFY_WRITE. Note that + * %VERIFY_WRITE is a superset of %VERIFY_READ - if it is safe + * to write to a block, it is always safe to read from it. + * @addr: User space pointer to start of block to check + * @size: Size of block to check + * + * Context: User context only. This function may sleep. + * + * Checks if a pointer to a block of memory in user space is valid. + * + * Returns true (nonzero) if the memory block may be valid, false (zero) + * if it is definitely invalid. + * + * Note that, depending on architecture, this function probably just + * checks that the pointer is in the user space range - after calling + * this function, memory access functions may still return -EFAULT. + */ +#define access_ok(type, addr, size) (likely(__range_not_ok(addr, size) == 0)) + +/* + * The exception table consists of pairs of addresses: the first is the + * address of an instruction that is allowed to fault, and the second is + * the address at which the program should continue. No registers are + * modified, so it is entirely up to the continuation code to figure out + * what to do. + * + * All the routines below use bits of fixup code that are out of line + * with the main instruction path. This means when everything is well, + * we don't even have to jump over them. Further, they do not intrude + * on our cache or tlb entries. + */ + +struct exception_table_entry { + unsigned long insn, fixup; +}; + +extern int fixup_exception(struct pt_regs *regs); + +/* + * These are the main single-value transfer routines. They automatically + * use the right size if we just have the right pointer type. + * + * This gets kind of ugly. We want to return _two_ values in "get_user()" + * and yet we don't want to do any pointers, because that is too much + * of a performance impact. Thus we have a few rather ugly macros here, + * and hide all the ugliness from the user. + * + * The "__xxx" versions of the user access functions are versions that + * do not verify the address space, that must have been done previously + * with a separate "access_ok()" call (this is used when we do multiple + * accesses to the same area of user memory). + */ + +extern int __get_user_1(void); +extern int __get_user_2(void); +extern int __get_user_4(void); +extern int __get_user_8(void); +extern int __get_user_bad(void); + +#define __get_user_x(size, ret, x, ptr) \ + asm volatile("call __get_user_" #size \ + : "=a" (ret),"=d" (x) \ + : "0" (ptr)) \ + +/* Careful: we have to cast the result to the type of the pointer + * for sign reasons */ + +/** + * get_user: - Get a simple variable from user space. + * @x: Variable to store result. + * @ptr: Source address, in user space. + * + * Context: User context only. This function may sleep. + * + * This macro copies a single simple variable from user space to kernel + * space. It supports simple types like char and int, but not larger + * data types like structures or arrays. + * + * @ptr must have pointer-to-simple-variable type, and the result of + * dereferencing @ptr must be assignable to @x without a cast. + * + * Returns zero on success, or -EFAULT on error. + * On error, the variable @x is set to zero. + */ +#ifdef CONFIG_X86_32 +#define __get_user_8(__ret_gu, __val_gu, ptr) \ + __get_user_x(X, __ret_gu, __val_gu, ptr) +#else +#define __get_user_8(__ret_gu, __val_gu, ptr) \ + __get_user_x(8, __ret_gu, __val_gu, ptr) +#endif + +#define get_user(x, ptr) \ +({ \ + int __ret_gu; \ + unsigned long __val_gu; \ + __chk_user_ptr(ptr); \ + switch (sizeof(*(ptr))) { \ + case 1: \ + __get_user_x(1, __ret_gu, __val_gu, ptr); \ + break; \ + case 2: \ + __get_user_x(2, __ret_gu, __val_gu, ptr); \ + break; \ + case 4: \ + __get_user_x(4, __ret_gu, __val_gu, ptr); \ + break; \ + case 8: \ + __get_user_8(__ret_gu, __val_gu, ptr); \ + break; \ + default: \ + __get_user_x(X, __ret_gu, __val_gu, ptr); \ + break; \ + } \ + (x) = (__typeof__(*(ptr)))__val_gu; \ + __ret_gu; \ +}) + +#define __put_user_x(size, x, ptr, __ret_pu) \ + asm volatile("call __put_user_" #size : "=a" (__ret_pu) \ + :"0" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx") + + + +#ifdef CONFIG_X86_32 +#define __put_user_u64(x, addr, err) \ + asm volatile("1: movl %%eax,0(%2)\n" \ + "2: movl %%edx,4(%2)\n" \ + "3:\n" \ + ".section .fixup,\"ax\"\n" \ + "4: movl %3,%0\n" \ + " jmp 3b\n" \ + ".previous\n" \ + _ASM_EXTABLE(1b, 4b) \ + _ASM_EXTABLE(2b, 4b) \ + : "=r" (err) \ + : "A" (x), "r" (addr), "i" (-EFAULT), "0" (err)) + +#define __put_user_x8(x, ptr, __ret_pu) \ + asm volatile("call __put_user_8" : "=a" (__ret_pu) \ + : "A" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx") +#else +#define __put_user_u64(x, ptr, retval) \ + __put_user_asm(x, ptr, retval, "q", "", "Zr", -EFAULT) +#define __put_user_x8(x, ptr, __ret_pu) __put_user_x(8, x, ptr, __ret_pu) +#endif + +extern void __put_user_bad(void); + +/* + * Strange magic calling convention: pointer in %ecx, + * value in %eax(:%edx), return value in %eax. clobbers %rbx + */ +extern void __put_user_1(void); +extern void __put_user_2(void); +extern void __put_user_4(void); +extern void __put_user_8(void); + +#ifdef CONFIG_X86_WP_WORKS_OK + +/** + * put_user: - Write a simple value into user space. + * @x: Value to copy to user space. + * @ptr: Destination address, in user space. + * + * Context: User context only. This function may sleep. + * + * This macro copies a single simple value from kernel space to user + * space. It supports simple types like char and int, but not larger + * data types like structures or arrays. + * + * @ptr must have pointer-to-simple-variable type, and @x must be assignable + * to the result of dereferencing @ptr. + * + * Returns zero on success, or -EFAULT on error. + */ +#define put_user(x, ptr) \ +({ \ + int __ret_pu; \ + __typeof__(*(ptr)) __pu_val; \ + __chk_user_ptr(ptr); \ + __pu_val = x; \ + switch (sizeof(*(ptr))) { \ + case 1: \ + __put_user_x(1, __pu_val, ptr, __ret_pu); \ + break; \ + case 2: \ + __put_user_x(2, __pu_val, ptr, __ret_pu); \ + break; \ + case 4: \ + __put_user_x(4, __pu_val, ptr, __ret_pu); \ + break; \ + case 8: \ + __put_user_x8(__pu_val, ptr, __ret_pu); \ + break; \ + default: \ + __put_user_x(X, __pu_val, ptr, __ret_pu); \ + break; \ + } \ + __ret_pu; \ +}) + +#define __put_user_size(x, ptr, size, retval, errret) \ +do { \ + retval = 0; \ + __chk_user_ptr(ptr); \ + switch (size) { \ + case 1: \ + __put_user_asm(x, ptr, retval, "b", "b", "iq", errret); \ + break; \ + case 2: \ + __put_user_asm(x, ptr, retval, "w", "w", "ir", errret); \ + break; \ + case 4: \ + __put_user_asm(x, ptr, retval, "l", "k", "ir", errret);\ + break; \ + case 8: \ + __put_user_u64((__typeof__(*ptr))(x), ptr, retval); \ + break; \ + default: \ + __put_user_bad(); \ + } \ +} while (0) + +#else + +#define __put_user_size(x, ptr, size, retval, errret) \ +do { \ + __typeof__(*(ptr))__pus_tmp = x; \ + retval = 0; \ + \ + if (unlikely(__copy_to_user_ll(ptr, &__pus_tmp, size) != 0)) \ + retval = errret; \ +} while (0) + +#define put_user(x, ptr) \ +({ \ + int __ret_pu; \ + __typeof__(*(ptr))__pus_tmp = x; \ + __ret_pu = 0; \ + if (unlikely(__copy_to_user_ll(ptr, &__pus_tmp, \ + sizeof(*(ptr))) != 0)) \ + __ret_pu = -EFAULT; \ + __ret_pu; \ +}) +#endif + +#ifdef CONFIG_X86_32 +#define __get_user_asm_u64(x, ptr, retval, errret) (x) = __get_user_bad() +#else +#define __get_user_asm_u64(x, ptr, retval, errret) \ + __get_user_asm(x, ptr, retval, "q", "", "=r", errret) +#endif + +#define __get_user_size(x, ptr, size, retval, errret) \ +do { \ + retval = 0; \ + __chk_user_ptr(ptr); \ + switch (size) { \ + case 1: \ + __get_user_asm(x, ptr, retval, "b", "b", "=q", errret); \ + break; \ + case 2: \ + __get_user_asm(x, ptr, retval, "w", "w", "=r", errret); \ + break; \ + case 4: \ + __get_user_asm(x, ptr, retval, "l", "k", "=r", errret); \ + break; \ + case 8: \ + __get_user_asm_u64(x, ptr, retval, errret); \ + break; \ + default: \ + (x) = __get_user_bad(); \ + } \ +} while (0) + +#define __get_user_asm(x, addr, err, itype, rtype, ltype, errret) \ + asm volatile("1: mov"itype" %2,%"rtype"1\n" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: mov %3,%0\n" \ + " xor"itype" %"rtype"1,%"rtype"1\n" \ + " jmp 2b\n" \ + ".previous\n" \ + _ASM_EXTABLE(1b, 3b) \ + : "=r" (err), ltype(x) \ + : "m" (__m(addr)), "i" (errret), "0" (err)) + +#define __put_user_nocheck(x, ptr, size) \ +({ \ + long __pu_err; \ + __put_user_size((x), (ptr), (size), __pu_err, -EFAULT); \ + __pu_err; \ +}) + +#define __get_user_nocheck(x, ptr, size) \ +({ \ + long __gu_err; \ + unsigned long __gu_val; \ + __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \ + (x) = (__force __typeof__(*(ptr)))__gu_val; \ + __gu_err; \ +}) + +/* FIXME: this hack is definitely wrong -AK */ +struct __large_struct { unsigned long buf[100]; }; +#define __m(x) (*(struct __large_struct __user *)(x)) + +/* + * Tell gcc we read from memory instead of writing: this is because + * we do not write to any memory gcc knows about, so there are no + * aliasing issues. + */ +#define __put_user_asm(x, addr, err, itype, rtype, ltype, errret) \ + asm volatile("1: mov"itype" %"rtype"1,%2\n" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: mov %3,%0\n" \ + " jmp 2b\n" \ + ".previous\n" \ + _ASM_EXTABLE(1b, 3b) \ + : "=r"(err) \ + : ltype(x), "m" (__m(addr)), "i" (errret), "0" (err)) +/** + * __get_user: - Get a simple variable from user space, with less checking. + * @x: Variable to store result. + * @ptr: Source address, in user space. + * + * Context: User context only. This function may sleep. + * + * This macro copies a single simple variable from user space to kernel + * space. It supports simple types like char and int, but not larger + * data types like structures or arrays. + * + * @ptr must have pointer-to-simple-variable type, and the result of + * dereferencing @ptr must be assignable to @x without a cast. + * + * Caller must check the pointer with access_ok() before calling this + * function. + * + * Returns zero on success, or -EFAULT on error. + * On error, the variable @x is set to zero. + */ + +#define __get_user(x, ptr) \ + __get_user_nocheck((x), (ptr), sizeof(*(ptr))) +/** + * __put_user: - Write a simple value into user space, with less checking. + * @x: Value to copy to user space. + * @ptr: Destination address, in user space. + * + * Context: User context only. This function may sleep. + * + * This macro copies a single simple value from kernel space to user + * space. It supports simple types like char and int, but not larger + * data types like structures or arrays. + * + * @ptr must have pointer-to-simple-variable type, and @x must be assignable + * to the result of dereferencing @ptr. + * + * Caller must check the pointer with access_ok() before calling this + * function. + * + * Returns zero on success, or -EFAULT on error. + */ + +#define __put_user(x, ptr) \ + __put_user_nocheck((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr))) + +#define __get_user_unaligned __get_user +#define __put_user_unaligned __put_user + +/* + * movsl can be slow when source and dest are not both 8-byte aligned + */ +#ifdef CONFIG_X86_INTEL_USERCOPY +extern struct movsl_mask { + int mask; +} ____cacheline_aligned_in_smp movsl_mask; +#endif + +#define ARCH_HAS_NOCACHE_UACCESS 1 + +#ifdef CONFIG_X86_32 +# include "uaccess_32.h" +#else +# define ARCH_HAS_SEARCH_EXTABLE +# include "uaccess_64.h" +#endif + +#endif /* _ASM_X86_UACCESS_H */ + diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h new file mode 100644 index 000000000000..d095a3aeea1b --- /dev/null +++ b/arch/x86/include/asm/uaccess_32.h @@ -0,0 +1,218 @@ +#ifndef _ASM_X86_UACCESS_32_H +#define _ASM_X86_UACCESS_32_H + +/* + * User space memory access functions + */ +#include <linux/errno.h> +#include <linux/thread_info.h> +#include <linux/prefetch.h> +#include <linux/string.h> +#include <asm/asm.h> +#include <asm/page.h> + +unsigned long __must_check __copy_to_user_ll + (void __user *to, const void *from, unsigned long n); +unsigned long __must_check __copy_from_user_ll + (void *to, const void __user *from, unsigned long n); +unsigned long __must_check __copy_from_user_ll_nozero + (void *to, const void __user *from, unsigned long n); +unsigned long __must_check __copy_from_user_ll_nocache + (void *to, const void __user *from, unsigned long n); +unsigned long __must_check __copy_from_user_ll_nocache_nozero + (void *to, const void __user *from, unsigned long n); + +/** + * __copy_to_user_inatomic: - Copy a block of data into user space, with less checking. + * @to: Destination address, in user space. + * @from: Source address, in kernel space. + * @n: Number of bytes to copy. + * + * Context: User context only. + * + * Copy data from kernel space to user space. Caller must check + * the specified block with access_ok() before calling this function. + * The caller should also make sure he pins the user space address + * so that the we don't result in page fault and sleep. + * + * Here we special-case 1, 2 and 4-byte copy_*_user invocations. On a fault + * we return the initial request size (1, 2 or 4), as copy_*_user should do. + * If a store crosses a page boundary and gets a fault, the x86 will not write + * anything, so this is accurate. + */ + +static __always_inline unsigned long __must_check +__copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) +{ + if (__builtin_constant_p(n)) { + unsigned long ret; + + switch (n) { + case 1: + __put_user_size(*(u8 *)from, (u8 __user *)to, + 1, ret, 1); + return ret; + case 2: + __put_user_size(*(u16 *)from, (u16 __user *)to, + 2, ret, 2); + return ret; + case 4: + __put_user_size(*(u32 *)from, (u32 __user *)to, + 4, ret, 4); + return ret; + } + } + return __copy_to_user_ll(to, from, n); +} + +/** + * __copy_to_user: - Copy a block of data into user space, with less checking. + * @to: Destination address, in user space. + * @from: Source address, in kernel space. + * @n: Number of bytes to copy. + * + * Context: User context only. This function may sleep. + * + * Copy data from kernel space to user space. Caller must check + * the specified block with access_ok() before calling this function. + * + * Returns number of bytes that could not be copied. + * On success, this will be zero. + */ +static __always_inline unsigned long __must_check +__copy_to_user(void __user *to, const void *from, unsigned long n) +{ + might_sleep(); + return __copy_to_user_inatomic(to, from, n); +} + +static __always_inline unsigned long +__copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) +{ + /* Avoid zeroing the tail if the copy fails.. + * If 'n' is constant and 1, 2, or 4, we do still zero on a failure, + * but as the zeroing behaviour is only significant when n is not + * constant, that shouldn't be a problem. + */ + if (__builtin_constant_p(n)) { + unsigned long ret; + + switch (n) { + case 1: + __get_user_size(*(u8 *)to, from, 1, ret, 1); + return ret; + case 2: + __get_user_size(*(u16 *)to, from, 2, ret, 2); + return ret; + case 4: + __get_user_size(*(u32 *)to, from, 4, ret, 4); + return ret; + } + } + return __copy_from_user_ll_nozero(to, from, n); +} + +/** + * __copy_from_user: - Copy a block of data from user space, with less checking. + * @to: Destination address, in kernel space. + * @from: Source address, in user space. + * @n: Number of bytes to copy. + * + * Context: User context only. This function may sleep. + * + * Copy data from user space to kernel space. Caller must check + * the specified block with access_ok() before calling this function. + * + * Returns number of bytes that could not be copied. + * On success, this will be zero. + * + * If some data could not be copied, this function will pad the copied + * data to the requested size using zero bytes. + * + * An alternate version - __copy_from_user_inatomic() - may be called from + * atomic context and will fail rather than sleep. In this case the + * uncopied bytes will *NOT* be padded with zeros. See fs/filemap.h + * for explanation of why this is needed. + */ +static __always_inline unsigned long +__copy_from_user(void *to, const void __user *from, unsigned long n) +{ + might_sleep(); + if (__builtin_constant_p(n)) { + unsigned long ret; + + switch (n) { + case 1: + __get_user_size(*(u8 *)to, from, 1, ret, 1); + return ret; + case 2: + __get_user_size(*(u16 *)to, from, 2, ret, 2); + return ret; + case 4: + __get_user_size(*(u32 *)to, from, 4, ret, 4); + return ret; + } + } + return __copy_from_user_ll(to, from, n); +} + +static __always_inline unsigned long __copy_from_user_nocache(void *to, + const void __user *from, unsigned long n) +{ + might_sleep(); + if (__builtin_constant_p(n)) { + unsigned long ret; + + switch (n) { + case 1: + __get_user_size(*(u8 *)to, from, 1, ret, 1); + return ret; + case 2: + __get_user_size(*(u16 *)to, from, 2, ret, 2); + return ret; + case 4: + __get_user_size(*(u32 *)to, from, 4, ret, 4); + return ret; + } + } + return __copy_from_user_ll_nocache(to, from, n); +} + +static __always_inline unsigned long +__copy_from_user_inatomic_nocache(void *to, const void __user *from, + unsigned long n) +{ + return __copy_from_user_ll_nocache_nozero(to, from, n); +} + +unsigned long __must_check copy_to_user(void __user *to, + const void *from, unsigned long n); +unsigned long __must_check copy_from_user(void *to, + const void __user *from, + unsigned long n); +long __must_check strncpy_from_user(char *dst, const char __user *src, + long count); +long __must_check __strncpy_from_user(char *dst, + const char __user *src, long count); + +/** + * strlen_user: - Get the size of a string in user space. + * @str: The string to measure. + * + * Context: User context only. This function may sleep. + * + * Get the size of a NUL-terminated string in user space. + * + * Returns the size of the string INCLUDING the terminating NUL. + * On exception, returns 0. + * + * If there is a limit on the length of a valid string, you may wish to + * consider using strnlen_user() instead. + */ +#define strlen_user(str) strnlen_user(str, LONG_MAX) + +long strnlen_user(const char __user *str, long n); +unsigned long __must_check clear_user(void __user *mem, unsigned long len); +unsigned long __must_check __clear_user(void __user *mem, unsigned long len); + +#endif /* _ASM_X86_UACCESS_32_H */ diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h new file mode 100644 index 000000000000..664f15280f14 --- /dev/null +++ b/arch/x86/include/asm/uaccess_64.h @@ -0,0 +1,202 @@ +#ifndef _ASM_X86_UACCESS_64_H +#define _ASM_X86_UACCESS_64_H + +/* + * User space memory access functions + */ +#include <linux/compiler.h> +#include <linux/errno.h> +#include <linux/prefetch.h> +#include <linux/lockdep.h> +#include <asm/page.h> + +/* + * Copy To/From Userspace + */ + +/* Handles exceptions in both to and from, but doesn't do access_ok */ +__must_check unsigned long +copy_user_generic(void *to, const void *from, unsigned len); + +__must_check unsigned long +copy_to_user(void __user *to, const void *from, unsigned len); +__must_check unsigned long +copy_from_user(void *to, const void __user *from, unsigned len); +__must_check unsigned long +copy_in_user(void __user *to, const void __user *from, unsigned len); + +static __always_inline __must_check +int __copy_from_user(void *dst, const void __user *src, unsigned size) +{ + int ret = 0; + if (!__builtin_constant_p(size)) + return copy_user_generic(dst, (__force void *)src, size); + switch (size) { + case 1:__get_user_asm(*(u8 *)dst, (u8 __user *)src, + ret, "b", "b", "=q", 1); + return ret; + case 2:__get_user_asm(*(u16 *)dst, (u16 __user *)src, + ret, "w", "w", "=r", 2); + return ret; + case 4:__get_user_asm(*(u32 *)dst, (u32 __user *)src, + ret, "l", "k", "=r", 4); + return ret; + case 8:__get_user_asm(*(u64 *)dst, (u64 __user *)src, + ret, "q", "", "=r", 8); + return ret; + case 10: + __get_user_asm(*(u64 *)dst, (u64 __user *)src, + ret, "q", "", "=r", 16); + if (unlikely(ret)) + return ret; + __get_user_asm(*(u16 *)(8 + (char *)dst), + (u16 __user *)(8 + (char __user *)src), + ret, "w", "w", "=r", 2); + return ret; + case 16: + __get_user_asm(*(u64 *)dst, (u64 __user *)src, + ret, "q", "", "=r", 16); + if (unlikely(ret)) + return ret; + __get_user_asm(*(u64 *)(8 + (char *)dst), + (u64 __user *)(8 + (char __user *)src), + ret, "q", "", "=r", 8); + return ret; + default: + return copy_user_generic(dst, (__force void *)src, size); + } +} + +static __always_inline __must_check +int __copy_to_user(void __user *dst, const void *src, unsigned size) +{ + int ret = 0; + if (!__builtin_constant_p(size)) + return copy_user_generic((__force void *)dst, src, size); + switch (size) { + case 1:__put_user_asm(*(u8 *)src, (u8 __user *)dst, + ret, "b", "b", "iq", 1); + return ret; + case 2:__put_user_asm(*(u16 *)src, (u16 __user *)dst, + ret, "w", "w", "ir", 2); + return ret; + case 4:__put_user_asm(*(u32 *)src, (u32 __user *)dst, + ret, "l", "k", "ir", 4); + return ret; + case 8:__put_user_asm(*(u64 *)src, (u64 __user *)dst, + ret, "q", "", "ir", 8); + return ret; + case 10: + __put_user_asm(*(u64 *)src, (u64 __user *)dst, + ret, "q", "", "ir", 10); + if (unlikely(ret)) + return ret; + asm("":::"memory"); + __put_user_asm(4[(u16 *)src], 4 + (u16 __user *)dst, + ret, "w", "w", "ir", 2); + return ret; + case 16: + __put_user_asm(*(u64 *)src, (u64 __user *)dst, + ret, "q", "", "ir", 16); + if (unlikely(ret)) + return ret; + asm("":::"memory"); + __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst, + ret, "q", "", "ir", 8); + return ret; + default: + return copy_user_generic((__force void *)dst, src, size); + } +} + +static __always_inline __must_check +int __copy_in_user(void __user *dst, const void __user *src, unsigned size) +{ + int ret = 0; + if (!__builtin_constant_p(size)) + return copy_user_generic((__force void *)dst, + (__force void *)src, size); + switch (size) { + case 1: { + u8 tmp; + __get_user_asm(tmp, (u8 __user *)src, + ret, "b", "b", "=q", 1); + if (likely(!ret)) + __put_user_asm(tmp, (u8 __user *)dst, + ret, "b", "b", "iq", 1); + return ret; + } + case 2: { + u16 tmp; + __get_user_asm(tmp, (u16 __user *)src, + ret, "w", "w", "=r", 2); + if (likely(!ret)) + __put_user_asm(tmp, (u16 __user *)dst, + ret, "w", "w", "ir", 2); + return ret; + } + + case 4: { + u32 tmp; + __get_user_asm(tmp, (u32 __user *)src, + ret, "l", "k", "=r", 4); + if (likely(!ret)) + __put_user_asm(tmp, (u32 __user *)dst, + ret, "l", "k", "ir", 4); + return ret; + } + case 8: { + u64 tmp; + __get_user_asm(tmp, (u64 __user *)src, + ret, "q", "", "=r", 8); + if (likely(!ret)) + __put_user_asm(tmp, (u64 __user *)dst, + ret, "q", "", "ir", 8); + return ret; + } + default: + return copy_user_generic((__force void *)dst, + (__force void *)src, size); + } +} + +__must_check long +strncpy_from_user(char *dst, const char __user *src, long count); +__must_check long +__strncpy_from_user(char *dst, const char __user *src, long count); +__must_check long strnlen_user(const char __user *str, long n); +__must_check long __strnlen_user(const char __user *str, long n); +__must_check long strlen_user(const char __user *str); +__must_check unsigned long clear_user(void __user *mem, unsigned long len); +__must_check unsigned long __clear_user(void __user *mem, unsigned long len); + +__must_check long __copy_from_user_inatomic(void *dst, const void __user *src, + unsigned size); + +static __must_check __always_inline int +__copy_to_user_inatomic(void __user *dst, const void *src, unsigned size) +{ + return copy_user_generic((__force void *)dst, src, size); +} + +extern long __copy_user_nocache(void *dst, const void __user *src, + unsigned size, int zerorest); + +static inline int __copy_from_user_nocache(void *dst, const void __user *src, + unsigned size) +{ + might_sleep(); + return __copy_user_nocache(dst, src, size, 1); +} + +static inline int __copy_from_user_inatomic_nocache(void *dst, + const void __user *src, + unsigned size) +{ + return __copy_user_nocache(dst, src, size, 0); +} + +unsigned long +copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest); + +#endif /* _ASM_X86_UACCESS_64_H */ diff --git a/arch/x86/include/asm/ucontext.h b/arch/x86/include/asm/ucontext.h new file mode 100644 index 000000000000..87324cf439d9 --- /dev/null +++ b/arch/x86/include/asm/ucontext.h @@ -0,0 +1,18 @@ +#ifndef _ASM_X86_UCONTEXT_H +#define _ASM_X86_UCONTEXT_H + +#define UC_FP_XSTATE 0x1 /* indicates the presence of extended state + * information in the memory layout pointed + * by the fpstate pointer in the ucontext's + * sigcontext struct (uc_mcontext). + */ + +struct ucontext { + unsigned long uc_flags; + struct ucontext *uc_link; + stack_t uc_stack; + struct sigcontext uc_mcontext; + sigset_t uc_sigmask; /* mask last for extensibility */ +}; + +#endif /* _ASM_X86_UCONTEXT_H */ diff --git a/arch/x86/include/asm/unaligned.h b/arch/x86/include/asm/unaligned.h new file mode 100644 index 000000000000..a7bd416b4763 --- /dev/null +++ b/arch/x86/include/asm/unaligned.h @@ -0,0 +1,14 @@ +#ifndef _ASM_X86_UNALIGNED_H +#define _ASM_X86_UNALIGNED_H + +/* + * The x86 can do unaligned accesses itself. + */ + +#include <linux/unaligned/access_ok.h> +#include <linux/unaligned/generic.h> + +#define get_unaligned __get_unaligned_le +#define put_unaligned __put_unaligned_le + +#endif /* _ASM_X86_UNALIGNED_H */ diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h new file mode 100644 index 000000000000..2a58ed3e51d8 --- /dev/null +++ b/arch/x86/include/asm/unistd.h @@ -0,0 +1,13 @@ +#ifdef __KERNEL__ +# ifdef CONFIG_X86_32 +# include "unistd_32.h" +# else +# include "unistd_64.h" +# endif +#else +# ifdef __i386__ +# include "unistd_32.h" +# else +# include "unistd_64.h" +# endif +#endif diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h new file mode 100644 index 000000000000..f2bba78430a4 --- /dev/null +++ b/arch/x86/include/asm/unistd_32.h @@ -0,0 +1,379 @@ +#ifndef _ASM_X86_UNISTD_32_H +#define _ASM_X86_UNISTD_32_H + +/* + * This file contains the system call numbers. + */ + +#define __NR_restart_syscall 0 +#define __NR_exit 1 +#define __NR_fork 2 +#define __NR_read 3 +#define __NR_write 4 +#define __NR_open 5 +#define __NR_close 6 +#define __NR_waitpid 7 +#define __NR_creat 8 +#define __NR_link 9 +#define __NR_unlink 10 +#define __NR_execve 11 +#define __NR_chdir 12 +#define __NR_time 13 +#define __NR_mknod 14 +#define __NR_chmod 15 +#define __NR_lchown 16 +#define __NR_break 17 +#define __NR_oldstat 18 +#define __NR_lseek 19 +#define __NR_getpid 20 +#define __NR_mount 21 +#define __NR_umount 22 +#define __NR_setuid 23 +#define __NR_getuid 24 +#define __NR_stime 25 +#define __NR_ptrace 26 +#define __NR_alarm 27 +#define __NR_oldfstat 28 +#define __NR_pause 29 +#define __NR_utime 30 +#define __NR_stty 31 +#define __NR_gtty 32 +#define __NR_access 33 +#define __NR_nice 34 +#define __NR_ftime 35 +#define __NR_sync 36 +#define __NR_kill 37 +#define __NR_rename 38 +#define __NR_mkdir 39 +#define __NR_rmdir 40 +#define __NR_dup 41 +#define __NR_pipe 42 +#define __NR_times 43 +#define __NR_prof 44 +#define __NR_brk 45 +#define __NR_setgid 46 +#define __NR_getgid 47 +#define __NR_signal 48 +#define __NR_geteuid 49 +#define __NR_getegid 50 +#define __NR_acct 51 +#define __NR_umount2 52 +#define __NR_lock 53 +#define __NR_ioctl 54 +#define __NR_fcntl 55 +#define __NR_mpx 56 +#define __NR_setpgid 57 +#define __NR_ulimit 58 +#define __NR_oldolduname 59 +#define __NR_umask 60 +#define __NR_chroot 61 +#define __NR_ustat 62 +#define __NR_dup2 63 +#define __NR_getppid 64 +#define __NR_getpgrp 65 +#define __NR_setsid 66 +#define __NR_sigaction 67 +#define __NR_sgetmask 68 +#define __NR_ssetmask 69 +#define __NR_setreuid 70 +#define __NR_setregid 71 +#define __NR_sigsuspend 72 +#define __NR_sigpending 73 +#define __NR_sethostname 74 +#define __NR_setrlimit 75 +#define __NR_getrlimit 76 /* Back compatible 2Gig limited rlimit */ +#define __NR_getrusage 77 +#define __NR_gettimeofday 78 +#define __NR_settimeofday 79 +#define __NR_getgroups 80 +#define __NR_setgroups 81 +#define __NR_select 82 +#define __NR_symlink 83 +#define __NR_oldlstat 84 +#define __NR_readlink 85 +#define __NR_uselib 86 +#define __NR_swapon 87 +#define __NR_reboot 88 +#define __NR_readdir 89 +#define __NR_mmap 90 +#define __NR_munmap 91 +#define __NR_truncate 92 +#define __NR_ftruncate 93 +#define __NR_fchmod 94 +#define __NR_fchown 95 +#define __NR_getpriority 96 +#define __NR_setpriority 97 +#define __NR_profil 98 +#define __NR_statfs 99 +#define __NR_fstatfs 100 +#define __NR_ioperm 101 +#define __NR_socketcall 102 +#define __NR_syslog 103 +#define __NR_setitimer 104 +#define __NR_getitimer 105 +#define __NR_stat 106 +#define __NR_lstat 107 +#define __NR_fstat 108 +#define __NR_olduname 109 +#define __NR_iopl 110 +#define __NR_vhangup 111 +#define __NR_idle 112 +#define __NR_vm86old 113 +#define __NR_wait4 114 +#define __NR_swapoff 115 +#define __NR_sysinfo 116 +#define __NR_ipc 117 +#define __NR_fsync 118 +#define __NR_sigreturn 119 +#define __NR_clone 120 +#define __NR_setdomainname 121 +#define __NR_uname 122 +#define __NR_modify_ldt 123 +#define __NR_adjtimex 124 +#define __NR_mprotect 125 +#define __NR_sigprocmask 126 +#define __NR_create_module 127 +#define __NR_init_module 128 +#define __NR_delete_module 129 +#define __NR_get_kernel_syms 130 +#define __NR_quotactl 131 +#define __NR_getpgid 132 +#define __NR_fchdir 133 +#define __NR_bdflush 134 +#define __NR_sysfs 135 +#define __NR_personality 136 +#define __NR_afs_syscall 137 /* Syscall for Andrew File System */ +#define __NR_setfsuid 138 +#define __NR_setfsgid 139 +#define __NR__llseek 140 +#define __NR_getdents 141 +#define __NR__newselect 142 +#define __NR_flock 143 +#define __NR_msync 144 +#define __NR_readv 145 +#define __NR_writev 146 +#define __NR_getsid 147 +#define __NR_fdatasync 148 +#define __NR__sysctl 149 +#define __NR_mlock 150 +#define __NR_munlock 151 +#define __NR_mlockall 152 +#define __NR_munlockall 153 +#define __NR_sched_setparam 154 +#define __NR_sched_getparam 155 +#define __NR_sched_setscheduler 156 +#define __NR_sched_getscheduler 157 +#define __NR_sched_yield 158 +#define __NR_sched_get_priority_max 159 +#define __NR_sched_get_priority_min 160 +#define __NR_sched_rr_get_interval 161 +#define __NR_nanosleep 162 +#define __NR_mremap 163 +#define __NR_setresuid 164 +#define __NR_getresuid 165 +#define __NR_vm86 166 +#define __NR_query_module 167 +#define __NR_poll 168 +#define __NR_nfsservctl 169 +#define __NR_setresgid 170 +#define __NR_getresgid 171 +#define __NR_prctl 172 +#define __NR_rt_sigreturn 173 +#define __NR_rt_sigaction 174 +#define __NR_rt_sigprocmask 175 +#define __NR_rt_sigpending 176 +#define __NR_rt_sigtimedwait 177 +#define __NR_rt_sigqueueinfo 178 +#define __NR_rt_sigsuspend 179 +#define __NR_pread64 180 +#define __NR_pwrite64 181 +#define __NR_chown 182 +#define __NR_getcwd 183 +#define __NR_capget 184 +#define __NR_capset 185 +#define __NR_sigaltstack 186 +#define __NR_sendfile 187 +#define __NR_getpmsg 188 /* some people actually want streams */ +#define __NR_putpmsg 189 /* some people actually want streams */ +#define __NR_vfork 190 +#define __NR_ugetrlimit 191 /* SuS compliant getrlimit */ +#define __NR_mmap2 192 +#define __NR_truncate64 193 +#define __NR_ftruncate64 194 +#define __NR_stat64 195 +#define __NR_lstat64 196 +#define __NR_fstat64 197 +#define __NR_lchown32 198 +#define __NR_getuid32 199 +#define __NR_getgid32 200 +#define __NR_geteuid32 201 +#define __NR_getegid32 202 +#define __NR_setreuid32 203 +#define __NR_setregid32 204 +#define __NR_getgroups32 205 +#define __NR_setgroups32 206 +#define __NR_fchown32 207 +#define __NR_setresuid32 208 +#define __NR_getresuid32 209 +#define __NR_setresgid32 210 +#define __NR_getresgid32 211 +#define __NR_chown32 212 +#define __NR_setuid32 213 +#define __NR_setgid32 214 +#define __NR_setfsuid32 215 +#define __NR_setfsgid32 216 +#define __NR_pivot_root 217 +#define __NR_mincore 218 +#define __NR_madvise 219 +#define __NR_madvise1 219 /* delete when C lib stub is removed */ +#define __NR_getdents64 220 +#define __NR_fcntl64 221 +/* 223 is unused */ +#define __NR_gettid 224 +#define __NR_readahead 225 +#define __NR_setxattr 226 +#define __NR_lsetxattr 227 +#define __NR_fsetxattr 228 +#define __NR_getxattr 229 +#define __NR_lgetxattr 230 +#define __NR_fgetxattr 231 +#define __NR_listxattr 232 +#define __NR_llistxattr 233 +#define __NR_flistxattr 234 +#define __NR_removexattr 235 +#define __NR_lremovexattr 236 +#define __NR_fremovexattr 237 +#define __NR_tkill 238 +#define __NR_sendfile64 239 +#define __NR_futex 240 +#define __NR_sched_setaffinity 241 +#define __NR_sched_getaffinity 242 +#define __NR_set_thread_area 243 +#define __NR_get_thread_area 244 +#define __NR_io_setup 245 +#define __NR_io_destroy 246 +#define __NR_io_getevents 247 +#define __NR_io_submit 248 +#define __NR_io_cancel 249 +#define __NR_fadvise64 250 +/* 251 is available for reuse (was briefly sys_set_zone_reclaim) */ +#define __NR_exit_group 252 +#define __NR_lookup_dcookie 253 +#define __NR_epoll_create 254 +#define __NR_epoll_ctl 255 +#define __NR_epoll_wait 256 +#define __NR_remap_file_pages 257 +#define __NR_set_tid_address 258 +#define __NR_timer_create 259 +#define __NR_timer_settime (__NR_timer_create+1) +#define __NR_timer_gettime (__NR_timer_create+2) +#define __NR_timer_getoverrun (__NR_timer_create+3) +#define __NR_timer_delete (__NR_timer_create+4) +#define __NR_clock_settime (__NR_timer_create+5) +#define __NR_clock_gettime (__NR_timer_create+6) +#define __NR_clock_getres (__NR_timer_create+7) +#define __NR_clock_nanosleep (__NR_timer_create+8) +#define __NR_statfs64 268 +#define __NR_fstatfs64 269 +#define __NR_tgkill 270 +#define __NR_utimes 271 +#define __NR_fadvise64_64 272 +#define __NR_vserver 273 +#define __NR_mbind 274 +#define __NR_get_mempolicy 275 +#define __NR_set_mempolicy 276 +#define __NR_mq_open 277 +#define __NR_mq_unlink (__NR_mq_open+1) +#define __NR_mq_timedsend (__NR_mq_open+2) +#define __NR_mq_timedreceive (__NR_mq_open+3) +#define __NR_mq_notify (__NR_mq_open+4) +#define __NR_mq_getsetattr (__NR_mq_open+5) +#define __NR_kexec_load 283 +#define __NR_waitid 284 +/* #define __NR_sys_setaltroot 285 */ +#define __NR_add_key 286 +#define __NR_request_key 287 +#define __NR_keyctl 288 +#define __NR_ioprio_set 289 +#define __NR_ioprio_get 290 +#define __NR_inotify_init 291 +#define __NR_inotify_add_watch 292 +#define __NR_inotify_rm_watch 293 +#define __NR_migrate_pages 294 +#define __NR_openat 295 +#define __NR_mkdirat 296 +#define __NR_mknodat 297 +#define __NR_fchownat 298 +#define __NR_futimesat 299 +#define __NR_fstatat64 300 +#define __NR_unlinkat 301 +#define __NR_renameat 302 +#define __NR_linkat 303 +#define __NR_symlinkat 304 +#define __NR_readlinkat 305 +#define __NR_fchmodat 306 +#define __NR_faccessat 307 +#define __NR_pselect6 308 +#define __NR_ppoll 309 +#define __NR_unshare 310 +#define __NR_set_robust_list 311 +#define __NR_get_robust_list 312 +#define __NR_splice 313 +#define __NR_sync_file_range 314 +#define __NR_tee 315 +#define __NR_vmsplice 316 +#define __NR_move_pages 317 +#define __NR_getcpu 318 +#define __NR_epoll_pwait 319 +#define __NR_utimensat 320 +#define __NR_signalfd 321 +#define __NR_timerfd_create 322 +#define __NR_eventfd 323 +#define __NR_fallocate 324 +#define __NR_timerfd_settime 325 +#define __NR_timerfd_gettime 326 +#define __NR_signalfd4 327 +#define __NR_eventfd2 328 +#define __NR_epoll_create1 329 +#define __NR_dup3 330 +#define __NR_pipe2 331 +#define __NR_inotify_init1 332 + +#ifdef __KERNEL__ + +#define __ARCH_WANT_IPC_PARSE_VERSION +#define __ARCH_WANT_OLD_READDIR +#define __ARCH_WANT_OLD_STAT +#define __ARCH_WANT_STAT64 +#define __ARCH_WANT_SYS_ALARM +#define __ARCH_WANT_SYS_GETHOSTNAME +#define __ARCH_WANT_SYS_PAUSE +#define __ARCH_WANT_SYS_SGETMASK +#define __ARCH_WANT_SYS_SIGNAL +#define __ARCH_WANT_SYS_TIME +#define __ARCH_WANT_SYS_UTIME +#define __ARCH_WANT_SYS_WAITPID +#define __ARCH_WANT_SYS_SOCKETCALL +#define __ARCH_WANT_SYS_FADVISE64 +#define __ARCH_WANT_SYS_GETPGRP +#define __ARCH_WANT_SYS_LLSEEK +#define __ARCH_WANT_SYS_NICE +#define __ARCH_WANT_SYS_OLD_GETRLIMIT +#define __ARCH_WANT_SYS_OLDUMOUNT +#define __ARCH_WANT_SYS_SIGPENDING +#define __ARCH_WANT_SYS_SIGPROCMASK +#define __ARCH_WANT_SYS_RT_SIGACTION +#define __ARCH_WANT_SYS_RT_SIGSUSPEND + +/* + * "Conditional" syscalls + * + * What we want is __attribute__((weak,alias("sys_ni_syscall"))), + * but it doesn't work on all toolchains, so we just do it by hand + */ +#ifndef cond_syscall +#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall") +#endif + +#endif /* __KERNEL__ */ +#endif /* _ASM_X86_UNISTD_32_H */ diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h new file mode 100644 index 000000000000..834b2c1d89fb --- /dev/null +++ b/arch/x86/include/asm/unistd_64.h @@ -0,0 +1,693 @@ +#ifndef _ASM_X86_UNISTD_64_H +#define _ASM_X86_UNISTD_64_H + +#ifndef __SYSCALL +#define __SYSCALL(a, b) +#endif + +/* + * This file contains the system call numbers. + * + * Note: holes are not allowed. + */ + +/* at least 8 syscall per cacheline */ +#define __NR_read 0 +__SYSCALL(__NR_read, sys_read) +#define __NR_write 1 +__SYSCALL(__NR_write, sys_write) +#define __NR_open 2 +__SYSCALL(__NR_open, sys_open) +#define __NR_close 3 +__SYSCALL(__NR_close, sys_close) +#define __NR_stat 4 +__SYSCALL(__NR_stat, sys_newstat) +#define __NR_fstat 5 +__SYSCALL(__NR_fstat, sys_newfstat) +#define __NR_lstat 6 +__SYSCALL(__NR_lstat, sys_newlstat) +#define __NR_poll 7 +__SYSCALL(__NR_poll, sys_poll) + +#define __NR_lseek 8 +__SYSCALL(__NR_lseek, sys_lseek) +#define __NR_mmap 9 +__SYSCALL(__NR_mmap, sys_mmap) +#define __NR_mprotect 10 +__SYSCALL(__NR_mprotect, sys_mprotect) +#define __NR_munmap 11 +__SYSCALL(__NR_munmap, sys_munmap) +#define __NR_brk 12 +__SYSCALL(__NR_brk, sys_brk) +#define __NR_rt_sigaction 13 +__SYSCALL(__NR_rt_sigaction, sys_rt_sigaction) +#define __NR_rt_sigprocmask 14 +__SYSCALL(__NR_rt_sigprocmask, sys_rt_sigprocmask) +#define __NR_rt_sigreturn 15 +__SYSCALL(__NR_rt_sigreturn, stub_rt_sigreturn) + +#define __NR_ioctl 16 +__SYSCALL(__NR_ioctl, sys_ioctl) +#define __NR_pread64 17 +__SYSCALL(__NR_pread64, sys_pread64) +#define __NR_pwrite64 18 +__SYSCALL(__NR_pwrite64, sys_pwrite64) +#define __NR_readv 19 +__SYSCALL(__NR_readv, sys_readv) +#define __NR_writev 20 +__SYSCALL(__NR_writev, sys_writev) +#define __NR_access 21 +__SYSCALL(__NR_access, sys_access) +#define __NR_pipe 22 +__SYSCALL(__NR_pipe, sys_pipe) +#define __NR_select 23 +__SYSCALL(__NR_select, sys_select) + +#define __NR_sched_yield 24 +__SYSCALL(__NR_sched_yield, sys_sched_yield) +#define __NR_mremap 25 +__SYSCALL(__NR_mremap, sys_mremap) +#define __NR_msync 26 +__SYSCALL(__NR_msync, sys_msync) +#define __NR_mincore 27 +__SYSCALL(__NR_mincore, sys_mincore) +#define __NR_madvise 28 +__SYSCALL(__NR_madvise, sys_madvise) +#define __NR_shmget 29 +__SYSCALL(__NR_shmget, sys_shmget) +#define __NR_shmat 30 +__SYSCALL(__NR_shmat, sys_shmat) +#define __NR_shmctl 31 +__SYSCALL(__NR_shmctl, sys_shmctl) + +#define __NR_dup 32 +__SYSCALL(__NR_dup, sys_dup) +#define __NR_dup2 33 +__SYSCALL(__NR_dup2, sys_dup2) +#define __NR_pause 34 +__SYSCALL(__NR_pause, sys_pause) +#define __NR_nanosleep 35 +__SYSCALL(__NR_nanosleep, sys_nanosleep) +#define __NR_getitimer 36 +__SYSCALL(__NR_getitimer, sys_getitimer) +#define __NR_alarm 37 +__SYSCALL(__NR_alarm, sys_alarm) +#define __NR_setitimer 38 +__SYSCALL(__NR_setitimer, sys_setitimer) +#define __NR_getpid 39 +__SYSCALL(__NR_getpid, sys_getpid) + +#define __NR_sendfile 40 +__SYSCALL(__NR_sendfile, sys_sendfile64) +#define __NR_socket 41 +__SYSCALL(__NR_socket, sys_socket) +#define __NR_connect 42 +__SYSCALL(__NR_connect, sys_connect) +#define __NR_accept 43 +__SYSCALL(__NR_accept, sys_accept) +#define __NR_sendto 44 +__SYSCALL(__NR_sendto, sys_sendto) +#define __NR_recvfrom 45 +__SYSCALL(__NR_recvfrom, sys_recvfrom) +#define __NR_sendmsg 46 +__SYSCALL(__NR_sendmsg, sys_sendmsg) +#define __NR_recvmsg 47 +__SYSCALL(__NR_recvmsg, sys_recvmsg) + +#define __NR_shutdown 48 +__SYSCALL(__NR_shutdown, sys_shutdown) +#define __NR_bind 49 +__SYSCALL(__NR_bind, sys_bind) +#define __NR_listen 50 +__SYSCALL(__NR_listen, sys_listen) +#define __NR_getsockname 51 +__SYSCALL(__NR_getsockname, sys_getsockname) +#define __NR_getpeername 52 +__SYSCALL(__NR_getpeername, sys_getpeername) +#define __NR_socketpair 53 +__SYSCALL(__NR_socketpair, sys_socketpair) +#define __NR_setsockopt 54 +__SYSCALL(__NR_setsockopt, sys_setsockopt) +#define __NR_getsockopt 55 +__SYSCALL(__NR_getsockopt, sys_getsockopt) + +#define __NR_clone 56 +__SYSCALL(__NR_clone, stub_clone) +#define __NR_fork 57 +__SYSCALL(__NR_fork, stub_fork) +#define __NR_vfork 58 +__SYSCALL(__NR_vfork, stub_vfork) +#define __NR_execve 59 +__SYSCALL(__NR_execve, stub_execve) +#define __NR_exit 60 +__SYSCALL(__NR_exit, sys_exit) +#define __NR_wait4 61 +__SYSCALL(__NR_wait4, sys_wait4) +#define __NR_kill 62 +__SYSCALL(__NR_kill, sys_kill) +#define __NR_uname 63 +__SYSCALL(__NR_uname, sys_uname) + +#define __NR_semget 64 +__SYSCALL(__NR_semget, sys_semget) +#define __NR_semop 65 +__SYSCALL(__NR_semop, sys_semop) +#define __NR_semctl 66 +__SYSCALL(__NR_semctl, sys_semctl) +#define __NR_shmdt 67 +__SYSCALL(__NR_shmdt, sys_shmdt) +#define __NR_msgget 68 +__SYSCALL(__NR_msgget, sys_msgget) +#define __NR_msgsnd 69 +__SYSCALL(__NR_msgsnd, sys_msgsnd) +#define __NR_msgrcv 70 +__SYSCALL(__NR_msgrcv, sys_msgrcv) +#define __NR_msgctl 71 +__SYSCALL(__NR_msgctl, sys_msgctl) + +#define __NR_fcntl 72 +__SYSCALL(__NR_fcntl, sys_fcntl) +#define __NR_flock 73 +__SYSCALL(__NR_flock, sys_flock) +#define __NR_fsync 74 +__SYSCALL(__NR_fsync, sys_fsync) +#define __NR_fdatasync 75 +__SYSCALL(__NR_fdatasync, sys_fdatasync) +#define __NR_truncate 76 +__SYSCALL(__NR_truncate, sys_truncate) +#define __NR_ftruncate 77 +__SYSCALL(__NR_ftruncate, sys_ftruncate) +#define __NR_getdents 78 +__SYSCALL(__NR_getdents, sys_getdents) +#define __NR_getcwd 79 +__SYSCALL(__NR_getcwd, sys_getcwd) + +#define __NR_chdir 80 +__SYSCALL(__NR_chdir, sys_chdir) +#define __NR_fchdir 81 +__SYSCALL(__NR_fchdir, sys_fchdir) +#define __NR_rename 82 +__SYSCALL(__NR_rename, sys_rename) +#define __NR_mkdir 83 +__SYSCALL(__NR_mkdir, sys_mkdir) +#define __NR_rmdir 84 +__SYSCALL(__NR_rmdir, sys_rmdir) +#define __NR_creat 85 +__SYSCALL(__NR_creat, sys_creat) +#define __NR_link 86 +__SYSCALL(__NR_link, sys_link) +#define __NR_unlink 87 +__SYSCALL(__NR_unlink, sys_unlink) + +#define __NR_symlink 88 +__SYSCALL(__NR_symlink, sys_symlink) +#define __NR_readlink 89 +__SYSCALL(__NR_readlink, sys_readlink) +#define __NR_chmod 90 +__SYSCALL(__NR_chmod, sys_chmod) +#define __NR_fchmod 91 +__SYSCALL(__NR_fchmod, sys_fchmod) +#define __NR_chown 92 +__SYSCALL(__NR_chown, sys_chown) +#define __NR_fchown 93 +__SYSCALL(__NR_fchown, sys_fchown) +#define __NR_lchown 94 +__SYSCALL(__NR_lchown, sys_lchown) +#define __NR_umask 95 +__SYSCALL(__NR_umask, sys_umask) + +#define __NR_gettimeofday 96 +__SYSCALL(__NR_gettimeofday, sys_gettimeofday) +#define __NR_getrlimit 97 +__SYSCALL(__NR_getrlimit, sys_getrlimit) +#define __NR_getrusage 98 +__SYSCALL(__NR_getrusage, sys_getrusage) +#define __NR_sysinfo 99 +__SYSCALL(__NR_sysinfo, sys_sysinfo) +#define __NR_times 100 +__SYSCALL(__NR_times, sys_times) +#define __NR_ptrace 101 +__SYSCALL(__NR_ptrace, sys_ptrace) +#define __NR_getuid 102 +__SYSCALL(__NR_getuid, sys_getuid) +#define __NR_syslog 103 +__SYSCALL(__NR_syslog, sys_syslog) + +/* at the very end the stuff that never runs during the benchmarks */ +#define __NR_getgid 104 +__SYSCALL(__NR_getgid, sys_getgid) +#define __NR_setuid 105 +__SYSCALL(__NR_setuid, sys_setuid) +#define __NR_setgid 106 +__SYSCALL(__NR_setgid, sys_setgid) +#define __NR_geteuid 107 +__SYSCALL(__NR_geteuid, sys_geteuid) +#define __NR_getegid 108 +__SYSCALL(__NR_getegid, sys_getegid) +#define __NR_setpgid 109 +__SYSCALL(__NR_setpgid, sys_setpgid) +#define __NR_getppid 110 +__SYSCALL(__NR_getppid, sys_getppid) +#define __NR_getpgrp 111 +__SYSCALL(__NR_getpgrp, sys_getpgrp) + +#define __NR_setsid 112 +__SYSCALL(__NR_setsid, sys_setsid) +#define __NR_setreuid 113 +__SYSCALL(__NR_setreuid, sys_setreuid) +#define __NR_setregid 114 +__SYSCALL(__NR_setregid, sys_setregid) +#define __NR_getgroups 115 +__SYSCALL(__NR_getgroups, sys_getgroups) +#define __NR_setgroups 116 +__SYSCALL(__NR_setgroups, sys_setgroups) +#define __NR_setresuid 117 +__SYSCALL(__NR_setresuid, sys_setresuid) +#define __NR_getresuid 118 +__SYSCALL(__NR_getresuid, sys_getresuid) +#define __NR_setresgid 119 +__SYSCALL(__NR_setresgid, sys_setresgid) + +#define __NR_getresgid 120 +__SYSCALL(__NR_getresgid, sys_getresgid) +#define __NR_getpgid 121 +__SYSCALL(__NR_getpgid, sys_getpgid) +#define __NR_setfsuid 122 +__SYSCALL(__NR_setfsuid, sys_setfsuid) +#define __NR_setfsgid 123 +__SYSCALL(__NR_setfsgid, sys_setfsgid) +#define __NR_getsid 124 +__SYSCALL(__NR_getsid, sys_getsid) +#define __NR_capget 125 +__SYSCALL(__NR_capget, sys_capget) +#define __NR_capset 126 +__SYSCALL(__NR_capset, sys_capset) + +#define __NR_rt_sigpending 127 +__SYSCALL(__NR_rt_sigpending, sys_rt_sigpending) +#define __NR_rt_sigtimedwait 128 +__SYSCALL(__NR_rt_sigtimedwait, sys_rt_sigtimedwait) +#define __NR_rt_sigqueueinfo 129 +__SYSCALL(__NR_rt_sigqueueinfo, sys_rt_sigqueueinfo) +#define __NR_rt_sigsuspend 130 +__SYSCALL(__NR_rt_sigsuspend, sys_rt_sigsuspend) +#define __NR_sigaltstack 131 +__SYSCALL(__NR_sigaltstack, stub_sigaltstack) +#define __NR_utime 132 +__SYSCALL(__NR_utime, sys_utime) +#define __NR_mknod 133 +__SYSCALL(__NR_mknod, sys_mknod) + +/* Only needed for a.out */ +#define __NR_uselib 134 +__SYSCALL(__NR_uselib, sys_ni_syscall) +#define __NR_personality 135 +__SYSCALL(__NR_personality, sys_personality) + +#define __NR_ustat 136 +__SYSCALL(__NR_ustat, sys_ustat) +#define __NR_statfs 137 +__SYSCALL(__NR_statfs, sys_statfs) +#define __NR_fstatfs 138 +__SYSCALL(__NR_fstatfs, sys_fstatfs) +#define __NR_sysfs 139 +__SYSCALL(__NR_sysfs, sys_sysfs) + +#define __NR_getpriority 140 +__SYSCALL(__NR_getpriority, sys_getpriority) +#define __NR_setpriority 141 +__SYSCALL(__NR_setpriority, sys_setpriority) +#define __NR_sched_setparam 142 +__SYSCALL(__NR_sched_setparam, sys_sched_setparam) +#define __NR_sched_getparam 143 +__SYSCALL(__NR_sched_getparam, sys_sched_getparam) +#define __NR_sched_setscheduler 144 +__SYSCALL(__NR_sched_setscheduler, sys_sched_setscheduler) +#define __NR_sched_getscheduler 145 +__SYSCALL(__NR_sched_getscheduler, sys_sched_getscheduler) +#define __NR_sched_get_priority_max 146 +__SYSCALL(__NR_sched_get_priority_max, sys_sched_get_priority_max) +#define __NR_sched_get_priority_min 147 +__SYSCALL(__NR_sched_get_priority_min, sys_sched_get_priority_min) +#define __NR_sched_rr_get_interval 148 +__SYSCALL(__NR_sched_rr_get_interval, sys_sched_rr_get_interval) + +#define __NR_mlock 149 +__SYSCALL(__NR_mlock, sys_mlock) +#define __NR_munlock 150 +__SYSCALL(__NR_munlock, sys_munlock) +#define __NR_mlockall 151 +__SYSCALL(__NR_mlockall, sys_mlockall) +#define __NR_munlockall 152 +__SYSCALL(__NR_munlockall, sys_munlockall) + +#define __NR_vhangup 153 +__SYSCALL(__NR_vhangup, sys_vhangup) + +#define __NR_modify_ldt 154 +__SYSCALL(__NR_modify_ldt, sys_modify_ldt) + +#define __NR_pivot_root 155 +__SYSCALL(__NR_pivot_root, sys_pivot_root) + +#define __NR__sysctl 156 +__SYSCALL(__NR__sysctl, sys_sysctl) + +#define __NR_prctl 157 +__SYSCALL(__NR_prctl, sys_prctl) +#define __NR_arch_prctl 158 +__SYSCALL(__NR_arch_prctl, sys_arch_prctl) + +#define __NR_adjtimex 159 +__SYSCALL(__NR_adjtimex, sys_adjtimex) + +#define __NR_setrlimit 160 +__SYSCALL(__NR_setrlimit, sys_setrlimit) + +#define __NR_chroot 161 +__SYSCALL(__NR_chroot, sys_chroot) + +#define __NR_sync 162 +__SYSCALL(__NR_sync, sys_sync) + +#define __NR_acct 163 +__SYSCALL(__NR_acct, sys_acct) + +#define __NR_settimeofday 164 +__SYSCALL(__NR_settimeofday, sys_settimeofday) + +#define __NR_mount 165 +__SYSCALL(__NR_mount, sys_mount) +#define __NR_umount2 166 +__SYSCALL(__NR_umount2, sys_umount) + +#define __NR_swapon 167 +__SYSCALL(__NR_swapon, sys_swapon) +#define __NR_swapoff 168 +__SYSCALL(__NR_swapoff, sys_swapoff) + +#define __NR_reboot 169 +__SYSCALL(__NR_reboot, sys_reboot) + +#define __NR_sethostname 170 +__SYSCALL(__NR_sethostname, sys_sethostname) +#define __NR_setdomainname 171 +__SYSCALL(__NR_setdomainname, sys_setdomainname) + +#define __NR_iopl 172 +__SYSCALL(__NR_iopl, stub_iopl) +#define __NR_ioperm 173 +__SYSCALL(__NR_ioperm, sys_ioperm) + +#define __NR_create_module 174 +__SYSCALL(__NR_create_module, sys_ni_syscall) +#define __NR_init_module 175 +__SYSCALL(__NR_init_module, sys_init_module) +#define __NR_delete_module 176 +__SYSCALL(__NR_delete_module, sys_delete_module) +#define __NR_get_kernel_syms 177 +__SYSCALL(__NR_get_kernel_syms, sys_ni_syscall) +#define __NR_query_module 178 +__SYSCALL(__NR_query_module, sys_ni_syscall) + +#define __NR_quotactl 179 +__SYSCALL(__NR_quotactl, sys_quotactl) + +#define __NR_nfsservctl 180 +__SYSCALL(__NR_nfsservctl, sys_nfsservctl) + +/* reserved for LiS/STREAMS */ +#define __NR_getpmsg 181 +__SYSCALL(__NR_getpmsg, sys_ni_syscall) +#define __NR_putpmsg 182 +__SYSCALL(__NR_putpmsg, sys_ni_syscall) + +/* reserved for AFS */ +#define __NR_afs_syscall 183 +__SYSCALL(__NR_afs_syscall, sys_ni_syscall) + +/* reserved for tux */ +#define __NR_tuxcall 184 +__SYSCALL(__NR_tuxcall, sys_ni_syscall) + +#define __NR_security 185 +__SYSCALL(__NR_security, sys_ni_syscall) + +#define __NR_gettid 186 +__SYSCALL(__NR_gettid, sys_gettid) + +#define __NR_readahead 187 +__SYSCALL(__NR_readahead, sys_readahead) +#define __NR_setxattr 188 +__SYSCALL(__NR_setxattr, sys_setxattr) +#define __NR_lsetxattr 189 +__SYSCALL(__NR_lsetxattr, sys_lsetxattr) +#define __NR_fsetxattr 190 +__SYSCALL(__NR_fsetxattr, sys_fsetxattr) +#define __NR_getxattr 191 +__SYSCALL(__NR_getxattr, sys_getxattr) +#define __NR_lgetxattr 192 +__SYSCALL(__NR_lgetxattr, sys_lgetxattr) +#define __NR_fgetxattr 193 +__SYSCALL(__NR_fgetxattr, sys_fgetxattr) +#define __NR_listxattr 194 +__SYSCALL(__NR_listxattr, sys_listxattr) +#define __NR_llistxattr 195 +__SYSCALL(__NR_llistxattr, sys_llistxattr) +#define __NR_flistxattr 196 +__SYSCALL(__NR_flistxattr, sys_flistxattr) +#define __NR_removexattr 197 +__SYSCALL(__NR_removexattr, sys_removexattr) +#define __NR_lremovexattr 198 +__SYSCALL(__NR_lremovexattr, sys_lremovexattr) +#define __NR_fremovexattr 199 +__SYSCALL(__NR_fremovexattr, sys_fremovexattr) +#define __NR_tkill 200 +__SYSCALL(__NR_tkill, sys_tkill) +#define __NR_time 201 +__SYSCALL(__NR_time, sys_time) +#define __NR_futex 202 +__SYSCALL(__NR_futex, sys_futex) +#define __NR_sched_setaffinity 203 +__SYSCALL(__NR_sched_setaffinity, sys_sched_setaffinity) +#define __NR_sched_getaffinity 204 +__SYSCALL(__NR_sched_getaffinity, sys_sched_getaffinity) +#define __NR_set_thread_area 205 +__SYSCALL(__NR_set_thread_area, sys_ni_syscall) /* use arch_prctl */ +#define __NR_io_setup 206 +__SYSCALL(__NR_io_setup, sys_io_setup) +#define __NR_io_destroy 207 +__SYSCALL(__NR_io_destroy, sys_io_destroy) +#define __NR_io_getevents 208 +__SYSCALL(__NR_io_getevents, sys_io_getevents) +#define __NR_io_submit 209 +__SYSCALL(__NR_io_submit, sys_io_submit) +#define __NR_io_cancel 210 +__SYSCALL(__NR_io_cancel, sys_io_cancel) +#define __NR_get_thread_area 211 +__SYSCALL(__NR_get_thread_area, sys_ni_syscall) /* use arch_prctl */ +#define __NR_lookup_dcookie 212 +__SYSCALL(__NR_lookup_dcookie, sys_lookup_dcookie) +#define __NR_epoll_create 213 +__SYSCALL(__NR_epoll_create, sys_epoll_create) +#define __NR_epoll_ctl_old 214 +__SYSCALL(__NR_epoll_ctl_old, sys_ni_syscall) +#define __NR_epoll_wait_old 215 +__SYSCALL(__NR_epoll_wait_old, sys_ni_syscall) +#define __NR_remap_file_pages 216 +__SYSCALL(__NR_remap_file_pages, sys_remap_file_pages) +#define __NR_getdents64 217 +__SYSCALL(__NR_getdents64, sys_getdents64) +#define __NR_set_tid_address 218 +__SYSCALL(__NR_set_tid_address, sys_set_tid_address) +#define __NR_restart_syscall 219 +__SYSCALL(__NR_restart_syscall, sys_restart_syscall) +#define __NR_semtimedop 220 +__SYSCALL(__NR_semtimedop, sys_semtimedop) +#define __NR_fadvise64 221 +__SYSCALL(__NR_fadvise64, sys_fadvise64) +#define __NR_timer_create 222 +__SYSCALL(__NR_timer_create, sys_timer_create) +#define __NR_timer_settime 223 +__SYSCALL(__NR_timer_settime, sys_timer_settime) +#define __NR_timer_gettime 224 +__SYSCALL(__NR_timer_gettime, sys_timer_gettime) +#define __NR_timer_getoverrun 225 +__SYSCALL(__NR_timer_getoverrun, sys_timer_getoverrun) +#define __NR_timer_delete 226 +__SYSCALL(__NR_timer_delete, sys_timer_delete) +#define __NR_clock_settime 227 +__SYSCALL(__NR_clock_settime, sys_clock_settime) +#define __NR_clock_gettime 228 +__SYSCALL(__NR_clock_gettime, sys_clock_gettime) +#define __NR_clock_getres 229 +__SYSCALL(__NR_clock_getres, sys_clock_getres) +#define __NR_clock_nanosleep 230 +__SYSCALL(__NR_clock_nanosleep, sys_clock_nanosleep) +#define __NR_exit_group 231 +__SYSCALL(__NR_exit_group, sys_exit_group) +#define __NR_epoll_wait 232 +__SYSCALL(__NR_epoll_wait, sys_epoll_wait) +#define __NR_epoll_ctl 233 +__SYSCALL(__NR_epoll_ctl, sys_epoll_ctl) +#define __NR_tgkill 234 +__SYSCALL(__NR_tgkill, sys_tgkill) +#define __NR_utimes 235 +__SYSCALL(__NR_utimes, sys_utimes) +#define __NR_vserver 236 +__SYSCALL(__NR_vserver, sys_ni_syscall) +#define __NR_mbind 237 +__SYSCALL(__NR_mbind, sys_mbind) +#define __NR_set_mempolicy 238 +__SYSCALL(__NR_set_mempolicy, sys_set_mempolicy) +#define __NR_get_mempolicy 239 +__SYSCALL(__NR_get_mempolicy, sys_get_mempolicy) +#define __NR_mq_open 240 +__SYSCALL(__NR_mq_open, sys_mq_open) +#define __NR_mq_unlink 241 +__SYSCALL(__NR_mq_unlink, sys_mq_unlink) +#define __NR_mq_timedsend 242 +__SYSCALL(__NR_mq_timedsend, sys_mq_timedsend) +#define __NR_mq_timedreceive 243 +__SYSCALL(__NR_mq_timedreceive, sys_mq_timedreceive) +#define __NR_mq_notify 244 +__SYSCALL(__NR_mq_notify, sys_mq_notify) +#define __NR_mq_getsetattr 245 +__SYSCALL(__NR_mq_getsetattr, sys_mq_getsetattr) +#define __NR_kexec_load 246 +__SYSCALL(__NR_kexec_load, sys_kexec_load) +#define __NR_waitid 247 +__SYSCALL(__NR_waitid, sys_waitid) +#define __NR_add_key 248 +__SYSCALL(__NR_add_key, sys_add_key) +#define __NR_request_key 249 +__SYSCALL(__NR_request_key, sys_request_key) +#define __NR_keyctl 250 +__SYSCALL(__NR_keyctl, sys_keyctl) +#define __NR_ioprio_set 251 +__SYSCALL(__NR_ioprio_set, sys_ioprio_set) +#define __NR_ioprio_get 252 +__SYSCALL(__NR_ioprio_get, sys_ioprio_get) +#define __NR_inotify_init 253 +__SYSCALL(__NR_inotify_init, sys_inotify_init) +#define __NR_inotify_add_watch 254 +__SYSCALL(__NR_inotify_add_watch, sys_inotify_add_watch) +#define __NR_inotify_rm_watch 255 +__SYSCALL(__NR_inotify_rm_watch, sys_inotify_rm_watch) +#define __NR_migrate_pages 256 +__SYSCALL(__NR_migrate_pages, sys_migrate_pages) +#define __NR_openat 257 +__SYSCALL(__NR_openat, sys_openat) +#define __NR_mkdirat 258 +__SYSCALL(__NR_mkdirat, sys_mkdirat) +#define __NR_mknodat 259 +__SYSCALL(__NR_mknodat, sys_mknodat) +#define __NR_fchownat 260 +__SYSCALL(__NR_fchownat, sys_fchownat) +#define __NR_futimesat 261 +__SYSCALL(__NR_futimesat, sys_futimesat) +#define __NR_newfstatat 262 +__SYSCALL(__NR_newfstatat, sys_newfstatat) +#define __NR_unlinkat 263 +__SYSCALL(__NR_unlinkat, sys_unlinkat) +#define __NR_renameat 264 +__SYSCALL(__NR_renameat, sys_renameat) +#define __NR_linkat 265 +__SYSCALL(__NR_linkat, sys_linkat) +#define __NR_symlinkat 266 +__SYSCALL(__NR_symlinkat, sys_symlinkat) +#define __NR_readlinkat 267 +__SYSCALL(__NR_readlinkat, sys_readlinkat) +#define __NR_fchmodat 268 +__SYSCALL(__NR_fchmodat, sys_fchmodat) +#define __NR_faccessat 269 +__SYSCALL(__NR_faccessat, sys_faccessat) +#define __NR_pselect6 270 +__SYSCALL(__NR_pselect6, sys_pselect6) +#define __NR_ppoll 271 +__SYSCALL(__NR_ppoll, sys_ppoll) +#define __NR_unshare 272 +__SYSCALL(__NR_unshare, sys_unshare) +#define __NR_set_robust_list 273 +__SYSCALL(__NR_set_robust_list, sys_set_robust_list) +#define __NR_get_robust_list 274 +__SYSCALL(__NR_get_robust_list, sys_get_robust_list) +#define __NR_splice 275 +__SYSCALL(__NR_splice, sys_splice) +#define __NR_tee 276 +__SYSCALL(__NR_tee, sys_tee) +#define __NR_sync_file_range 277 +__SYSCALL(__NR_sync_file_range, sys_sync_file_range) +#define __NR_vmsplice 278 +__SYSCALL(__NR_vmsplice, sys_vmsplice) +#define __NR_move_pages 279 +__SYSCALL(__NR_move_pages, sys_move_pages) +#define __NR_utimensat 280 +__SYSCALL(__NR_utimensat, sys_utimensat) +#define __IGNORE_getcpu /* implemented as a vsyscall */ +#define __NR_epoll_pwait 281 +__SYSCALL(__NR_epoll_pwait, sys_epoll_pwait) +#define __NR_signalfd 282 +__SYSCALL(__NR_signalfd, sys_signalfd) +#define __NR_timerfd_create 283 +__SYSCALL(__NR_timerfd_create, sys_timerfd_create) +#define __NR_eventfd 284 +__SYSCALL(__NR_eventfd, sys_eventfd) +#define __NR_fallocate 285 +__SYSCALL(__NR_fallocate, sys_fallocate) +#define __NR_timerfd_settime 286 +__SYSCALL(__NR_timerfd_settime, sys_timerfd_settime) +#define __NR_timerfd_gettime 287 +__SYSCALL(__NR_timerfd_gettime, sys_timerfd_gettime) +#define __NR_paccept 288 +__SYSCALL(__NR_paccept, sys_paccept) +#define __NR_signalfd4 289 +__SYSCALL(__NR_signalfd4, sys_signalfd4) +#define __NR_eventfd2 290 +__SYSCALL(__NR_eventfd2, sys_eventfd2) +#define __NR_epoll_create1 291 +__SYSCALL(__NR_epoll_create1, sys_epoll_create1) +#define __NR_dup3 292 +__SYSCALL(__NR_dup3, sys_dup3) +#define __NR_pipe2 293 +__SYSCALL(__NR_pipe2, sys_pipe2) +#define __NR_inotify_init1 294 +__SYSCALL(__NR_inotify_init1, sys_inotify_init1) + + +#ifndef __NO_STUBS +#define __ARCH_WANT_OLD_READDIR +#define __ARCH_WANT_OLD_STAT +#define __ARCH_WANT_SYS_ALARM +#define __ARCH_WANT_SYS_GETHOSTNAME +#define __ARCH_WANT_SYS_PAUSE +#define __ARCH_WANT_SYS_SGETMASK +#define __ARCH_WANT_SYS_SIGNAL +#define __ARCH_WANT_SYS_UTIME +#define __ARCH_WANT_SYS_WAITPID +#define __ARCH_WANT_SYS_SOCKETCALL +#define __ARCH_WANT_SYS_FADVISE64 +#define __ARCH_WANT_SYS_GETPGRP +#define __ARCH_WANT_SYS_LLSEEK +#define __ARCH_WANT_SYS_NICE +#define __ARCH_WANT_SYS_OLD_GETRLIMIT +#define __ARCH_WANT_SYS_OLDUMOUNT +#define __ARCH_WANT_SYS_SIGPENDING +#define __ARCH_WANT_SYS_SIGPROCMASK +#define __ARCH_WANT_SYS_RT_SIGACTION +#define __ARCH_WANT_SYS_RT_SIGSUSPEND +#define __ARCH_WANT_SYS_TIME +#define __ARCH_WANT_COMPAT_SYS_TIME +#endif /* __NO_STUBS */ + +#ifdef __KERNEL__ +/* + * "Conditional" syscalls + * + * What we want is __attribute__((weak,alias("sys_ni_syscall"))), + * but it doesn't work on all toolchains, so we just do it by hand + */ +#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall") +#endif /* __KERNEL__ */ + +#endif /* _ASM_X86_UNISTD_64_H */ diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h new file mode 100644 index 000000000000..8b064bd9c553 --- /dev/null +++ b/arch/x86/include/asm/unwind.h @@ -0,0 +1,13 @@ +#ifndef _ASM_X86_UNWIND_H +#define _ASM_X86_UNWIND_H + +#define UNW_PC(frame) ((void)(frame), 0UL) +#define UNW_SP(frame) ((void)(frame), 0UL) +#define UNW_FP(frame) ((void)(frame), 0UL) + +static inline int arch_unw_user_mode(const void *info) +{ + return 0; +} + +#endif /* _ASM_X86_UNWIND_H */ diff --git a/arch/x86/include/asm/user.h b/arch/x86/include/asm/user.h new file mode 100644 index 000000000000..999873b22e7f --- /dev/null +++ b/arch/x86/include/asm/user.h @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include "user_32.h" +#else +# include "user_64.h" +#endif diff --git a/arch/x86/include/asm/user32.h b/arch/x86/include/asm/user32.h new file mode 100644 index 000000000000..14cbb73ebcba --- /dev/null +++ b/arch/x86/include/asm/user32.h @@ -0,0 +1,70 @@ +#ifndef _ASM_X86_USER32_H +#define _ASM_X86_USER32_H + +/* IA32 compatible user structures for ptrace. + * These should be used for 32bit coredumps too. */ + +struct user_i387_ia32_struct { + u32 cwd; + u32 swd; + u32 twd; + u32 fip; + u32 fcs; + u32 foo; + u32 fos; + u32 st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */ +}; + +/* FSAVE frame with extensions */ +struct user32_fxsr_struct { + unsigned short cwd; + unsigned short swd; + unsigned short twd; /* not compatible to 64bit twd */ + unsigned short fop; + int fip; + int fcs; + int foo; + int fos; + int mxcsr; + int reserved; + int st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ + int xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ + int padding[56]; +}; + +struct user_regs_struct32 { + __u32 ebx, ecx, edx, esi, edi, ebp, eax; + unsigned short ds, __ds, es, __es; + unsigned short fs, __fs, gs, __gs; + __u32 orig_eax, eip; + unsigned short cs, __cs; + __u32 eflags, esp; + unsigned short ss, __ss; +}; + +struct user32 { + struct user_regs_struct32 regs; /* Where the registers are actually stored */ + int u_fpvalid; /* True if math co-processor being used. */ + /* for this mess. Not yet used. */ + struct user_i387_ia32_struct i387; /* Math Co-processor registers. */ +/* The rest of this junk is to help gdb figure out what goes where */ + __u32 u_tsize; /* Text segment size (pages). */ + __u32 u_dsize; /* Data segment size (pages). */ + __u32 u_ssize; /* Stack segment size (pages). */ + __u32 start_code; /* Starting virtual address of text. */ + __u32 start_stack; /* Starting virtual address of stack area. + This is actually the bottom of the stack, + the top of the stack is always found in the + esp register. */ + __u32 signal; /* Signal that caused the core dump. */ + int reserved; /* No __u32er used */ + __u32 u_ar0; /* Used by gdb to help find the values for */ + /* the registers. */ + __u32 u_fpstate; /* Math Co-processor pointer. */ + __u32 magic; /* To uniquely identify a core file */ + char u_comm[32]; /* User command that was responsible */ + int u_debugreg[8]; +}; + + +#endif /* _ASM_X86_USER32_H */ diff --git a/arch/x86/include/asm/user_32.h b/arch/x86/include/asm/user_32.h new file mode 100644 index 000000000000..bebfd8644016 --- /dev/null +++ b/arch/x86/include/asm/user_32.h @@ -0,0 +1,131 @@ +#ifndef _ASM_X86_USER_32_H +#define _ASM_X86_USER_32_H + +#include <asm/page.h> +/* Core file format: The core file is written in such a way that gdb + can understand it and provide useful information to the user (under + linux we use the 'trad-core' bfd). There are quite a number of + obstacles to being able to view the contents of the floating point + registers, and until these are solved you will not be able to view the + contents of them. Actually, you can read in the core file and look at + the contents of the user struct to find out what the floating point + registers contain. + The actual file contents are as follows: + UPAGE: 1 page consisting of a user struct that tells gdb what is present + in the file. Directly after this is a copy of the task_struct, which + is currently not used by gdb, but it may come in useful at some point. + All of the registers are stored as part of the upage. The upage should + always be only one page. + DATA: The data area is stored. We use current->end_text to + current->brk to pick up all of the user variables, plus any memory + that may have been malloced. No attempt is made to determine if a page + is demand-zero or if a page is totally unused, we just cover the entire + range. All of the addresses are rounded in such a way that an integral + number of pages is written. + STACK: We need the stack information in order to get a meaningful + backtrace. We need to write the data from (esp) to + current->start_stack, so we round each of these off in order to be able + to write an integer number of pages. + The minimum core file size is 3 pages, or 12288 bytes. +*/ + +/* + * Pentium III FXSR, SSE support + * Gareth Hughes <gareth@valinux.com>, May 2000 + * + * Provide support for the GDB 5.0+ PTRACE_{GET|SET}FPXREGS requests for + * interacting with the FXSR-format floating point environment. Floating + * point data can be accessed in the regular format in the usual manner, + * and both the standard and SIMD floating point data can be accessed via + * the new ptrace requests. In either case, changes to the FPU environment + * will be reflected in the task's state as expected. + */ + +struct user_i387_struct { + long cwd; + long swd; + long twd; + long fip; + long fcs; + long foo; + long fos; + long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */ +}; + +struct user_fxsr_struct { + unsigned short cwd; + unsigned short swd; + unsigned short twd; + unsigned short fop; + long fip; + long fcs; + long foo; + long fos; + long mxcsr; + long reserved; + long st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ + long xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ + long padding[56]; +}; + +/* + * This is the old layout of "struct pt_regs", and + * is still the layout used by user mode (the new + * pt_regs doesn't have all registers as the kernel + * doesn't use the extra segment registers) + */ +struct user_regs_struct { + unsigned long bx; + unsigned long cx; + unsigned long dx; + unsigned long si; + unsigned long di; + unsigned long bp; + unsigned long ax; + unsigned long ds; + unsigned long es; + unsigned long fs; + unsigned long gs; + unsigned long orig_ax; + unsigned long ip; + unsigned long cs; + unsigned long flags; + unsigned long sp; + unsigned long ss; +}; + +/* When the kernel dumps core, it starts by dumping the user struct - + this will be used by gdb to figure out where the data and stack segments + are within the file, and what virtual addresses to use. */ +struct user{ +/* We start with the registers, to mimic the way that "memory" is returned + from the ptrace(3,...) function. */ + struct user_regs_struct regs; /* Where the registers are actually stored */ +/* ptrace does not yet supply these. Someday.... */ + int u_fpvalid; /* True if math co-processor being used. */ + /* for this mess. Not yet used. */ + struct user_i387_struct i387; /* Math Co-processor registers. */ +/* The rest of this junk is to help gdb figure out what goes where */ + unsigned long int u_tsize; /* Text segment size (pages). */ + unsigned long int u_dsize; /* Data segment size (pages). */ + unsigned long int u_ssize; /* Stack segment size (pages). */ + unsigned long start_code; /* Starting virtual address of text. */ + unsigned long start_stack; /* Starting virtual address of stack area. + This is actually the bottom of the stack, + the top of the stack is always found in the + esp register. */ + long int signal; /* Signal that caused the core dump. */ + int reserved; /* No longer used */ + unsigned long u_ar0; /* Used by gdb to help find the values for */ + /* the registers. */ + struct user_i387_struct *u_fpstate; /* Math Co-processor pointer. */ + unsigned long magic; /* To uniquely identify a core file */ + char u_comm[32]; /* User command that was responsible */ + int u_debugreg[8]; +}; +#define NBPG PAGE_SIZE +#define UPAGES 1 +#define HOST_TEXT_START_ADDR (u.start_code) +#define HOST_STACK_END_ADDR (u.start_stack + u.u_ssize * NBPG) + +#endif /* _ASM_X86_USER_32_H */ diff --git a/arch/x86/include/asm/user_64.h b/arch/x86/include/asm/user_64.h new file mode 100644 index 000000000000..faf2cd3e0d76 --- /dev/null +++ b/arch/x86/include/asm/user_64.h @@ -0,0 +1,137 @@ +#ifndef _ASM_X86_USER_64_H +#define _ASM_X86_USER_64_H + +#include <asm/types.h> +#include <asm/page.h> +/* Core file format: The core file is written in such a way that gdb + can understand it and provide useful information to the user. + There are quite a number of obstacles to being able to view the + contents of the floating point registers, and until these are + solved you will not be able to view the contents of them. + Actually, you can read in the core file and look at the contents of + the user struct to find out what the floating point registers + contain. + + The actual file contents are as follows: + UPAGE: 1 page consisting of a user struct that tells gdb what is present + in the file. Directly after this is a copy of the task_struct, which + is currently not used by gdb, but it may come in useful at some point. + All of the registers are stored as part of the upage. The upage should + always be only one page. + DATA: The data area is stored. We use current->end_text to + current->brk to pick up all of the user variables, plus any memory + that may have been malloced. No attempt is made to determine if a page + is demand-zero or if a page is totally unused, we just cover the entire + range. All of the addresses are rounded in such a way that an integral + number of pages is written. + STACK: We need the stack information in order to get a meaningful + backtrace. We need to write the data from (esp) to + current->start_stack, so we round each of these off in order to be able + to write an integer number of pages. + The minimum core file size is 3 pages, or 12288 bytes. */ + +/* + * Pentium III FXSR, SSE support + * Gareth Hughes <gareth@valinux.com>, May 2000 + * + * Provide support for the GDB 5.0+ PTRACE_{GET|SET}FPXREGS requests for + * interacting with the FXSR-format floating point environment. Floating + * point data can be accessed in the regular format in the usual manner, + * and both the standard and SIMD floating point data can be accessed via + * the new ptrace requests. In either case, changes to the FPU environment + * will be reflected in the task's state as expected. + * + * x86-64 support by Andi Kleen. + */ + +/* This matches the 64bit FXSAVE format as defined by AMD. It is the same + as the 32bit format defined by Intel, except that the selector:offset pairs + for data and eip are replaced with flat 64bit pointers. */ +struct user_i387_struct { + unsigned short cwd; + unsigned short swd; + unsigned short twd; /* Note this is not the same as + the 32bit/x87/FSAVE twd */ + unsigned short fop; + __u64 rip; + __u64 rdp; + __u32 mxcsr; + __u32 mxcsr_mask; + __u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ + __u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ + __u32 padding[24]; +}; + +/* + * Segment register layout in coredumps. + */ +struct user_regs_struct { + unsigned long r15; + unsigned long r14; + unsigned long r13; + unsigned long r12; + unsigned long bp; + unsigned long bx; + unsigned long r11; + unsigned long r10; + unsigned long r9; + unsigned long r8; + unsigned long ax; + unsigned long cx; + unsigned long dx; + unsigned long si; + unsigned long di; + unsigned long orig_ax; + unsigned long ip; + unsigned long cs; + unsigned long flags; + unsigned long sp; + unsigned long ss; + unsigned long fs_base; + unsigned long gs_base; + unsigned long ds; + unsigned long es; + unsigned long fs; + unsigned long gs; +}; + +/* When the kernel dumps core, it starts by dumping the user struct - + this will be used by gdb to figure out where the data and stack segments + are within the file, and what virtual addresses to use. */ + +struct user { +/* We start with the registers, to mimic the way that "memory" is returned + from the ptrace(3,...) function. */ + struct user_regs_struct regs; /* Where the registers are actually stored */ +/* ptrace does not yet supply these. Someday.... */ + int u_fpvalid; /* True if math co-processor being used. */ + /* for this mess. Not yet used. */ + int pad0; + struct user_i387_struct i387; /* Math Co-processor registers. */ +/* The rest of this junk is to help gdb figure out what goes where */ + unsigned long int u_tsize; /* Text segment size (pages). */ + unsigned long int u_dsize; /* Data segment size (pages). */ + unsigned long int u_ssize; /* Stack segment size (pages). */ + unsigned long start_code; /* Starting virtual address of text. */ + unsigned long start_stack; /* Starting virtual address of stack area. + This is actually the bottom of the stack, + the top of the stack is always found in the + esp register. */ + long int signal; /* Signal that caused the core dump. */ + int reserved; /* No longer used */ + int pad1; + unsigned long u_ar0; /* Used by gdb to help find the values for */ + /* the registers. */ + struct user_i387_struct *u_fpstate; /* Math Co-processor pointer. */ + unsigned long magic; /* To uniquely identify a core file */ + char u_comm[32]; /* User command that was responsible */ + unsigned long u_debugreg[8]; + unsigned long error_code; /* CPU error code or 0 */ + unsigned long fault_address; /* CR3 or 0 */ +}; +#define NBPG PAGE_SIZE +#define UPAGES 1 +#define HOST_TEXT_START_ADDR (u.start_code) +#define HOST_STACK_END_ADDR (u.start_stack + u.u_ssize * NBPG) + +#endif /* _ASM_X86_USER_64_H */ diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h new file mode 100644 index 000000000000..d931d3b7e6f7 --- /dev/null +++ b/arch/x86/include/asm/uv/bios.h @@ -0,0 +1,94 @@ +#ifndef _ASM_X86_UV_BIOS_H +#define _ASM_X86_UV_BIOS_H + +/* + * UV BIOS layer definitions. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) Russ Anderson + */ + +#include <linux/rtc.h> + +/* + * Values for the BIOS calls. It is passed as the first * argument in the + * BIOS call. Passing any other value in the first argument will result + * in a BIOS_STATUS_UNIMPLEMENTED return status. + */ +enum uv_bios_cmd { + UV_BIOS_COMMON, + UV_BIOS_GET_SN_INFO, + UV_BIOS_FREQ_BASE +}; + +/* + * Status values returned from a BIOS call. + */ +enum { + BIOS_STATUS_SUCCESS = 0, + BIOS_STATUS_UNIMPLEMENTED = -ENOSYS, + BIOS_STATUS_EINVAL = -EINVAL, + BIOS_STATUS_UNAVAIL = -EBUSY +}; + +/* + * The UV system table describes specific firmware + * capabilities available to the Linux kernel at runtime. + */ +struct uv_systab { + char signature[4]; /* must be "UVST" */ + u32 revision; /* distinguish different firmware revs */ + u64 function; /* BIOS runtime callback function ptr */ +}; + +enum { + BIOS_FREQ_BASE_PLATFORM = 0, + BIOS_FREQ_BASE_INTERVAL_TIMER = 1, + BIOS_FREQ_BASE_REALTIME_CLOCK = 2 +}; + +union partition_info_u { + u64 val; + struct { + u64 hub_version : 8, + partition_id : 16, + coherence_id : 16, + region_size : 24; + }; +}; + +/* + * bios calls have 6 parameters + */ +extern s64 uv_bios_call(enum uv_bios_cmd, u64, u64, u64, u64, u64); +extern s64 uv_bios_call_irqsave(enum uv_bios_cmd, u64, u64, u64, u64, u64); +extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64); + +extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *); +extern s64 uv_bios_freq_base(u64, u64 *); + +extern void uv_bios_init(void); + +extern int uv_type; +extern long sn_partition_id; +extern long uv_coherency_id; +extern long uv_region_size; +#define partition_coherence_id() (uv_coherency_id) + +extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */ + +#endif /* _ASM_X86_UV_BIOS_H */ diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h new file mode 100644 index 000000000000..e2363253bbbf --- /dev/null +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -0,0 +1,332 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * SGI UV Broadcast Assist Unit definitions + * + * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved. + */ + +#ifndef _ASM_X86_UV_UV_BAU_H +#define _ASM_X86_UV_UV_BAU_H + +#include <linux/bitmap.h> +#define BITSPERBYTE 8 + +/* + * Broadcast Assist Unit messaging structures + * + * Selective Broadcast activations are induced by software action + * specifying a particular 8-descriptor "set" via a 6-bit index written + * to an MMR. + * Thus there are 64 unique 512-byte sets of SB descriptors - one set for + * each 6-bit index value. These descriptor sets are mapped in sequence + * starting with set 0 located at the address specified in the + * BAU_SB_DESCRIPTOR_BASE register, set 1 is located at BASE + 512, + * set 2 is at BASE + 2*512, set 3 at BASE + 3*512, and so on. + * + * We will use 31 sets, one for sending BAU messages from each of the 32 + * cpu's on the node. + * + * TLB shootdown will use the first of the 8 descriptors of each set. + * Each of the descriptors is 64 bytes in size (8*64 = 512 bytes in a set). + */ + +#define UV_ITEMS_PER_DESCRIPTOR 8 +#define UV_CPUS_PER_ACT_STATUS 32 +#define UV_ACT_STATUS_MASK 0x3 +#define UV_ACT_STATUS_SIZE 2 +#define UV_ACTIVATION_DESCRIPTOR_SIZE 32 +#define UV_DISTRIBUTION_SIZE 256 +#define UV_SW_ACK_NPENDING 8 +#define UV_NET_ENDPOINT_INTD 0x38 +#define UV_DESC_BASE_PNODE_SHIFT 49 +#define UV_PAYLOADQ_PNODE_SHIFT 49 +#define UV_PTC_BASENAME "sgi_uv/ptc_statistics" +#define uv_physnodeaddr(x) ((__pa((unsigned long)(x)) & uv_mmask)) + +/* + * bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1 + */ +#define DESC_STATUS_IDLE 0 +#define DESC_STATUS_ACTIVE 1 +#define DESC_STATUS_DESTINATION_TIMEOUT 2 +#define DESC_STATUS_SOURCE_TIMEOUT 3 + +/* + * source side threshholds at which message retries print a warning + */ +#define SOURCE_TIMEOUT_LIMIT 20 +#define DESTINATION_TIMEOUT_LIMIT 20 + +/* + * number of entries in the destination side payload queue + */ +#define DEST_Q_SIZE 17 +/* + * number of destination side software ack resources + */ +#define DEST_NUM_RESOURCES 8 +#define MAX_CPUS_PER_NODE 32 +/* + * completion statuses for sending a TLB flush message + */ +#define FLUSH_RETRY 1 +#define FLUSH_GIVEUP 2 +#define FLUSH_COMPLETE 3 + +/* + * Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor) + * If the 'multilevel' flag in the header portion of the descriptor + * has been set to 0, then endpoint multi-unicast mode is selected. + * The distribution specification (32 bytes) is interpreted as a 256-bit + * distribution vector. Adjacent bits correspond to consecutive even numbered + * nodeIDs. The result of adding the index of a given bit to the 15-bit + * 'base_dest_nodeid' field of the header corresponds to the + * destination nodeID associated with that specified bit. + */ +struct bau_target_nodemask { + unsigned long bits[BITS_TO_LONGS(256)]; +}; + +/* + * mask of cpu's on a node + * (during initialization we need to check that unsigned long has + * enough bits for max. cpu's per node) + */ +struct bau_local_cpumask { + unsigned long bits; +}; + +/* + * Payload: 16 bytes (128 bits) (bytes 0x20-0x2f of descriptor) + * only 12 bytes (96 bits) of the payload area are usable. + * An additional 3 bytes (bits 27:4) of the header address are carried + * to the next bytes of the destination payload queue. + * And an additional 2 bytes of the header Suppl_A field are also + * carried to the destination payload queue. + * But the first byte of the Suppl_A becomes bits 127:120 (the 16th byte) + * of the destination payload queue, which is written by the hardware + * with the s/w ack resource bit vector. + * [ effective message contents (16 bytes (128 bits) maximum), not counting + * the s/w ack bit vector ] + */ + +/* + * The payload is software-defined for INTD transactions + */ +struct bau_msg_payload { + unsigned long address; /* signifies a page or all TLB's + of the cpu */ + /* 64 bits */ + unsigned short sending_cpu; /* filled in by sender */ + /* 16 bits */ + unsigned short acknowledge_count;/* filled in by destination */ + /* 16 bits */ + unsigned int reserved1:32; /* not usable */ +}; + + +/* + * Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor) + * see table 4.2.3.0.1 in broacast_assist spec. + */ +struct bau_msg_header { + int dest_subnodeid:6; /* must be zero */ + /* bits 5:0 */ + int base_dest_nodeid:15; /* nasid>>1 (pnode) of first bit in node_map */ + /* bits 20:6 */ + int command:8; /* message type */ + /* bits 28:21 */ + /* 0x38: SN3net EndPoint Message */ + int rsvd_1:3; /* must be zero */ + /* bits 31:29 */ + /* int will align on 32 bits */ + int rsvd_2:9; /* must be zero */ + /* bits 40:32 */ + /* Suppl_A is 56-41 */ + int payload_2a:8; /* becomes byte 16 of msg */ + /* bits 48:41 */ /* not currently using */ + int payload_2b:8; /* becomes byte 17 of msg */ + /* bits 56:49 */ /* not currently using */ + /* Address field (96:57) is never used as an + address (these are address bits 42:3) */ + int rsvd_3:1; /* must be zero */ + /* bit 57 */ + /* address bits 27:4 are payload */ + /* these 24 bits become bytes 12-14 of msg */ + int replied_to:1; /* sent as 0 by the source to byte 12 */ + /* bit 58 */ + + int payload_1a:5; /* not currently used */ + /* bits 63:59 */ + int payload_1b:8; /* not currently used */ + /* bits 71:64 */ + int payload_1c:8; /* not currently used */ + /* bits 79:72 */ + int payload_1d:2; /* not currently used */ + /* bits 81:80 */ + + int rsvd_4:7; /* must be zero */ + /* bits 88:82 */ + int sw_ack_flag:1; /* software acknowledge flag */ + /* bit 89 */ + /* INTD trasactions at destination are to + wait for software acknowledge */ + int rsvd_5:6; /* must be zero */ + /* bits 95:90 */ + int rsvd_6:5; /* must be zero */ + /* bits 100:96 */ + int int_both:1; /* if 1, interrupt both sockets on the blade */ + /* bit 101*/ + int fairness:3; /* usually zero */ + /* bits 104:102 */ + int multilevel:1; /* multi-level multicast format */ + /* bit 105 */ + /* 0 for TLB: endpoint multi-unicast messages */ + int chaining:1; /* next descriptor is part of this activation*/ + /* bit 106 */ + int rsvd_7:21; /* must be zero */ + /* bits 127:107 */ +}; + +/* + * The activation descriptor: + * The format of the message to send, plus all accompanying control + * Should be 64 bytes + */ +struct bau_desc { + struct bau_target_nodemask distribution; + /* + * message template, consisting of header and payload: + */ + struct bau_msg_header header; + struct bau_msg_payload payload; +}; +/* + * -payload-- ---------header------ + * bytes 0-11 bits 41-56 bits 58-81 + * A B (2) C (3) + * + * A/B/C are moved to: + * A C B + * bytes 0-11 bytes 12-14 bytes 16-17 (byte 15 filled in by hw as vector) + * ------------payload queue----------- + */ + +/* + * The payload queue on the destination side is an array of these. + * With BAU_MISC_CONTROL set for software acknowledge mode, the messages + * are 32 bytes (2 micropackets) (256 bits) in length, but contain only 17 + * bytes of usable data, including the sw ack vector in byte 15 (bits 127:120) + * (12 bytes come from bau_msg_payload, 3 from payload_1, 2 from + * sw_ack_vector and payload_2) + * "Enabling Software Acknowledgment mode (see Section 4.3.3 Software + * Acknowledge Processing) also selects 32 byte (17 bytes usable) payload + * operation." + */ +struct bau_payload_queue_entry { + unsigned long address; /* signifies a page or all TLB's + of the cpu */ + /* 64 bits, bytes 0-7 */ + + unsigned short sending_cpu; /* cpu that sent the message */ + /* 16 bits, bytes 8-9 */ + + unsigned short acknowledge_count; /* filled in by destination */ + /* 16 bits, bytes 10-11 */ + + unsigned short replied_to:1; /* sent as 0 by the source */ + /* 1 bit */ + unsigned short unused1:7; /* not currently using */ + /* 7 bits: byte 12) */ + + unsigned char unused2[2]; /* not currently using */ + /* bytes 13-14 */ + + unsigned char sw_ack_vector; /* filled in by the hardware */ + /* byte 15 (bits 127:120) */ + + unsigned char unused4[3]; /* not currently using bytes 17-19 */ + /* bytes 17-19 */ + + int number_of_cpus; /* filled in at destination */ + /* 32 bits, bytes 20-23 (aligned) */ + + unsigned char unused5[8]; /* not using */ + /* bytes 24-31 */ +}; + +/* + * one for every slot in the destination payload queue + */ +struct bau_msg_status { + struct bau_local_cpumask seen_by; /* map of cpu's */ +}; + +/* + * one for every slot in the destination software ack resources + */ +struct bau_sw_ack_status { + struct bau_payload_queue_entry *msg; /* associated message */ + int watcher; /* cpu monitoring, or -1 */ +}; + +/* + * one on every node and per-cpu; to locate the software tables + */ +struct bau_control { + struct bau_desc *descriptor_base; + struct bau_payload_queue_entry *bau_msg_head; + struct bau_payload_queue_entry *va_queue_first; + struct bau_payload_queue_entry *va_queue_last; + struct bau_msg_status *msg_statuses; + int *watching; /* pointer to array */ +}; + +/* + * This structure is allocated per_cpu for UV TLB shootdown statistics. + */ +struct ptc_stats { + unsigned long ptc_i; /* number of IPI-style flushes */ + unsigned long requestor; /* number of nodes this cpu sent to */ + unsigned long requestee; /* times cpu was remotely requested */ + unsigned long alltlb; /* times all tlb's on this cpu were flushed */ + unsigned long onetlb; /* times just one tlb on this cpu was flushed */ + unsigned long s_retry; /* retries on source side timeouts */ + unsigned long d_retry; /* retries on destination side timeouts */ + unsigned long sflush; /* cycles spent in uv_flush_tlb_others */ + unsigned long dflush; /* cycles spent on destination side */ + unsigned long retriesok; /* successes on retries */ + unsigned long nomsg; /* interrupts with no message */ + unsigned long multmsg; /* interrupts with multiple messages */ + unsigned long ntargeted;/* nodes targeted */ +}; + +static inline int bau_node_isset(int node, struct bau_target_nodemask *dstp) +{ + return constant_test_bit(node, &dstp->bits[0]); +} +static inline void bau_node_set(int node, struct bau_target_nodemask *dstp) +{ + __set_bit(node, &dstp->bits[0]); +} +static inline void bau_nodes_clear(struct bau_target_nodemask *dstp, int nbits) +{ + bitmap_zero(&dstp->bits[0], nbits); +} + +static inline void bau_cpubits_clear(struct bau_local_cpumask *dstp, int nbits) +{ + bitmap_zero(&dstp->bits, nbits); +} + +#define cpubit_isset(cpu, bau_local_cpumask) \ + test_bit((cpu), (bau_local_cpumask).bits) + +extern int uv_flush_tlb_others(cpumask_t *, struct mm_struct *, unsigned long); +extern void uv_bau_message_intr1(void); +extern void uv_bau_timeout_intr1(void); + +#endif /* _ASM_X86_UV_UV_BAU_H */ diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h new file mode 100644 index 000000000000..c6ad93e315c8 --- /dev/null +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -0,0 +1,354 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * SGI UV architectural definitions + * + * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved. + */ + +#ifndef _ASM_X86_UV_UV_HUB_H +#define _ASM_X86_UV_UV_HUB_H + +#include <linux/numa.h> +#include <linux/percpu.h> +#include <asm/types.h> +#include <asm/percpu.h> + + +/* + * Addressing Terminology + * + * M - The low M bits of a physical address represent the offset + * into the blade local memory. RAM memory on a blade is physically + * contiguous (although various IO spaces may punch holes in + * it).. + * + * N - Number of bits in the node portion of a socket physical + * address. + * + * NASID - network ID of a router, Mbrick or Cbrick. Nasid values of + * routers always have low bit of 1, C/MBricks have low bit + * equal to 0. Most addressing macros that target UV hub chips + * right shift the NASID by 1 to exclude the always-zero bit. + * NASIDs contain up to 15 bits. + * + * GNODE - NASID right shifted by 1 bit. Most mmrs contain gnodes instead + * of nasids. + * + * PNODE - the low N bits of the GNODE. The PNODE is the most useful variant + * of the nasid for socket usage. + * + * + * NumaLink Global Physical Address Format: + * +--------------------------------+---------------------+ + * |00..000| GNODE | NodeOffset | + * +--------------------------------+---------------------+ + * |<-------53 - M bits --->|<--------M bits -----> + * + * M - number of node offset bits (35 .. 40) + * + * + * Memory/UV-HUB Processor Socket Address Format: + * +----------------+---------------+---------------------+ + * |00..000000000000| PNODE | NodeOffset | + * +----------------+---------------+---------------------+ + * <--- N bits --->|<--------M bits -----> + * + * M - number of node offset bits (35 .. 40) + * N - number of PNODE bits (0 .. 10) + * + * Note: M + N cannot currently exceed 44 (x86_64) or 46 (IA64). + * The actual values are configuration dependent and are set at + * boot time. M & N values are set by the hardware/BIOS at boot. + * + * + * APICID format + * NOTE!!!!!! This is the current format of the APICID. However, code + * should assume that this will change in the future. Use functions + * in this file for all APICID bit manipulations and conversion. + * + * 1111110000000000 + * 5432109876543210 + * pppppppppplc0cch + * sssssssssss + * + * p = pnode bits + * l = socket number on board + * c = core + * h = hyperthread + * s = bits that are in the SOCKET_ID CSR + * + * Note: Processor only supports 12 bits in the APICID register. The ACPI + * tables hold all 16 bits. Software needs to be aware of this. + * + * Unless otherwise specified, all references to APICID refer to + * the FULL value contained in ACPI tables, not the subset in the + * processor APICID register. + */ + + +/* + * Maximum number of bricks in all partitions and in all coherency domains. + * This is the total number of bricks accessible in the numalink fabric. It + * includes all C & M bricks. Routers are NOT included. + * + * This value is also the value of the maximum number of non-router NASIDs + * in the numalink fabric. + * + * NOTE: a brick may contain 1 or 2 OS nodes. Don't get these confused. + */ +#define UV_MAX_NUMALINK_BLADES 16384 + +/* + * Maximum number of C/Mbricks within a software SSI (hardware may support + * more). + */ +#define UV_MAX_SSI_BLADES 256 + +/* + * The largest possible NASID of a C or M brick (+ 2) + */ +#define UV_MAX_NASID_VALUE (UV_MAX_NUMALINK_NODES * 2) + +/* + * The following defines attributes of the HUB chip. These attributes are + * frequently referenced and are kept in the per-cpu data areas of each cpu. + * They are kept together in a struct to minimize cache misses. + */ +struct uv_hub_info_s { + unsigned long global_mmr_base; + unsigned long gpa_mask; + unsigned long gnode_upper; + unsigned long lowmem_remap_top; + unsigned long lowmem_remap_base; + unsigned short pnode; + unsigned short pnode_mask; + unsigned short coherency_domain_number; + unsigned short numa_blade_id; + unsigned char blade_processor_id; + unsigned char m_val; + unsigned char n_val; +}; +DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); +#define uv_hub_info (&__get_cpu_var(__uv_hub_info)) +#define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu)) + +/* + * Local & Global MMR space macros. + * Note: macros are intended to be used ONLY by inline functions + * in this file - not by other kernel code. + * n - NASID (full 15-bit global nasid) + * g - GNODE (full 15-bit global nasid, right shifted 1) + * p - PNODE (local part of nsids, right shifted 1) + */ +#define UV_NASID_TO_PNODE(n) (((n) >> 1) & uv_hub_info->pnode_mask) +#define UV_PNODE_TO_NASID(p) (((p) << 1) | uv_hub_info->gnode_upper) + +#define UV_LOCAL_MMR_BASE 0xf4000000UL +#define UV_GLOBAL_MMR32_BASE 0xf8000000UL +#define UV_GLOBAL_MMR64_BASE (uv_hub_info->global_mmr_base) +#define UV_LOCAL_MMR_SIZE (64UL * 1024 * 1024) +#define UV_GLOBAL_MMR32_SIZE (64UL * 1024 * 1024) + +#define UV_GLOBAL_MMR32_PNODE_SHIFT 15 +#define UV_GLOBAL_MMR64_PNODE_SHIFT 26 + +#define UV_GLOBAL_MMR32_PNODE_BITS(p) ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT)) + +#define UV_GLOBAL_MMR64_PNODE_BITS(p) \ + ((unsigned long)(p) << UV_GLOBAL_MMR64_PNODE_SHIFT) + +#define UV_APIC_PNODE_SHIFT 6 + +/* + * Macros for converting between kernel virtual addresses, socket local physical + * addresses, and UV global physical addresses. + * Note: use the standard __pa() & __va() macros for converting + * between socket virtual and socket physical addresses. + */ + +/* socket phys RAM --> UV global physical address */ +static inline unsigned long uv_soc_phys_ram_to_gpa(unsigned long paddr) +{ + if (paddr < uv_hub_info->lowmem_remap_top) + paddr += uv_hub_info->lowmem_remap_base; + return paddr | uv_hub_info->gnode_upper; +} + + +/* socket virtual --> UV global physical address */ +static inline unsigned long uv_gpa(void *v) +{ + return __pa(v) | uv_hub_info->gnode_upper; +} + +/* socket virtual --> UV global physical address */ +static inline void *uv_vgpa(void *v) +{ + return (void *)uv_gpa(v); +} + +/* UV global physical address --> socket virtual */ +static inline void *uv_va(unsigned long gpa) +{ + return __va(gpa & uv_hub_info->gpa_mask); +} + +/* pnode, offset --> socket virtual */ +static inline void *uv_pnode_offset_to_vaddr(int pnode, unsigned long offset) +{ + return __va(((unsigned long)pnode << uv_hub_info->m_val) | offset); +} + + +/* + * Extract a PNODE from an APICID (full apicid, not processor subset) + */ +static inline int uv_apicid_to_pnode(int apicid) +{ + return (apicid >> UV_APIC_PNODE_SHIFT); +} + +/* + * Access global MMRs using the low memory MMR32 space. This region supports + * faster MMR access but not all MMRs are accessible in this space. + */ +static inline unsigned long *uv_global_mmr32_address(int pnode, + unsigned long offset) +{ + return __va(UV_GLOBAL_MMR32_BASE | + UV_GLOBAL_MMR32_PNODE_BITS(pnode) | offset); +} + +static inline void uv_write_global_mmr32(int pnode, unsigned long offset, + unsigned long val) +{ + *uv_global_mmr32_address(pnode, offset) = val; +} + +static inline unsigned long uv_read_global_mmr32(int pnode, + unsigned long offset) +{ + return *uv_global_mmr32_address(pnode, offset); +} + +/* + * Access Global MMR space using the MMR space located at the top of physical + * memory. + */ +static inline unsigned long *uv_global_mmr64_address(int pnode, + unsigned long offset) +{ + return __va(UV_GLOBAL_MMR64_BASE | + UV_GLOBAL_MMR64_PNODE_BITS(pnode) | offset); +} + +static inline void uv_write_global_mmr64(int pnode, unsigned long offset, + unsigned long val) +{ + *uv_global_mmr64_address(pnode, offset) = val; +} + +static inline unsigned long uv_read_global_mmr64(int pnode, + unsigned long offset) +{ + return *uv_global_mmr64_address(pnode, offset); +} + +/* + * Access hub local MMRs. Faster than using global space but only local MMRs + * are accessible. + */ +static inline unsigned long *uv_local_mmr_address(unsigned long offset) +{ + return __va(UV_LOCAL_MMR_BASE | offset); +} + +static inline unsigned long uv_read_local_mmr(unsigned long offset) +{ + return *uv_local_mmr_address(offset); +} + +static inline void uv_write_local_mmr(unsigned long offset, unsigned long val) +{ + *uv_local_mmr_address(offset) = val; +} + +/* + * Structures and definitions for converting between cpu, node, pnode, and blade + * numbers. + */ +struct uv_blade_info { + unsigned short nr_possible_cpus; + unsigned short nr_online_cpus; + unsigned short pnode; +}; +extern struct uv_blade_info *uv_blade_info; +extern short *uv_node_to_blade; +extern short *uv_cpu_to_blade; +extern short uv_possible_blades; + +/* Blade-local cpu number of current cpu. Numbered 0 .. <# cpus on the blade> */ +static inline int uv_blade_processor_id(void) +{ + return uv_hub_info->blade_processor_id; +} + +/* Blade number of current cpu. Numnbered 0 .. <#blades -1> */ +static inline int uv_numa_blade_id(void) +{ + return uv_hub_info->numa_blade_id; +} + +/* Convert a cpu number to the the UV blade number */ +static inline int uv_cpu_to_blade_id(int cpu) +{ + return uv_cpu_to_blade[cpu]; +} + +/* Convert linux node number to the UV blade number */ +static inline int uv_node_to_blade_id(int nid) +{ + return uv_node_to_blade[nid]; +} + +/* Convert a blade id to the PNODE of the blade */ +static inline int uv_blade_to_pnode(int bid) +{ + return uv_blade_info[bid].pnode; +} + +/* Determine the number of possible cpus on a blade */ +static inline int uv_blade_nr_possible_cpus(int bid) +{ + return uv_blade_info[bid].nr_possible_cpus; +} + +/* Determine the number of online cpus on a blade */ +static inline int uv_blade_nr_online_cpus(int bid) +{ + return uv_blade_info[bid].nr_online_cpus; +} + +/* Convert a cpu id to the PNODE of the blade containing the cpu */ +static inline int uv_cpu_to_pnode(int cpu) +{ + return uv_blade_info[uv_cpu_to_blade_id(cpu)].pnode; +} + +/* Convert a linux node number to the PNODE of the blade */ +static inline int uv_node_to_pnode(int nid) +{ + return uv_blade_info[uv_node_to_blade_id(nid)].pnode; +} + +/* Maximum possible number of blades */ +static inline int uv_num_possible_blades(void) +{ + return uv_possible_blades; +} + +#endif /* _ASM_X86_UV_UV_HUB_H */ + diff --git a/arch/x86/include/asm/uv/uv_irq.h b/arch/x86/include/asm/uv/uv_irq.h new file mode 100644 index 000000000000..9613c8c0b647 --- /dev/null +++ b/arch/x86/include/asm/uv/uv_irq.h @@ -0,0 +1,36 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * SGI UV IRQ definitions + * + * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved. + */ + +#ifndef _ASM_X86_UV_UV_IRQ_H +#define _ASM_X86_UV_UV_IRQ_H + +/* If a generic version of this structure gets defined, eliminate this one. */ +struct uv_IO_APIC_route_entry { + __u64 vector : 8, + delivery_mode : 3, + dest_mode : 1, + delivery_status : 1, + polarity : 1, + __reserved_1 : 1, + trigger : 1, + mask : 1, + __reserved_2 : 15, + dest : 32; +}; + +extern struct irq_chip uv_irq_chip; + +extern int arch_enable_uv_irq(char *, unsigned int, int, int, unsigned long); +extern void arch_disable_uv_irq(int, unsigned long); + +extern int uv_setup_irq(char *, int, int, unsigned long); +extern void uv_teardown_irq(unsigned int, int, unsigned long); + +#endif /* _ASM_X86_UV_UV_IRQ_H */ diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h new file mode 100644 index 000000000000..dd627793a234 --- /dev/null +++ b/arch/x86/include/asm/uv/uv_mmrs.h @@ -0,0 +1,1295 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * SGI UV MMR definitions + * + * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved. + */ + +#ifndef _ASM_X86_UV_UV_MMRS_H +#define _ASM_X86_UV_UV_MMRS_H + +#define UV_MMR_ENABLE (1UL << 63) + +/* ========================================================================= */ +/* UVH_BAU_DATA_CONFIG */ +/* ========================================================================= */ +#define UVH_BAU_DATA_CONFIG 0x61680UL +#define UVH_BAU_DATA_CONFIG_32 0x0438 + +#define UVH_BAU_DATA_CONFIG_VECTOR_SHFT 0 +#define UVH_BAU_DATA_CONFIG_VECTOR_MASK 0x00000000000000ffUL +#define UVH_BAU_DATA_CONFIG_DM_SHFT 8 +#define UVH_BAU_DATA_CONFIG_DM_MASK 0x0000000000000700UL +#define UVH_BAU_DATA_CONFIG_DESTMODE_SHFT 11 +#define UVH_BAU_DATA_CONFIG_DESTMODE_MASK 0x0000000000000800UL +#define UVH_BAU_DATA_CONFIG_STATUS_SHFT 12 +#define UVH_BAU_DATA_CONFIG_STATUS_MASK 0x0000000000001000UL +#define UVH_BAU_DATA_CONFIG_P_SHFT 13 +#define UVH_BAU_DATA_CONFIG_P_MASK 0x0000000000002000UL +#define UVH_BAU_DATA_CONFIG_T_SHFT 15 +#define UVH_BAU_DATA_CONFIG_T_MASK 0x0000000000008000UL +#define UVH_BAU_DATA_CONFIG_M_SHFT 16 +#define UVH_BAU_DATA_CONFIG_M_MASK 0x0000000000010000UL +#define UVH_BAU_DATA_CONFIG_APIC_ID_SHFT 32 +#define UVH_BAU_DATA_CONFIG_APIC_ID_MASK 0xffffffff00000000UL + +union uvh_bau_data_config_u { + unsigned long v; + struct uvh_bau_data_config_s { + unsigned long vector_ : 8; /* RW */ + unsigned long dm : 3; /* RW */ + unsigned long destmode : 1; /* RW */ + unsigned long status : 1; /* RO */ + unsigned long p : 1; /* RO */ + unsigned long rsvd_14 : 1; /* */ + unsigned long t : 1; /* RO */ + unsigned long m : 1; /* RW */ + unsigned long rsvd_17_31: 15; /* */ + unsigned long apic_id : 32; /* RW */ + } s; +}; + +/* ========================================================================= */ +/* UVH_EVENT_OCCURRED0 */ +/* ========================================================================= */ +#define UVH_EVENT_OCCURRED0 0x70000UL +#define UVH_EVENT_OCCURRED0_32 0x005e8 + +#define UVH_EVENT_OCCURRED0_LB_HCERR_SHFT 0 +#define UVH_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL +#define UVH_EVENT_OCCURRED0_GR0_HCERR_SHFT 1 +#define UVH_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000002UL +#define UVH_EVENT_OCCURRED0_GR1_HCERR_SHFT 2 +#define UVH_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000004UL +#define UVH_EVENT_OCCURRED0_LH_HCERR_SHFT 3 +#define UVH_EVENT_OCCURRED0_LH_HCERR_MASK 0x0000000000000008UL +#define UVH_EVENT_OCCURRED0_RH_HCERR_SHFT 4 +#define UVH_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000010UL +#define UVH_EVENT_OCCURRED0_XN_HCERR_SHFT 5 +#define UVH_EVENT_OCCURRED0_XN_HCERR_MASK 0x0000000000000020UL +#define UVH_EVENT_OCCURRED0_SI_HCERR_SHFT 6 +#define UVH_EVENT_OCCURRED0_SI_HCERR_MASK 0x0000000000000040UL +#define UVH_EVENT_OCCURRED0_LB_AOERR0_SHFT 7 +#define UVH_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000080UL +#define UVH_EVENT_OCCURRED0_GR0_AOERR0_SHFT 8 +#define UVH_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000000100UL +#define UVH_EVENT_OCCURRED0_GR1_AOERR0_SHFT 9 +#define UVH_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000000200UL +#define UVH_EVENT_OCCURRED0_LH_AOERR0_SHFT 10 +#define UVH_EVENT_OCCURRED0_LH_AOERR0_MASK 0x0000000000000400UL +#define UVH_EVENT_OCCURRED0_RH_AOERR0_SHFT 11 +#define UVH_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL +#define UVH_EVENT_OCCURRED0_XN_AOERR0_SHFT 12 +#define UVH_EVENT_OCCURRED0_XN_AOERR0_MASK 0x0000000000001000UL +#define UVH_EVENT_OCCURRED0_SI_AOERR0_SHFT 13 +#define UVH_EVENT_OCCURRED0_SI_AOERR0_MASK 0x0000000000002000UL +#define UVH_EVENT_OCCURRED0_LB_AOERR1_SHFT 14 +#define UVH_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000004000UL +#define UVH_EVENT_OCCURRED0_GR0_AOERR1_SHFT 15 +#define UVH_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000000008000UL +#define UVH_EVENT_OCCURRED0_GR1_AOERR1_SHFT 16 +#define UVH_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000000010000UL +#define UVH_EVENT_OCCURRED0_LH_AOERR1_SHFT 17 +#define UVH_EVENT_OCCURRED0_LH_AOERR1_MASK 0x0000000000020000UL +#define UVH_EVENT_OCCURRED0_RH_AOERR1_SHFT 18 +#define UVH_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000040000UL +#define UVH_EVENT_OCCURRED0_XN_AOERR1_SHFT 19 +#define UVH_EVENT_OCCURRED0_XN_AOERR1_MASK 0x0000000000080000UL +#define UVH_EVENT_OCCURRED0_SI_AOERR1_SHFT 20 +#define UVH_EVENT_OCCURRED0_SI_AOERR1_MASK 0x0000000000100000UL +#define UVH_EVENT_OCCURRED0_RH_VPI_INT_SHFT 21 +#define UVH_EVENT_OCCURRED0_RH_VPI_INT_MASK 0x0000000000200000UL +#define UVH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 22 +#define UVH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 23 +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000000800000UL +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 24 +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000001000000UL +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 25 +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000002000000UL +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 26 +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000004000000UL +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 27 +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000000008000000UL +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 28 +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000000010000000UL +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 29 +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000000020000000UL +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 30 +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000000040000000UL +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 31 +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000000080000000UL +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 32 +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000000100000000UL +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 33 +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000000200000000UL +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 34 +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000000400000000UL +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 35 +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000000800000000UL +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 36 +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000001000000000UL +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 37 +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000002000000000UL +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 38 +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000004000000000UL +#define UVH_EVENT_OCCURRED0_L1_NMI_INT_SHFT 39 +#define UVH_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0000008000000000UL +#define UVH_EVENT_OCCURRED0_STOP_CLOCK_SHFT 40 +#define UVH_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0000010000000000UL +#define UVH_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 41 +#define UVH_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0000020000000000UL +#define UVH_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 42 +#define UVH_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0000040000000000UL +#define UVH_EVENT_OCCURRED0_LTC_INT_SHFT 43 +#define UVH_EVENT_OCCURRED0_LTC_INT_MASK 0x0000080000000000UL +#define UVH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 44 +#define UVH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL +#define UVH_EVENT_OCCURRED0_IPI_INT_SHFT 45 +#define UVH_EVENT_OCCURRED0_IPI_INT_MASK 0x0000200000000000UL +#define UVH_EVENT_OCCURRED0_EXTIO_INT0_SHFT 46 +#define UVH_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0000400000000000UL +#define UVH_EVENT_OCCURRED0_EXTIO_INT1_SHFT 47 +#define UVH_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0000800000000000UL +#define UVH_EVENT_OCCURRED0_EXTIO_INT2_SHFT 48 +#define UVH_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0001000000000000UL +#define UVH_EVENT_OCCURRED0_EXTIO_INT3_SHFT 49 +#define UVH_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0002000000000000UL +#define UVH_EVENT_OCCURRED0_PROFILE_INT_SHFT 50 +#define UVH_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0004000000000000UL +#define UVH_EVENT_OCCURRED0_RTC0_SHFT 51 +#define UVH_EVENT_OCCURRED0_RTC0_MASK 0x0008000000000000UL +#define UVH_EVENT_OCCURRED0_RTC1_SHFT 52 +#define UVH_EVENT_OCCURRED0_RTC1_MASK 0x0010000000000000UL +#define UVH_EVENT_OCCURRED0_RTC2_SHFT 53 +#define UVH_EVENT_OCCURRED0_RTC2_MASK 0x0020000000000000UL +#define UVH_EVENT_OCCURRED0_RTC3_SHFT 54 +#define UVH_EVENT_OCCURRED0_RTC3_MASK 0x0040000000000000UL +#define UVH_EVENT_OCCURRED0_BAU_DATA_SHFT 55 +#define UVH_EVENT_OCCURRED0_BAU_DATA_MASK 0x0080000000000000UL +#define UVH_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_SHFT 56 +#define UVH_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_MASK 0x0100000000000000UL +union uvh_event_occurred0_u { + unsigned long v; + struct uvh_event_occurred0_s { + unsigned long lb_hcerr : 1; /* RW, W1C */ + unsigned long gr0_hcerr : 1; /* RW, W1C */ + unsigned long gr1_hcerr : 1; /* RW, W1C */ + unsigned long lh_hcerr : 1; /* RW, W1C */ + unsigned long rh_hcerr : 1; /* RW, W1C */ + unsigned long xn_hcerr : 1; /* RW, W1C */ + unsigned long si_hcerr : 1; /* RW, W1C */ + unsigned long lb_aoerr0 : 1; /* RW, W1C */ + unsigned long gr0_aoerr0 : 1; /* RW, W1C */ + unsigned long gr1_aoerr0 : 1; /* RW, W1C */ + unsigned long lh_aoerr0 : 1; /* RW, W1C */ + unsigned long rh_aoerr0 : 1; /* RW, W1C */ + unsigned long xn_aoerr0 : 1; /* RW, W1C */ + unsigned long si_aoerr0 : 1; /* RW, W1C */ + unsigned long lb_aoerr1 : 1; /* RW, W1C */ + unsigned long gr0_aoerr1 : 1; /* RW, W1C */ + unsigned long gr1_aoerr1 : 1; /* RW, W1C */ + unsigned long lh_aoerr1 : 1; /* RW, W1C */ + unsigned long rh_aoerr1 : 1; /* RW, W1C */ + unsigned long xn_aoerr1 : 1; /* RW, W1C */ + unsigned long si_aoerr1 : 1; /* RW, W1C */ + unsigned long rh_vpi_int : 1; /* RW, W1C */ + unsigned long system_shutdown_int : 1; /* RW, W1C */ + unsigned long lb_irq_int_0 : 1; /* RW, W1C */ + unsigned long lb_irq_int_1 : 1; /* RW, W1C */ + unsigned long lb_irq_int_2 : 1; /* RW, W1C */ + unsigned long lb_irq_int_3 : 1; /* RW, W1C */ + unsigned long lb_irq_int_4 : 1; /* RW, W1C */ + unsigned long lb_irq_int_5 : 1; /* RW, W1C */ + unsigned long lb_irq_int_6 : 1; /* RW, W1C */ + unsigned long lb_irq_int_7 : 1; /* RW, W1C */ + unsigned long lb_irq_int_8 : 1; /* RW, W1C */ + unsigned long lb_irq_int_9 : 1; /* RW, W1C */ + unsigned long lb_irq_int_10 : 1; /* RW, W1C */ + unsigned long lb_irq_int_11 : 1; /* RW, W1C */ + unsigned long lb_irq_int_12 : 1; /* RW, W1C */ + unsigned long lb_irq_int_13 : 1; /* RW, W1C */ + unsigned long lb_irq_int_14 : 1; /* RW, W1C */ + unsigned long lb_irq_int_15 : 1; /* RW, W1C */ + unsigned long l1_nmi_int : 1; /* RW, W1C */ + unsigned long stop_clock : 1; /* RW, W1C */ + unsigned long asic_to_l1 : 1; /* RW, W1C */ + unsigned long l1_to_asic : 1; /* RW, W1C */ + unsigned long ltc_int : 1; /* RW, W1C */ + unsigned long la_seq_trigger : 1; /* RW, W1C */ + unsigned long ipi_int : 1; /* RW, W1C */ + unsigned long extio_int0 : 1; /* RW, W1C */ + unsigned long extio_int1 : 1; /* RW, W1C */ + unsigned long extio_int2 : 1; /* RW, W1C */ + unsigned long extio_int3 : 1; /* RW, W1C */ + unsigned long profile_int : 1; /* RW, W1C */ + unsigned long rtc0 : 1; /* RW, W1C */ + unsigned long rtc1 : 1; /* RW, W1C */ + unsigned long rtc2 : 1; /* RW, W1C */ + unsigned long rtc3 : 1; /* RW, W1C */ + unsigned long bau_data : 1; /* RW, W1C */ + unsigned long power_management_req : 1; /* RW, W1C */ + unsigned long rsvd_57_63 : 7; /* */ + } s; +}; + +/* ========================================================================= */ +/* UVH_EVENT_OCCURRED0_ALIAS */ +/* ========================================================================= */ +#define UVH_EVENT_OCCURRED0_ALIAS 0x0000000000070008UL +#define UVH_EVENT_OCCURRED0_ALIAS_32 0x005f0 + +/* ========================================================================= */ +/* UVH_INT_CMPB */ +/* ========================================================================= */ +#define UVH_INT_CMPB 0x22080UL + +#define UVH_INT_CMPB_REAL_TIME_CMPB_SHFT 0 +#define UVH_INT_CMPB_REAL_TIME_CMPB_MASK 0x00ffffffffffffffUL + +union uvh_int_cmpb_u { + unsigned long v; + struct uvh_int_cmpb_s { + unsigned long real_time_cmpb : 56; /* RW */ + unsigned long rsvd_56_63 : 8; /* */ + } s; +}; + +/* ========================================================================= */ +/* UVH_INT_CMPC */ +/* ========================================================================= */ +#define UVH_INT_CMPC 0x22100UL + +#define UVH_INT_CMPC_REAL_TIME_CMPC_SHFT 0 +#define UVH_INT_CMPC_REAL_TIME_CMPC_MASK 0x00ffffffffffffffUL + +union uvh_int_cmpc_u { + unsigned long v; + struct uvh_int_cmpc_s { + unsigned long real_time_cmpc : 56; /* RW */ + unsigned long rsvd_56_63 : 8; /* */ + } s; +}; + +/* ========================================================================= */ +/* UVH_INT_CMPD */ +/* ========================================================================= */ +#define UVH_INT_CMPD 0x22180UL + +#define UVH_INT_CMPD_REAL_TIME_CMPD_SHFT 0 +#define UVH_INT_CMPD_REAL_TIME_CMPD_MASK 0x00ffffffffffffffUL + +union uvh_int_cmpd_u { + unsigned long v; + struct uvh_int_cmpd_s { + unsigned long real_time_cmpd : 56; /* RW */ + unsigned long rsvd_56_63 : 8; /* */ + } s; +}; + +/* ========================================================================= */ +/* UVH_IPI_INT */ +/* ========================================================================= */ +#define UVH_IPI_INT 0x60500UL +#define UVH_IPI_INT_32 0x0348 + +#define UVH_IPI_INT_VECTOR_SHFT 0 +#define UVH_IPI_INT_VECTOR_MASK 0x00000000000000ffUL +#define UVH_IPI_INT_DELIVERY_MODE_SHFT 8 +#define UVH_IPI_INT_DELIVERY_MODE_MASK 0x0000000000000700UL +#define UVH_IPI_INT_DESTMODE_SHFT 11 +#define UVH_IPI_INT_DESTMODE_MASK 0x0000000000000800UL +#define UVH_IPI_INT_APIC_ID_SHFT 16 +#define UVH_IPI_INT_APIC_ID_MASK 0x0000ffffffff0000UL +#define UVH_IPI_INT_SEND_SHFT 63 +#define UVH_IPI_INT_SEND_MASK 0x8000000000000000UL + +union uvh_ipi_int_u { + unsigned long v; + struct uvh_ipi_int_s { + unsigned long vector_ : 8; /* RW */ + unsigned long delivery_mode : 3; /* RW */ + unsigned long destmode : 1; /* RW */ + unsigned long rsvd_12_15 : 4; /* */ + unsigned long apic_id : 32; /* RW */ + unsigned long rsvd_48_62 : 15; /* */ + unsigned long send : 1; /* WP */ + } s; +}; + +/* ========================================================================= */ +/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST */ +/* ========================================================================= */ +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_32 0x009c0 + +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_SHFT 4 +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_SHFT 49 +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_MASK 0x7ffe000000000000UL + +union uvh_lb_bau_intd_payload_queue_first_u { + unsigned long v; + struct uvh_lb_bau_intd_payload_queue_first_s { + unsigned long rsvd_0_3: 4; /* */ + unsigned long address : 39; /* RW */ + unsigned long rsvd_43_48: 6; /* */ + unsigned long node_id : 14; /* RW */ + unsigned long rsvd_63 : 1; /* */ + } s; +}; + +/* ========================================================================= */ +/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST */ +/* ========================================================================= */ +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_32 0x009c8 + +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4 +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL + +union uvh_lb_bau_intd_payload_queue_last_u { + unsigned long v; + struct uvh_lb_bau_intd_payload_queue_last_s { + unsigned long rsvd_0_3: 4; /* */ + unsigned long address : 39; /* RW */ + unsigned long rsvd_43_63: 21; /* */ + } s; +}; + +/* ========================================================================= */ +/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL */ +/* ========================================================================= */ +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_32 0x009d0 + +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4 +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL + +union uvh_lb_bau_intd_payload_queue_tail_u { + unsigned long v; + struct uvh_lb_bau_intd_payload_queue_tail_s { + unsigned long rsvd_0_3: 4; /* */ + unsigned long address : 39; /* RW */ + unsigned long rsvd_43_63: 21; /* */ + } s; +}; + +/* ========================================================================= */ +/* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE */ +/* ========================================================================= */ +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_32 0x0a68 + +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0 +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_SHFT 1 +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_MASK 0x0000000000000002UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_SHFT 2 +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_MASK 0x0000000000000004UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_SHFT 3 +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_MASK 0x0000000000000008UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_SHFT 4 +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_MASK 0x0000000000000010UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_SHFT 5 +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_MASK 0x0000000000000020UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_SHFT 6 +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_MASK 0x0000000000000040UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_SHFT 7 +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_MASK 0x0000000000000080UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_SHFT 8 +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_MASK 0x0000000000000100UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_SHFT 9 +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_MASK 0x0000000000000200UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_SHFT 10 +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_MASK 0x0000000000000400UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_SHFT 11 +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_MASK 0x0000000000000800UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_SHFT 12 +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_MASK 0x0000000000001000UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_SHFT 13 +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_MASK 0x0000000000002000UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_SHFT 14 +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_SHFT 15 +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_MASK 0x0000000000008000UL +union uvh_lb_bau_intd_software_acknowledge_u { + unsigned long v; + struct uvh_lb_bau_intd_software_acknowledge_s { + unsigned long pending_0 : 1; /* RW, W1C */ + unsigned long pending_1 : 1; /* RW, W1C */ + unsigned long pending_2 : 1; /* RW, W1C */ + unsigned long pending_3 : 1; /* RW, W1C */ + unsigned long pending_4 : 1; /* RW, W1C */ + unsigned long pending_5 : 1; /* RW, W1C */ + unsigned long pending_6 : 1; /* RW, W1C */ + unsigned long pending_7 : 1; /* RW, W1C */ + unsigned long timeout_0 : 1; /* RW, W1C */ + unsigned long timeout_1 : 1; /* RW, W1C */ + unsigned long timeout_2 : 1; /* RW, W1C */ + unsigned long timeout_3 : 1; /* RW, W1C */ + unsigned long timeout_4 : 1; /* RW, W1C */ + unsigned long timeout_5 : 1; /* RW, W1C */ + unsigned long timeout_6 : 1; /* RW, W1C */ + unsigned long timeout_7 : 1; /* RW, W1C */ + unsigned long rsvd_16_63: 48; /* */ + } s; +}; + +/* ========================================================================= */ +/* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS */ +/* ========================================================================= */ +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x0000000000320088UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0x0a70 + +/* ========================================================================= */ +/* UVH_LB_BAU_SB_ACTIVATION_CONTROL */ +/* ========================================================================= */ +#define UVH_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL +#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_32 0x009a8 + +#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_SHFT 0 +#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_MASK 0x000000000000003fUL +#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT 62 +#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_MASK 0x4000000000000000UL +#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_SHFT 63 +#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_MASK 0x8000000000000000UL + +union uvh_lb_bau_sb_activation_control_u { + unsigned long v; + struct uvh_lb_bau_sb_activation_control_s { + unsigned long index : 6; /* RW */ + unsigned long rsvd_6_61: 56; /* */ + unsigned long push : 1; /* WP */ + unsigned long init : 1; /* WP */ + } s; +}; + +/* ========================================================================= */ +/* UVH_LB_BAU_SB_ACTIVATION_STATUS_0 */ +/* ========================================================================= */ +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x009b0 + +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_SHFT 0 +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_MASK 0xffffffffffffffffUL + +union uvh_lb_bau_sb_activation_status_0_u { + unsigned long v; + struct uvh_lb_bau_sb_activation_status_0_s { + unsigned long status : 64; /* RW */ + } s; +}; + +/* ========================================================================= */ +/* UVH_LB_BAU_SB_ACTIVATION_STATUS_1 */ +/* ========================================================================= */ +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x009b8 + +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_SHFT 0 +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_MASK 0xffffffffffffffffUL + +union uvh_lb_bau_sb_activation_status_1_u { + unsigned long v; + struct uvh_lb_bau_sb_activation_status_1_s { + unsigned long status : 64; /* RW */ + } s; +}; + +/* ========================================================================= */ +/* UVH_LB_BAU_SB_DESCRIPTOR_BASE */ +/* ========================================================================= */ +#define UVH_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL +#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_32 0x009a0 + +#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_SHFT 12 +#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL +#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT 49 +#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK 0x7ffe000000000000UL + +union uvh_lb_bau_sb_descriptor_base_u { + unsigned long v; + struct uvh_lb_bau_sb_descriptor_base_s { + unsigned long rsvd_0_11 : 12; /* */ + unsigned long page_address : 31; /* RW */ + unsigned long rsvd_43_48 : 6; /* */ + unsigned long node_id : 14; /* RW */ + unsigned long rsvd_63 : 1; /* */ + } s; +}; + +/* ========================================================================= */ +/* UVH_LB_MCAST_AOERR0_RPT_ENABLE */ +/* ========================================================================= */ +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE 0x50b20UL + +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_OBESE_MSG_SHFT 0 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_OBESE_MSG_MASK 0x0000000000000001UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_DATA_SB_ERR_SHFT 1 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_DATA_SB_ERR_MASK 0x0000000000000002UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_NACK_BUFF_PARITY_SHFT 2 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_NACK_BUFF_PARITY_MASK 0x0000000000000004UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_TIMEOUT_SHFT 3 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_TIMEOUT_MASK 0x0000000000000008UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_INACTIVE_REPLY_SHFT 4 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_INACTIVE_REPLY_MASK 0x0000000000000010UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_UPGRADE_ERROR_SHFT 5 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_UPGRADE_ERROR_MASK 0x0000000000000020UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REG_COUNT_UNDERFLOW_SHFT 6 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REG_COUNT_UNDERFLOW_MASK 0x0000000000000040UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REP_OBESE_MSG_SHFT 7 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REP_OBESE_MSG_MASK 0x0000000000000080UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_RUNT_MSG_SHFT 8 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_RUNT_MSG_MASK 0x0000000000000100UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_OBESE_MSG_SHFT 9 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_OBESE_MSG_MASK 0x0000000000000200UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_DATA_SB_ERR_SHFT 10 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_DATA_SB_ERR_MASK 0x0000000000000400UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_RUNT_MSG_SHFT 11 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_RUNT_MSG_MASK 0x0000000000000800UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_OBESE_MSG_SHFT 12 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_OBESE_MSG_MASK 0x0000000000001000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_DATA_SB_ERR_SHFT 13 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_DATA_SB_ERR_MASK 0x0000000000002000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_COMMAND_ERR_SHFT 14 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_COMMAND_ERR_MASK 0x0000000000004000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_PEND_TIMEOUT_SHFT 15 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_PEND_TIMEOUT_MASK 0x0000000000008000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_RUNT_MSG_SHFT 16 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_RUNT_MSG_MASK 0x0000000000010000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_OBESE_MSG_SHFT 17 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_OBESE_MSG_MASK 0x0000000000020000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_DATA_SB_ERR_SHFT 18 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_DATA_SB_ERR_MASK 0x0000000000040000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_RUNT_MSG_SHFT 19 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_RUNT_MSG_MASK 0x0000000000080000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_OBESE_MSG_SHFT 20 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_OBESE_MSG_MASK 0x0000000000100000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_DATA_SB_ERR_SHFT 21 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_DATA_SB_ERR_MASK 0x0000000000200000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_AMO_TIMEOUT_SHFT 22 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_AMO_TIMEOUT_MASK 0x0000000000400000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_PUT_TIMEOUT_SHFT 23 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_PUT_TIMEOUT_MASK 0x0000000000800000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_SPURIOUS_EVENT_SHFT 24 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_SPURIOUS_EVENT_MASK 0x0000000001000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IOH_DESTINATION_TABLE_PARITY_SHFT 25 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IOH_DESTINATION_TABLE_PARITY_MASK 0x0000000002000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_HAD_ERROR_REPLY_SHFT 26 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_HAD_ERROR_REPLY_MASK 0x0000000004000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_TIMEOUT_SHFT 27 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_TIMEOUT_MASK 0x0000000008000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_LOCK_MANAGER_HAD_ERROR_REPLY_SHFT 28 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_LOCK_MANAGER_HAD_ERROR_REPLY_MASK 0x0000000010000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_HAD_ERROR_REPLY_SHFT 29 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_HAD_ERROR_REPLY_MASK 0x0000000020000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_TIMEOUT_SHFT 30 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_TIMEOUT_MASK 0x0000000040000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SB_ACTIVATION_OVERRUN_SHFT 31 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SB_ACTIVATION_OVERRUN_MASK 0x0000000080000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_HAD_ERROR_REPLY_SHFT 32 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_HAD_ERROR_REPLY_MASK 0x0000000100000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_TIMEOUT_SHFT 33 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_TIMEOUT_MASK 0x0000000200000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_0_PARITY_SHFT 34 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_0_PARITY_MASK 0x0000000400000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_1_PARITY_SHFT 35 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_1_PARITY_MASK 0x0000000800000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SOCKET_DESTINATION_TABLE_PARITY_SHFT 36 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SOCKET_DESTINATION_TABLE_PARITY_MASK 0x0000001000000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_BAU_REPLY_PAYLOAD_CORRUPTION_SHFT 37 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_BAU_REPLY_PAYLOAD_CORRUPTION_MASK 0x0000002000000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IO_PORT_DESTINATION_TABLE_PARITY_SHFT 38 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IO_PORT_DESTINATION_TABLE_PARITY_MASK 0x0000004000000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INTD_SOFT_ACK_TIMEOUT_SHFT 39 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INTD_SOFT_ACK_TIMEOUT_MASK 0x0000008000000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_OBESE_MSG_SHFT 40 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_OBESE_MSG_MASK 0x0000010000000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_COMMAND_ERR_SHFT 41 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_COMMAND_ERR_MASK 0x0000020000000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_TIMEOUT_SHFT 42 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_TIMEOUT_MASK 0x0000040000000000UL + +union uvh_lb_mcast_aoerr0_rpt_enable_u { + unsigned long v; + struct uvh_lb_mcast_aoerr0_rpt_enable_s { + unsigned long mcast_obese_msg : 1; /* RW */ + unsigned long mcast_data_sb_err : 1; /* RW */ + unsigned long mcast_nack_buff_parity : 1; /* RW */ + unsigned long mcast_timeout : 1; /* RW */ + unsigned long mcast_inactive_reply : 1; /* RW */ + unsigned long mcast_upgrade_error : 1; /* RW */ + unsigned long mcast_reg_count_underflow : 1; /* RW */ + unsigned long mcast_rep_obese_msg : 1; /* RW */ + unsigned long ucache_req_runt_msg : 1; /* RW */ + unsigned long ucache_req_obese_msg : 1; /* RW */ + unsigned long ucache_req_data_sb_err : 1; /* RW */ + unsigned long ucache_rep_runt_msg : 1; /* RW */ + unsigned long ucache_rep_obese_msg : 1; /* RW */ + unsigned long ucache_rep_data_sb_err : 1; /* RW */ + unsigned long ucache_rep_command_err : 1; /* RW */ + unsigned long ucache_pend_timeout : 1; /* RW */ + unsigned long macc_req_runt_msg : 1; /* RW */ + unsigned long macc_req_obese_msg : 1; /* RW */ + unsigned long macc_req_data_sb_err : 1; /* RW */ + unsigned long macc_rep_runt_msg : 1; /* RW */ + unsigned long macc_rep_obese_msg : 1; /* RW */ + unsigned long macc_rep_data_sb_err : 1; /* RW */ + unsigned long macc_amo_timeout : 1; /* RW */ + unsigned long macc_put_timeout : 1; /* RW */ + unsigned long macc_spurious_event : 1; /* RW */ + unsigned long ioh_destination_table_parity : 1; /* RW */ + unsigned long get_had_error_reply : 1; /* RW */ + unsigned long get_timeout : 1; /* RW */ + unsigned long lock_manager_had_error_reply : 1; /* RW */ + unsigned long put_had_error_reply : 1; /* RW */ + unsigned long put_timeout : 1; /* RW */ + unsigned long sb_activation_overrun : 1; /* RW */ + unsigned long completed_gb_activation_had_error_reply : 1; /* RW */ + unsigned long completed_gb_activation_timeout : 1; /* RW */ + unsigned long descriptor_buffer_0_parity : 1; /* RW */ + unsigned long descriptor_buffer_1_parity : 1; /* RW */ + unsigned long socket_destination_table_parity : 1; /* RW */ + unsigned long bau_reply_payload_corruption : 1; /* RW */ + unsigned long io_port_destination_table_parity : 1; /* RW */ + unsigned long intd_soft_ack_timeout : 1; /* RW */ + unsigned long int_rep_obese_msg : 1; /* RW */ + unsigned long int_rep_command_err : 1; /* RW */ + unsigned long int_timeout : 1; /* RW */ + unsigned long rsvd_43_63 : 21; /* */ + } s; +}; + +/* ========================================================================= */ +/* UVH_LOCAL_INT0_CONFIG */ +/* ========================================================================= */ +#define UVH_LOCAL_INT0_CONFIG 0x61000UL + +#define UVH_LOCAL_INT0_CONFIG_VECTOR_SHFT 0 +#define UVH_LOCAL_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL +#define UVH_LOCAL_INT0_CONFIG_DM_SHFT 8 +#define UVH_LOCAL_INT0_CONFIG_DM_MASK 0x0000000000000700UL +#define UVH_LOCAL_INT0_CONFIG_DESTMODE_SHFT 11 +#define UVH_LOCAL_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL +#define UVH_LOCAL_INT0_CONFIG_STATUS_SHFT 12 +#define UVH_LOCAL_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL +#define UVH_LOCAL_INT0_CONFIG_P_SHFT 13 +#define UVH_LOCAL_INT0_CONFIG_P_MASK 0x0000000000002000UL +#define UVH_LOCAL_INT0_CONFIG_T_SHFT 15 +#define UVH_LOCAL_INT0_CONFIG_T_MASK 0x0000000000008000UL +#define UVH_LOCAL_INT0_CONFIG_M_SHFT 16 +#define UVH_LOCAL_INT0_CONFIG_M_MASK 0x0000000000010000UL +#define UVH_LOCAL_INT0_CONFIG_APIC_ID_SHFT 32 +#define UVH_LOCAL_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL + +union uvh_local_int0_config_u { + unsigned long v; + struct uvh_local_int0_config_s { + unsigned long vector_ : 8; /* RW */ + unsigned long dm : 3; /* RW */ + unsigned long destmode : 1; /* RW */ + unsigned long status : 1; /* RO */ + unsigned long p : 1; /* RO */ + unsigned long rsvd_14 : 1; /* */ + unsigned long t : 1; /* RO */ + unsigned long m : 1; /* RW */ + unsigned long rsvd_17_31: 15; /* */ + unsigned long apic_id : 32; /* RW */ + } s; +}; + +/* ========================================================================= */ +/* UVH_LOCAL_INT0_ENABLE */ +/* ========================================================================= */ +#define UVH_LOCAL_INT0_ENABLE 0x65000UL + +#define UVH_LOCAL_INT0_ENABLE_LB_HCERR_SHFT 0 +#define UVH_LOCAL_INT0_ENABLE_LB_HCERR_MASK 0x0000000000000001UL +#define UVH_LOCAL_INT0_ENABLE_GR0_HCERR_SHFT 1 +#define UVH_LOCAL_INT0_ENABLE_GR0_HCERR_MASK 0x0000000000000002UL +#define UVH_LOCAL_INT0_ENABLE_GR1_HCERR_SHFT 2 +#define UVH_LOCAL_INT0_ENABLE_GR1_HCERR_MASK 0x0000000000000004UL +#define UVH_LOCAL_INT0_ENABLE_LH_HCERR_SHFT 3 +#define UVH_LOCAL_INT0_ENABLE_LH_HCERR_MASK 0x0000000000000008UL +#define UVH_LOCAL_INT0_ENABLE_RH_HCERR_SHFT 4 +#define UVH_LOCAL_INT0_ENABLE_RH_HCERR_MASK 0x0000000000000010UL +#define UVH_LOCAL_INT0_ENABLE_XN_HCERR_SHFT 5 +#define UVH_LOCAL_INT0_ENABLE_XN_HCERR_MASK 0x0000000000000020UL +#define UVH_LOCAL_INT0_ENABLE_SI_HCERR_SHFT 6 +#define UVH_LOCAL_INT0_ENABLE_SI_HCERR_MASK 0x0000000000000040UL +#define UVH_LOCAL_INT0_ENABLE_LB_AOERR0_SHFT 7 +#define UVH_LOCAL_INT0_ENABLE_LB_AOERR0_MASK 0x0000000000000080UL +#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR0_SHFT 8 +#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR0_MASK 0x0000000000000100UL +#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR0_SHFT 9 +#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR0_MASK 0x0000000000000200UL +#define UVH_LOCAL_INT0_ENABLE_LH_AOERR0_SHFT 10 +#define UVH_LOCAL_INT0_ENABLE_LH_AOERR0_MASK 0x0000000000000400UL +#define UVH_LOCAL_INT0_ENABLE_RH_AOERR0_SHFT 11 +#define UVH_LOCAL_INT0_ENABLE_RH_AOERR0_MASK 0x0000000000000800UL +#define UVH_LOCAL_INT0_ENABLE_XN_AOERR0_SHFT 12 +#define UVH_LOCAL_INT0_ENABLE_XN_AOERR0_MASK 0x0000000000001000UL +#define UVH_LOCAL_INT0_ENABLE_SI_AOERR0_SHFT 13 +#define UVH_LOCAL_INT0_ENABLE_SI_AOERR0_MASK 0x0000000000002000UL +#define UVH_LOCAL_INT0_ENABLE_LB_AOERR1_SHFT 14 +#define UVH_LOCAL_INT0_ENABLE_LB_AOERR1_MASK 0x0000000000004000UL +#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR1_SHFT 15 +#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR1_MASK 0x0000000000008000UL +#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR1_SHFT 16 +#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR1_MASK 0x0000000000010000UL +#define UVH_LOCAL_INT0_ENABLE_LH_AOERR1_SHFT 17 +#define UVH_LOCAL_INT0_ENABLE_LH_AOERR1_MASK 0x0000000000020000UL +#define UVH_LOCAL_INT0_ENABLE_RH_AOERR1_SHFT 18 +#define UVH_LOCAL_INT0_ENABLE_RH_AOERR1_MASK 0x0000000000040000UL +#define UVH_LOCAL_INT0_ENABLE_XN_AOERR1_SHFT 19 +#define UVH_LOCAL_INT0_ENABLE_XN_AOERR1_MASK 0x0000000000080000UL +#define UVH_LOCAL_INT0_ENABLE_SI_AOERR1_SHFT 20 +#define UVH_LOCAL_INT0_ENABLE_SI_AOERR1_MASK 0x0000000000100000UL +#define UVH_LOCAL_INT0_ENABLE_RH_VPI_INT_SHFT 21 +#define UVH_LOCAL_INT0_ENABLE_RH_VPI_INT_MASK 0x0000000000200000UL +#define UVH_LOCAL_INT0_ENABLE_SYSTEM_SHUTDOWN_INT_SHFT 22 +#define UVH_LOCAL_INT0_ENABLE_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_0_SHFT 23 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_0_MASK 0x0000000000800000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_1_SHFT 24 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_1_MASK 0x0000000001000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_2_SHFT 25 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_2_MASK 0x0000000002000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_3_SHFT 26 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_3_MASK 0x0000000004000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_4_SHFT 27 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_4_MASK 0x0000000008000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_5_SHFT 28 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_5_MASK 0x0000000010000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_6_SHFT 29 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_6_MASK 0x0000000020000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_7_SHFT 30 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_7_MASK 0x0000000040000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_8_SHFT 31 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_8_MASK 0x0000000080000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_9_SHFT 32 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_9_MASK 0x0000000100000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_10_SHFT 33 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_10_MASK 0x0000000200000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_11_SHFT 34 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_11_MASK 0x0000000400000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_12_SHFT 35 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_12_MASK 0x0000000800000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_13_SHFT 36 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_13_MASK 0x0000001000000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_14_SHFT 37 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_14_MASK 0x0000002000000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_15_SHFT 38 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_15_MASK 0x0000004000000000UL +#define UVH_LOCAL_INT0_ENABLE_L1_NMI_INT_SHFT 39 +#define UVH_LOCAL_INT0_ENABLE_L1_NMI_INT_MASK 0x0000008000000000UL +#define UVH_LOCAL_INT0_ENABLE_STOP_CLOCK_SHFT 40 +#define UVH_LOCAL_INT0_ENABLE_STOP_CLOCK_MASK 0x0000010000000000UL +#define UVH_LOCAL_INT0_ENABLE_ASIC_TO_L1_SHFT 41 +#define UVH_LOCAL_INT0_ENABLE_ASIC_TO_L1_MASK 0x0000020000000000UL +#define UVH_LOCAL_INT0_ENABLE_L1_TO_ASIC_SHFT 42 +#define UVH_LOCAL_INT0_ENABLE_L1_TO_ASIC_MASK 0x0000040000000000UL +#define UVH_LOCAL_INT0_ENABLE_LTC_INT_SHFT 43 +#define UVH_LOCAL_INT0_ENABLE_LTC_INT_MASK 0x0000080000000000UL +#define UVH_LOCAL_INT0_ENABLE_LA_SEQ_TRIGGER_SHFT 44 +#define UVH_LOCAL_INT0_ENABLE_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL + +union uvh_local_int0_enable_u { + unsigned long v; + struct uvh_local_int0_enable_s { + unsigned long lb_hcerr : 1; /* RW */ + unsigned long gr0_hcerr : 1; /* RW */ + unsigned long gr1_hcerr : 1; /* RW */ + unsigned long lh_hcerr : 1; /* RW */ + unsigned long rh_hcerr : 1; /* RW */ + unsigned long xn_hcerr : 1; /* RW */ + unsigned long si_hcerr : 1; /* RW */ + unsigned long lb_aoerr0 : 1; /* RW */ + unsigned long gr0_aoerr0 : 1; /* RW */ + unsigned long gr1_aoerr0 : 1; /* RW */ + unsigned long lh_aoerr0 : 1; /* RW */ + unsigned long rh_aoerr0 : 1; /* RW */ + unsigned long xn_aoerr0 : 1; /* RW */ + unsigned long si_aoerr0 : 1; /* RW */ + unsigned long lb_aoerr1 : 1; /* RW */ + unsigned long gr0_aoerr1 : 1; /* RW */ + unsigned long gr1_aoerr1 : 1; /* RW */ + unsigned long lh_aoerr1 : 1; /* RW */ + unsigned long rh_aoerr1 : 1; /* RW */ + unsigned long xn_aoerr1 : 1; /* RW */ + unsigned long si_aoerr1 : 1; /* RW */ + unsigned long rh_vpi_int : 1; /* RW */ + unsigned long system_shutdown_int : 1; /* RW */ + unsigned long lb_irq_int_0 : 1; /* RW */ + unsigned long lb_irq_int_1 : 1; /* RW */ + unsigned long lb_irq_int_2 : 1; /* RW */ + unsigned long lb_irq_int_3 : 1; /* RW */ + unsigned long lb_irq_int_4 : 1; /* RW */ + unsigned long lb_irq_int_5 : 1; /* RW */ + unsigned long lb_irq_int_6 : 1; /* RW */ + unsigned long lb_irq_int_7 : 1; /* RW */ + unsigned long lb_irq_int_8 : 1; /* RW */ + unsigned long lb_irq_int_9 : 1; /* RW */ + unsigned long lb_irq_int_10 : 1; /* RW */ + unsigned long lb_irq_int_11 : 1; /* RW */ + unsigned long lb_irq_int_12 : 1; /* RW */ + unsigned long lb_irq_int_13 : 1; /* RW */ + unsigned long lb_irq_int_14 : 1; /* RW */ + unsigned long lb_irq_int_15 : 1; /* RW */ + unsigned long l1_nmi_int : 1; /* RW */ + unsigned long stop_clock : 1; /* RW */ + unsigned long asic_to_l1 : 1; /* RW */ + unsigned long l1_to_asic : 1; /* RW */ + unsigned long ltc_int : 1; /* RW */ + unsigned long la_seq_trigger : 1; /* RW */ + unsigned long rsvd_45_63 : 19; /* */ + } s; +}; + +/* ========================================================================= */ +/* UVH_NODE_ID */ +/* ========================================================================= */ +#define UVH_NODE_ID 0x0UL + +#define UVH_NODE_ID_FORCE1_SHFT 0 +#define UVH_NODE_ID_FORCE1_MASK 0x0000000000000001UL +#define UVH_NODE_ID_MANUFACTURER_SHFT 1 +#define UVH_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL +#define UVH_NODE_ID_PART_NUMBER_SHFT 12 +#define UVH_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL +#define UVH_NODE_ID_REVISION_SHFT 28 +#define UVH_NODE_ID_REVISION_MASK 0x00000000f0000000UL +#define UVH_NODE_ID_NODE_ID_SHFT 32 +#define UVH_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL +#define UVH_NODE_ID_NODES_PER_BIT_SHFT 48 +#define UVH_NODE_ID_NODES_PER_BIT_MASK 0x007f000000000000UL +#define UVH_NODE_ID_NI_PORT_SHFT 56 +#define UVH_NODE_ID_NI_PORT_MASK 0x0f00000000000000UL + +union uvh_node_id_u { + unsigned long v; + struct uvh_node_id_s { + unsigned long force1 : 1; /* RO */ + unsigned long manufacturer : 11; /* RO */ + unsigned long part_number : 16; /* RO */ + unsigned long revision : 4; /* RO */ + unsigned long node_id : 15; /* RW */ + unsigned long rsvd_47 : 1; /* */ + unsigned long nodes_per_bit : 7; /* RW */ + unsigned long rsvd_55 : 1; /* */ + unsigned long ni_port : 4; /* RO */ + unsigned long rsvd_60_63 : 4; /* */ + } s; +}; + +/* ========================================================================= */ +/* UVH_NODE_PRESENT_TABLE */ +/* ========================================================================= */ +#define UVH_NODE_PRESENT_TABLE 0x1400UL +#define UVH_NODE_PRESENT_TABLE_DEPTH 16 + +#define UVH_NODE_PRESENT_TABLE_NODES_SHFT 0 +#define UVH_NODE_PRESENT_TABLE_NODES_MASK 0xffffffffffffffffUL + +union uvh_node_present_table_u { + unsigned long v; + struct uvh_node_present_table_s { + unsigned long nodes : 64; /* RW */ + } s; +}; + +/* ========================================================================= */ +/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR */ +/* ========================================================================= */ +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL + +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24 +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL + +union uvh_rh_gam_alias210_redirect_config_0_mmr_u { + unsigned long v; + struct uvh_rh_gam_alias210_redirect_config_0_mmr_s { + unsigned long rsvd_0_23 : 24; /* */ + unsigned long dest_base : 22; /* RW */ + unsigned long rsvd_46_63: 18; /* */ + } s; +}; + +/* ========================================================================= */ +/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR */ +/* ========================================================================= */ +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL + +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24 +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL + +union uvh_rh_gam_alias210_redirect_config_1_mmr_u { + unsigned long v; + struct uvh_rh_gam_alias210_redirect_config_1_mmr_s { + unsigned long rsvd_0_23 : 24; /* */ + unsigned long dest_base : 22; /* RW */ + unsigned long rsvd_46_63: 18; /* */ + } s; +}; + +/* ========================================================================= */ +/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR */ +/* ========================================================================= */ +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL + +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24 +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL + +union uvh_rh_gam_alias210_redirect_config_2_mmr_u { + unsigned long v; + struct uvh_rh_gam_alias210_redirect_config_2_mmr_s { + unsigned long rsvd_0_23 : 24; /* */ + unsigned long dest_base : 22; /* RW */ + unsigned long rsvd_46_63: 18; /* */ + } s; +}; + +/* ========================================================================= */ +/* UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR */ +/* ========================================================================= */ +#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR 0x1600020UL + +#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_SHFT 26 +#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL +#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 +#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL + +union uvh_rh_gam_cfg_overlay_config_mmr_u { + unsigned long v; + struct uvh_rh_gam_cfg_overlay_config_mmr_s { + unsigned long rsvd_0_25: 26; /* */ + unsigned long base : 20; /* RW */ + unsigned long rsvd_46_62: 17; /* */ + unsigned long enable : 1; /* RW */ + } s; +}; + +/* ========================================================================= */ +/* UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR */ +/* ========================================================================= */ +#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL + +#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28 +#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL +#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_SHFT 48 +#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_MASK 0x0001000000000000UL +#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52 +#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL +#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 +#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL + +union uvh_rh_gam_gru_overlay_config_mmr_u { + unsigned long v; + struct uvh_rh_gam_gru_overlay_config_mmr_s { + unsigned long rsvd_0_27: 28; /* */ + unsigned long base : 18; /* RW */ + unsigned long rsvd_46_47: 2; /* */ + unsigned long gr4 : 1; /* RW */ + unsigned long rsvd_49_51: 3; /* */ + unsigned long n_gru : 4; /* RW */ + unsigned long rsvd_56_62: 7; /* */ + unsigned long enable : 1; /* RW */ + } s; +}; + +/* ========================================================================= */ +/* UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR */ +/* ========================================================================= */ +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL + +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 30 +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003fffc0000000UL +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46 +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52 +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL + +union uvh_rh_gam_mmioh_overlay_config_mmr_u { + unsigned long v; + struct uvh_rh_gam_mmioh_overlay_config_mmr_s { + unsigned long rsvd_0_29: 30; /* */ + unsigned long base : 16; /* RW */ + unsigned long m_io : 6; /* RW */ + unsigned long n_io : 4; /* RW */ + unsigned long rsvd_56_62: 7; /* */ + unsigned long enable : 1; /* RW */ + } s; +}; + +/* ========================================================================= */ +/* UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR */ +/* ========================================================================= */ +#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL + +#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26 +#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL +#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_SHFT 46 +#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_MASK 0x0000400000000000UL +#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 +#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL + +union uvh_rh_gam_mmr_overlay_config_mmr_u { + unsigned long v; + struct uvh_rh_gam_mmr_overlay_config_mmr_s { + unsigned long rsvd_0_25: 26; /* */ + unsigned long base : 20; /* RW */ + unsigned long dual_hub : 1; /* RW */ + unsigned long rsvd_47_62: 16; /* */ + unsigned long enable : 1; /* RW */ + } s; +}; + +/* ========================================================================= */ +/* UVH_RTC */ +/* ========================================================================= */ +#define UVH_RTC 0x340000UL + +#define UVH_RTC_REAL_TIME_CLOCK_SHFT 0 +#define UVH_RTC_REAL_TIME_CLOCK_MASK 0x00ffffffffffffffUL + +union uvh_rtc_u { + unsigned long v; + struct uvh_rtc_s { + unsigned long real_time_clock : 56; /* RW */ + unsigned long rsvd_56_63 : 8; /* */ + } s; +}; + +/* ========================================================================= */ +/* UVH_RTC1_INT_CONFIG */ +/* ========================================================================= */ +#define UVH_RTC1_INT_CONFIG 0x615c0UL + +#define UVH_RTC1_INT_CONFIG_VECTOR_SHFT 0 +#define UVH_RTC1_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL +#define UVH_RTC1_INT_CONFIG_DM_SHFT 8 +#define UVH_RTC1_INT_CONFIG_DM_MASK 0x0000000000000700UL +#define UVH_RTC1_INT_CONFIG_DESTMODE_SHFT 11 +#define UVH_RTC1_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL +#define UVH_RTC1_INT_CONFIG_STATUS_SHFT 12 +#define UVH_RTC1_INT_CONFIG_STATUS_MASK 0x0000000000001000UL +#define UVH_RTC1_INT_CONFIG_P_SHFT 13 +#define UVH_RTC1_INT_CONFIG_P_MASK 0x0000000000002000UL +#define UVH_RTC1_INT_CONFIG_T_SHFT 15 +#define UVH_RTC1_INT_CONFIG_T_MASK 0x0000000000008000UL +#define UVH_RTC1_INT_CONFIG_M_SHFT 16 +#define UVH_RTC1_INT_CONFIG_M_MASK 0x0000000000010000UL +#define UVH_RTC1_INT_CONFIG_APIC_ID_SHFT 32 +#define UVH_RTC1_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL + +union uvh_rtc1_int_config_u { + unsigned long v; + struct uvh_rtc1_int_config_s { + unsigned long vector_ : 8; /* RW */ + unsigned long dm : 3; /* RW */ + unsigned long destmode : 1; /* RW */ + unsigned long status : 1; /* RO */ + unsigned long p : 1; /* RO */ + unsigned long rsvd_14 : 1; /* */ + unsigned long t : 1; /* RO */ + unsigned long m : 1; /* RW */ + unsigned long rsvd_17_31: 15; /* */ + unsigned long apic_id : 32; /* RW */ + } s; +}; + +/* ========================================================================= */ +/* UVH_RTC2_INT_CONFIG */ +/* ========================================================================= */ +#define UVH_RTC2_INT_CONFIG 0x61600UL + +#define UVH_RTC2_INT_CONFIG_VECTOR_SHFT 0 +#define UVH_RTC2_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL +#define UVH_RTC2_INT_CONFIG_DM_SHFT 8 +#define UVH_RTC2_INT_CONFIG_DM_MASK 0x0000000000000700UL +#define UVH_RTC2_INT_CONFIG_DESTMODE_SHFT 11 +#define UVH_RTC2_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL +#define UVH_RTC2_INT_CONFIG_STATUS_SHFT 12 +#define UVH_RTC2_INT_CONFIG_STATUS_MASK 0x0000000000001000UL +#define UVH_RTC2_INT_CONFIG_P_SHFT 13 +#define UVH_RTC2_INT_CONFIG_P_MASK 0x0000000000002000UL +#define UVH_RTC2_INT_CONFIG_T_SHFT 15 +#define UVH_RTC2_INT_CONFIG_T_MASK 0x0000000000008000UL +#define UVH_RTC2_INT_CONFIG_M_SHFT 16 +#define UVH_RTC2_INT_CONFIG_M_MASK 0x0000000000010000UL +#define UVH_RTC2_INT_CONFIG_APIC_ID_SHFT 32 +#define UVH_RTC2_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL + +union uvh_rtc2_int_config_u { + unsigned long v; + struct uvh_rtc2_int_config_s { + unsigned long vector_ : 8; /* RW */ + unsigned long dm : 3; /* RW */ + unsigned long destmode : 1; /* RW */ + unsigned long status : 1; /* RO */ + unsigned long p : 1; /* RO */ + unsigned long rsvd_14 : 1; /* */ + unsigned long t : 1; /* RO */ + unsigned long m : 1; /* RW */ + unsigned long rsvd_17_31: 15; /* */ + unsigned long apic_id : 32; /* RW */ + } s; +}; + +/* ========================================================================= */ +/* UVH_RTC3_INT_CONFIG */ +/* ========================================================================= */ +#define UVH_RTC3_INT_CONFIG 0x61640UL + +#define UVH_RTC3_INT_CONFIG_VECTOR_SHFT 0 +#define UVH_RTC3_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL +#define UVH_RTC3_INT_CONFIG_DM_SHFT 8 +#define UVH_RTC3_INT_CONFIG_DM_MASK 0x0000000000000700UL +#define UVH_RTC3_INT_CONFIG_DESTMODE_SHFT 11 +#define UVH_RTC3_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL +#define UVH_RTC3_INT_CONFIG_STATUS_SHFT 12 +#define UVH_RTC3_INT_CONFIG_STATUS_MASK 0x0000000000001000UL +#define UVH_RTC3_INT_CONFIG_P_SHFT 13 +#define UVH_RTC3_INT_CONFIG_P_MASK 0x0000000000002000UL +#define UVH_RTC3_INT_CONFIG_T_SHFT 15 +#define UVH_RTC3_INT_CONFIG_T_MASK 0x0000000000008000UL +#define UVH_RTC3_INT_CONFIG_M_SHFT 16 +#define UVH_RTC3_INT_CONFIG_M_MASK 0x0000000000010000UL +#define UVH_RTC3_INT_CONFIG_APIC_ID_SHFT 32 +#define UVH_RTC3_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL + +union uvh_rtc3_int_config_u { + unsigned long v; + struct uvh_rtc3_int_config_s { + unsigned long vector_ : 8; /* RW */ + unsigned long dm : 3; /* RW */ + unsigned long destmode : 1; /* RW */ + unsigned long status : 1; /* RO */ + unsigned long p : 1; /* RO */ + unsigned long rsvd_14 : 1; /* */ + unsigned long t : 1; /* RO */ + unsigned long m : 1; /* RW */ + unsigned long rsvd_17_31: 15; /* */ + unsigned long apic_id : 32; /* RW */ + } s; +}; + +/* ========================================================================= */ +/* UVH_RTC_INC_RATIO */ +/* ========================================================================= */ +#define UVH_RTC_INC_RATIO 0x350000UL + +#define UVH_RTC_INC_RATIO_FRACTION_SHFT 0 +#define UVH_RTC_INC_RATIO_FRACTION_MASK 0x00000000000fffffUL +#define UVH_RTC_INC_RATIO_RATIO_SHFT 20 +#define UVH_RTC_INC_RATIO_RATIO_MASK 0x0000000000700000UL + +union uvh_rtc_inc_ratio_u { + unsigned long v; + struct uvh_rtc_inc_ratio_s { + unsigned long fraction : 20; /* RW */ + unsigned long ratio : 3; /* RW */ + unsigned long rsvd_23_63: 41; /* */ + } s; +}; + +/* ========================================================================= */ +/* UVH_SI_ADDR_MAP_CONFIG */ +/* ========================================================================= */ +#define UVH_SI_ADDR_MAP_CONFIG 0xc80000UL + +#define UVH_SI_ADDR_MAP_CONFIG_M_SKT_SHFT 0 +#define UVH_SI_ADDR_MAP_CONFIG_M_SKT_MASK 0x000000000000003fUL +#define UVH_SI_ADDR_MAP_CONFIG_N_SKT_SHFT 8 +#define UVH_SI_ADDR_MAP_CONFIG_N_SKT_MASK 0x0000000000000f00UL + +union uvh_si_addr_map_config_u { + unsigned long v; + struct uvh_si_addr_map_config_s { + unsigned long m_skt : 6; /* RW */ + unsigned long rsvd_6_7: 2; /* */ + unsigned long n_skt : 4; /* RW */ + unsigned long rsvd_12_63: 52; /* */ + } s; +}; + +/* ========================================================================= */ +/* UVH_SI_ALIAS0_OVERLAY_CONFIG */ +/* ========================================================================= */ +#define UVH_SI_ALIAS0_OVERLAY_CONFIG 0xc80008UL + +#define UVH_SI_ALIAS0_OVERLAY_CONFIG_BASE_SHFT 24 +#define UVH_SI_ALIAS0_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL +#define UVH_SI_ALIAS0_OVERLAY_CONFIG_M_ALIAS_SHFT 48 +#define UVH_SI_ALIAS0_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL +#define UVH_SI_ALIAS0_OVERLAY_CONFIG_ENABLE_SHFT 63 +#define UVH_SI_ALIAS0_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL + +union uvh_si_alias0_overlay_config_u { + unsigned long v; + struct uvh_si_alias0_overlay_config_s { + unsigned long rsvd_0_23: 24; /* */ + unsigned long base : 8; /* RW */ + unsigned long rsvd_32_47: 16; /* */ + unsigned long m_alias : 5; /* RW */ + unsigned long rsvd_53_62: 10; /* */ + unsigned long enable : 1; /* RW */ + } s; +}; + +/* ========================================================================= */ +/* UVH_SI_ALIAS1_OVERLAY_CONFIG */ +/* ========================================================================= */ +#define UVH_SI_ALIAS1_OVERLAY_CONFIG 0xc80010UL + +#define UVH_SI_ALIAS1_OVERLAY_CONFIG_BASE_SHFT 24 +#define UVH_SI_ALIAS1_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL +#define UVH_SI_ALIAS1_OVERLAY_CONFIG_M_ALIAS_SHFT 48 +#define UVH_SI_ALIAS1_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL +#define UVH_SI_ALIAS1_OVERLAY_CONFIG_ENABLE_SHFT 63 +#define UVH_SI_ALIAS1_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL + +union uvh_si_alias1_overlay_config_u { + unsigned long v; + struct uvh_si_alias1_overlay_config_s { + unsigned long rsvd_0_23: 24; /* */ + unsigned long base : 8; /* RW */ + unsigned long rsvd_32_47: 16; /* */ + unsigned long m_alias : 5; /* RW */ + unsigned long rsvd_53_62: 10; /* */ + unsigned long enable : 1; /* RW */ + } s; +}; + +/* ========================================================================= */ +/* UVH_SI_ALIAS2_OVERLAY_CONFIG */ +/* ========================================================================= */ +#define UVH_SI_ALIAS2_OVERLAY_CONFIG 0xc80018UL + +#define UVH_SI_ALIAS2_OVERLAY_CONFIG_BASE_SHFT 24 +#define UVH_SI_ALIAS2_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL +#define UVH_SI_ALIAS2_OVERLAY_CONFIG_M_ALIAS_SHFT 48 +#define UVH_SI_ALIAS2_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL +#define UVH_SI_ALIAS2_OVERLAY_CONFIG_ENABLE_SHFT 63 +#define UVH_SI_ALIAS2_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL + +union uvh_si_alias2_overlay_config_u { + unsigned long v; + struct uvh_si_alias2_overlay_config_s { + unsigned long rsvd_0_23: 24; /* */ + unsigned long base : 8; /* RW */ + unsigned long rsvd_32_47: 16; /* */ + unsigned long m_alias : 5; /* RW */ + unsigned long rsvd_53_62: 10; /* */ + unsigned long enable : 1; /* RW */ + } s; +}; + + +#endif /* _ASM_X86_UV_UV_MMRS_H */ diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h new file mode 100644 index 000000000000..9064052b73de --- /dev/null +++ b/arch/x86/include/asm/vdso.h @@ -0,0 +1,47 @@ +#ifndef _ASM_X86_VDSO_H +#define _ASM_X86_VDSO_H + +#ifdef CONFIG_X86_64 +extern const char VDSO64_PRELINK[]; + +/* + * Given a pointer to the vDSO image, find the pointer to VDSO64_name + * as that symbol is defined in the vDSO sources or linker script. + */ +#define VDSO64_SYMBOL(base, name) \ +({ \ + extern const char VDSO64_##name[]; \ + (void *)(VDSO64_##name - VDSO64_PRELINK + (unsigned long)(base)); \ +}) +#endif + +#if defined CONFIG_X86_32 || defined CONFIG_COMPAT +extern const char VDSO32_PRELINK[]; + +/* + * Given a pointer to the vDSO image, find the pointer to VDSO32_name + * as that symbol is defined in the vDSO sources or linker script. + */ +#define VDSO32_SYMBOL(base, name) \ +({ \ + extern const char VDSO32_##name[]; \ + (void *)(VDSO32_##name - VDSO32_PRELINK + (unsigned long)(base)); \ +}) +#endif + +/* + * These symbols are defined with the addresses in the vsyscall page. + * See vsyscall-sigreturn.S. + */ +extern void __user __kernel_sigreturn; +extern void __user __kernel_rt_sigreturn; + +/* + * These symbols are defined by vdso32.S to mark the bounds + * of the ELF DSO images included therein. + */ +extern const char vdso32_int80_start, vdso32_int80_end; +extern const char vdso32_syscall_start, vdso32_syscall_end; +extern const char vdso32_sysenter_start, vdso32_sysenter_end; + +#endif /* _ASM_X86_VDSO_H */ diff --git a/arch/x86/include/asm/vga.h b/arch/x86/include/asm/vga.h new file mode 100644 index 000000000000..c4b9dc2f67c5 --- /dev/null +++ b/arch/x86/include/asm/vga.h @@ -0,0 +1,20 @@ +/* + * Access to VGA videoram + * + * (c) 1998 Martin Mares <mj@ucw.cz> + */ + +#ifndef _ASM_X86_VGA_H +#define _ASM_X86_VGA_H + +/* + * On the PC, we can just recalculate addresses and then + * access the videoram directly without any black magic. + */ + +#define VGA_MAP_MEM(x, s) (unsigned long)phys_to_virt(x) + +#define vga_readb(x) (*(x)) +#define vga_writeb(x, y) (*(y) = (x)) + +#endif /* _ASM_X86_VGA_H */ diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h new file mode 100644 index 000000000000..dc27a69e5d2a --- /dev/null +++ b/arch/x86/include/asm/vgtod.h @@ -0,0 +1,29 @@ +#ifndef _ASM_X86_VGTOD_H +#define _ASM_X86_VGTOD_H + +#include <asm/vsyscall.h> +#include <linux/clocksource.h> + +struct vsyscall_gtod_data { + seqlock_t lock; + + /* open coded 'struct timespec' */ + time_t wall_time_sec; + u32 wall_time_nsec; + + int sysctl_enabled; + struct timezone sys_tz; + struct { /* extract of a clocksource struct */ + cycle_t (*vread)(void); + cycle_t cycle_last; + cycle_t mask; + u32 mult; + u32 shift; + } clock; + struct timespec wall_to_monotonic; +}; +extern struct vsyscall_gtod_data __vsyscall_gtod_data +__section_vsyscall_gtod_data; +extern struct vsyscall_gtod_data vsyscall_gtod_data; + +#endif /* _ASM_X86_VGTOD_H */ diff --git a/arch/x86/include/asm/vic.h b/arch/x86/include/asm/vic.h new file mode 100644 index 000000000000..53100f353612 --- /dev/null +++ b/arch/x86/include/asm/vic.h @@ -0,0 +1,61 @@ +/* Copyright (C) 1999,2001 + * + * Author: J.E.J.Bottomley@HansenPartnership.com + * + * Standard include definitions for the NCR Voyager Interrupt Controller */ + +/* The eight CPI vectors. To activate a CPI, you write a bit mask + * corresponding to the processor set to be interrupted into the + * relevant register. That set of CPUs will then be interrupted with + * the CPI */ +static const int VIC_CPI_Registers[] = + {0xFC00, 0xFC01, 0xFC08, 0xFC09, + 0xFC10, 0xFC11, 0xFC18, 0xFC19 }; + +#define VIC_PROC_WHO_AM_I 0xfc29 +# define QUAD_IDENTIFIER 0xC0 +# define EIGHT_SLOT_IDENTIFIER 0xE0 +#define QIC_EXTENDED_PROCESSOR_SELECT 0xFC72 +#define VIC_CPI_BASE_REGISTER 0xFC41 +#define VIC_PROCESSOR_ID 0xFC21 +# define VIC_CPU_MASQUERADE_ENABLE 0x8 + +#define VIC_CLAIM_REGISTER_0 0xFC38 +#define VIC_CLAIM_REGISTER_1 0xFC39 +#define VIC_REDIRECT_REGISTER_0 0xFC60 +#define VIC_REDIRECT_REGISTER_1 0xFC61 +#define VIC_PRIORITY_REGISTER 0xFC20 + +#define VIC_PRIMARY_MC_BASE 0xFC48 +#define VIC_SECONDARY_MC_BASE 0xFC49 + +#define QIC_PROCESSOR_ID 0xFC71 +# define QIC_CPUID_ENABLE 0x08 + +#define QIC_VIC_CPI_BASE_REGISTER 0xFC79 +#define QIC_CPI_BASE_REGISTER 0xFC7A + +#define QIC_MASK_REGISTER0 0xFC80 +/* NOTE: these are masked high, enabled low */ +# define QIC_PERF_TIMER 0x01 +# define QIC_LPE 0x02 +# define QIC_SYS_INT 0x04 +# define QIC_CMN_INT 0x08 +/* at the moment, just enable CMN_INT, disable SYS_INT */ +# define QIC_DEFAULT_MASK0 (~(QIC_CMN_INT /* | VIC_SYS_INT */)) +#define QIC_MASK_REGISTER1 0xFC81 +# define QIC_BOOT_CPI_MASK 0xFE +/* Enable CPI's 1-6 inclusive */ +# define QIC_CPI_ENABLE 0x81 + +#define QIC_INTERRUPT_CLEAR0 0xFC8A +#define QIC_INTERRUPT_CLEAR1 0xFC8B + +/* this is where we place the CPI vectors */ +#define VIC_DEFAULT_CPI_BASE 0xC0 +/* this is where we place the QIC CPI vectors */ +#define QIC_DEFAULT_CPI_BASE 0xD0 + +#define VIC_BOOT_INTERRUPT_MASK 0xfe + +extern void smp_vic_timer_interrupt(void); diff --git a/arch/x86/include/asm/visws/cobalt.h b/arch/x86/include/asm/visws/cobalt.h new file mode 100644 index 000000000000..166adf61e770 --- /dev/null +++ b/arch/x86/include/asm/visws/cobalt.h @@ -0,0 +1,125 @@ +#ifndef _ASM_X86_VISWS_COBALT_H +#define _ASM_X86_VISWS_COBALT_H + +#include <asm/fixmap.h> + +/* + * Cobalt SGI Visual Workstation system ASIC + */ + +#define CO_CPU_NUM_PHYS 0x1e00 +#define CO_CPU_TAB_PHYS (CO_CPU_NUM_PHYS + 2) + +#define CO_CPU_MAX 4 + +#define CO_CPU_PHYS 0xc2000000 +#define CO_APIC_PHYS 0xc4000000 + +/* see set_fixmap() and asm/fixmap.h */ +#define CO_CPU_VADDR (fix_to_virt(FIX_CO_CPU)) +#define CO_APIC_VADDR (fix_to_virt(FIX_CO_APIC)) + +/* Cobalt CPU registers -- relative to CO_CPU_VADDR, use co_cpu_*() */ +#define CO_CPU_REV 0x08 +#define CO_CPU_CTRL 0x10 +#define CO_CPU_STAT 0x20 +#define CO_CPU_TIMEVAL 0x30 + +/* CO_CPU_CTRL bits */ +#define CO_CTRL_TIMERUN 0x04 /* 0 == disabled */ +#define CO_CTRL_TIMEMASK 0x08 /* 0 == unmasked */ + +/* CO_CPU_STATUS bits */ +#define CO_STAT_TIMEINTR 0x02 /* (r) 1 == int pend, (w) 0 == clear */ + +/* CO_CPU_TIMEVAL value */ +#define CO_TIME_HZ 100000000 /* Cobalt core rate */ + +/* Cobalt APIC registers -- relative to CO_APIC_VADDR, use co_apic_*() */ +#define CO_APIC_HI(n) (((n) * 0x10) + 4) +#define CO_APIC_LO(n) ((n) * 0x10) +#define CO_APIC_ID 0x0ffc + +/* CO_APIC_ID bits */ +#define CO_APIC_ENABLE 0x00000100 + +/* CO_APIC_LO bits */ +#define CO_APIC_MASK 0x00010000 /* 0 = enabled */ +#define CO_APIC_LEVEL 0x00008000 /* 0 = edge */ + +/* + * Where things are physically wired to Cobalt + * #defines with no board _<type>_<rev>_ are common to all (thus far) + */ +#define CO_APIC_IDE0 4 +#define CO_APIC_IDE1 2 /* Only on 320 */ + +#define CO_APIC_8259 12 /* serial, floppy, par-l-l */ + +/* Lithium PCI Bridge A -- "the one with 82557 Ethernet" */ +#define CO_APIC_PCIA_BASE0 0 /* and 1 */ /* slot 0, line 0 */ +#define CO_APIC_PCIA_BASE123 5 /* and 6 */ /* slot 0, line 1 */ + +#define CO_APIC_PIIX4_USB 7 /* this one is weird */ + +/* Lithium PCI Bridge B -- "the one with PIIX4" */ +#define CO_APIC_PCIB_BASE0 8 /* and 9-12 *//* slot 0, line 0 */ +#define CO_APIC_PCIB_BASE123 13 /* 14.15 */ /* slot 0, line 1 */ + +#define CO_APIC_VIDOUT0 16 +#define CO_APIC_VIDOUT1 17 +#define CO_APIC_VIDIN0 18 +#define CO_APIC_VIDIN1 19 + +#define CO_APIC_LI_AUDIO 22 + +#define CO_APIC_AS 24 +#define CO_APIC_RE 25 + +#define CO_APIC_CPU 28 /* Timer and Cache interrupt */ +#define CO_APIC_NMI 29 +#define CO_APIC_LAST CO_APIC_NMI + +/* + * This is how irqs are assigned on the Visual Workstation. + * Legacy devices get irq's 1-15 (system clock is 0 and is CO_APIC_CPU). + * All other devices (including PCI) go to Cobalt and are irq's 16 on up. + */ +#define CO_IRQ_APIC0 16 /* irq of apic entry 0 */ +#define IS_CO_APIC(irq) ((irq) >= CO_IRQ_APIC0) +#define CO_IRQ(apic) (CO_IRQ_APIC0 + (apic)) /* apic ent to irq */ +#define CO_APIC(irq) ((irq) - CO_IRQ_APIC0) /* irq to apic ent */ +#define CO_IRQ_IDE0 14 /* knowledge of... */ +#define CO_IRQ_IDE1 15 /* ... ide driver defaults! */ +#define CO_IRQ_8259 CO_IRQ(CO_APIC_8259) + +#ifdef CONFIG_X86_VISWS_APIC +static inline void co_cpu_write(unsigned long reg, unsigned long v) +{ + *((volatile unsigned long *)(CO_CPU_VADDR+reg))=v; +} + +static inline unsigned long co_cpu_read(unsigned long reg) +{ + return *((volatile unsigned long *)(CO_CPU_VADDR+reg)); +} + +static inline void co_apic_write(unsigned long reg, unsigned long v) +{ + *((volatile unsigned long *)(CO_APIC_VADDR+reg))=v; +} + +static inline unsigned long co_apic_read(unsigned long reg) +{ + return *((volatile unsigned long *)(CO_APIC_VADDR+reg)); +} +#endif + +extern char visws_board_type; + +#define VISWS_320 0 +#define VISWS_540 1 + +extern char visws_board_rev; + +#endif /* _ASM_X86_VISWS_COBALT_H */ diff --git a/arch/x86/include/asm/visws/lithium.h b/arch/x86/include/asm/visws/lithium.h new file mode 100644 index 000000000000..a10d89bc1270 --- /dev/null +++ b/arch/x86/include/asm/visws/lithium.h @@ -0,0 +1,53 @@ +#ifndef _ASM_X86_VISWS_LITHIUM_H +#define _ASM_X86_VISWS_LITHIUM_H + +#include <asm/fixmap.h> + +/* + * Lithium is the SGI Visual Workstation I/O ASIC + */ + +#define LI_PCI_A_PHYS 0xfc000000 /* Enet is dev 3 */ +#define LI_PCI_B_PHYS 0xfd000000 /* PIIX4 is here */ + +/* see set_fixmap() and asm/fixmap.h */ +#define LI_PCIA_VADDR (fix_to_virt(FIX_LI_PCIA)) +#define LI_PCIB_VADDR (fix_to_virt(FIX_LI_PCIB)) + +/* Not a standard PCI? (not in linux/pci.h) */ +#define LI_PCI_BUSNUM 0x44 /* lo8: primary, hi8: sub */ +#define LI_PCI_INTEN 0x46 + +/* LI_PCI_INTENT bits */ +#define LI_INTA_0 0x0001 +#define LI_INTA_1 0x0002 +#define LI_INTA_2 0x0004 +#define LI_INTA_3 0x0008 +#define LI_INTA_4 0x0010 +#define LI_INTB 0x0020 +#define LI_INTC 0x0040 +#define LI_INTD 0x0080 + +/* More special purpose macros... */ +static inline void li_pcia_write16(unsigned long reg, unsigned short v) +{ + *((volatile unsigned short *)(LI_PCIA_VADDR+reg))=v; +} + +static inline unsigned short li_pcia_read16(unsigned long reg) +{ + return *((volatile unsigned short *)(LI_PCIA_VADDR+reg)); +} + +static inline void li_pcib_write16(unsigned long reg, unsigned short v) +{ + *((volatile unsigned short *)(LI_PCIB_VADDR+reg))=v; +} + +static inline unsigned short li_pcib_read16(unsigned long reg) +{ + return *((volatile unsigned short *)(LI_PCIB_VADDR+reg)); +} + +#endif /* _ASM_X86_VISWS_LITHIUM_H */ + diff --git a/arch/x86/include/asm/visws/piix4.h b/arch/x86/include/asm/visws/piix4.h new file mode 100644 index 000000000000..d0af4d338e7f --- /dev/null +++ b/arch/x86/include/asm/visws/piix4.h @@ -0,0 +1,107 @@ +#ifndef _ASM_X86_VISWS_PIIX4_H +#define _ASM_X86_VISWS_PIIX4_H + +/* + * PIIX4 as used on SGI Visual Workstations + */ + +#define PIIX_PM_START 0x0F80 + +#define SIO_GPIO_START 0x0FC0 + +#define SIO_PM_START 0x0FC8 + +#define PMBASE PIIX_PM_START +#define GPIREG0 (PMBASE+0x30) +#define GPIREG(x) (GPIREG0+((x)/8)) +#define GPIBIT(x) (1 << ((x)%8)) + +#define PIIX_GPI_BD_ID1 18 +#define PIIX_GPI_BD_ID2 19 +#define PIIX_GPI_BD_ID3 20 +#define PIIX_GPI_BD_ID4 21 +#define PIIX_GPI_BD_REG GPIREG(PIIX_GPI_BD_ID1) +#define PIIX_GPI_BD_MASK (GPIBIT(PIIX_GPI_BD_ID1) | \ + GPIBIT(PIIX_GPI_BD_ID2) | \ + GPIBIT(PIIX_GPI_BD_ID3) | \ + GPIBIT(PIIX_GPI_BD_ID4) ) + +#define PIIX_GPI_BD_SHIFT (PIIX_GPI_BD_ID1 % 8) + +#define SIO_INDEX 0x2e +#define SIO_DATA 0x2f + +#define SIO_DEV_SEL 0x7 +#define SIO_DEV_ENB 0x30 +#define SIO_DEV_MSB 0x60 +#define SIO_DEV_LSB 0x61 + +#define SIO_GP_DEV 0x7 + +#define SIO_GP_BASE SIO_GPIO_START +#define SIO_GP_MSB (SIO_GP_BASE>>8) +#define SIO_GP_LSB (SIO_GP_BASE&0xff) + +#define SIO_GP_DATA1 (SIO_GP_BASE+0) + +#define SIO_PM_DEV 0x8 + +#define SIO_PM_BASE SIO_PM_START +#define SIO_PM_MSB (SIO_PM_BASE>>8) +#define SIO_PM_LSB (SIO_PM_BASE&0xff) +#define SIO_PM_INDEX (SIO_PM_BASE+0) +#define SIO_PM_DATA (SIO_PM_BASE+1) + +#define SIO_PM_FER2 0x1 + +#define SIO_PM_GP_EN 0x80 + + + +/* + * This is the dev/reg where generating a config cycle will + * result in a PCI special cycle. + */ +#define SPECIAL_DEV 0xff +#define SPECIAL_REG 0x00 + +/* + * PIIX4 needs to see a special cycle with the following data + * to be convinced the processor has gone into the stop grant + * state. PIIX4 insists on seeing this before it will power + * down a system. + */ +#define PIIX_SPECIAL_STOP 0x00120002 + +#define PIIX4_RESET_PORT 0xcf9 +#define PIIX4_RESET_VAL 0x6 + +#define PMSTS_PORT 0xf80 // 2 bytes PM Status +#define PMEN_PORT 0xf82 // 2 bytes PM Enable +#define PMCNTRL_PORT 0xf84 // 2 bytes PM Control + +#define PM_SUSPEND_ENABLE 0x2000 // start sequence to suspend state + +/* + * PMSTS and PMEN I/O bit definitions. + * (Bits are the same in both registers) + */ +#define PM_STS_RSM (1<<15) // Resume Status +#define PM_STS_PWRBTNOR (1<<11) // Power Button Override +#define PM_STS_RTC (1<<10) // RTC status +#define PM_STS_PWRBTN (1<<8) // Power Button Pressed? +#define PM_STS_GBL (1<<5) // Global Status +#define PM_STS_BM (1<<4) // Bus Master Status +#define PM_STS_TMROF (1<<0) // Timer Overflow Status. + +/* + * Stop clock GPI register + */ +#define PIIX_GPIREG0 (0xf80 + 0x30) + +/* + * Stop clock GPI bit in GPIREG0 + */ +#define PIIX_GPI_STPCLK 0x4 // STPCLK signal routed back in + +#endif /* _ASM_X86_VISWS_PIIX4_H */ diff --git a/arch/x86/include/asm/visws/sgivw.h b/arch/x86/include/asm/visws/sgivw.h new file mode 100644 index 000000000000..5fbf63e1003c --- /dev/null +++ b/arch/x86/include/asm/visws/sgivw.h @@ -0,0 +1,5 @@ +/* + * Frame buffer position and size: + */ +extern unsigned long sgivwfb_mem_phys; +extern unsigned long sgivwfb_mem_size; diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h new file mode 100644 index 000000000000..f9303602fbc0 --- /dev/null +++ b/arch/x86/include/asm/vm86.h @@ -0,0 +1,208 @@ +#ifndef _ASM_X86_VM86_H +#define _ASM_X86_VM86_H + +/* + * I'm guessing at the VIF/VIP flag usage, but hope that this is how + * the Pentium uses them. Linux will return from vm86 mode when both + * VIF and VIP is set. + * + * On a Pentium, we could probably optimize the virtual flags directly + * in the eflags register instead of doing it "by hand" in vflags... + * + * Linus + */ + +#include <asm/processor-flags.h> + +#define BIOSSEG 0x0f000 + +#define CPU_086 0 +#define CPU_186 1 +#define CPU_286 2 +#define CPU_386 3 +#define CPU_486 4 +#define CPU_586 5 + +/* + * Return values for the 'vm86()' system call + */ +#define VM86_TYPE(retval) ((retval) & 0xff) +#define VM86_ARG(retval) ((retval) >> 8) + +#define VM86_SIGNAL 0 /* return due to signal */ +#define VM86_UNKNOWN 1 /* unhandled GP fault + - IO-instruction or similar */ +#define VM86_INTx 2 /* int3/int x instruction (ARG = x) */ +#define VM86_STI 3 /* sti/popf/iret instruction enabled + virtual interrupts */ + +/* + * Additional return values when invoking new vm86() + */ +#define VM86_PICRETURN 4 /* return due to pending PIC request */ +#define VM86_TRAP 6 /* return due to DOS-debugger request */ + +/* + * function codes when invoking new vm86() + */ +#define VM86_PLUS_INSTALL_CHECK 0 +#define VM86_ENTER 1 +#define VM86_ENTER_NO_BYPASS 2 +#define VM86_REQUEST_IRQ 3 +#define VM86_FREE_IRQ 4 +#define VM86_GET_IRQ_BITS 5 +#define VM86_GET_AND_RESET_IRQ 6 + +/* + * This is the stack-layout seen by the user space program when we have + * done a translation of "SAVE_ALL" from vm86 mode. The real kernel layout + * is 'kernel_vm86_regs' (see below). + */ + +struct vm86_regs { +/* + * normal regs, with special meaning for the segment descriptors.. + */ + long ebx; + long ecx; + long edx; + long esi; + long edi; + long ebp; + long eax; + long __null_ds; + long __null_es; + long __null_fs; + long __null_gs; + long orig_eax; + long eip; + unsigned short cs, __csh; + long eflags; + long esp; + unsigned short ss, __ssh; +/* + * these are specific to v86 mode: + */ + unsigned short es, __esh; + unsigned short ds, __dsh; + unsigned short fs, __fsh; + unsigned short gs, __gsh; +}; + +struct revectored_struct { + unsigned long __map[8]; /* 256 bits */ +}; + +struct vm86_struct { + struct vm86_regs regs; + unsigned long flags; + unsigned long screen_bitmap; + unsigned long cpu_type; + struct revectored_struct int_revectored; + struct revectored_struct int21_revectored; +}; + +/* + * flags masks + */ +#define VM86_SCREEN_BITMAP 0x0001 + +struct vm86plus_info_struct { + unsigned long force_return_for_pic:1; + unsigned long vm86dbg_active:1; /* for debugger */ + unsigned long vm86dbg_TFpendig:1; /* for debugger */ + unsigned long unused:28; + unsigned long is_vm86pus:1; /* for vm86 internal use */ + unsigned char vm86dbg_intxxtab[32]; /* for debugger */ +}; +struct vm86plus_struct { + struct vm86_regs regs; + unsigned long flags; + unsigned long screen_bitmap; + unsigned long cpu_type; + struct revectored_struct int_revectored; + struct revectored_struct int21_revectored; + struct vm86plus_info_struct vm86plus; +}; + +#ifdef __KERNEL__ + +#include <asm/ptrace.h> + +/* + * This is the (kernel) stack-layout when we have done a "SAVE_ALL" from vm86 + * mode - the main change is that the old segment descriptors aren't + * useful any more and are forced to be zero by the kernel (and the + * hardware when a trap occurs), and the real segment descriptors are + * at the end of the structure. Look at ptrace.h to see the "normal" + * setup. For user space layout see 'struct vm86_regs' above. + */ + +struct kernel_vm86_regs { +/* + * normal regs, with special meaning for the segment descriptors.. + */ + struct pt_regs pt; +/* + * these are specific to v86 mode: + */ + unsigned short es, __esh; + unsigned short ds, __dsh; + unsigned short fs, __fsh; + unsigned short gs, __gsh; +}; + +struct kernel_vm86_struct { + struct kernel_vm86_regs regs; +/* + * the below part remains on the kernel stack while we are in VM86 mode. + * 'tss.esp0' then contains the address of VM86_TSS_ESP0 below, and when we + * get forced back from VM86, the CPU and "SAVE_ALL" will restore the above + * 'struct kernel_vm86_regs' with the then actual values. + * Therefore, pt_regs in fact points to a complete 'kernel_vm86_struct' + * in kernelspace, hence we need not reget the data from userspace. + */ +#define VM86_TSS_ESP0 flags + unsigned long flags; + unsigned long screen_bitmap; + unsigned long cpu_type; + struct revectored_struct int_revectored; + struct revectored_struct int21_revectored; + struct vm86plus_info_struct vm86plus; + struct pt_regs *regs32; /* here we save the pointer to the old regs */ +/* + * The below is not part of the structure, but the stack layout continues + * this way. In front of 'return-eip' may be some data, depending on + * compilation, so we don't rely on this and save the pointer to 'oldregs' + * in 'regs32' above. + * However, with GCC-2.7.2 and the current CFLAGS you see exactly this: + + long return-eip; from call to vm86() + struct pt_regs oldregs; user space registers as saved by syscall + */ +}; + +#ifdef CONFIG_VM86 + +void handle_vm86_fault(struct kernel_vm86_regs *, long); +int handle_vm86_trap(struct kernel_vm86_regs *, long, int); +struct pt_regs *save_v86_state(struct kernel_vm86_regs *); + +struct task_struct; +void release_vm86_irqs(struct task_struct *); + +#else + +#define handle_vm86_fault(a, b) +#define release_vm86_irqs(a) + +static inline int handle_vm86_trap(struct kernel_vm86_regs *a, long b, int c) +{ + return 0; +} + +#endif /* CONFIG_VM86 */ + +#endif /* __KERNEL__ */ + +#endif /* _ASM_X86_VM86_H */ diff --git a/arch/x86/include/asm/vmi.h b/arch/x86/include/asm/vmi.h new file mode 100644 index 000000000000..b7c0dea119fe --- /dev/null +++ b/arch/x86/include/asm/vmi.h @@ -0,0 +1,263 @@ +/* + * VMI interface definition + * + * Copyright (C) 2005, VMware, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Maintained by: Zachary Amsden zach@vmware.com + * + */ +#include <linux/types.h> + +/* + *--------------------------------------------------------------------- + * + * VMI Option ROM API + * + *--------------------------------------------------------------------- + */ +#define VMI_SIGNATURE 0x696d5663 /* "cVmi" */ + +#define PCI_VENDOR_ID_VMWARE 0x15AD +#define PCI_DEVICE_ID_VMWARE_VMI 0x0801 + +/* + * We use two version numbers for compatibility, with the major + * number signifying interface breakages, and the minor number + * interface extensions. + */ +#define VMI_API_REV_MAJOR 3 +#define VMI_API_REV_MINOR 0 + +#define VMI_CALL_CPUID 0 +#define VMI_CALL_WRMSR 1 +#define VMI_CALL_RDMSR 2 +#define VMI_CALL_SetGDT 3 +#define VMI_CALL_SetLDT 4 +#define VMI_CALL_SetIDT 5 +#define VMI_CALL_SetTR 6 +#define VMI_CALL_GetGDT 7 +#define VMI_CALL_GetLDT 8 +#define VMI_CALL_GetIDT 9 +#define VMI_CALL_GetTR 10 +#define VMI_CALL_WriteGDTEntry 11 +#define VMI_CALL_WriteLDTEntry 12 +#define VMI_CALL_WriteIDTEntry 13 +#define VMI_CALL_UpdateKernelStack 14 +#define VMI_CALL_SetCR0 15 +#define VMI_CALL_SetCR2 16 +#define VMI_CALL_SetCR3 17 +#define VMI_CALL_SetCR4 18 +#define VMI_CALL_GetCR0 19 +#define VMI_CALL_GetCR2 20 +#define VMI_CALL_GetCR3 21 +#define VMI_CALL_GetCR4 22 +#define VMI_CALL_WBINVD 23 +#define VMI_CALL_SetDR 24 +#define VMI_CALL_GetDR 25 +#define VMI_CALL_RDPMC 26 +#define VMI_CALL_RDTSC 27 +#define VMI_CALL_CLTS 28 +#define VMI_CALL_EnableInterrupts 29 +#define VMI_CALL_DisableInterrupts 30 +#define VMI_CALL_GetInterruptMask 31 +#define VMI_CALL_SetInterruptMask 32 +#define VMI_CALL_IRET 33 +#define VMI_CALL_SYSEXIT 34 +#define VMI_CALL_Halt 35 +#define VMI_CALL_Reboot 36 +#define VMI_CALL_Shutdown 37 +#define VMI_CALL_SetPxE 38 +#define VMI_CALL_SetPxELong 39 +#define VMI_CALL_UpdatePxE 40 +#define VMI_CALL_UpdatePxELong 41 +#define VMI_CALL_MachineToPhysical 42 +#define VMI_CALL_PhysicalToMachine 43 +#define VMI_CALL_AllocatePage 44 +#define VMI_CALL_ReleasePage 45 +#define VMI_CALL_InvalPage 46 +#define VMI_CALL_FlushTLB 47 +#define VMI_CALL_SetLinearMapping 48 + +#define VMI_CALL_SetIOPLMask 61 +#define VMI_CALL_SetInitialAPState 62 +#define VMI_CALL_APICWrite 63 +#define VMI_CALL_APICRead 64 +#define VMI_CALL_IODelay 65 +#define VMI_CALL_SetLazyMode 73 + +/* + *--------------------------------------------------------------------- + * + * MMU operation flags + * + *--------------------------------------------------------------------- + */ + +/* Flags used by VMI_{Allocate|Release}Page call */ +#define VMI_PAGE_PAE 0x10 /* Allocate PAE shadow */ +#define VMI_PAGE_CLONE 0x20 /* Clone from another shadow */ +#define VMI_PAGE_ZEROED 0x40 /* Page is pre-zeroed */ + + +/* Flags shared by Allocate|Release Page and PTE updates */ +#define VMI_PAGE_PT 0x01 +#define VMI_PAGE_PD 0x02 +#define VMI_PAGE_PDP 0x04 +#define VMI_PAGE_PML4 0x08 + +#define VMI_PAGE_NORMAL 0x00 /* for debugging */ + +/* Flags used by PTE updates */ +#define VMI_PAGE_CURRENT_AS 0x10 /* implies VMI_PAGE_VA_MASK is valid */ +#define VMI_PAGE_DEFER 0x20 /* may queue update until TLB inval */ +#define VMI_PAGE_VA_MASK 0xfffff000 + +#ifdef CONFIG_X86_PAE +#define VMI_PAGE_L1 (VMI_PAGE_PT | VMI_PAGE_PAE | VMI_PAGE_ZEROED) +#define VMI_PAGE_L2 (VMI_PAGE_PD | VMI_PAGE_PAE | VMI_PAGE_ZEROED) +#else +#define VMI_PAGE_L1 (VMI_PAGE_PT | VMI_PAGE_ZEROED) +#define VMI_PAGE_L2 (VMI_PAGE_PD | VMI_PAGE_ZEROED) +#endif + +/* Flags used by VMI_FlushTLB call */ +#define VMI_FLUSH_TLB 0x01 +#define VMI_FLUSH_GLOBAL 0x02 + +/* + *--------------------------------------------------------------------- + * + * VMI relocation definitions for ROM call get_reloc + * + *--------------------------------------------------------------------- + */ + +/* VMI Relocation types */ +#define VMI_RELOCATION_NONE 0 +#define VMI_RELOCATION_CALL_REL 1 +#define VMI_RELOCATION_JUMP_REL 2 +#define VMI_RELOCATION_NOP 3 + +#ifndef __ASSEMBLY__ +struct vmi_relocation_info { + unsigned char *eip; + unsigned char type; + unsigned char reserved[3]; +}; +#endif + + +/* + *--------------------------------------------------------------------- + * + * Generic ROM structures and definitions + * + *--------------------------------------------------------------------- + */ + +#ifndef __ASSEMBLY__ + +struct vrom_header { + u16 rom_signature; /* option ROM signature */ + u8 rom_length; /* ROM length in 512 byte chunks */ + u8 rom_entry[4]; /* 16-bit code entry point */ + u8 rom_pad0; /* 4-byte align pad */ + u32 vrom_signature; /* VROM identification signature */ + u8 api_version_min;/* Minor version of API */ + u8 api_version_maj;/* Major version of API */ + u8 jump_slots; /* Number of jump slots */ + u8 reserved1; /* Reserved for expansion */ + u32 virtual_top; /* Hypervisor virtual address start */ + u16 reserved2; /* Reserved for expansion */ + u16 license_offs; /* Offset to License string */ + u16 pci_header_offs;/* Offset to PCI OPROM header */ + u16 pnp_header_offs;/* Offset to PnP OPROM header */ + u32 rom_pad3; /* PnP reserverd / VMI reserved */ + u8 reserved[96]; /* Reserved for headers */ + char vmi_init[8]; /* VMI_Init jump point */ + char get_reloc[8]; /* VMI_GetRelocationInfo jump point */ +} __attribute__((packed)); + +struct pnp_header { + char sig[4]; + char rev; + char size; + short next; + short res; + long devID; + unsigned short manufacturer_offset; + unsigned short product_offset; +} __attribute__((packed)); + +struct pci_header { + char sig[4]; + short vendorID; + short deviceID; + short vpdData; + short size; + char rev; + char class; + char subclass; + char interface; + short chunks; + char rom_version_min; + char rom_version_maj; + char codetype; + char lastRom; + short reserved; +} __attribute__((packed)); + +/* Function prototypes for bootstrapping */ +extern void vmi_init(void); +extern void vmi_bringup(void); +extern void vmi_apply_boot_page_allocations(void); + +/* State needed to start an application processor in an SMP system. */ +struct vmi_ap_state { + u32 cr0; + u32 cr2; + u32 cr3; + u32 cr4; + + u64 efer; + + u32 eip; + u32 eflags; + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; + u32 esp; + u32 ebp; + u32 esi; + u32 edi; + u16 cs; + u16 ss; + u16 ds; + u16 es; + u16 fs; + u16 gs; + u16 ldtr; + + u16 gdtr_limit; + u32 gdtr_base; + u32 idtr_base; + u16 idtr_limit; +}; + +#endif diff --git a/arch/x86/include/asm/vmi_time.h b/arch/x86/include/asm/vmi_time.h new file mode 100644 index 000000000000..c6e0bee93e3c --- /dev/null +++ b/arch/x86/include/asm/vmi_time.h @@ -0,0 +1,98 @@ +/* + * VMI Time wrappers + * + * Copyright (C) 2006, VMware, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send feedback to dhecht@vmware.com + * + */ + +#ifndef _ASM_X86_VMI_TIME_H +#define _ASM_X86_VMI_TIME_H + +/* + * Raw VMI call indices for timer functions + */ +#define VMI_CALL_GetCycleFrequency 66 +#define VMI_CALL_GetCycleCounter 67 +#define VMI_CALL_SetAlarm 68 +#define VMI_CALL_CancelAlarm 69 +#define VMI_CALL_GetWallclockTime 70 +#define VMI_CALL_WallclockUpdated 71 + +/* Cached VMI timer operations */ +extern struct vmi_timer_ops { + u64 (*get_cycle_frequency)(void); + u64 (*get_cycle_counter)(int); + u64 (*get_wallclock)(void); + int (*wallclock_updated)(void); + void (*set_alarm)(u32 flags, u64 expiry, u64 period); + void (*cancel_alarm)(u32 flags); +} vmi_timer_ops; + +/* Prototypes */ +extern void __init vmi_time_init(void); +extern unsigned long vmi_get_wallclock(void); +extern int vmi_set_wallclock(unsigned long now); +extern unsigned long long vmi_sched_clock(void); +extern unsigned long vmi_tsc_khz(void); + +#ifdef CONFIG_X86_LOCAL_APIC +extern void __devinit vmi_time_bsp_init(void); +extern void __devinit vmi_time_ap_init(void); +#endif + +/* + * When run under a hypervisor, a vcpu is always in one of three states: + * running, halted, or ready. The vcpu is in the 'running' state if it + * is executing. When the vcpu executes the halt interface, the vcpu + * enters the 'halted' state and remains halted until there is some work + * pending for the vcpu (e.g. an alarm expires, host I/O completes on + * behalf of virtual I/O). At this point, the vcpu enters the 'ready' + * state (waiting for the hypervisor to reschedule it). Finally, at any + * time when the vcpu is not in the 'running' state nor the 'halted' + * state, it is in the 'ready' state. + * + * Real time is advances while the vcpu is 'running', 'ready', or + * 'halted'. Stolen time is the time in which the vcpu is in the + * 'ready' state. Available time is the remaining time -- the vcpu is + * either 'running' or 'halted'. + * + * All three views of time are accessible through the VMI cycle + * counters. + */ + +/* The cycle counters. */ +#define VMI_CYCLES_REAL 0 +#define VMI_CYCLES_AVAILABLE 1 +#define VMI_CYCLES_STOLEN 2 + +/* The alarm interface 'flags' bits */ +#define VMI_ALARM_COUNTERS 2 + +#define VMI_ALARM_COUNTER_MASK 0x000000ff + +#define VMI_ALARM_WIRED_IRQ0 0x00000000 +#define VMI_ALARM_WIRED_LVTT 0x00010000 + +#define VMI_ALARM_IS_ONESHOT 0x00000000 +#define VMI_ALARM_IS_PERIODIC 0x00000100 + +#define CONFIG_VMI_ALARM_HZ 100 + +#endif /* _ASM_X86_VMI_TIME_H */ diff --git a/arch/x86/include/asm/voyager.h b/arch/x86/include/asm/voyager.h new file mode 100644 index 000000000000..9c811d2e6f91 --- /dev/null +++ b/arch/x86/include/asm/voyager.h @@ -0,0 +1,528 @@ +/* Copyright (C) 1999,2001 + * + * Author: J.E.J.Bottomley@HansenPartnership.com + * + * Standard include definitions for the NCR Voyager system */ + +#undef VOYAGER_DEBUG +#undef VOYAGER_CAT_DEBUG + +#ifdef VOYAGER_DEBUG +#define VDEBUG(x) printk x +#else +#define VDEBUG(x) +#endif + +/* There are three levels of voyager machine: 3,4 and 5. The rule is + * if it's less than 3435 it's a Level 3 except for a 3360 which is + * a level 4. A 3435 or above is a Level 5 */ +#define VOYAGER_LEVEL5_AND_ABOVE 0x3435 +#define VOYAGER_LEVEL4 0x3360 + +/* The L4 DINO ASIC */ +#define VOYAGER_DINO 0x43 + +/* voyager ports in standard I/O space */ +#define VOYAGER_MC_SETUP 0x96 + + +#define VOYAGER_CAT_CONFIG_PORT 0x97 +# define VOYAGER_CAT_DESELECT 0xff +#define VOYAGER_SSPB_RELOCATION_PORT 0x98 + +/* Valid CAT controller commands */ +/* start instruction register cycle */ +#define VOYAGER_CAT_IRCYC 0x01 +/* start data register cycle */ +#define VOYAGER_CAT_DRCYC 0x02 +/* move to execute state */ +#define VOYAGER_CAT_RUN 0x0F +/* end operation */ +#define VOYAGER_CAT_END 0x80 +/* hold in idle state */ +#define VOYAGER_CAT_HOLD 0x90 +/* single step an "intest" vector */ +#define VOYAGER_CAT_STEP 0xE0 +/* return cat controller to CLEMSON mode */ +#define VOYAGER_CAT_CLEMSON 0xFF + +/* the default cat command header */ +#define VOYAGER_CAT_HEADER 0x7F + +/* the range of possible CAT module ids in the system */ +#define VOYAGER_MIN_MODULE 0x10 +#define VOYAGER_MAX_MODULE 0x1f + +/* The voyager registers per asic */ +#define VOYAGER_ASIC_ID_REG 0x00 +#define VOYAGER_ASIC_TYPE_REG 0x01 +/* the sub address registers can be made auto incrementing on reads */ +#define VOYAGER_AUTO_INC_REG 0x02 +# define VOYAGER_AUTO_INC 0x04 +# define VOYAGER_NO_AUTO_INC 0xfb +#define VOYAGER_SUBADDRDATA 0x03 +#define VOYAGER_SCANPATH 0x05 +# define VOYAGER_CONNECT_ASIC 0x01 +# define VOYAGER_DISCONNECT_ASIC 0xfe +#define VOYAGER_SUBADDRLO 0x06 +#define VOYAGER_SUBADDRHI 0x07 +#define VOYAGER_SUBMODSELECT 0x08 +#define VOYAGER_SUBMODPRESENT 0x09 + +#define VOYAGER_SUBADDR_LO 0xff +#define VOYAGER_SUBADDR_HI 0xffff + +/* the maximum size of a scan path -- used to form instructions */ +#define VOYAGER_MAX_SCAN_PATH 0x100 +/* the biggest possible register size (in bytes) */ +#define VOYAGER_MAX_REG_SIZE 4 + +/* Total number of possible modules (including submodules) */ +#define VOYAGER_MAX_MODULES 16 +/* Largest number of asics per module */ +#define VOYAGER_MAX_ASICS_PER_MODULE 7 + +/* the CAT asic of each module is always the first one */ +#define VOYAGER_CAT_ID 0 +#define VOYAGER_PSI 0x1a + +/* voyager instruction operations and registers */ +#define VOYAGER_READ_CONFIG 0x1 +#define VOYAGER_WRITE_CONFIG 0x2 +#define VOYAGER_BYPASS 0xff + +typedef struct voyager_asic { + __u8 asic_addr; /* ASIC address; Level 4 */ + __u8 asic_type; /* ASIC type */ + __u8 asic_id; /* ASIC id */ + __u8 jtag_id[4]; /* JTAG id */ + __u8 asic_location; /* Location within scan path; start w/ 0 */ + __u8 bit_location; /* Location within bit stream; start w/ 0 */ + __u8 ireg_length; /* Instruction register length */ + __u16 subaddr; /* Amount of sub address space */ + struct voyager_asic *next; /* Next asic in linked list */ +} voyager_asic_t; + +typedef struct voyager_module { + __u8 module_addr; /* Module address */ + __u8 scan_path_connected; /* Scan path connected */ + __u16 ee_size; /* Size of the EEPROM */ + __u16 num_asics; /* Number of Asics */ + __u16 inst_bits; /* Instruction bits in the scan path */ + __u16 largest_reg; /* Largest register in the scan path */ + __u16 smallest_reg; /* Smallest register in the scan path */ + voyager_asic_t *asic; /* First ASIC in scan path (CAT_I) */ + struct voyager_module *submodule; /* Submodule pointer */ + struct voyager_module *next; /* Next module in linked list */ +} voyager_module_t; + +typedef struct voyager_eeprom_hdr { + __u8 module_id[4]; + __u8 version_id; + __u8 config_id; + __u16 boundry_id; /* boundary scan id */ + __u16 ee_size; /* size of EEPROM */ + __u8 assembly[11]; /* assembly # */ + __u8 assembly_rev; /* assembly rev */ + __u8 tracer[4]; /* tracer number */ + __u16 assembly_cksum; /* asm checksum */ + __u16 power_consump; /* pwr requirements */ + __u16 num_asics; /* number of asics */ + __u16 bist_time; /* min. bist time */ + __u16 err_log_offset; /* error log offset */ + __u16 scan_path_offset;/* scan path offset */ + __u16 cct_offset; + __u16 log_length; /* length of err log */ + __u16 xsum_end; /* offset to end of + checksum */ + __u8 reserved[4]; + __u8 sflag; /* starting sentinal */ + __u8 part_number[13]; /* prom part number */ + __u8 version[10]; /* version number */ + __u8 signature[8]; + __u16 eeprom_chksum; + __u32 data_stamp_offset; + __u8 eflag ; /* ending sentinal */ +} __attribute__((packed)) voyager_eprom_hdr_t; + + + +#define VOYAGER_EPROM_SIZE_OFFSET \ + ((__u16)(&(((voyager_eprom_hdr_t *)0)->ee_size))) +#define VOYAGER_XSUM_END_OFFSET 0x2a + +/* the following three definitions are for internal table layouts + * in the module EPROMs. We really only care about the IDs and + * offsets */ +typedef struct voyager_sp_table { + __u8 asic_id; + __u8 bypass_flag; + __u16 asic_data_offset; + __u16 config_data_offset; +} __attribute__((packed)) voyager_sp_table_t; + +typedef struct voyager_jtag_table { + __u8 icode[4]; + __u8 runbist[4]; + __u8 intest[4]; + __u8 samp_preld[4]; + __u8 ireg_len; +} __attribute__((packed)) voyager_jtt_t; + +typedef struct voyager_asic_data_table { + __u8 jtag_id[4]; + __u16 length_bsr; + __u16 length_bist_reg; + __u32 bist_clk; + __u16 subaddr_bits; + __u16 seed_bits; + __u16 sig_bits; + __u16 jtag_offset; +} __attribute__((packed)) voyager_at_t; + +/* Voyager Interrupt Controller (VIC) registers */ + +/* Base to add to Cross Processor Interrupts (CPIs) when triggering + * the CPU IRQ line */ +/* register defines for the WCBICs (one per processor) */ +#define VOYAGER_WCBIC0 0x41 /* bus A node P1 processor 0 */ +#define VOYAGER_WCBIC1 0x49 /* bus A node P1 processor 1 */ +#define VOYAGER_WCBIC2 0x51 /* bus A node P2 processor 0 */ +#define VOYAGER_WCBIC3 0x59 /* bus A node P2 processor 1 */ +#define VOYAGER_WCBIC4 0x61 /* bus B node P1 processor 0 */ +#define VOYAGER_WCBIC5 0x69 /* bus B node P1 processor 1 */ +#define VOYAGER_WCBIC6 0x71 /* bus B node P2 processor 0 */ +#define VOYAGER_WCBIC7 0x79 /* bus B node P2 processor 1 */ + + +/* top of memory registers */ +#define VOYAGER_WCBIC_TOM_L 0x4 +#define VOYAGER_WCBIC_TOM_H 0x5 + +/* register defines for Voyager Memory Contol (VMC) + * these are present on L4 machines only */ +#define VOYAGER_VMC1 0x81 +#define VOYAGER_VMC2 0x91 +#define VOYAGER_VMC3 0xa1 +#define VOYAGER_VMC4 0xb1 + +/* VMC Ports */ +#define VOYAGER_VMC_MEMORY_SETUP 0x9 +# define VMC_Interleaving 0x01 +# define VMC_4Way 0x02 +# define VMC_EvenCacheLines 0x04 +# define VMC_HighLine 0x08 +# define VMC_Start0_Enable 0x20 +# define VMC_Start1_Enable 0x40 +# define VMC_Vremap 0x80 +#define VOYAGER_VMC_BANK_DENSITY 0xa +# define VMC_BANK_EMPTY 0 +# define VMC_BANK_4MB 1 +# define VMC_BANK_16MB 2 +# define VMC_BANK_64MB 3 +# define VMC_BANK0_MASK 0x03 +# define VMC_BANK1_MASK 0x0C +# define VMC_BANK2_MASK 0x30 +# define VMC_BANK3_MASK 0xC0 + +/* Magellan Memory Controller (MMC) defines - present on L5 */ +#define VOYAGER_MMC_ASIC_ID 1 +/* the two memory modules corresponding to memory cards in the system */ +#define VOYAGER_MMC_MEMORY0_MODULE 0x14 +#define VOYAGER_MMC_MEMORY1_MODULE 0x15 +/* the Magellan Memory Address (MMA) defines */ +#define VOYAGER_MMA_ASIC_ID 2 + +/* Submodule number for the Quad Baseboard */ +#define VOYAGER_QUAD_BASEBOARD 1 + +/* ASIC defines for the Quad Baseboard */ +#define VOYAGER_QUAD_QDATA0 1 +#define VOYAGER_QUAD_QDATA1 2 +#define VOYAGER_QUAD_QABC 3 + +/* Useful areas in extended CMOS */ +#define VOYAGER_PROCESSOR_PRESENT_MASK 0x88a +#define VOYAGER_MEMORY_CLICKMAP 0xa23 +#define VOYAGER_DUMP_LOCATION 0xb1a + +/* SUS In Control bit - used to tell SUS that we don't need to be + * babysat anymore */ +#define VOYAGER_SUS_IN_CONTROL_PORT 0x3ff +# define VOYAGER_IN_CONTROL_FLAG 0x80 + +/* Voyager PSI defines */ +#define VOYAGER_PSI_STATUS_REG 0x08 +# define PSI_DC_FAIL 0x01 +# define PSI_MON 0x02 +# define PSI_FAULT 0x04 +# define PSI_ALARM 0x08 +# define PSI_CURRENT 0x10 +# define PSI_DVM 0x20 +# define PSI_PSCFAULT 0x40 +# define PSI_STAT_CHG 0x80 + +#define VOYAGER_PSI_SUPPLY_REG 0x8000 + /* read */ +# define PSI_FAIL_DC 0x01 +# define PSI_FAIL_AC 0x02 +# define PSI_MON_INT 0x04 +# define PSI_SWITCH_OFF 0x08 +# define PSI_HX_OFF 0x10 +# define PSI_SECURITY 0x20 +# define PSI_CMOS_BATT_LOW 0x40 +# define PSI_CMOS_BATT_FAIL 0x80 + /* write */ +# define PSI_CLR_SWITCH_OFF 0x13 +# define PSI_CLR_HX_OFF 0x14 +# define PSI_CLR_CMOS_BATT_FAIL 0x17 + +#define VOYAGER_PSI_MASK 0x8001 +# define PSI_MASK_MASK 0x10 + +#define VOYAGER_PSI_AC_FAIL_REG 0x8004 +#define AC_FAIL_STAT_CHANGE 0x80 + +#define VOYAGER_PSI_GENERAL_REG 0x8007 + /* read */ +# define PSI_SWITCH_ON 0x01 +# define PSI_SWITCH_ENABLED 0x02 +# define PSI_ALARM_ENABLED 0x08 +# define PSI_SECURE_ENABLED 0x10 +# define PSI_COLD_RESET 0x20 +# define PSI_COLD_START 0x80 + /* write */ +# define PSI_POWER_DOWN 0x10 +# define PSI_SWITCH_DISABLE 0x01 +# define PSI_SWITCH_ENABLE 0x11 +# define PSI_CLEAR 0x12 +# define PSI_ALARM_DISABLE 0x03 +# define PSI_ALARM_ENABLE 0x13 +# define PSI_CLEAR_COLD_RESET 0x05 +# define PSI_SET_COLD_RESET 0x15 +# define PSI_CLEAR_COLD_START 0x07 +# define PSI_SET_COLD_START 0x17 + + + +struct voyager_bios_info { + __u8 len; + __u8 major; + __u8 minor; + __u8 debug; + __u8 num_classes; + __u8 class_1; + __u8 class_2; +}; + +/* The following structures and definitions are for the Kernel/SUS + * interface these are needed to find out how SUS initialised any Quad + * boards in the system */ + +#define NUMBER_OF_MC_BUSSES 2 +#define SLOTS_PER_MC_BUS 8 +#define MAX_CPUS 16 /* 16 way CPU system */ +#define MAX_PROCESSOR_BOARDS 4 /* 4 processor slot system */ +#define MAX_CACHE_LEVELS 4 /* # of cache levels supported */ +#define MAX_SHARED_CPUS 4 /* # of CPUs that can share a LARC */ +#define NUMBER_OF_POS_REGS 8 + +typedef struct { + __u8 MC_Slot; + __u8 POS_Values[NUMBER_OF_POS_REGS]; +} __attribute__((packed)) MC_SlotInformation_t; + +struct QuadDescription { + __u8 Type; /* for type 0 (DYADIC or MONADIC) all fields + * will be zero except for slot */ + __u8 StructureVersion; + __u32 CPI_BaseAddress; + __u32 LARC_BankSize; + __u32 LocalMemoryStateBits; + __u8 Slot; /* Processor slots 1 - 4 */ +} __attribute__((packed)); + +struct ProcBoardInfo { + __u8 Type; + __u8 StructureVersion; + __u8 NumberOfBoards; + struct QuadDescription QuadData[MAX_PROCESSOR_BOARDS]; +} __attribute__((packed)); + +struct CacheDescription { + __u8 Level; + __u32 TotalSize; + __u16 LineSize; + __u8 Associativity; + __u8 CacheType; + __u8 WriteType; + __u8 Number_CPUs_SharedBy; + __u8 Shared_CPUs_Hardware_IDs[MAX_SHARED_CPUS]; + +} __attribute__((packed)); + +struct CPU_Description { + __u8 CPU_HardwareId; + char *FRU_String; + __u8 NumberOfCacheLevels; + struct CacheDescription CacheLevelData[MAX_CACHE_LEVELS]; +} __attribute__((packed)); + +struct CPU_Info { + __u8 Type; + __u8 StructureVersion; + __u8 NumberOf_CPUs; + struct CPU_Description CPU_Data[MAX_CPUS]; +} __attribute__((packed)); + + +/* + * This structure will be used by SUS and the OS. + * The assumption about this structure is that no blank space is + * packed in it by our friend the compiler. + */ +typedef struct { + __u8 Mailbox_SUS; /* Written to by SUS to give + commands/response to the OS */ + __u8 Mailbox_OS; /* Written to by the OS to give + commands/response to SUS */ + __u8 SUS_MailboxVersion; /* Tells the OS which iteration of the + interface SUS supports */ + __u8 OS_MailboxVersion; /* Tells SUS which iteration of the + interface the OS supports */ + __u32 OS_Flags; /* Flags set by the OS as info for + SUS */ + __u32 SUS_Flags; /* Flags set by SUS as info + for the OS */ + __u32 WatchDogPeriod; /* Watchdog period (in seconds) which + the DP uses to see if the OS + is dead */ + __u32 WatchDogCount; /* Updated by the OS on every tic. */ + __u32 MemoryFor_SUS_ErrorLog; /* Flat 32 bit address which tells SUS + where to stuff the SUS error log + on a dump */ + MC_SlotInformation_t MC_SlotInfo[NUMBER_OF_MC_BUSSES*SLOTS_PER_MC_BUS]; + /* Storage for MCA POS data */ + /* All new SECOND_PASS_INTERFACE fields added from this point */ + struct ProcBoardInfo *BoardData; + struct CPU_Info *CPU_Data; + /* All new fields must be added from this point */ +} Voyager_KernelSUS_Mbox_t; + +/* structure for finding the right memory address to send a QIC CPI to */ +struct voyager_qic_cpi { + /* Each cache line (32 bytes) can trigger a cpi. The cpi + * read/write may occur anywhere in the cache line---pick the + * middle to be safe */ + struct { + __u32 pad1[3]; + __u32 cpi; + __u32 pad2[4]; + } qic_cpi[8]; +}; + +struct voyager_status { + __u32 power_fail:1; + __u32 switch_off:1; + __u32 request_from_kernel:1; +}; + +struct voyager_psi_regs { + __u8 cat_id; + __u8 cat_dev; + __u8 cat_control; + __u8 subaddr; + __u8 dummy4; + __u8 checkbit; + __u8 subaddr_low; + __u8 subaddr_high; + __u8 intstatus; + __u8 stat1; + __u8 stat3; + __u8 fault; + __u8 tms; + __u8 gen; + __u8 sysconf; + __u8 dummy15; +}; + +struct voyager_psi_subregs { + __u8 supply; + __u8 mask; + __u8 present; + __u8 DCfail; + __u8 ACfail; + __u8 fail; + __u8 UPSfail; + __u8 genstatus; +}; + +struct voyager_psi { + struct voyager_psi_regs regs; + struct voyager_psi_subregs subregs; +}; + +struct voyager_SUS { +#define VOYAGER_DUMP_BUTTON_NMI 0x1 +#define VOYAGER_SUS_VALID 0x2 +#define VOYAGER_SYSINT_COMPLETE 0x3 + __u8 SUS_mbox; +#define VOYAGER_NO_COMMAND 0x0 +#define VOYAGER_IGNORE_DUMP 0x1 +#define VOYAGER_DO_DUMP 0x2 +#define VOYAGER_SYSINT_HANDSHAKE 0x3 +#define VOYAGER_DO_MEM_DUMP 0x4 +#define VOYAGER_SYSINT_WAS_RECOVERED 0x5 + __u8 kernel_mbox; +#define VOYAGER_MAILBOX_VERSION 0x10 + __u8 SUS_version; + __u8 kernel_version; +#define VOYAGER_OS_HAS_SYSINT 0x1 +#define VOYAGER_OS_IN_PROGRESS 0x2 +#define VOYAGER_UPDATING_WDPERIOD 0x4 + __u32 kernel_flags; +#define VOYAGER_SUS_BOOTING 0x1 +#define VOYAGER_SUS_IN_PROGRESS 0x2 + __u32 SUS_flags; + __u32 watchdog_period; + __u32 watchdog_count; + __u32 SUS_errorlog; + /* lots of system configuration stuff under here */ +}; + +/* Variables exported by voyager_smp */ +extern __u32 voyager_extended_vic_processors; +extern __u32 voyager_allowed_boot_processors; +extern __u32 voyager_quad_processors; +extern struct voyager_qic_cpi *voyager_quad_cpi_addr[NR_CPUS]; +extern struct voyager_SUS *voyager_SUS; + +/* variables exported always */ +extern struct task_struct *voyager_thread; +extern int voyager_level; +extern struct voyager_status voyager_status; + +/* functions exported by the voyager and voyager_smp modules */ +extern int voyager_cat_readb(__u8 module, __u8 asic, int reg); +extern void voyager_cat_init(void); +extern void voyager_detect(struct voyager_bios_info *); +extern void voyager_trap_init(void); +extern void voyager_setup_irqs(void); +extern int voyager_memory_detect(int region, __u32 *addr, __u32 *length); +extern void voyager_smp_intr_init(void); +extern __u8 voyager_extended_cmos_read(__u16 cmos_address); +extern void voyager_smp_dump(void); +extern void voyager_timer_interrupt(void); +extern void smp_local_timer_interrupt(void); +extern void voyager_power_off(void); +extern void smp_voyager_power_off(void *dummy); +extern void voyager_restart(void); +extern void voyager_cat_power_off(void); +extern void voyager_cat_do_common_interrupt(void); +extern void voyager_handle_nmi(void); +/* Commands for the following are */ +#define VOYAGER_PSI_READ 0 +#define VOYAGER_PSI_WRITE 1 +#define VOYAGER_PSI_SUBREAD 2 +#define VOYAGER_PSI_SUBWRITE 3 +extern void voyager_cat_psi(__u8, __u16, __u8 *); diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h new file mode 100644 index 000000000000..d0983d255fbd --- /dev/null +++ b/arch/x86/include/asm/vsyscall.h @@ -0,0 +1,44 @@ +#ifndef _ASM_X86_VSYSCALL_H +#define _ASM_X86_VSYSCALL_H + +enum vsyscall_num { + __NR_vgettimeofday, + __NR_vtime, + __NR_vgetcpu, +}; + +#define VSYSCALL_START (-10UL << 20) +#define VSYSCALL_SIZE 1024 +#define VSYSCALL_END (-2UL << 20) +#define VSYSCALL_MAPPED_PAGES 1 +#define VSYSCALL_ADDR(vsyscall_nr) (VSYSCALL_START+VSYSCALL_SIZE*(vsyscall_nr)) + +#ifdef __KERNEL__ +#include <linux/seqlock.h> + +#define __section_vgetcpu_mode __attribute__ ((unused, __section__ (".vgetcpu_mode"), aligned(16))) +#define __section_jiffies __attribute__ ((unused, __section__ (".jiffies"), aligned(16))) + +/* Definitions for CONFIG_GENERIC_TIME definitions */ +#define __section_vsyscall_gtod_data __attribute__ \ + ((unused, __section__ (".vsyscall_gtod_data"),aligned(16))) +#define __section_vsyscall_clock __attribute__ \ + ((unused, __section__ (".vsyscall_clock"),aligned(16))) +#define __vsyscall_fn \ + __attribute__ ((unused, __section__(".vsyscall_fn"))) notrace + +#define VGETCPU_RDTSCP 1 +#define VGETCPU_LSL 2 + +extern int __vgetcpu_mode; +extern volatile unsigned long __jiffies; + +/* kernel space (writeable) */ +extern int vgetcpu_mode; +extern struct timezone sys_tz; + +extern void map_vsyscall(void); + +#endif /* __KERNEL__ */ + +#endif /* _ASM_X86_VSYSCALL_H */ diff --git a/arch/x86/include/asm/xcr.h b/arch/x86/include/asm/xcr.h new file mode 100644 index 000000000000..f2cba4e79a23 --- /dev/null +++ b/arch/x86/include/asm/xcr.h @@ -0,0 +1,49 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2008 rPath, Inc. - All Rights Reserved + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2 or (at your + * option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * asm-x86/xcr.h + * + * Definitions for the eXtended Control Register instructions + */ + +#ifndef _ASM_X86_XCR_H +#define _ASM_X86_XCR_H + +#define XCR_XFEATURE_ENABLED_MASK 0x00000000 + +#ifdef __KERNEL__ +# ifndef __ASSEMBLY__ + +#include <linux/types.h> + +static inline u64 xgetbv(u32 index) +{ + u32 eax, edx; + + asm volatile(".byte 0x0f,0x01,0xd0" /* xgetbv */ + : "=a" (eax), "=d" (edx) + : "c" (index)); + return eax + ((u64)edx << 32); +} + +static inline void xsetbv(u32 index, u64 value) +{ + u32 eax = value; + u32 edx = value >> 32; + + asm volatile(".byte 0x0f,0x01,0xd1" /* xsetbv */ + : : "a" (eax), "d" (edx), "c" (index)); +} + +# endif /* __ASSEMBLY__ */ +#endif /* __KERNEL__ */ + +#endif /* _ASM_X86_XCR_H */ diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h new file mode 100644 index 000000000000..19144184983a --- /dev/null +++ b/arch/x86/include/asm/xen/events.h @@ -0,0 +1,24 @@ +#ifndef _ASM_X86_XEN_EVENTS_H +#define _ASM_X86_XEN_EVENTS_H + +enum ipi_vector { + XEN_RESCHEDULE_VECTOR, + XEN_CALL_FUNCTION_VECTOR, + XEN_CALL_FUNCTION_SINGLE_VECTOR, + XEN_SPIN_UNLOCK_VECTOR, + + XEN_NR_IPIS, +}; + +static inline int xen_irqs_disabled(struct pt_regs *regs) +{ + return raw_irqs_disabled_flags(regs->flags); +} + +static inline void xen_do_IRQ(int irq, struct pt_regs *regs) +{ + regs->orig_ax = ~irq; + do_IRQ(regs); +} + +#endif /* _ASM_X86_XEN_EVENTS_H */ diff --git a/arch/x86/include/asm/xen/grant_table.h b/arch/x86/include/asm/xen/grant_table.h new file mode 100644 index 000000000000..fdbbb45767a6 --- /dev/null +++ b/arch/x86/include/asm/xen/grant_table.h @@ -0,0 +1,7 @@ +#ifndef _ASM_X86_XEN_GRANT_TABLE_H +#define _ASM_X86_XEN_GRANT_TABLE_H + +#define xen_alloc_vm_area(size) alloc_vm_area(size) +#define xen_free_vm_area(area) free_vm_area(area) + +#endif /* _ASM_X86_XEN_GRANT_TABLE_H */ diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h new file mode 100644 index 000000000000..3f6000d95fe2 --- /dev/null +++ b/arch/x86/include/asm/xen/hypercall.h @@ -0,0 +1,527 @@ +/****************************************************************************** + * hypercall.h + * + * Linux-specific hypervisor handling. + * + * Copyright (c) 2002-2004, K A Fraser + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef _ASM_X86_XEN_HYPERCALL_H +#define _ASM_X86_XEN_HYPERCALL_H + +#include <linux/errno.h> +#include <linux/string.h> + +#include <xen/interface/xen.h> +#include <xen/interface/sched.h> +#include <xen/interface/physdev.h> + +/* + * The hypercall asms have to meet several constraints: + * - Work on 32- and 64-bit. + * The two architectures put their arguments in different sets of + * registers. + * + * - Work around asm syntax quirks + * It isn't possible to specify one of the rNN registers in a + * constraint, so we use explicit register variables to get the + * args into the right place. + * + * - Mark all registers as potentially clobbered + * Even unused parameters can be clobbered by the hypervisor, so we + * need to make sure gcc knows it. + * + * - Avoid compiler bugs. + * This is the tricky part. Because x86_32 has such a constrained + * register set, gcc versions below 4.3 have trouble generating + * code when all the arg registers and memory are trashed by the + * asm. There are syntactically simpler ways of achieving the + * semantics below, but they cause the compiler to crash. + * + * The only combination I found which works is: + * - assign the __argX variables first + * - list all actually used parameters as "+r" (__argX) + * - clobber the rest + * + * The result certainly isn't pretty, and it really shows up cpp's + * weakness as as macro language. Sorry. (But let's just give thanks + * there aren't more than 5 arguments...) + */ + +extern struct { char _entry[32]; } hypercall_page[]; + +#define __HYPERCALL "call hypercall_page+%c[offset]" +#define __HYPERCALL_ENTRY(x) \ + [offset] "i" (__HYPERVISOR_##x * sizeof(hypercall_page[0])) + +#ifdef CONFIG_X86_32 +#define __HYPERCALL_RETREG "eax" +#define __HYPERCALL_ARG1REG "ebx" +#define __HYPERCALL_ARG2REG "ecx" +#define __HYPERCALL_ARG3REG "edx" +#define __HYPERCALL_ARG4REG "esi" +#define __HYPERCALL_ARG5REG "edi" +#else +#define __HYPERCALL_RETREG "rax" +#define __HYPERCALL_ARG1REG "rdi" +#define __HYPERCALL_ARG2REG "rsi" +#define __HYPERCALL_ARG3REG "rdx" +#define __HYPERCALL_ARG4REG "r10" +#define __HYPERCALL_ARG5REG "r8" +#endif + +#define __HYPERCALL_DECLS \ + register unsigned long __res asm(__HYPERCALL_RETREG); \ + register unsigned long __arg1 asm(__HYPERCALL_ARG1REG) = __arg1; \ + register unsigned long __arg2 asm(__HYPERCALL_ARG2REG) = __arg2; \ + register unsigned long __arg3 asm(__HYPERCALL_ARG3REG) = __arg3; \ + register unsigned long __arg4 asm(__HYPERCALL_ARG4REG) = __arg4; \ + register unsigned long __arg5 asm(__HYPERCALL_ARG5REG) = __arg5; + +#define __HYPERCALL_0PARAM "=r" (__res) +#define __HYPERCALL_1PARAM __HYPERCALL_0PARAM, "+r" (__arg1) +#define __HYPERCALL_2PARAM __HYPERCALL_1PARAM, "+r" (__arg2) +#define __HYPERCALL_3PARAM __HYPERCALL_2PARAM, "+r" (__arg3) +#define __HYPERCALL_4PARAM __HYPERCALL_3PARAM, "+r" (__arg4) +#define __HYPERCALL_5PARAM __HYPERCALL_4PARAM, "+r" (__arg5) + +#define __HYPERCALL_0ARG() +#define __HYPERCALL_1ARG(a1) \ + __HYPERCALL_0ARG() __arg1 = (unsigned long)(a1); +#define __HYPERCALL_2ARG(a1,a2) \ + __HYPERCALL_1ARG(a1) __arg2 = (unsigned long)(a2); +#define __HYPERCALL_3ARG(a1,a2,a3) \ + __HYPERCALL_2ARG(a1,a2) __arg3 = (unsigned long)(a3); +#define __HYPERCALL_4ARG(a1,a2,a3,a4) \ + __HYPERCALL_3ARG(a1,a2,a3) __arg4 = (unsigned long)(a4); +#define __HYPERCALL_5ARG(a1,a2,a3,a4,a5) \ + __HYPERCALL_4ARG(a1,a2,a3,a4) __arg5 = (unsigned long)(a5); + +#define __HYPERCALL_CLOBBER5 "memory" +#define __HYPERCALL_CLOBBER4 __HYPERCALL_CLOBBER5, __HYPERCALL_ARG5REG +#define __HYPERCALL_CLOBBER3 __HYPERCALL_CLOBBER4, __HYPERCALL_ARG4REG +#define __HYPERCALL_CLOBBER2 __HYPERCALL_CLOBBER3, __HYPERCALL_ARG3REG +#define __HYPERCALL_CLOBBER1 __HYPERCALL_CLOBBER2, __HYPERCALL_ARG2REG +#define __HYPERCALL_CLOBBER0 __HYPERCALL_CLOBBER1, __HYPERCALL_ARG1REG + +#define _hypercall0(type, name) \ +({ \ + __HYPERCALL_DECLS; \ + __HYPERCALL_0ARG(); \ + asm volatile (__HYPERCALL \ + : __HYPERCALL_0PARAM \ + : __HYPERCALL_ENTRY(name) \ + : __HYPERCALL_CLOBBER0); \ + (type)__res; \ +}) + +#define _hypercall1(type, name, a1) \ +({ \ + __HYPERCALL_DECLS; \ + __HYPERCALL_1ARG(a1); \ + asm volatile (__HYPERCALL \ + : __HYPERCALL_1PARAM \ + : __HYPERCALL_ENTRY(name) \ + : __HYPERCALL_CLOBBER1); \ + (type)__res; \ +}) + +#define _hypercall2(type, name, a1, a2) \ +({ \ + __HYPERCALL_DECLS; \ + __HYPERCALL_2ARG(a1, a2); \ + asm volatile (__HYPERCALL \ + : __HYPERCALL_2PARAM \ + : __HYPERCALL_ENTRY(name) \ + : __HYPERCALL_CLOBBER2); \ + (type)__res; \ +}) + +#define _hypercall3(type, name, a1, a2, a3) \ +({ \ + __HYPERCALL_DECLS; \ + __HYPERCALL_3ARG(a1, a2, a3); \ + asm volatile (__HYPERCALL \ + : __HYPERCALL_3PARAM \ + : __HYPERCALL_ENTRY(name) \ + : __HYPERCALL_CLOBBER3); \ + (type)__res; \ +}) + +#define _hypercall4(type, name, a1, a2, a3, a4) \ +({ \ + __HYPERCALL_DECLS; \ + __HYPERCALL_4ARG(a1, a2, a3, a4); \ + asm volatile (__HYPERCALL \ + : __HYPERCALL_4PARAM \ + : __HYPERCALL_ENTRY(name) \ + : __HYPERCALL_CLOBBER4); \ + (type)__res; \ +}) + +#define _hypercall5(type, name, a1, a2, a3, a4, a5) \ +({ \ + __HYPERCALL_DECLS; \ + __HYPERCALL_5ARG(a1, a2, a3, a4, a5); \ + asm volatile (__HYPERCALL \ + : __HYPERCALL_5PARAM \ + : __HYPERCALL_ENTRY(name) \ + : __HYPERCALL_CLOBBER5); \ + (type)__res; \ +}) + +static inline int +HYPERVISOR_set_trap_table(struct trap_info *table) +{ + return _hypercall1(int, set_trap_table, table); +} + +static inline int +HYPERVISOR_mmu_update(struct mmu_update *req, int count, + int *success_count, domid_t domid) +{ + return _hypercall4(int, mmu_update, req, count, success_count, domid); +} + +static inline int +HYPERVISOR_mmuext_op(struct mmuext_op *op, int count, + int *success_count, domid_t domid) +{ + return _hypercall4(int, mmuext_op, op, count, success_count, domid); +} + +static inline int +HYPERVISOR_set_gdt(unsigned long *frame_list, int entries) +{ + return _hypercall2(int, set_gdt, frame_list, entries); +} + +static inline int +HYPERVISOR_stack_switch(unsigned long ss, unsigned long esp) +{ + return _hypercall2(int, stack_switch, ss, esp); +} + +#ifdef CONFIG_X86_32 +static inline int +HYPERVISOR_set_callbacks(unsigned long event_selector, + unsigned long event_address, + unsigned long failsafe_selector, + unsigned long failsafe_address) +{ + return _hypercall4(int, set_callbacks, + event_selector, event_address, + failsafe_selector, failsafe_address); +} +#else /* CONFIG_X86_64 */ +static inline int +HYPERVISOR_set_callbacks(unsigned long event_address, + unsigned long failsafe_address, + unsigned long syscall_address) +{ + return _hypercall3(int, set_callbacks, + event_address, failsafe_address, + syscall_address); +} +#endif /* CONFIG_X86_{32,64} */ + +static inline int +HYPERVISOR_callback_op(int cmd, void *arg) +{ + return _hypercall2(int, callback_op, cmd, arg); +} + +static inline int +HYPERVISOR_fpu_taskswitch(int set) +{ + return _hypercall1(int, fpu_taskswitch, set); +} + +static inline int +HYPERVISOR_sched_op(int cmd, void *arg) +{ + return _hypercall2(int, sched_op_new, cmd, arg); +} + +static inline long +HYPERVISOR_set_timer_op(u64 timeout) +{ + unsigned long timeout_hi = (unsigned long)(timeout>>32); + unsigned long timeout_lo = (unsigned long)timeout; + return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi); +} + +static inline int +HYPERVISOR_set_debugreg(int reg, unsigned long value) +{ + return _hypercall2(int, set_debugreg, reg, value); +} + +static inline unsigned long +HYPERVISOR_get_debugreg(int reg) +{ + return _hypercall1(unsigned long, get_debugreg, reg); +} + +static inline int +HYPERVISOR_update_descriptor(u64 ma, u64 desc) +{ + return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32); +} + +static inline int +HYPERVISOR_memory_op(unsigned int cmd, void *arg) +{ + return _hypercall2(int, memory_op, cmd, arg); +} + +static inline int +HYPERVISOR_multicall(void *call_list, int nr_calls) +{ + return _hypercall2(int, multicall, call_list, nr_calls); +} + +static inline int +HYPERVISOR_update_va_mapping(unsigned long va, pte_t new_val, + unsigned long flags) +{ + if (sizeof(new_val) == sizeof(long)) + return _hypercall3(int, update_va_mapping, va, + new_val.pte, flags); + else + return _hypercall4(int, update_va_mapping, va, + new_val.pte, new_val.pte >> 32, flags); +} + +static inline int +HYPERVISOR_event_channel_op(int cmd, void *arg) +{ + int rc = _hypercall2(int, event_channel_op, cmd, arg); + if (unlikely(rc == -ENOSYS)) { + struct evtchn_op op; + op.cmd = cmd; + memcpy(&op.u, arg, sizeof(op.u)); + rc = _hypercall1(int, event_channel_op_compat, &op); + memcpy(arg, &op.u, sizeof(op.u)); + } + return rc; +} + +static inline int +HYPERVISOR_xen_version(int cmd, void *arg) +{ + return _hypercall2(int, xen_version, cmd, arg); +} + +static inline int +HYPERVISOR_console_io(int cmd, int count, char *str) +{ + return _hypercall3(int, console_io, cmd, count, str); +} + +static inline int +HYPERVISOR_physdev_op(int cmd, void *arg) +{ + int rc = _hypercall2(int, physdev_op, cmd, arg); + if (unlikely(rc == -ENOSYS)) { + struct physdev_op op; + op.cmd = cmd; + memcpy(&op.u, arg, sizeof(op.u)); + rc = _hypercall1(int, physdev_op_compat, &op); + memcpy(arg, &op.u, sizeof(op.u)); + } + return rc; +} + +static inline int +HYPERVISOR_grant_table_op(unsigned int cmd, void *uop, unsigned int count) +{ + return _hypercall3(int, grant_table_op, cmd, uop, count); +} + +static inline int +HYPERVISOR_update_va_mapping_otherdomain(unsigned long va, pte_t new_val, + unsigned long flags, domid_t domid) +{ + if (sizeof(new_val) == sizeof(long)) + return _hypercall4(int, update_va_mapping_otherdomain, va, + new_val.pte, flags, domid); + else + return _hypercall5(int, update_va_mapping_otherdomain, va, + new_val.pte, new_val.pte >> 32, + flags, domid); +} + +static inline int +HYPERVISOR_vm_assist(unsigned int cmd, unsigned int type) +{ + return _hypercall2(int, vm_assist, cmd, type); +} + +static inline int +HYPERVISOR_vcpu_op(int cmd, int vcpuid, void *extra_args) +{ + return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args); +} + +#ifdef CONFIG_X86_64 +static inline int +HYPERVISOR_set_segment_base(int reg, unsigned long value) +{ + return _hypercall2(int, set_segment_base, reg, value); +} +#endif + +static inline int +HYPERVISOR_suspend(unsigned long srec) +{ + return _hypercall3(int, sched_op, SCHEDOP_shutdown, + SHUTDOWN_suspend, srec); +} + +static inline int +HYPERVISOR_nmi_op(unsigned long op, unsigned long arg) +{ + return _hypercall2(int, nmi_op, op, arg); +} + +static inline void +MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) +{ + mcl->op = __HYPERVISOR_fpu_taskswitch; + mcl->args[0] = set; +} + +static inline void +MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va, + pte_t new_val, unsigned long flags) +{ + mcl->op = __HYPERVISOR_update_va_mapping; + mcl->args[0] = va; + if (sizeof(new_val) == sizeof(long)) { + mcl->args[1] = new_val.pte; + mcl->args[2] = flags; + } else { + mcl->args[1] = new_val.pte; + mcl->args[2] = new_val.pte >> 32; + mcl->args[3] = flags; + } +} + +static inline void +MULTI_grant_table_op(struct multicall_entry *mcl, unsigned int cmd, + void *uop, unsigned int count) +{ + mcl->op = __HYPERVISOR_grant_table_op; + mcl->args[0] = cmd; + mcl->args[1] = (unsigned long)uop; + mcl->args[2] = count; +} + +static inline void +MULTI_update_va_mapping_otherdomain(struct multicall_entry *mcl, unsigned long va, + pte_t new_val, unsigned long flags, + domid_t domid) +{ + mcl->op = __HYPERVISOR_update_va_mapping_otherdomain; + mcl->args[0] = va; + if (sizeof(new_val) == sizeof(long)) { + mcl->args[1] = new_val.pte; + mcl->args[2] = flags; + mcl->args[3] = domid; + } else { + mcl->args[1] = new_val.pte; + mcl->args[2] = new_val.pte >> 32; + mcl->args[3] = flags; + mcl->args[4] = domid; + } +} + +static inline void +MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr, + struct desc_struct desc) +{ + mcl->op = __HYPERVISOR_update_descriptor; + if (sizeof(maddr) == sizeof(long)) { + mcl->args[0] = maddr; + mcl->args[1] = *(unsigned long *)&desc; + } else { + mcl->args[0] = maddr; + mcl->args[1] = maddr >> 32; + mcl->args[2] = desc.a; + mcl->args[3] = desc.b; + } +} + +static inline void +MULTI_memory_op(struct multicall_entry *mcl, unsigned int cmd, void *arg) +{ + mcl->op = __HYPERVISOR_memory_op; + mcl->args[0] = cmd; + mcl->args[1] = (unsigned long)arg; +} + +static inline void +MULTI_mmu_update(struct multicall_entry *mcl, struct mmu_update *req, + int count, int *success_count, domid_t domid) +{ + mcl->op = __HYPERVISOR_mmu_update; + mcl->args[0] = (unsigned long)req; + mcl->args[1] = count; + mcl->args[2] = (unsigned long)success_count; + mcl->args[3] = domid; +} + +static inline void +MULTI_mmuext_op(struct multicall_entry *mcl, struct mmuext_op *op, int count, + int *success_count, domid_t domid) +{ + mcl->op = __HYPERVISOR_mmuext_op; + mcl->args[0] = (unsigned long)op; + mcl->args[1] = count; + mcl->args[2] = (unsigned long)success_count; + mcl->args[3] = domid; +} + +static inline void +MULTI_set_gdt(struct multicall_entry *mcl, unsigned long *frames, int entries) +{ + mcl->op = __HYPERVISOR_set_gdt; + mcl->args[0] = (unsigned long)frames; + mcl->args[1] = entries; +} + +static inline void +MULTI_stack_switch(struct multicall_entry *mcl, + unsigned long ss, unsigned long esp) +{ + mcl->op = __HYPERVISOR_stack_switch; + mcl->args[0] = ss; + mcl->args[1] = esp; +} + +#endif /* _ASM_X86_XEN_HYPERCALL_H */ diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h new file mode 100644 index 000000000000..a38d25ac87d2 --- /dev/null +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -0,0 +1,82 @@ +/****************************************************************************** + * hypervisor.h + * + * Linux-specific hypervisor handling. + * + * Copyright (c) 2002-2004, K A Fraser + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef _ASM_X86_XEN_HYPERVISOR_H +#define _ASM_X86_XEN_HYPERVISOR_H + +#include <linux/types.h> +#include <linux/kernel.h> + +#include <xen/interface/xen.h> +#include <xen/interface/version.h> + +#include <asm/ptrace.h> +#include <asm/page.h> +#include <asm/desc.h> +#if defined(__i386__) +# ifdef CONFIG_X86_PAE +# include <asm-generic/pgtable-nopud.h> +# else +# include <asm-generic/pgtable-nopmd.h> +# endif +#endif +#include <asm/xen/hypercall.h> + +/* arch/i386/kernel/setup.c */ +extern struct shared_info *HYPERVISOR_shared_info; +extern struct start_info *xen_start_info; + +/* arch/i386/mach-xen/evtchn.c */ +/* Force a proper event-channel callback from Xen. */ +extern void force_evtchn_callback(void); + +/* Turn jiffies into Xen system time. */ +u64 jiffies_to_st(unsigned long jiffies); + + +#define MULTI_UVMFLAGS_INDEX 3 +#define MULTI_UVMDOMID_INDEX 4 + +enum xen_domain_type { + XEN_NATIVE, + XEN_PV_DOMAIN, + XEN_HVM_DOMAIN, +}; + +extern enum xen_domain_type xen_domain_type; + +#define xen_domain() (xen_domain_type != XEN_NATIVE) +#define xen_pv_domain() (xen_domain_type == XEN_PV_DOMAIN) +#define xen_initial_domain() (xen_pv_domain() && xen_start_info->flags & SIF_INITDOMAIN) +#define xen_hvm_domain() (xen_domain_type == XEN_HVM_DOMAIN) + +#endif /* _ASM_X86_XEN_HYPERVISOR_H */ diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h new file mode 100644 index 000000000000..e8506c1f0c55 --- /dev/null +++ b/arch/x86/include/asm/xen/interface.h @@ -0,0 +1,175 @@ +/****************************************************************************** + * arch-x86_32.h + * + * Guest OS interface to x86 Xen. + * + * Copyright (c) 2004, K A Fraser + */ + +#ifndef _ASM_X86_XEN_INTERFACE_H +#define _ASM_X86_XEN_INTERFACE_H + +#ifdef __XEN__ +#define __DEFINE_GUEST_HANDLE(name, type) \ + typedef struct { type *p; } __guest_handle_ ## name +#else +#define __DEFINE_GUEST_HANDLE(name, type) \ + typedef type * __guest_handle_ ## name +#endif + +#define DEFINE_GUEST_HANDLE_STRUCT(name) \ + __DEFINE_GUEST_HANDLE(name, struct name) +#define DEFINE_GUEST_HANDLE(name) __DEFINE_GUEST_HANDLE(name, name) +#define GUEST_HANDLE(name) __guest_handle_ ## name + +#ifdef __XEN__ +#if defined(__i386__) +#define set_xen_guest_handle(hnd, val) \ + do { \ + if (sizeof(hnd) == 8) \ + *(uint64_t *)&(hnd) = 0; \ + (hnd).p = val; \ + } while (0) +#elif defined(__x86_64__) +#define set_xen_guest_handle(hnd, val) do { (hnd).p = val; } while (0) +#endif +#else +#if defined(__i386__) +#define set_xen_guest_handle(hnd, val) \ + do { \ + if (sizeof(hnd) == 8) \ + *(uint64_t *)&(hnd) = 0; \ + (hnd) = val; \ + } while (0) +#elif defined(__x86_64__) +#define set_xen_guest_handle(hnd, val) do { (hnd) = val; } while (0) +#endif +#endif + +#ifndef __ASSEMBLY__ +/* Guest handles for primitive C types. */ +__DEFINE_GUEST_HANDLE(uchar, unsigned char); +__DEFINE_GUEST_HANDLE(uint, unsigned int); +__DEFINE_GUEST_HANDLE(ulong, unsigned long); +DEFINE_GUEST_HANDLE(char); +DEFINE_GUEST_HANDLE(int); +DEFINE_GUEST_HANDLE(long); +DEFINE_GUEST_HANDLE(void); +#endif + +#ifndef HYPERVISOR_VIRT_START +#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START) +#endif + +#ifndef machine_to_phys_mapping +#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START) +#endif + +/* Maximum number of virtual CPUs in multi-processor guests. */ +#define MAX_VIRT_CPUS 32 + +/* + * SEGMENT DESCRIPTOR TABLES + */ +/* + * A number of GDT entries are reserved by Xen. These are not situated at the + * start of the GDT because some stupid OSes export hard-coded selector values + * in their ABI. These hard-coded values are always near the start of the GDT, + * so Xen places itself out of the way, at the far end of the GDT. + */ +#define FIRST_RESERVED_GDT_PAGE 14 +#define FIRST_RESERVED_GDT_BYTE (FIRST_RESERVED_GDT_PAGE * 4096) +#define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8) + +/* + * Send an array of these to HYPERVISOR_set_trap_table() + * The privilege level specifies which modes may enter a trap via a software + * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate + * privilege levels as follows: + * Level == 0: Noone may enter + * Level == 1: Kernel may enter + * Level == 2: Kernel may enter + * Level == 3: Everyone may enter + */ +#define TI_GET_DPL(_ti) ((_ti)->flags & 3) +#define TI_GET_IF(_ti) ((_ti)->flags & 4) +#define TI_SET_DPL(_ti, _dpl) ((_ti)->flags |= (_dpl)) +#define TI_SET_IF(_ti, _if) ((_ti)->flags |= ((!!(_if))<<2)) + +#ifndef __ASSEMBLY__ +struct trap_info { + uint8_t vector; /* exception vector */ + uint8_t flags; /* 0-3: privilege level; 4: clear event enable? */ + uint16_t cs; /* code selector */ + unsigned long address; /* code offset */ +}; +DEFINE_GUEST_HANDLE_STRUCT(trap_info); + +struct arch_shared_info { + unsigned long max_pfn; /* max pfn that appears in table */ + /* Frame containing list of mfns containing list of mfns containing p2m. */ + unsigned long pfn_to_mfn_frame_list_list; + unsigned long nmi_reason; +}; +#endif /* !__ASSEMBLY__ */ + +#ifdef CONFIG_X86_32 +#include "interface_32.h" +#else +#include "interface_64.h" +#endif + +#ifndef __ASSEMBLY__ +/* + * The following is all CPU context. Note that the fpu_ctxt block is filled + * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used. + */ +struct vcpu_guest_context { + /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */ + struct { char x[512]; } fpu_ctxt; /* User-level FPU registers */ +#define VGCF_I387_VALID (1<<0) +#define VGCF_HVM_GUEST (1<<1) +#define VGCF_IN_KERNEL (1<<2) + unsigned long flags; /* VGCF_* flags */ + struct cpu_user_regs user_regs; /* User-level CPU registers */ + struct trap_info trap_ctxt[256]; /* Virtual IDT */ + unsigned long ldt_base, ldt_ents; /* LDT (linear address, # ents) */ + unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */ + unsigned long kernel_ss, kernel_sp; /* Virtual TSS (only SS1/SP1) */ + /* NB. User pagetable on x86/64 is placed in ctrlreg[1]. */ + unsigned long ctrlreg[8]; /* CR0-CR7 (control registers) */ + unsigned long debugreg[8]; /* DB0-DB7 (debug registers) */ +#ifdef __i386__ + unsigned long event_callback_cs; /* CS:EIP of event callback */ + unsigned long event_callback_eip; + unsigned long failsafe_callback_cs; /* CS:EIP of failsafe callback */ + unsigned long failsafe_callback_eip; +#else + unsigned long event_callback_eip; + unsigned long failsafe_callback_eip; + unsigned long syscall_callback_eip; +#endif + unsigned long vm_assist; /* VMASST_TYPE_* bitmap */ +#ifdef __x86_64__ + /* Segment base addresses. */ + uint64_t fs_base; + uint64_t gs_base_kernel; + uint64_t gs_base_user; +#endif +}; +DEFINE_GUEST_HANDLE_STRUCT(vcpu_guest_context); +#endif /* !__ASSEMBLY__ */ + +/* + * Prefix forces emulation of some non-trapping instructions. + * Currently only CPUID. + */ +#ifdef __ASSEMBLY__ +#define XEN_EMULATE_PREFIX .byte 0x0f,0x0b,0x78,0x65,0x6e ; +#define XEN_CPUID XEN_EMULATE_PREFIX cpuid +#else +#define XEN_EMULATE_PREFIX ".byte 0x0f,0x0b,0x78,0x65,0x6e ; " +#define XEN_CPUID XEN_EMULATE_PREFIX "cpuid" +#endif + +#endif /* _ASM_X86_XEN_INTERFACE_H */ diff --git a/arch/x86/include/asm/xen/interface_32.h b/arch/x86/include/asm/xen/interface_32.h new file mode 100644 index 000000000000..42a7e004ae5c --- /dev/null +++ b/arch/x86/include/asm/xen/interface_32.h @@ -0,0 +1,97 @@ +/****************************************************************************** + * arch-x86_32.h + * + * Guest OS interface to x86 32-bit Xen. + * + * Copyright (c) 2004, K A Fraser + */ + +#ifndef _ASM_X86_XEN_INTERFACE_32_H +#define _ASM_X86_XEN_INTERFACE_32_H + + +/* + * These flat segments are in the Xen-private section of every GDT. Since these + * are also present in the initial GDT, many OSes will be able to avoid + * installing their own GDT. + */ +#define FLAT_RING1_CS 0xe019 /* GDT index 259 */ +#define FLAT_RING1_DS 0xe021 /* GDT index 260 */ +#define FLAT_RING1_SS 0xe021 /* GDT index 260 */ +#define FLAT_RING3_CS 0xe02b /* GDT index 261 */ +#define FLAT_RING3_DS 0xe033 /* GDT index 262 */ +#define FLAT_RING3_SS 0xe033 /* GDT index 262 */ + +#define FLAT_KERNEL_CS FLAT_RING1_CS +#define FLAT_KERNEL_DS FLAT_RING1_DS +#define FLAT_KERNEL_SS FLAT_RING1_SS +#define FLAT_USER_CS FLAT_RING3_CS +#define FLAT_USER_DS FLAT_RING3_DS +#define FLAT_USER_SS FLAT_RING3_SS + +/* And the trap vector is... */ +#define TRAP_INSTR "int $0x82" + +/* + * Virtual addresses beyond this are not modifiable by guest OSes. The + * machine->physical mapping table starts at this address, read-only. + */ +#define __HYPERVISOR_VIRT_START 0xF5800000 + +#ifndef __ASSEMBLY__ + +struct cpu_user_regs { + uint32_t ebx; + uint32_t ecx; + uint32_t edx; + uint32_t esi; + uint32_t edi; + uint32_t ebp; + uint32_t eax; + uint16_t error_code; /* private */ + uint16_t entry_vector; /* private */ + uint32_t eip; + uint16_t cs; + uint8_t saved_upcall_mask; + uint8_t _pad0; + uint32_t eflags; /* eflags.IF == !saved_upcall_mask */ + uint32_t esp; + uint16_t ss, _pad1; + uint16_t es, _pad2; + uint16_t ds, _pad3; + uint16_t fs, _pad4; + uint16_t gs, _pad5; +}; +DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs); + +typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */ + +struct arch_vcpu_info { + unsigned long cr2; + unsigned long pad[5]; /* sizeof(struct vcpu_info) == 64 */ +}; + +struct xen_callback { + unsigned long cs; + unsigned long eip; +}; +typedef struct xen_callback xen_callback_t; + +#define XEN_CALLBACK(__cs, __eip) \ + ((struct xen_callback){ .cs = (__cs), .eip = (unsigned long)(__eip) }) +#endif /* !__ASSEMBLY__ */ + + +/* + * Page-directory addresses above 4GB do not fit into architectural %cr3. + * When accessing %cr3, or equivalent field in vcpu_guest_context, guests + * must use the following accessor macros to pack/unpack valid MFNs. + * + * Note that Xen is using the fact that the pagetable base is always + * page-aligned, and putting the 12 MSB of the address into the 12 LSB + * of cr3. + */ +#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20)) +#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20)) + +#endif /* _ASM_X86_XEN_INTERFACE_32_H */ diff --git a/arch/x86/include/asm/xen/interface_64.h b/arch/x86/include/asm/xen/interface_64.h new file mode 100644 index 000000000000..100d2662b97c --- /dev/null +++ b/arch/x86/include/asm/xen/interface_64.h @@ -0,0 +1,159 @@ +#ifndef _ASM_X86_XEN_INTERFACE_64_H +#define _ASM_X86_XEN_INTERFACE_64_H + +/* + * 64-bit segment selectors + * These flat segments are in the Xen-private section of every GDT. Since these + * are also present in the initial GDT, many OSes will be able to avoid + * installing their own GDT. + */ + +#define FLAT_RING3_CS32 0xe023 /* GDT index 260 */ +#define FLAT_RING3_CS64 0xe033 /* GDT index 261 */ +#define FLAT_RING3_DS32 0xe02b /* GDT index 262 */ +#define FLAT_RING3_DS64 0x0000 /* NULL selector */ +#define FLAT_RING3_SS32 0xe02b /* GDT index 262 */ +#define FLAT_RING3_SS64 0xe02b /* GDT index 262 */ + +#define FLAT_KERNEL_DS64 FLAT_RING3_DS64 +#define FLAT_KERNEL_DS32 FLAT_RING3_DS32 +#define FLAT_KERNEL_DS FLAT_KERNEL_DS64 +#define FLAT_KERNEL_CS64 FLAT_RING3_CS64 +#define FLAT_KERNEL_CS32 FLAT_RING3_CS32 +#define FLAT_KERNEL_CS FLAT_KERNEL_CS64 +#define FLAT_KERNEL_SS64 FLAT_RING3_SS64 +#define FLAT_KERNEL_SS32 FLAT_RING3_SS32 +#define FLAT_KERNEL_SS FLAT_KERNEL_SS64 + +#define FLAT_USER_DS64 FLAT_RING3_DS64 +#define FLAT_USER_DS32 FLAT_RING3_DS32 +#define FLAT_USER_DS FLAT_USER_DS64 +#define FLAT_USER_CS64 FLAT_RING3_CS64 +#define FLAT_USER_CS32 FLAT_RING3_CS32 +#define FLAT_USER_CS FLAT_USER_CS64 +#define FLAT_USER_SS64 FLAT_RING3_SS64 +#define FLAT_USER_SS32 FLAT_RING3_SS32 +#define FLAT_USER_SS FLAT_USER_SS64 + +#define __HYPERVISOR_VIRT_START 0xFFFF800000000000 +#define __HYPERVISOR_VIRT_END 0xFFFF880000000000 +#define __MACH2PHYS_VIRT_START 0xFFFF800000000000 +#define __MACH2PHYS_VIRT_END 0xFFFF804000000000 + +#ifndef HYPERVISOR_VIRT_START +#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START) +#define HYPERVISOR_VIRT_END mk_unsigned_long(__HYPERVISOR_VIRT_END) +#endif + +#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START) +#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END) +#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>3) +#ifndef machine_to_phys_mapping +#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START) +#endif + +/* + * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base) + * @which == SEGBASE_* ; @base == 64-bit base address + * Returns 0 on success. + */ +#define SEGBASE_FS 0 +#define SEGBASE_GS_USER 1 +#define SEGBASE_GS_KERNEL 2 +#define SEGBASE_GS_USER_SEL 3 /* Set user %gs specified in base[15:0] */ + +/* + * int HYPERVISOR_iret(void) + * All arguments are on the kernel stack, in the following format. + * Never returns if successful. Current kernel context is lost. + * The saved CS is mapped as follows: + * RING0 -> RING3 kernel mode. + * RING1 -> RING3 kernel mode. + * RING2 -> RING3 kernel mode. + * RING3 -> RING3 user mode. + * However RING0 indicates that the guest kernel should return to iteself + * directly with + * orb $3,1*8(%rsp) + * iretq + * If flags contains VGCF_in_syscall: + * Restore RAX, RIP, RFLAGS, RSP. + * Discard R11, RCX, CS, SS. + * Otherwise: + * Restore RAX, R11, RCX, CS:RIP, RFLAGS, SS:RSP. + * All other registers are saved on hypercall entry and restored to user. + */ +/* Guest exited in SYSCALL context? Return to guest with SYSRET? */ +#define _VGCF_in_syscall 8 +#define VGCF_in_syscall (1<<_VGCF_in_syscall) +#define VGCF_IN_SYSCALL VGCF_in_syscall + +#ifndef __ASSEMBLY__ + +struct iret_context { + /* Top of stack (%rsp at point of hypercall). */ + uint64_t rax, r11, rcx, flags, rip, cs, rflags, rsp, ss; + /* Bottom of iret stack frame. */ +}; + +#if defined(__GNUC__) && !defined(__STRICT_ANSI__) +/* Anonymous union includes both 32- and 64-bit names (e.g., eax/rax). */ +#define __DECL_REG(name) union { \ + uint64_t r ## name, e ## name; \ + uint32_t _e ## name; \ +} +#else +/* Non-gcc sources must always use the proper 64-bit name (e.g., rax). */ +#define __DECL_REG(name) uint64_t r ## name +#endif + +struct cpu_user_regs { + uint64_t r15; + uint64_t r14; + uint64_t r13; + uint64_t r12; + __DECL_REG(bp); + __DECL_REG(bx); + uint64_t r11; + uint64_t r10; + uint64_t r9; + uint64_t r8; + __DECL_REG(ax); + __DECL_REG(cx); + __DECL_REG(dx); + __DECL_REG(si); + __DECL_REG(di); + uint32_t error_code; /* private */ + uint32_t entry_vector; /* private */ + __DECL_REG(ip); + uint16_t cs, _pad0[1]; + uint8_t saved_upcall_mask; + uint8_t _pad1[3]; + __DECL_REG(flags); /* rflags.IF == !saved_upcall_mask */ + __DECL_REG(sp); + uint16_t ss, _pad2[3]; + uint16_t es, _pad3[3]; + uint16_t ds, _pad4[3]; + uint16_t fs, _pad5[3]; /* Non-zero => takes precedence over fs_base. */ + uint16_t gs, _pad6[3]; /* Non-zero => takes precedence over gs_base_usr. */ +}; +DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs); + +#undef __DECL_REG + +#define xen_pfn_to_cr3(pfn) ((unsigned long)(pfn) << 12) +#define xen_cr3_to_pfn(cr3) ((unsigned long)(cr3) >> 12) + +struct arch_vcpu_info { + unsigned long cr2; + unsigned long pad; /* sizeof(vcpu_info_t) == 64 */ +}; + +typedef unsigned long xen_callback_t; + +#define XEN_CALLBACK(__cs, __rip) \ + ((unsigned long)(__rip)) + +#endif /* !__ASSEMBLY__ */ + + +#endif /* _ASM_X86_XEN_INTERFACE_64_H */ diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h new file mode 100644 index 000000000000..bc628998a1b9 --- /dev/null +++ b/arch/x86/include/asm/xen/page.h @@ -0,0 +1,165 @@ +#ifndef _ASM_X86_XEN_PAGE_H +#define _ASM_X86_XEN_PAGE_H + +#include <linux/pfn.h> + +#include <asm/uaccess.h> +#include <asm/pgtable.h> + +#include <xen/features.h> + +/* Xen machine address */ +typedef struct xmaddr { + phys_addr_t maddr; +} xmaddr_t; + +/* Xen pseudo-physical address */ +typedef struct xpaddr { + phys_addr_t paddr; +} xpaddr_t; + +#define XMADDR(x) ((xmaddr_t) { .maddr = (x) }) +#define XPADDR(x) ((xpaddr_t) { .paddr = (x) }) + +/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/ +#define INVALID_P2M_ENTRY (~0UL) +#define FOREIGN_FRAME_BIT (1UL<<31) +#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT) + +/* Maximum amount of memory we can handle in a domain in pages */ +#define MAX_DOMAIN_PAGES \ + ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE)) + + +extern unsigned long get_phys_to_machine(unsigned long pfn); +extern void set_phys_to_machine(unsigned long pfn, unsigned long mfn); + +static inline unsigned long pfn_to_mfn(unsigned long pfn) +{ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return pfn; + + return get_phys_to_machine(pfn) & ~FOREIGN_FRAME_BIT; +} + +static inline int phys_to_machine_mapping_valid(unsigned long pfn) +{ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return 1; + + return get_phys_to_machine(pfn) != INVALID_P2M_ENTRY; +} + +static inline unsigned long mfn_to_pfn(unsigned long mfn) +{ + unsigned long pfn; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return mfn; + +#if 0 + if (unlikely((mfn >> machine_to_phys_order) != 0)) + return max_mapnr; +#endif + + pfn = 0; + /* + * The array access can fail (e.g., device space beyond end of RAM). + * In such cases it doesn't matter what we return (we return garbage), + * but we must handle the fault without crashing! + */ + __get_user(pfn, &machine_to_phys_mapping[mfn]); + + return pfn; +} + +static inline xmaddr_t phys_to_machine(xpaddr_t phys) +{ + unsigned offset = phys.paddr & ~PAGE_MASK; + return XMADDR(PFN_PHYS(pfn_to_mfn(PFN_DOWN(phys.paddr))) | offset); +} + +static inline xpaddr_t machine_to_phys(xmaddr_t machine) +{ + unsigned offset = machine.maddr & ~PAGE_MASK; + return XPADDR(PFN_PHYS(mfn_to_pfn(PFN_DOWN(machine.maddr))) | offset); +} + +/* + * We detect special mappings in one of two ways: + * 1. If the MFN is an I/O page then Xen will set the m2p entry + * to be outside our maximum possible pseudophys range. + * 2. If the MFN belongs to a different domain then we will certainly + * not have MFN in our p2m table. Conversely, if the page is ours, + * then we'll have p2m(m2p(MFN))==MFN. + * If we detect a special mapping then it doesn't have a 'struct page'. + * We force !pfn_valid() by returning an out-of-range pointer. + * + * NB. These checks require that, for any MFN that is not in our reservation, + * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if + * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN. + * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety. + * + * NB2. When deliberately mapping foreign pages into the p2m table, you *must* + * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we + * require. In all the cases we care about, the FOREIGN_FRAME bit is + * masked (e.g., pfn_to_mfn()) so behaviour there is correct. + */ +static inline unsigned long mfn_to_local_pfn(unsigned long mfn) +{ + extern unsigned long max_mapnr; + unsigned long pfn = mfn_to_pfn(mfn); + if ((pfn < max_mapnr) + && !xen_feature(XENFEAT_auto_translated_physmap) + && (get_phys_to_machine(pfn) != mfn)) + return max_mapnr; /* force !pfn_valid() */ + /* XXX fixme; not true with sparsemem */ + return pfn; +} + +/* VIRT <-> MACHINE conversion */ +#define virt_to_machine(v) (phys_to_machine(XPADDR(__pa(v)))) +#define virt_to_mfn(v) (pfn_to_mfn(PFN_DOWN(__pa(v)))) +#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT)) + +static inline unsigned long pte_mfn(pte_t pte) +{ + return (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT; +} + +static inline pte_t mfn_pte(unsigned long page_nr, pgprot_t pgprot) +{ + pte_t pte; + + pte.pte = ((phys_addr_t)page_nr << PAGE_SHIFT) | + (pgprot_val(pgprot) & __supported_pte_mask); + + return pte; +} + +static inline pteval_t pte_val_ma(pte_t pte) +{ + return pte.pte; +} + +static inline pte_t __pte_ma(pteval_t x) +{ + return (pte_t) { .pte = x }; +} + +#define pmd_val_ma(v) ((v).pmd) +#ifdef __PAGETABLE_PUD_FOLDED +#define pud_val_ma(v) ((v).pgd.pgd) +#else +#define pud_val_ma(v) ((v).pud) +#endif +#define __pmd_ma(x) ((pmd_t) { (x) } ) + +#define pgd_val_ma(x) ((x).pgd) + + +xmaddr_t arbitrary_virt_to_machine(void *address); +void make_lowmem_page_readonly(void *vaddr); +void make_lowmem_page_readwrite(void *vaddr); + +#endif /* _ASM_X86_XEN_PAGE_H */ diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h new file mode 100644 index 000000000000..11b3bb86e17b --- /dev/null +++ b/arch/x86/include/asm/xor.h @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include "xor_32.h" +#else +# include "xor_64.h" +#endif diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h new file mode 100644 index 000000000000..133b40a0f495 --- /dev/null +++ b/arch/x86/include/asm/xor_32.h @@ -0,0 +1,888 @@ +#ifndef _ASM_X86_XOR_32_H +#define _ASM_X86_XOR_32_H + +/* + * Optimized RAID-5 checksumming functions for MMX and SSE. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * You should have received a copy of the GNU General Public License + * (for example /usr/src/linux/COPYING); if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* + * High-speed RAID5 checksumming functions utilizing MMX instructions. + * Copyright (C) 1998 Ingo Molnar. + */ + +#define LD(x, y) " movq 8*("#x")(%1), %%mm"#y" ;\n" +#define ST(x, y) " movq %%mm"#y", 8*("#x")(%1) ;\n" +#define XO1(x, y) " pxor 8*("#x")(%2), %%mm"#y" ;\n" +#define XO2(x, y) " pxor 8*("#x")(%3), %%mm"#y" ;\n" +#define XO3(x, y) " pxor 8*("#x")(%4), %%mm"#y" ;\n" +#define XO4(x, y) " pxor 8*("#x")(%5), %%mm"#y" ;\n" + +#include <asm/i387.h> + +static void +xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) +{ + unsigned long lines = bytes >> 7; + + kernel_fpu_begin(); + + asm volatile( +#undef BLOCK +#define BLOCK(i) \ + LD(i, 0) \ + LD(i + 1, 1) \ + LD(i + 2, 2) \ + LD(i + 3, 3) \ + XO1(i, 0) \ + ST(i, 0) \ + XO1(i+1, 1) \ + ST(i+1, 1) \ + XO1(i + 2, 2) \ + ST(i + 2, 2) \ + XO1(i + 3, 3) \ + ST(i + 3, 3) + + " .align 32 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addl $128, %1 ;\n" + " addl $128, %2 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + : "+r" (lines), + "+r" (p1), "+r" (p2) + : + : "memory"); + + kernel_fpu_end(); +} + +static void +xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, + unsigned long *p3) +{ + unsigned long lines = bytes >> 7; + + kernel_fpu_begin(); + + asm volatile( +#undef BLOCK +#define BLOCK(i) \ + LD(i, 0) \ + LD(i + 1, 1) \ + LD(i + 2, 2) \ + LD(i + 3, 3) \ + XO1(i, 0) \ + XO1(i + 1, 1) \ + XO1(i + 2, 2) \ + XO1(i + 3, 3) \ + XO2(i, 0) \ + ST(i, 0) \ + XO2(i + 1, 1) \ + ST(i + 1, 1) \ + XO2(i + 2, 2) \ + ST(i + 2, 2) \ + XO2(i + 3, 3) \ + ST(i + 3, 3) + + " .align 32 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addl $128, %1 ;\n" + " addl $128, %2 ;\n" + " addl $128, %3 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + : "+r" (lines), + "+r" (p1), "+r" (p2), "+r" (p3) + : + : "memory"); + + kernel_fpu_end(); +} + +static void +xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, + unsigned long *p3, unsigned long *p4) +{ + unsigned long lines = bytes >> 7; + + kernel_fpu_begin(); + + asm volatile( +#undef BLOCK +#define BLOCK(i) \ + LD(i, 0) \ + LD(i + 1, 1) \ + LD(i + 2, 2) \ + LD(i + 3, 3) \ + XO1(i, 0) \ + XO1(i + 1, 1) \ + XO1(i + 2, 2) \ + XO1(i + 3, 3) \ + XO2(i, 0) \ + XO2(i + 1, 1) \ + XO2(i + 2, 2) \ + XO2(i + 3, 3) \ + XO3(i, 0) \ + ST(i, 0) \ + XO3(i + 1, 1) \ + ST(i + 1, 1) \ + XO3(i + 2, 2) \ + ST(i + 2, 2) \ + XO3(i + 3, 3) \ + ST(i + 3, 3) + + " .align 32 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addl $128, %1 ;\n" + " addl $128, %2 ;\n" + " addl $128, %3 ;\n" + " addl $128, %4 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + : "+r" (lines), + "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) + : + : "memory"); + + kernel_fpu_end(); +} + + +static void +xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, + unsigned long *p3, unsigned long *p4, unsigned long *p5) +{ + unsigned long lines = bytes >> 7; + + kernel_fpu_begin(); + + /* Make sure GCC forgets anything it knows about p4 or p5, + such that it won't pass to the asm volatile below a + register that is shared with any other variable. That's + because we modify p4 and p5 there, but we can't mark them + as read/write, otherwise we'd overflow the 10-asm-operands + limit of GCC < 3.1. */ + asm("" : "+r" (p4), "+r" (p5)); + + asm volatile( +#undef BLOCK +#define BLOCK(i) \ + LD(i, 0) \ + LD(i + 1, 1) \ + LD(i + 2, 2) \ + LD(i + 3, 3) \ + XO1(i, 0) \ + XO1(i + 1, 1) \ + XO1(i + 2, 2) \ + XO1(i + 3, 3) \ + XO2(i, 0) \ + XO2(i + 1, 1) \ + XO2(i + 2, 2) \ + XO2(i + 3, 3) \ + XO3(i, 0) \ + XO3(i + 1, 1) \ + XO3(i + 2, 2) \ + XO3(i + 3, 3) \ + XO4(i, 0) \ + ST(i, 0) \ + XO4(i + 1, 1) \ + ST(i + 1, 1) \ + XO4(i + 2, 2) \ + ST(i + 2, 2) \ + XO4(i + 3, 3) \ + ST(i + 3, 3) + + " .align 32 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addl $128, %1 ;\n" + " addl $128, %2 ;\n" + " addl $128, %3 ;\n" + " addl $128, %4 ;\n" + " addl $128, %5 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + : "+r" (lines), + "+r" (p1), "+r" (p2), "+r" (p3) + : "r" (p4), "r" (p5) + : "memory"); + + /* p4 and p5 were modified, and now the variables are dead. + Clobber them just to be sure nobody does something stupid + like assuming they have some legal value. */ + asm("" : "=r" (p4), "=r" (p5)); + + kernel_fpu_end(); +} + +#undef LD +#undef XO1 +#undef XO2 +#undef XO3 +#undef XO4 +#undef ST +#undef BLOCK + +static void +xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) +{ + unsigned long lines = bytes >> 6; + + kernel_fpu_begin(); + + asm volatile( + " .align 32 ;\n" + " 1: ;\n" + " movq (%1), %%mm0 ;\n" + " movq 8(%1), %%mm1 ;\n" + " pxor (%2), %%mm0 ;\n" + " movq 16(%1), %%mm2 ;\n" + " movq %%mm0, (%1) ;\n" + " pxor 8(%2), %%mm1 ;\n" + " movq 24(%1), %%mm3 ;\n" + " movq %%mm1, 8(%1) ;\n" + " pxor 16(%2), %%mm2 ;\n" + " movq 32(%1), %%mm4 ;\n" + " movq %%mm2, 16(%1) ;\n" + " pxor 24(%2), %%mm3 ;\n" + " movq 40(%1), %%mm5 ;\n" + " movq %%mm3, 24(%1) ;\n" + " pxor 32(%2), %%mm4 ;\n" + " movq 48(%1), %%mm6 ;\n" + " movq %%mm4, 32(%1) ;\n" + " pxor 40(%2), %%mm5 ;\n" + " movq 56(%1), %%mm7 ;\n" + " movq %%mm5, 40(%1) ;\n" + " pxor 48(%2), %%mm6 ;\n" + " pxor 56(%2), %%mm7 ;\n" + " movq %%mm6, 48(%1) ;\n" + " movq %%mm7, 56(%1) ;\n" + + " addl $64, %1 ;\n" + " addl $64, %2 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + : "+r" (lines), + "+r" (p1), "+r" (p2) + : + : "memory"); + + kernel_fpu_end(); +} + +static void +xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, + unsigned long *p3) +{ + unsigned long lines = bytes >> 6; + + kernel_fpu_begin(); + + asm volatile( + " .align 32,0x90 ;\n" + " 1: ;\n" + " movq (%1), %%mm0 ;\n" + " movq 8(%1), %%mm1 ;\n" + " pxor (%2), %%mm0 ;\n" + " movq 16(%1), %%mm2 ;\n" + " pxor 8(%2), %%mm1 ;\n" + " pxor (%3), %%mm0 ;\n" + " pxor 16(%2), %%mm2 ;\n" + " movq %%mm0, (%1) ;\n" + " pxor 8(%3), %%mm1 ;\n" + " pxor 16(%3), %%mm2 ;\n" + " movq 24(%1), %%mm3 ;\n" + " movq %%mm1, 8(%1) ;\n" + " movq 32(%1), %%mm4 ;\n" + " movq 40(%1), %%mm5 ;\n" + " pxor 24(%2), %%mm3 ;\n" + " movq %%mm2, 16(%1) ;\n" + " pxor 32(%2), %%mm4 ;\n" + " pxor 24(%3), %%mm3 ;\n" + " pxor 40(%2), %%mm5 ;\n" + " movq %%mm3, 24(%1) ;\n" + " pxor 32(%3), %%mm4 ;\n" + " pxor 40(%3), %%mm5 ;\n" + " movq 48(%1), %%mm6 ;\n" + " movq %%mm4, 32(%1) ;\n" + " movq 56(%1), %%mm7 ;\n" + " pxor 48(%2), %%mm6 ;\n" + " movq %%mm5, 40(%1) ;\n" + " pxor 56(%2), %%mm7 ;\n" + " pxor 48(%3), %%mm6 ;\n" + " pxor 56(%3), %%mm7 ;\n" + " movq %%mm6, 48(%1) ;\n" + " movq %%mm7, 56(%1) ;\n" + + " addl $64, %1 ;\n" + " addl $64, %2 ;\n" + " addl $64, %3 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + : "+r" (lines), + "+r" (p1), "+r" (p2), "+r" (p3) + : + : "memory" ); + + kernel_fpu_end(); +} + +static void +xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, + unsigned long *p3, unsigned long *p4) +{ + unsigned long lines = bytes >> 6; + + kernel_fpu_begin(); + + asm volatile( + " .align 32,0x90 ;\n" + " 1: ;\n" + " movq (%1), %%mm0 ;\n" + " movq 8(%1), %%mm1 ;\n" + " pxor (%2), %%mm0 ;\n" + " movq 16(%1), %%mm2 ;\n" + " pxor 8(%2), %%mm1 ;\n" + " pxor (%3), %%mm0 ;\n" + " pxor 16(%2), %%mm2 ;\n" + " pxor 8(%3), %%mm1 ;\n" + " pxor (%4), %%mm0 ;\n" + " movq 24(%1), %%mm3 ;\n" + " pxor 16(%3), %%mm2 ;\n" + " pxor 8(%4), %%mm1 ;\n" + " movq %%mm0, (%1) ;\n" + " movq 32(%1), %%mm4 ;\n" + " pxor 24(%2), %%mm3 ;\n" + " pxor 16(%4), %%mm2 ;\n" + " movq %%mm1, 8(%1) ;\n" + " movq 40(%1), %%mm5 ;\n" + " pxor 32(%2), %%mm4 ;\n" + " pxor 24(%3), %%mm3 ;\n" + " movq %%mm2, 16(%1) ;\n" + " pxor 40(%2), %%mm5 ;\n" + " pxor 32(%3), %%mm4 ;\n" + " pxor 24(%4), %%mm3 ;\n" + " movq %%mm3, 24(%1) ;\n" + " movq 56(%1), %%mm7 ;\n" + " movq 48(%1), %%mm6 ;\n" + " pxor 40(%3), %%mm5 ;\n" + " pxor 32(%4), %%mm4 ;\n" + " pxor 48(%2), %%mm6 ;\n" + " movq %%mm4, 32(%1) ;\n" + " pxor 56(%2), %%mm7 ;\n" + " pxor 40(%4), %%mm5 ;\n" + " pxor 48(%3), %%mm6 ;\n" + " pxor 56(%3), %%mm7 ;\n" + " movq %%mm5, 40(%1) ;\n" + " pxor 48(%4), %%mm6 ;\n" + " pxor 56(%4), %%mm7 ;\n" + " movq %%mm6, 48(%1) ;\n" + " movq %%mm7, 56(%1) ;\n" + + " addl $64, %1 ;\n" + " addl $64, %2 ;\n" + " addl $64, %3 ;\n" + " addl $64, %4 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + : "+r" (lines), + "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) + : + : "memory"); + + kernel_fpu_end(); +} + +static void +xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, + unsigned long *p3, unsigned long *p4, unsigned long *p5) +{ + unsigned long lines = bytes >> 6; + + kernel_fpu_begin(); + + /* Make sure GCC forgets anything it knows about p4 or p5, + such that it won't pass to the asm volatile below a + register that is shared with any other variable. That's + because we modify p4 and p5 there, but we can't mark them + as read/write, otherwise we'd overflow the 10-asm-operands + limit of GCC < 3.1. */ + asm("" : "+r" (p4), "+r" (p5)); + + asm volatile( + " .align 32,0x90 ;\n" + " 1: ;\n" + " movq (%1), %%mm0 ;\n" + " movq 8(%1), %%mm1 ;\n" + " pxor (%2), %%mm0 ;\n" + " pxor 8(%2), %%mm1 ;\n" + " movq 16(%1), %%mm2 ;\n" + " pxor (%3), %%mm0 ;\n" + " pxor 8(%3), %%mm1 ;\n" + " pxor 16(%2), %%mm2 ;\n" + " pxor (%4), %%mm0 ;\n" + " pxor 8(%4), %%mm1 ;\n" + " pxor 16(%3), %%mm2 ;\n" + " movq 24(%1), %%mm3 ;\n" + " pxor (%5), %%mm0 ;\n" + " pxor 8(%5), %%mm1 ;\n" + " movq %%mm0, (%1) ;\n" + " pxor 16(%4), %%mm2 ;\n" + " pxor 24(%2), %%mm3 ;\n" + " movq %%mm1, 8(%1) ;\n" + " pxor 16(%5), %%mm2 ;\n" + " pxor 24(%3), %%mm3 ;\n" + " movq 32(%1), %%mm4 ;\n" + " movq %%mm2, 16(%1) ;\n" + " pxor 24(%4), %%mm3 ;\n" + " pxor 32(%2), %%mm4 ;\n" + " movq 40(%1), %%mm5 ;\n" + " pxor 24(%5), %%mm3 ;\n" + " pxor 32(%3), %%mm4 ;\n" + " pxor 40(%2), %%mm5 ;\n" + " movq %%mm3, 24(%1) ;\n" + " pxor 32(%4), %%mm4 ;\n" + " pxor 40(%3), %%mm5 ;\n" + " movq 48(%1), %%mm6 ;\n" + " movq 56(%1), %%mm7 ;\n" + " pxor 32(%5), %%mm4 ;\n" + " pxor 40(%4), %%mm5 ;\n" + " pxor 48(%2), %%mm6 ;\n" + " pxor 56(%2), %%mm7 ;\n" + " movq %%mm4, 32(%1) ;\n" + " pxor 48(%3), %%mm6 ;\n" + " pxor 56(%3), %%mm7 ;\n" + " pxor 40(%5), %%mm5 ;\n" + " pxor 48(%4), %%mm6 ;\n" + " pxor 56(%4), %%mm7 ;\n" + " movq %%mm5, 40(%1) ;\n" + " pxor 48(%5), %%mm6 ;\n" + " pxor 56(%5), %%mm7 ;\n" + " movq %%mm6, 48(%1) ;\n" + " movq %%mm7, 56(%1) ;\n" + + " addl $64, %1 ;\n" + " addl $64, %2 ;\n" + " addl $64, %3 ;\n" + " addl $64, %4 ;\n" + " addl $64, %5 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + : "+r" (lines), + "+r" (p1), "+r" (p2), "+r" (p3) + : "r" (p4), "r" (p5) + : "memory"); + + /* p4 and p5 were modified, and now the variables are dead. + Clobber them just to be sure nobody does something stupid + like assuming they have some legal value. */ + asm("" : "=r" (p4), "=r" (p5)); + + kernel_fpu_end(); +} + +static struct xor_block_template xor_block_pII_mmx = { + .name = "pII_mmx", + .do_2 = xor_pII_mmx_2, + .do_3 = xor_pII_mmx_3, + .do_4 = xor_pII_mmx_4, + .do_5 = xor_pII_mmx_5, +}; + +static struct xor_block_template xor_block_p5_mmx = { + .name = "p5_mmx", + .do_2 = xor_p5_mmx_2, + .do_3 = xor_p5_mmx_3, + .do_4 = xor_p5_mmx_4, + .do_5 = xor_p5_mmx_5, +}; + +/* + * Cache avoiding checksumming functions utilizing KNI instructions + * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) + */ + +#define XMMS_SAVE \ +do { \ + preempt_disable(); \ + cr0 = read_cr0(); \ + clts(); \ + asm volatile( \ + "movups %%xmm0,(%0) ;\n\t" \ + "movups %%xmm1,0x10(%0) ;\n\t" \ + "movups %%xmm2,0x20(%0) ;\n\t" \ + "movups %%xmm3,0x30(%0) ;\n\t" \ + : \ + : "r" (xmm_save) \ + : "memory"); \ +} while (0) + +#define XMMS_RESTORE \ +do { \ + asm volatile( \ + "sfence ;\n\t" \ + "movups (%0),%%xmm0 ;\n\t" \ + "movups 0x10(%0),%%xmm1 ;\n\t" \ + "movups 0x20(%0),%%xmm2 ;\n\t" \ + "movups 0x30(%0),%%xmm3 ;\n\t" \ + : \ + : "r" (xmm_save) \ + : "memory"); \ + write_cr0(cr0); \ + preempt_enable(); \ +} while (0) + +#define ALIGN16 __attribute__((aligned(16))) + +#define OFFS(x) "16*("#x")" +#define PF_OFFS(x) "256+16*("#x")" +#define PF0(x) " prefetchnta "PF_OFFS(x)"(%1) ;\n" +#define LD(x, y) " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n" +#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n" +#define PF1(x) " prefetchnta "PF_OFFS(x)"(%2) ;\n" +#define PF2(x) " prefetchnta "PF_OFFS(x)"(%3) ;\n" +#define PF3(x) " prefetchnta "PF_OFFS(x)"(%4) ;\n" +#define PF4(x) " prefetchnta "PF_OFFS(x)"(%5) ;\n" +#define PF5(x) " prefetchnta "PF_OFFS(x)"(%6) ;\n" +#define XO1(x, y) " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n" +#define XO2(x, y) " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n" +#define XO3(x, y) " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n" +#define XO4(x, y) " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n" +#define XO5(x, y) " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n" + + +static void +xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) +{ + unsigned long lines = bytes >> 8; + char xmm_save[16*4] ALIGN16; + int cr0; + + XMMS_SAVE; + + asm volatile( +#undef BLOCK +#define BLOCK(i) \ + LD(i, 0) \ + LD(i + 1, 1) \ + PF1(i) \ + PF1(i + 2) \ + LD(i + 2, 2) \ + LD(i + 3, 3) \ + PF0(i + 4) \ + PF0(i + 6) \ + XO1(i, 0) \ + XO1(i + 1, 1) \ + XO1(i + 2, 2) \ + XO1(i + 3, 3) \ + ST(i, 0) \ + ST(i + 1, 1) \ + ST(i + 2, 2) \ + ST(i + 3, 3) \ + + + PF0(0) + PF0(2) + + " .align 32 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addl $256, %1 ;\n" + " addl $256, %2 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + : "+r" (lines), + "+r" (p1), "+r" (p2) + : + : "memory"); + + XMMS_RESTORE; +} + +static void +xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, + unsigned long *p3) +{ + unsigned long lines = bytes >> 8; + char xmm_save[16*4] ALIGN16; + int cr0; + + XMMS_SAVE; + + asm volatile( +#undef BLOCK +#define BLOCK(i) \ + PF1(i) \ + PF1(i + 2) \ + LD(i,0) \ + LD(i + 1, 1) \ + LD(i + 2, 2) \ + LD(i + 3, 3) \ + PF2(i) \ + PF2(i + 2) \ + PF0(i + 4) \ + PF0(i + 6) \ + XO1(i,0) \ + XO1(i + 1, 1) \ + XO1(i + 2, 2) \ + XO1(i + 3, 3) \ + XO2(i,0) \ + XO2(i + 1, 1) \ + XO2(i + 2, 2) \ + XO2(i + 3, 3) \ + ST(i,0) \ + ST(i + 1, 1) \ + ST(i + 2, 2) \ + ST(i + 3, 3) \ + + + PF0(0) + PF0(2) + + " .align 32 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addl $256, %1 ;\n" + " addl $256, %2 ;\n" + " addl $256, %3 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + : "+r" (lines), + "+r" (p1), "+r"(p2), "+r"(p3) + : + : "memory" ); + + XMMS_RESTORE; +} + +static void +xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, + unsigned long *p3, unsigned long *p4) +{ + unsigned long lines = bytes >> 8; + char xmm_save[16*4] ALIGN16; + int cr0; + + XMMS_SAVE; + + asm volatile( +#undef BLOCK +#define BLOCK(i) \ + PF1(i) \ + PF1(i + 2) \ + LD(i,0) \ + LD(i + 1, 1) \ + LD(i + 2, 2) \ + LD(i + 3, 3) \ + PF2(i) \ + PF2(i + 2) \ + XO1(i,0) \ + XO1(i + 1, 1) \ + XO1(i + 2, 2) \ + XO1(i + 3, 3) \ + PF3(i) \ + PF3(i + 2) \ + PF0(i + 4) \ + PF0(i + 6) \ + XO2(i,0) \ + XO2(i + 1, 1) \ + XO2(i + 2, 2) \ + XO2(i + 3, 3) \ + XO3(i,0) \ + XO3(i + 1, 1) \ + XO3(i + 2, 2) \ + XO3(i + 3, 3) \ + ST(i,0) \ + ST(i + 1, 1) \ + ST(i + 2, 2) \ + ST(i + 3, 3) \ + + + PF0(0) + PF0(2) + + " .align 32 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addl $256, %1 ;\n" + " addl $256, %2 ;\n" + " addl $256, %3 ;\n" + " addl $256, %4 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + : "+r" (lines), + "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) + : + : "memory" ); + + XMMS_RESTORE; +} + +static void +xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, + unsigned long *p3, unsigned long *p4, unsigned long *p5) +{ + unsigned long lines = bytes >> 8; + char xmm_save[16*4] ALIGN16; + int cr0; + + XMMS_SAVE; + + /* Make sure GCC forgets anything it knows about p4 or p5, + such that it won't pass to the asm volatile below a + register that is shared with any other variable. That's + because we modify p4 and p5 there, but we can't mark them + as read/write, otherwise we'd overflow the 10-asm-operands + limit of GCC < 3.1. */ + asm("" : "+r" (p4), "+r" (p5)); + + asm volatile( +#undef BLOCK +#define BLOCK(i) \ + PF1(i) \ + PF1(i + 2) \ + LD(i,0) \ + LD(i + 1, 1) \ + LD(i + 2, 2) \ + LD(i + 3, 3) \ + PF2(i) \ + PF2(i + 2) \ + XO1(i,0) \ + XO1(i + 1, 1) \ + XO1(i + 2, 2) \ + XO1(i + 3, 3) \ + PF3(i) \ + PF3(i + 2) \ + XO2(i,0) \ + XO2(i + 1, 1) \ + XO2(i + 2, 2) \ + XO2(i + 3, 3) \ + PF4(i) \ + PF4(i + 2) \ + PF0(i + 4) \ + PF0(i + 6) \ + XO3(i,0) \ + XO3(i + 1, 1) \ + XO3(i + 2, 2) \ + XO3(i + 3, 3) \ + XO4(i,0) \ + XO4(i + 1, 1) \ + XO4(i + 2, 2) \ + XO4(i + 3, 3) \ + ST(i,0) \ + ST(i + 1, 1) \ + ST(i + 2, 2) \ + ST(i + 3, 3) \ + + + PF0(0) + PF0(2) + + " .align 32 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addl $256, %1 ;\n" + " addl $256, %2 ;\n" + " addl $256, %3 ;\n" + " addl $256, %4 ;\n" + " addl $256, %5 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + : "+r" (lines), + "+r" (p1), "+r" (p2), "+r" (p3) + : "r" (p4), "r" (p5) + : "memory"); + + /* p4 and p5 were modified, and now the variables are dead. + Clobber them just to be sure nobody does something stupid + like assuming they have some legal value. */ + asm("" : "=r" (p4), "=r" (p5)); + + XMMS_RESTORE; +} + +static struct xor_block_template xor_block_pIII_sse = { + .name = "pIII_sse", + .do_2 = xor_sse_2, + .do_3 = xor_sse_3, + .do_4 = xor_sse_4, + .do_5 = xor_sse_5, +}; + +/* Also try the generic routines. */ +#include <asm-generic/xor.h> + +#undef XOR_TRY_TEMPLATES +#define XOR_TRY_TEMPLATES \ +do { \ + xor_speed(&xor_block_8regs); \ + xor_speed(&xor_block_8regs_p); \ + xor_speed(&xor_block_32regs); \ + xor_speed(&xor_block_32regs_p); \ + if (cpu_has_xmm) \ + xor_speed(&xor_block_pIII_sse); \ + if (cpu_has_mmx) { \ + xor_speed(&xor_block_pII_mmx); \ + xor_speed(&xor_block_p5_mmx); \ + } \ +} while (0) + +/* We force the use of the SSE xor block because it can write around L2. + We may also be able to load into the L1 only depending on how the cpu + deals with a load to a line that is being prefetched. */ +#define XOR_SELECT_TEMPLATE(FASTEST) \ + (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) + +#endif /* _ASM_X86_XOR_32_H */ diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h new file mode 100644 index 000000000000..1549b5e261f6 --- /dev/null +++ b/arch/x86/include/asm/xor_64.h @@ -0,0 +1,361 @@ +#ifndef _ASM_X86_XOR_64_H +#define _ASM_X86_XOR_64_H + +/* + * Optimized RAID-5 checksumming functions for MMX and SSE. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * You should have received a copy of the GNU General Public License + * (for example /usr/src/linux/COPYING); if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + + +/* + * Cache avoiding checksumming functions utilizing KNI instructions + * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) + */ + +/* + * Based on + * High-speed RAID5 checksumming functions utilizing SSE instructions. + * Copyright (C) 1998 Ingo Molnar. + */ + +/* + * x86-64 changes / gcc fixes from Andi Kleen. + * Copyright 2002 Andi Kleen, SuSE Labs. + * + * This hasn't been optimized for the hammer yet, but there are likely + * no advantages to be gotten from x86-64 here anyways. + */ + +typedef struct { + unsigned long a, b; +} __attribute__((aligned(16))) xmm_store_t; + +/* Doesn't use gcc to save the XMM registers, because there is no easy way to + tell it to do a clts before the register saving. */ +#define XMMS_SAVE \ +do { \ + preempt_disable(); \ + asm volatile( \ + "movq %%cr0,%0 ;\n\t" \ + "clts ;\n\t" \ + "movups %%xmm0,(%1) ;\n\t" \ + "movups %%xmm1,0x10(%1) ;\n\t" \ + "movups %%xmm2,0x20(%1) ;\n\t" \ + "movups %%xmm3,0x30(%1) ;\n\t" \ + : "=&r" (cr0) \ + : "r" (xmm_save) \ + : "memory"); \ +} while (0) + +#define XMMS_RESTORE \ +do { \ + asm volatile( \ + "sfence ;\n\t" \ + "movups (%1),%%xmm0 ;\n\t" \ + "movups 0x10(%1),%%xmm1 ;\n\t" \ + "movups 0x20(%1),%%xmm2 ;\n\t" \ + "movups 0x30(%1),%%xmm3 ;\n\t" \ + "movq %0,%%cr0 ;\n\t" \ + : \ + : "r" (cr0), "r" (xmm_save) \ + : "memory"); \ + preempt_enable(); \ +} while (0) + +#define OFFS(x) "16*("#x")" +#define PF_OFFS(x) "256+16*("#x")" +#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n" +#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n" +#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n" +#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n" +#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n" +#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n" +#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n" +#define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n" +#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n" +#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n" +#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n" +#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n" +#define XO5(x, y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n" + + +static void +xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) +{ + unsigned int lines = bytes >> 8; + unsigned long cr0; + xmm_store_t xmm_save[4]; + + XMMS_SAVE; + + asm volatile( +#undef BLOCK +#define BLOCK(i) \ + LD(i, 0) \ + LD(i + 1, 1) \ + PF1(i) \ + PF1(i + 2) \ + LD(i + 2, 2) \ + LD(i + 3, 3) \ + PF0(i + 4) \ + PF0(i + 6) \ + XO1(i, 0) \ + XO1(i + 1, 1) \ + XO1(i + 2, 2) \ + XO1(i + 3, 3) \ + ST(i, 0) \ + ST(i + 1, 1) \ + ST(i + 2, 2) \ + ST(i + 3, 3) \ + + + PF0(0) + PF0(2) + + " .align 32 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addq %[inc], %[p1] ;\n" + " addq %[inc], %[p2] ;\n" + " decl %[cnt] ; jnz 1b" + : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines) + : [inc] "r" (256UL) + : "memory"); + + XMMS_RESTORE; +} + +static void +xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, + unsigned long *p3) +{ + unsigned int lines = bytes >> 8; + xmm_store_t xmm_save[4]; + unsigned long cr0; + + XMMS_SAVE; + + asm volatile( +#undef BLOCK +#define BLOCK(i) \ + PF1(i) \ + PF1(i + 2) \ + LD(i, 0) \ + LD(i + 1, 1) \ + LD(i + 2, 2) \ + LD(i + 3, 3) \ + PF2(i) \ + PF2(i + 2) \ + PF0(i + 4) \ + PF0(i + 6) \ + XO1(i, 0) \ + XO1(i + 1, 1) \ + XO1(i + 2, 2) \ + XO1(i + 3, 3) \ + XO2(i, 0) \ + XO2(i + 1, 1) \ + XO2(i + 2, 2) \ + XO2(i + 3, 3) \ + ST(i, 0) \ + ST(i + 1, 1) \ + ST(i + 2, 2) \ + ST(i + 3, 3) \ + + + PF0(0) + PF0(2) + + " .align 32 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addq %[inc], %[p1] ;\n" + " addq %[inc], %[p2] ;\n" + " addq %[inc], %[p3] ;\n" + " decl %[cnt] ; jnz 1b" + : [cnt] "+r" (lines), + [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) + : [inc] "r" (256UL) + : "memory"); + XMMS_RESTORE; +} + +static void +xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, + unsigned long *p3, unsigned long *p4) +{ + unsigned int lines = bytes >> 8; + xmm_store_t xmm_save[4]; + unsigned long cr0; + + XMMS_SAVE; + + asm volatile( +#undef BLOCK +#define BLOCK(i) \ + PF1(i) \ + PF1(i + 2) \ + LD(i, 0) \ + LD(i + 1, 1) \ + LD(i + 2, 2) \ + LD(i + 3, 3) \ + PF2(i) \ + PF2(i + 2) \ + XO1(i, 0) \ + XO1(i + 1, 1) \ + XO1(i + 2, 2) \ + XO1(i + 3, 3) \ + PF3(i) \ + PF3(i + 2) \ + PF0(i + 4) \ + PF0(i + 6) \ + XO2(i, 0) \ + XO2(i + 1, 1) \ + XO2(i + 2, 2) \ + XO2(i + 3, 3) \ + XO3(i, 0) \ + XO3(i + 1, 1) \ + XO3(i + 2, 2) \ + XO3(i + 3, 3) \ + ST(i, 0) \ + ST(i + 1, 1) \ + ST(i + 2, 2) \ + ST(i + 3, 3) \ + + + PF0(0) + PF0(2) + + " .align 32 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addq %[inc], %[p1] ;\n" + " addq %[inc], %[p2] ;\n" + " addq %[inc], %[p3] ;\n" + " addq %[inc], %[p4] ;\n" + " decl %[cnt] ; jnz 1b" + : [cnt] "+c" (lines), + [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) + : [inc] "r" (256UL) + : "memory" ); + + XMMS_RESTORE; +} + +static void +xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, + unsigned long *p3, unsigned long *p4, unsigned long *p5) +{ + unsigned int lines = bytes >> 8; + xmm_store_t xmm_save[4]; + unsigned long cr0; + + XMMS_SAVE; + + asm volatile( +#undef BLOCK +#define BLOCK(i) \ + PF1(i) \ + PF1(i + 2) \ + LD(i, 0) \ + LD(i + 1, 1) \ + LD(i + 2, 2) \ + LD(i + 3, 3) \ + PF2(i) \ + PF2(i + 2) \ + XO1(i, 0) \ + XO1(i + 1, 1) \ + XO1(i + 2, 2) \ + XO1(i + 3, 3) \ + PF3(i) \ + PF3(i + 2) \ + XO2(i, 0) \ + XO2(i + 1, 1) \ + XO2(i + 2, 2) \ + XO2(i + 3, 3) \ + PF4(i) \ + PF4(i + 2) \ + PF0(i + 4) \ + PF0(i + 6) \ + XO3(i, 0) \ + XO3(i + 1, 1) \ + XO3(i + 2, 2) \ + XO3(i + 3, 3) \ + XO4(i, 0) \ + XO4(i + 1, 1) \ + XO4(i + 2, 2) \ + XO4(i + 3, 3) \ + ST(i, 0) \ + ST(i + 1, 1) \ + ST(i + 2, 2) \ + ST(i + 3, 3) \ + + + PF0(0) + PF0(2) + + " .align 32 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addq %[inc], %[p1] ;\n" + " addq %[inc], %[p2] ;\n" + " addq %[inc], %[p3] ;\n" + " addq %[inc], %[p4] ;\n" + " addq %[inc], %[p5] ;\n" + " decl %[cnt] ; jnz 1b" + : [cnt] "+c" (lines), + [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4), + [p5] "+r" (p5) + : [inc] "r" (256UL) + : "memory"); + + XMMS_RESTORE; +} + +static struct xor_block_template xor_block_sse = { + .name = "generic_sse", + .do_2 = xor_sse_2, + .do_3 = xor_sse_3, + .do_4 = xor_sse_4, + .do_5 = xor_sse_5, +}; + +#undef XOR_TRY_TEMPLATES +#define XOR_TRY_TEMPLATES \ +do { \ + xor_speed(&xor_block_sse); \ +} while (0) + +/* We force the use of the SSE xor block because it can write around L2. + We may also be able to load into the L1 only depending on how the cpu + deals with a load to a line that is being prefetched. */ +#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse) + +#endif /* _ASM_X86_XOR_64_H */ diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h new file mode 100644 index 000000000000..08e9a1ac07a9 --- /dev/null +++ b/arch/x86/include/asm/xsave.h @@ -0,0 +1,118 @@ +#ifndef __ASM_X86_XSAVE_H +#define __ASM_X86_XSAVE_H + +#include <linux/types.h> +#include <asm/processor.h> +#include <asm/i387.h> + +#define XSTATE_FP 0x1 +#define XSTATE_SSE 0x2 + +#define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE) + +#define FXSAVE_SIZE 512 + +/* + * These are the features that the OS can handle currently. + */ +#define XCNTXT_MASK (XSTATE_FP | XSTATE_SSE) + +#ifdef CONFIG_X86_64 +#define REX_PREFIX "0x48, " +#else +#define REX_PREFIX +#endif + +extern unsigned int xstate_size; +extern u64 pcntxt_mask; +extern struct xsave_struct *init_xstate_buf; + +extern void xsave_cntxt_init(void); +extern void xsave_init(void); +extern int init_fpu(struct task_struct *child); +extern int check_for_xstate(struct i387_fxsave_struct __user *buf, + void __user *fpstate, + struct _fpx_sw_bytes *sw); + +static inline int xrstor_checking(struct xsave_struct *fx) +{ + int err; + + asm volatile("1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n\t" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: movl $-1,%[err]\n" + " jmp 2b\n" + ".previous\n" + _ASM_EXTABLE(1b, 3b) + : [err] "=r" (err) + : "D" (fx), "m" (*fx), "a" (-1), "d" (-1), "0" (0) + : "memory"); + + return err; +} + +static inline int xsave_user(struct xsave_struct __user *buf) +{ + int err; + __asm__ __volatile__("1: .byte " REX_PREFIX "0x0f,0xae,0x27\n" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: movl $-1,%[err]\n" + " jmp 2b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + _ASM_ALIGN "\n" + _ASM_PTR "1b,3b\n" + ".previous" + : [err] "=r" (err) + : "D" (buf), "a" (-1), "d" (-1), "0" (0) + : "memory"); + if (unlikely(err) && __clear_user(buf, xstate_size)) + err = -EFAULT; + /* No need to clear here because the caller clears USED_MATH */ + return err; +} + +static inline int xrestore_user(struct xsave_struct __user *buf, u64 mask) +{ + int err; + struct xsave_struct *xstate = ((__force struct xsave_struct *)buf); + u32 lmask = mask; + u32 hmask = mask >> 32; + + __asm__ __volatile__("1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: movl $-1,%[err]\n" + " jmp 2b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + _ASM_ALIGN "\n" + _ASM_PTR "1b,3b\n" + ".previous" + : [err] "=r" (err) + : "D" (xstate), "a" (lmask), "d" (hmask), "0" (0) + : "memory"); /* memory required? */ + return err; +} + +static inline void xrstor_state(struct xsave_struct *fx, u64 mask) +{ + u32 lmask = mask; + u32 hmask = mask >> 32; + + asm volatile(".byte " REX_PREFIX "0x0f,0xae,0x2f\n\t" + : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + : "memory"); +} + +static inline void xsave(struct task_struct *tsk) +{ + /* This, however, we can work around by forcing the compiler to select + an addressing mode that doesn't require extended registers. */ + __asm__ __volatile__(".byte " REX_PREFIX "0x0f,0xae,0x27" + : : "D" (&(tsk->thread.xstate->xsave)), + "a" (-1), "d"(-1) : "memory"); +} +#endif diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index da140611bb57..d7e5a58ee22f 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -7,9 +7,10 @@ extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinu CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) ifdef CONFIG_FTRACE -# Do not profile debug utilities +# Do not profile debug and lowlevel utilities CFLAGS_REMOVE_tsc.o = -pg CFLAGS_REMOVE_rtc.o = -pg +CFLAGS_REMOVE_paravirt-spinlocks.o = -pg endif # @@ -22,7 +23,7 @@ CFLAGS_hpet.o := $(nostackp) CFLAGS_tsc.o := $(nostackp) obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o -obj-y += traps_$(BITS).o irq_$(BITS).o +obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time_$(BITS).o ioport.o ldt.o obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o obj-$(CONFIG_X86_VISWS) += visws_quirks.o @@ -37,7 +38,7 @@ obj-y += tsc.o io_delay.o rtc.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o obj-y += process.o -obj-y += i387.o +obj-y += i387.o xsave.o obj-y += ptrace.o obj-y += ds.o obj-$(CONFIG_X86_32) += tls.o @@ -50,7 +51,6 @@ obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o obj-$(CONFIG_MCA) += mca_32.o obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o -obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_PCI) += early-quirks.o apm-y := apm_32.o obj-$(CONFIG_APM) += apm.o @@ -60,14 +60,15 @@ obj-$(CONFIG_X86_32_SMP) += smpcommon.o obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o obj-$(CONFIG_X86_MPPARSE) += mpparse.o -obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi.o -obj-$(CONFIG_X86_IO_APIC) += io_apic_$(BITS).o +obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o +obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o obj-$(CONFIG_X86_NUMAQ) += numaq_32.o +obj-$(CONFIG_X86_ES7000) += es7000_32.o obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o obj-y += vsmp_64.o obj-$(CONFIG_KPROBES) += kprobes.o @@ -88,7 +89,7 @@ obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o obj-$(CONFIG_KVM_GUEST) += kvm.o obj-$(CONFIG_KVM_CLOCK) += kvmclock.o -obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o +obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o paravirt-spinlocks.o obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o @@ -98,10 +99,18 @@ scx200-y += scx200_32.o obj-$(CONFIG_OLPC) += olpc.o +microcode-y := microcode_core.o +microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o +microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o +obj-$(CONFIG_MICROCODE) += microcode.o + ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o + obj-y += bios_uv.o uv_irq.o uv_sysfs.o + obj-y += genx2apic_cluster.o + obj-y += genx2apic_phys.o obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o obj-$(CONFIG_AUDIT) += audit_64.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index f489d7a9be92..8c1f76abae9e 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -58,7 +58,6 @@ EXPORT_SYMBOL(acpi_disabled); #ifdef CONFIG_X86_64 #include <asm/proto.h> -#include <asm/genapic.h> #else /* X86 */ @@ -154,10 +153,21 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size) } #ifdef CONFIG_PCI_MMCONFIG + +static int acpi_mcfg_64bit_base_addr __initdata = FALSE; + /* The physical address of the MMCONFIG aperture. Set from ACPI tables. */ struct acpi_mcfg_allocation *pci_mmcfg_config; int pci_mmcfg_config_num; +static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg) +{ + if (!strcmp(mcfg->header.oem_id, "SGI")) + acpi_mcfg_64bit_base_addr = TRUE; + + return 0; +} + int __init acpi_parse_mcfg(struct acpi_table_header *header) { struct acpi_table_mcfg *mcfg; @@ -190,8 +200,12 @@ int __init acpi_parse_mcfg(struct acpi_table_header *header) } memcpy(pci_mmcfg_config, &mcfg[1], config_size); + + acpi_mcfg_oem_check(mcfg); + for (i = 0; i < pci_mmcfg_config_num; ++i) { - if (pci_mmcfg_config[i].address > 0xFFFFFFFF) { + if ((pci_mmcfg_config[i].address > 0xFFFFFFFF) && + !acpi_mcfg_64bit_base_addr) { printk(KERN_ERR PREFIX "MMCONFIG not in low 4GB of memory\n"); kfree(pci_mmcfg_config); @@ -239,10 +253,8 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled) return; } -#ifdef CONFIG_X86_32 if (boot_cpu_physical_apicid != -1U) ver = apic_version[boot_cpu_physical_apicid]; -#endif generic_processor_info(id, ver); } @@ -761,11 +773,9 @@ static void __init acpi_register_lapic_address(unsigned long address) set_fixmap_nocache(FIX_APIC_BASE, address); if (boot_cpu_physical_apicid == -1U) { - boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); -#ifdef CONFIG_X86_32 + boot_cpu_physical_apicid = read_apic_id(); apic_version[boot_cpu_physical_apicid] = GET_APIC_VERSION(apic_read(APIC_LVR)); -#endif } } @@ -1021,7 +1031,7 @@ void __init mp_config_acpi_legacy_irqs(void) mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; #endif set_bit(MP_ISA_BUS, mp_bus_not_pci); - Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); + pr_debug("Bus #%d is ISA\n", MP_ISA_BUS); #ifdef CONFIG_X86_ES7000 /* @@ -1127,8 +1137,8 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity) return gsi; } if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) { - Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", - mp_ioapic_routing[ioapic].apic_id, ioapic_pin); + pr_debug("Pin %d-%d already programmed\n", + mp_ioapic_routing[ioapic].apic_id, ioapic_pin); #ifdef CONFIG_X86_32 return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]); #else @@ -1247,7 +1257,7 @@ static int __init acpi_parse_madt_ioapic_entries(void) count = acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr, - NR_IRQ_VECTORS); + nr_irqs); if (count < 0) { printk(KERN_ERR PREFIX "Error parsing interrupt source overrides entry\n"); @@ -1267,7 +1277,7 @@ static int __init acpi_parse_madt_ioapic_entries(void) count = acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_parse_nmi_src, - NR_IRQ_VECTORS); + nr_irqs); if (count < 0) { printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n"); /* TBD: Cleanup to allow fallback to MPS */ @@ -1337,7 +1347,9 @@ static void __init acpi_process_madt(void) acpi_ioapic = 1; smp_found_config = 1; +#ifdef CONFIG_X86_32 setup_apic_routing(); +#endif } } if (error == -EINVAL) { @@ -1407,8 +1419,16 @@ static int __init force_acpi_ht(const struct dmi_system_id *d) */ static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) { - pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n", d->ident); - acpi_skip_timer_override = 1; + /* + * The ati_ixp4x0_rev() early PCI quirk should have set + * the acpi_skip_timer_override flag already: + */ + if (!acpi_skip_timer_override) { + WARN(1, KERN_ERR "ati_ixp4x0 quirk not complete.\n"); + pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n", + d->ident); + acpi_skip_timer_override = 1; + } return 0; } @@ -1579,6 +1599,11 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"), }, }, + {} +}; + +/* second table for DMI checks that should run after early-quirks */ +static struct dmi_system_id __initdata acpi_dmi_table_late[] = { /* * HP laptops which use a DSDT reporting as HP/SB400/10000, * which includes some code which overrides all temperature @@ -1591,6 +1616,14 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = { */ { .callback = dmi_ignore_irq0_timer_override, + .ident = "HP nx6115 laptop", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6115"), + }, + }, + { + .callback = dmi_ignore_irq0_timer_override, .ident = "HP NX6125 laptop", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), @@ -1605,6 +1638,14 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"), }, }, + { + .callback = dmi_ignore_irq0_timer_override, + .ident = "HP 6715b laptop", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"), + }, + }, {} }; @@ -1691,6 +1732,9 @@ int __init early_acpi_boot_init(void) int __init acpi_boot_init(void) { + /* those are executed after early-quirks are executed */ + dmi_check_system(acpi_dmi_table_late); + /* * If acpi_disabled, bail out * One exception: acpi=ht continues far enough to enumerate LAPICs diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 868de3d5c39d..806b4e9051b4 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -9,6 +9,8 @@ #include <linux/bootmem.h> #include <linux/dmi.h> #include <linux/cpumask.h> +#include <asm/segment.h> +#include <asm/desc.h> #include "realmode/wakeup.h" #include "sleep.h" @@ -19,19 +21,10 @@ unsigned long acpi_realmode_flags; /* address in low memory of the wakeup routine. */ static unsigned long acpi_realmode; -#ifdef CONFIG_64BIT -static char temp_stack[10240]; +#if defined(CONFIG_SMP) && defined(CONFIG_64BIT) +static char temp_stack[4096]; #endif -/* XXX: this macro should move to asm-x86/segment.h and be shared with the - boot code... */ -#define GDT_ENTRY(flags, base, limit) \ - (((u64)(base & 0xff000000) << 32) | \ - ((u64)flags << 40) | \ - ((u64)(limit & 0x00ff0000) << 32) | \ - ((u64)(base & 0x00ffffff) << 16) | \ - ((u64)(limit & 0x0000ffff))) - /** * acpi_save_state_mem - save kernel state * @@ -94,7 +87,7 @@ int acpi_save_state_mem(void) #endif /* !CONFIG_64BIT */ header->pmode_cr0 = read_cr0(); - header->pmode_cr4 = read_cr4(); + header->pmode_cr4 = read_cr4_safe(); header->realmode_flags = acpi_realmode_flags; header->real_magic = 0x12345678; @@ -105,7 +98,9 @@ int acpi_save_state_mem(void) #else /* CONFIG_64BIT */ header->trampoline_segment = setup_trampoline() >> 4; #ifdef CONFIG_SMP - stack_start.sp = temp_stack + 4096; + stack_start.sp = temp_stack + sizeof(temp_stack); + early_gdt_descr.address = + (unsigned long)get_cpu_gdt_table(smp_processor_id()); #endif initial_code = (unsigned long)wakeup_long64; saved_magic = 0x123456789abcdef0; @@ -158,6 +153,10 @@ static int __init acpi_sleep_setup(char *str) acpi_realmode_flags |= 2; if (strncmp(str, "s3_beep", 7) == 0) acpi_realmode_flags |= 4; +#ifdef CONFIG_HIBERNATION + if (strncmp(str, "s4_nohwsig", 10) == 0) + acpi_no_s4_hw_signature(); +#endif if (strncmp(str, "old_ordering", 12) == 0) acpi_old_suspend_ordering(); str = strchr(str, ','); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 2763cb37b553..a84ac7b570e6 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -145,35 +145,25 @@ static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = { extern char __vsyscall_0; const unsigned char *const *find_nop_table(void) { - return boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || - boot_cpu_data.x86 < 6 ? k8_nops : p6_nops; + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && + boot_cpu_has(X86_FEATURE_NOPL)) + return p6_nops; + else + return k8_nops; } #else /* CONFIG_X86_64 */ -static const struct nop { - int cpuid; - const unsigned char *const *noptable; -} noptypes[] = { - { X86_FEATURE_K8, k8_nops }, - { X86_FEATURE_K7, k7_nops }, - { X86_FEATURE_P4, p6_nops }, - { X86_FEATURE_P3, p6_nops }, - { -1, NULL } -}; - const unsigned char *const *find_nop_table(void) { - const unsigned char *const *noptable = intel_nops; - int i; - - for (i = 0; noptypes[i].cpuid >= 0; i++) { - if (boot_cpu_has(noptypes[i].cpuid)) { - noptable = noptypes[i].noptable; - break; - } - } - return noptable; + if (boot_cpu_has(X86_FEATURE_K8)) + return k8_nops; + else if (boot_cpu_has(X86_FEATURE_K7)) + return k7_nops; + else if (boot_cpu_has(X86_FEATURE_NOPL)) + return p6_nops; + else + return intel_nops; } #endif /* CONFIG_X86_64 */ @@ -241,25 +231,25 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end) continue; if (*ptr > text_end) continue; - text_poke(*ptr, ((unsigned char []){0xf0}), 1); /* add lock prefix */ + /* turn DS segment override prefix into lock prefix */ + text_poke(*ptr, ((unsigned char []){0xf0}), 1); }; } static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) { u8 **ptr; - char insn[1]; if (noreplace_smp) return; - add_nops(insn, 1); for (ptr = start; ptr < end; ptr++) { if (*ptr < text) continue; if (*ptr > text_end) continue; - text_poke(*ptr, insn, 1); + /* turn lock prefix into DS segment override prefix */ + text_poke(*ptr, ((unsigned char []){0x3E}), 1); }; } @@ -454,7 +444,7 @@ void __init alternative_instructions(void) _text, _etext); /* Only switch to UP mode if we don't immediately boot others */ - if (num_possible_cpus() == 1 || setup_max_cpus <= 1) + if (num_present_cpus() == 1 || setup_max_cpus <= 1) alternatives_smp_switch(0); } #endif diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index f2766d84c7a0..a8fd9ebdc8e2 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -23,36 +23,149 @@ #include <linux/scatterlist.h> #include <linux/iommu-helper.h> #include <asm/proto.h> -#include <asm/gart.h> +#include <asm/iommu.h> #include <asm/amd_iommu_types.h> #include <asm/amd_iommu.h> #define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28)) -#define to_pages(addr, size) \ - (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT) +#define EXIT_LOOP_COUNT 10000000 static DEFINE_RWLOCK(amd_iommu_devtable_lock); -struct command { +/* A list of preallocated protection domains */ +static LIST_HEAD(iommu_pd_list); +static DEFINE_SPINLOCK(iommu_pd_list_lock); + +/* + * general struct to manage commands send to an IOMMU + */ +struct iommu_cmd { u32 data[4]; }; static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, struct unity_map_entry *e); +/* returns !0 if the IOMMU is caching non-present entries in its TLB */ static int iommu_has_npcache(struct amd_iommu *iommu) { return iommu->cap & IOMMU_CAP_NPCACHE; } -static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) +/**************************************************************************** + * + * Interrupt handling functions + * + ****************************************************************************/ + +static void iommu_print_event(void *__evt) +{ + u32 *event = __evt; + int type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK; + int devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK; + int domid = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK; + int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK; + u64 address = (u64)(((u64)event[3]) << 32) | event[2]; + + printk(KERN_ERR "AMD IOMMU: Event logged ["); + + switch (type) { + case EVENT_TYPE_ILL_DEV: + printk("ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x " + "address=0x%016llx flags=0x%04x]\n", + PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), + address, flags); + break; + case EVENT_TYPE_IO_FAULT: + printk("IO_PAGE_FAULT device=%02x:%02x.%x " + "domain=0x%04x address=0x%016llx flags=0x%04x]\n", + PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), + domid, address, flags); + break; + case EVENT_TYPE_DEV_TAB_ERR: + printk("DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x " + "address=0x%016llx flags=0x%04x]\n", + PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), + address, flags); + break; + case EVENT_TYPE_PAGE_TAB_ERR: + printk("PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x " + "domain=0x%04x address=0x%016llx flags=0x%04x]\n", + PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), + domid, address, flags); + break; + case EVENT_TYPE_ILL_CMD: + printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address); + break; + case EVENT_TYPE_CMD_HARD_ERR: + printk("COMMAND_HARDWARE_ERROR address=0x%016llx " + "flags=0x%04x]\n", address, flags); + break; + case EVENT_TYPE_IOTLB_INV_TO: + printk("IOTLB_INV_TIMEOUT device=%02x:%02x.%x " + "address=0x%016llx]\n", + PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), + address); + break; + case EVENT_TYPE_INV_DEV_REQ: + printk("INVALID_DEVICE_REQUEST device=%02x:%02x.%x " + "address=0x%016llx flags=0x%04x]\n", + PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), + address, flags); + break; + default: + printk(KERN_ERR "UNKNOWN type=0x%02x]\n", type); + } +} + +static void iommu_poll_events(struct amd_iommu *iommu) +{ + u32 head, tail; + unsigned long flags; + + spin_lock_irqsave(&iommu->lock, flags); + + head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); + tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET); + + while (head != tail) { + iommu_print_event(iommu->evt_buf + head); + head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size; + } + + writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); + + spin_unlock_irqrestore(&iommu->lock, flags); +} + +irqreturn_t amd_iommu_int_handler(int irq, void *data) +{ + struct amd_iommu *iommu; + + list_for_each_entry(iommu, &amd_iommu_list, list) + iommu_poll_events(iommu); + + return IRQ_HANDLED; +} + +/**************************************************************************** + * + * IOMMU command queuing functions + * + ****************************************************************************/ + +/* + * Writes the command to the IOMMUs command buffer and informs the + * hardware about the new command. Must be called with iommu->lock held. + */ +static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) { u32 tail, head; u8 *target; tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); - target = (iommu->cmd_buf + tail); + target = iommu->cmd_buf + tail; memcpy_toio(target, cmd, sizeof(*cmd)); tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size; head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); @@ -63,7 +176,11 @@ static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) return 0; } -static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) +/* + * General queuing function for commands. Takes iommu->lock and calls + * __iommu_queue_command(). + */ +static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) { unsigned long flags; int ret; @@ -75,35 +192,59 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) return ret; } +/* + * This function is called whenever we need to ensure that the IOMMU has + * completed execution of all commands we sent. It sends a + * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs + * us about that by writing a value to a physical address we pass with + * the command. + */ static int iommu_completion_wait(struct amd_iommu *iommu) { - int ret; - struct command cmd; - volatile u64 ready = 0; - unsigned long ready_phys = virt_to_phys(&ready); + int ret = 0, ready = 0; + unsigned status = 0; + struct iommu_cmd cmd; + unsigned long flags, i = 0; memset(&cmd, 0, sizeof(cmd)); - cmd.data[0] = LOW_U32(ready_phys) | CMD_COMPL_WAIT_STORE_MASK; - cmd.data[1] = HIGH_U32(ready_phys); - cmd.data[2] = 1; /* value written to 'ready' */ + cmd.data[0] = CMD_COMPL_WAIT_INT_MASK; CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT); iommu->need_sync = 0; - ret = iommu_queue_command(iommu, &cmd); + spin_lock_irqsave(&iommu->lock, flags); + + ret = __iommu_queue_command(iommu, &cmd); if (ret) - return ret; + goto out; + + while (!ready && (i < EXIT_LOOP_COUNT)) { + ++i; + /* wait for the bit to become one */ + status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); + ready = status & MMIO_STATUS_COM_WAIT_INT_MASK; + } + + /* set bit back to zero */ + status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; + writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); - while (!ready) - cpu_relax(); + if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit())) + printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n"); +out: + spin_unlock_irqrestore(&iommu->lock, flags); return 0; } +/* + * Command send function for invalidating a device table entry + */ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) { - struct command cmd; + struct iommu_cmd cmd; + int ret; BUG_ON(iommu == NULL); @@ -111,37 +252,50 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY); cmd.data[0] = devid; + ret = iommu_queue_command(iommu, &cmd); + iommu->need_sync = 1; - return iommu_queue_command(iommu, &cmd); + return ret; } +/* + * Generic command send function for invalidaing TLB entries + */ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, u64 address, u16 domid, int pde, int s) { - struct command cmd; + struct iommu_cmd cmd; + int ret; memset(&cmd, 0, sizeof(cmd)); address &= PAGE_MASK; CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES); cmd.data[1] |= domid; - cmd.data[2] = LOW_U32(address); - cmd.data[3] = HIGH_U32(address); - if (s) + cmd.data[2] = lower_32_bits(address); + cmd.data[3] = upper_32_bits(address); + if (s) /* size bit - we flush more than one 4kb page */ cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; - if (pde) + if (pde) /* PDE bit - we wan't flush everything not only the PTEs */ cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; + ret = iommu_queue_command(iommu, &cmd); + iommu->need_sync = 1; - return iommu_queue_command(iommu, &cmd); + return ret; } +/* + * TLB invalidation function which is called from the mapping functions. + * It invalidates a single PTE if the range to flush is within a single + * page. Otherwise it flushes the whole TLB of the IOMMU. + */ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, u64 address, size_t size) { int s = 0; - unsigned pages = to_pages(address, size); + unsigned pages = iommu_num_pages(address, size, PAGE_SIZE); address &= PAGE_MASK; @@ -159,6 +313,28 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, return 0; } +/* Flush the whole IO/TLB for a given protection domain */ +static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid) +{ + u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; + + iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1); +} + +/**************************************************************************** + * + * The functions below are used the create the page table mappings for + * unity mapped regions. + * + ****************************************************************************/ + +/* + * Generic mapping functions. It maps a physical address into a DMA + * address space. It allocates the page table pages if necessary. + * In the future it can be extended to a generic mapping function + * supporting all features of AMD IOMMU page tables like level skipping + * and full 64 bit address spaces. + */ static int iommu_map(struct protection_domain *dom, unsigned long bus_addr, unsigned long phys_addr, @@ -209,6 +385,10 @@ static int iommu_map(struct protection_domain *dom, return 0; } +/* + * This function checks if a specific unity mapping entry is needed for + * this specific IOMMU. + */ static int iommu_for_unity_map(struct amd_iommu *iommu, struct unity_map_entry *entry) { @@ -223,6 +403,12 @@ static int iommu_for_unity_map(struct amd_iommu *iommu, return 0; } +/* + * Init the unity mappings for a specific IOMMU in the system + * + * Basically iterates over all unity mapping entries and applies them to + * the default domain DMA of that IOMMU if necessary. + */ static int iommu_init_unity_mappings(struct amd_iommu *iommu) { struct unity_map_entry *entry; @@ -239,6 +425,10 @@ static int iommu_init_unity_mappings(struct amd_iommu *iommu) return 0; } +/* + * This function actually applies the mapping to the page table of the + * dma_ops domain. + */ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, struct unity_map_entry *e) { @@ -261,6 +451,9 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, return 0; } +/* + * Inits the unity mappings required for a specific device + */ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, u16 devid) { @@ -278,33 +471,48 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, return 0; } -static unsigned long dma_mask_to_pages(unsigned long mask) -{ - return (mask >> PAGE_SHIFT) + - (PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT); -} +/**************************************************************************** + * + * The next functions belong to the address allocator for the dma_ops + * interface functions. They work like the allocators in the other IOMMU + * drivers. Its basically a bitmap which marks the allocated pages in + * the aperture. Maybe it could be enhanced in the future to a more + * efficient allocator. + * + ****************************************************************************/ +/* + * The address allocator core function. + * + * called with domain->lock held + */ static unsigned long dma_ops_alloc_addresses(struct device *dev, struct dma_ops_domain *dom, - unsigned int pages) + unsigned int pages, + unsigned long align_mask, + u64 dma_mask) { - unsigned long limit = dma_mask_to_pages(*dev->dma_mask); + unsigned long limit; unsigned long address; - unsigned long size = dom->aperture_size >> PAGE_SHIFT; unsigned long boundary_size; boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, PAGE_SIZE) >> PAGE_SHIFT; - limit = limit < size ? limit : size; + limit = iommu_device_max_index(dom->aperture_size >> PAGE_SHIFT, 0, + dma_mask >> PAGE_SHIFT); - if (dom->next_bit >= limit) + if (dom->next_bit >= limit) { dom->next_bit = 0; + dom->need_flush = true; + } address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages, - 0 , boundary_size, 0); - if (address == -1) + 0 , boundary_size, align_mask); + if (address == -1) { address = iommu_area_alloc(dom->bitmap, limit, 0, pages, - 0, boundary_size, 0); + 0, boundary_size, align_mask); + dom->need_flush = true; + } if (likely(address != -1)) { dom->next_bit = address + pages; @@ -317,6 +525,11 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev, return address; } +/* + * The address free function. + * + * called with domain->lock held + */ static void dma_ops_free_addresses(struct dma_ops_domain *dom, unsigned long address, unsigned int pages) @@ -325,6 +538,16 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom, iommu_area_free(dom->bitmap, address, pages); } +/**************************************************************************** + * + * The next functions belong to the domain allocation. A domain is + * allocated for every IOMMU as the default domain. If device isolation + * is enabled, every device get its own domain. The most important thing + * about domains is the page table mapping the DMA address space they + * contain. + * + ****************************************************************************/ + static u16 domain_id_alloc(void) { unsigned long flags; @@ -342,6 +565,10 @@ static u16 domain_id_alloc(void) return id; } +/* + * Used to reserve address ranges in the aperture (e.g. for exclusion + * ranges. + */ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, unsigned long start_page, unsigned int pages) @@ -351,7 +578,7 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, if (start_page + pages > last_page) pages = last_page - start_page; - set_bit_string(dom->bitmap, start_page, pages); + iommu_area_reserve(dom->bitmap, start_page, pages); } static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom) @@ -382,6 +609,10 @@ static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom) free_page((unsigned long)p1); } +/* + * Free a domain, only used if something went wrong in the + * allocation path and we need to free an already allocated page table + */ static void dma_ops_domain_free(struct dma_ops_domain *dom) { if (!dom) @@ -396,6 +627,11 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom) kfree(dom); } +/* + * Allocates a new protection domain usable for the dma_ops functions. + * It also intializes the page table and the address allocator data + * structures required for the dma_ops interface + */ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, unsigned order) { @@ -436,14 +672,24 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, dma_dom->bitmap[0] = 1; dma_dom->next_bit = 0; + dma_dom->need_flush = false; + dma_dom->target_dev = 0xffff; + + /* Intialize the exclusion range if necessary */ if (iommu->exclusion_start && iommu->exclusion_start < dma_dom->aperture_size) { unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; - int pages = to_pages(iommu->exclusion_start, - iommu->exclusion_length); + int pages = iommu_num_pages(iommu->exclusion_start, + iommu->exclusion_length, + PAGE_SIZE); dma_ops_reserve_addresses(dma_dom, startpage, pages); } + /* + * At the last step, build the page tables so we don't need to + * allocate page table pages in the dma_ops mapping/unmapping + * path. + */ num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512); dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *), GFP_KERNEL); @@ -472,6 +718,10 @@ free_dma_dom: return NULL; } +/* + * Find out the protection domain structure for a given PCI device. This + * will give us the pointer to the page table root for example. + */ static struct protection_domain *domain_for_device(u16 devid) { struct protection_domain *dom; @@ -484,6 +734,10 @@ static struct protection_domain *domain_for_device(u16 devid) return dom; } +/* + * If a device is not yet associated with a domain, this function does + * assigns it visible for the hardware + */ static void set_device_domain(struct amd_iommu *iommu, struct protection_domain *domain, u16 devid) @@ -492,12 +746,13 @@ static void set_device_domain(struct amd_iommu *iommu, u64 pte_root = virt_to_phys(domain->pt_root); - pte_root |= (domain->mode & 0x07) << 9; - pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | 2; + pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) + << DEV_ENTRY_MODE_SHIFT; + pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV; write_lock_irqsave(&amd_iommu_devtable_lock, flags); - amd_iommu_dev_table[devid].data[0] = pte_root; - amd_iommu_dev_table[devid].data[1] = pte_root >> 32; + amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); + amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); amd_iommu_dev_table[devid].data[2] = domain->id; amd_iommu_pd_table[devid] = domain; @@ -508,6 +763,58 @@ static void set_device_domain(struct amd_iommu *iommu, iommu->need_sync = 1; } +/***************************************************************************** + * + * The next functions belong to the dma_ops mapping/unmapping code. + * + *****************************************************************************/ + +/* + * This function checks if the driver got a valid device from the caller to + * avoid dereferencing invalid pointers. + */ +static bool check_device(struct device *dev) +{ + if (!dev || !dev->dma_mask) + return false; + + return true; +} + +/* + * In this function the list of preallocated protection domains is traversed to + * find the domain for a specific device + */ +static struct dma_ops_domain *find_protection_domain(u16 devid) +{ + struct dma_ops_domain *entry, *ret = NULL; + unsigned long flags; + + if (list_empty(&iommu_pd_list)) + return NULL; + + spin_lock_irqsave(&iommu_pd_list_lock, flags); + + list_for_each_entry(entry, &iommu_pd_list, list) { + if (entry->target_dev == devid) { + ret = entry; + list_del(&ret->list); + break; + } + } + + spin_unlock_irqrestore(&iommu_pd_list_lock, flags); + + return ret; +} + +/* + * In the dma_ops path we only have the struct device. This function + * finds the corresponding IOMMU, the protection domain and the + * requestor id for a given device. + * If the device is not yet associated with a domain this is also done + * in this function. + */ static int get_device_resources(struct device *dev, struct amd_iommu **iommu, struct protection_domain **domain, @@ -517,26 +824,30 @@ static int get_device_resources(struct device *dev, struct pci_dev *pcidev; u16 _bdf; - BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask); + *iommu = NULL; + *domain = NULL; + *bdf = 0xffff; + + if (dev->bus != &pci_bus_type) + return 0; pcidev = to_pci_dev(dev); - _bdf = (pcidev->bus->number << 8) | pcidev->devfn; + _bdf = calc_devid(pcidev->bus->number, pcidev->devfn); - if (_bdf >= amd_iommu_last_bdf) { - *iommu = NULL; - *domain = NULL; - *bdf = 0xffff; + /* device not translated by any IOMMU in the system? */ + if (_bdf > amd_iommu_last_bdf) return 0; - } *bdf = amd_iommu_alias_table[_bdf]; *iommu = amd_iommu_rlookup_table[*bdf]; if (*iommu == NULL) return 0; - dma_dom = (*iommu)->default_dom; *domain = domain_for_device(*bdf); if (*domain == NULL) { + dma_dom = find_protection_domain(*bdf); + if (!dma_dom) + dma_dom = (*iommu)->default_dom; *domain = &dma_dom->domain; set_device_domain(*iommu, *domain, *bdf); printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " @@ -547,6 +858,10 @@ static int get_device_resources(struct device *dev, return 1; } +/* + * This is the generic map function. It maps one 4kb page at paddr to + * the given address in the DMA address space for the domain. + */ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, struct dma_ops_domain *dom, unsigned long address, @@ -578,6 +893,9 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, return (dma_addr_t)address; } +/* + * The generic unmapping function for on page in the DMA address space. + */ static void dma_ops_domain_unmap(struct amd_iommu *iommu, struct dma_ops_domain *dom, unsigned long address) @@ -597,22 +915,35 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu, *pte = 0ULL; } +/* + * This function contains common code for mapping of a physically + * contiguous memory region into DMA address space. It is uses by all + * mapping functions provided by this IOMMU driver. + * Must be called with the domain lock held. + */ static dma_addr_t __map_single(struct device *dev, struct amd_iommu *iommu, struct dma_ops_domain *dma_dom, phys_addr_t paddr, size_t size, - int dir) + int dir, + bool align, + u64 dma_mask) { dma_addr_t offset = paddr & ~PAGE_MASK; dma_addr_t address, start; unsigned int pages; + unsigned long align_mask = 0; int i; - pages = to_pages(paddr, size); + pages = iommu_num_pages(paddr, size, PAGE_SIZE); paddr &= PAGE_MASK; - address = dma_ops_alloc_addresses(dev, dma_dom, pages); + if (align) + align_mask = (1UL << get_order(size)) - 1; + + address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask, + dma_mask); if (unlikely(address == bad_dma_address)) goto out; @@ -624,10 +955,20 @@ static dma_addr_t __map_single(struct device *dev, } address += offset; + if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) { + iommu_flush_tlb(iommu, dma_dom->domain.id); + dma_dom->need_flush = false; + } else if (unlikely(iommu_has_npcache(iommu))) + iommu_flush_pages(iommu, dma_dom->domain.id, address, size); + out: return address; } +/* + * Does the reverse of the __map_single function. Must be called with + * the domain lock held too + */ static void __unmap_single(struct amd_iommu *iommu, struct dma_ops_domain *dma_dom, dma_addr_t dma_addr, @@ -640,7 +981,7 @@ static void __unmap_single(struct amd_iommu *iommu, if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size)) return; - pages = to_pages(dma_addr, size); + pages = iommu_num_pages(dma_addr, size, PAGE_SIZE); dma_addr &= PAGE_MASK; start = dma_addr; @@ -650,8 +991,14 @@ static void __unmap_single(struct amd_iommu *iommu, } dma_ops_free_addresses(dma_dom, dma_addr, pages); + + if (amd_iommu_unmap_flush) + iommu_flush_pages(iommu, dma_dom->domain.id, dma_addr, size); } +/* + * The exported map_single function for dma_ops. + */ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir) { @@ -660,21 +1007,26 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr, struct protection_domain *domain; u16 devid; dma_addr_t addr; + u64 dma_mask; + + if (!check_device(dev)) + return bad_dma_address; + + dma_mask = *dev->dma_mask; get_device_resources(dev, &iommu, &domain, &devid); if (iommu == NULL || domain == NULL) + /* device not handled by any AMD IOMMU */ return (dma_addr_t)paddr; spin_lock_irqsave(&domain->lock, flags); - addr = __map_single(dev, iommu, domain->priv, paddr, size, dir); + addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false, + dma_mask); if (addr == bad_dma_address) goto out; - if (iommu_has_npcache(iommu)) - iommu_flush_pages(iommu, domain->id, addr, size); - - if (iommu->need_sync) + if (unlikely(iommu->need_sync)) iommu_completion_wait(iommu); out: @@ -683,6 +1035,9 @@ out: return addr; } +/* + * The exported unmap_single function for dma_ops. + */ static void unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, int dir) { @@ -691,21 +1046,25 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr, struct protection_domain *domain; u16 devid; - if (!get_device_resources(dev, &iommu, &domain, &devid)) + if (!check_device(dev) || + !get_device_resources(dev, &iommu, &domain, &devid)) + /* device not handled by any AMD IOMMU */ return; spin_lock_irqsave(&domain->lock, flags); __unmap_single(iommu, domain->priv, dma_addr, size, dir); - iommu_flush_pages(iommu, domain->id, dma_addr, size); - - if (iommu->need_sync) + if (unlikely(iommu->need_sync)) iommu_completion_wait(iommu); spin_unlock_irqrestore(&domain->lock, flags); } +/* + * This is a special map_sg function which is used if we should map a + * device which is not handled by an AMD IOMMU in the system. + */ static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist, int nelems, int dir) { @@ -720,6 +1079,10 @@ static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist, return nelems; } +/* + * The exported map_sg function for dma_ops (handles scatter-gather + * lists). + */ static int map_sg(struct device *dev, struct scatterlist *sglist, int nelems, int dir) { @@ -731,6 +1094,12 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, struct scatterlist *s; phys_addr_t paddr; int mapped_elems = 0; + u64 dma_mask; + + if (!check_device(dev)) + return 0; + + dma_mask = *dev->dma_mask; get_device_resources(dev, &iommu, &domain, &devid); @@ -743,19 +1112,17 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, paddr = sg_phys(s); s->dma_address = __map_single(dev, iommu, domain->priv, - paddr, s->length, dir); + paddr, s->length, dir, false, + dma_mask); if (s->dma_address) { s->dma_length = s->length; mapped_elems++; } else goto unmap; - if (iommu_has_npcache(iommu)) - iommu_flush_pages(iommu, domain->id, s->dma_address, - s->dma_length); } - if (iommu->need_sync) + if (unlikely(iommu->need_sync)) iommu_completion_wait(iommu); out: @@ -775,6 +1142,10 @@ unmap: goto out; } +/* + * The exported map_sg function for dma_ops (handles scatter-gather + * lists). + */ static void unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems, int dir) { @@ -785,7 +1156,8 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist, u16 devid; int i; - if (!get_device_resources(dev, &iommu, &domain, &devid)) + if (!check_device(dev) || + !get_device_resources(dev, &iommu, &domain, &devid)) return; spin_lock_irqsave(&domain->lock, flags); @@ -793,17 +1165,18 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist, for_each_sg(sglist, s, nelems, i) { __unmap_single(iommu, domain->priv, s->dma_address, s->dma_length, dir); - iommu_flush_pages(iommu, domain->id, s->dma_address, - s->dma_length); s->dma_address = s->dma_length = 0; } - if (iommu->need_sync) + if (unlikely(iommu->need_sync)) iommu_completion_wait(iommu); spin_unlock_irqrestore(&domain->lock, flags); } +/* + * The exported alloc_coherent function for dma_ops. + */ static void *alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t flag) { @@ -813,25 +1186,33 @@ static void *alloc_coherent(struct device *dev, size_t size, struct protection_domain *domain; u16 devid; phys_addr_t paddr; + u64 dma_mask = dev->coherent_dma_mask; + + if (!check_device(dev)) + return NULL; + + if (!get_device_resources(dev, &iommu, &domain, &devid)) + flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); + flag |= __GFP_ZERO; virt_addr = (void *)__get_free_pages(flag, get_order(size)); if (!virt_addr) return 0; - memset(virt_addr, 0, size); paddr = virt_to_phys(virt_addr); - get_device_resources(dev, &iommu, &domain, &devid); - if (!iommu || !domain) { *dma_addr = (dma_addr_t)paddr; return virt_addr; } + if (!dma_mask) + dma_mask = *dev->dma_mask; + spin_lock_irqsave(&domain->lock, flags); *dma_addr = __map_single(dev, iommu, domain->priv, paddr, - size, DMA_BIDIRECTIONAL); + size, DMA_BIDIRECTIONAL, true, dma_mask); if (*dma_addr == bad_dma_address) { free_pages((unsigned long)virt_addr, get_order(size)); @@ -839,10 +1220,7 @@ static void *alloc_coherent(struct device *dev, size_t size, goto out; } - if (iommu_has_npcache(iommu)) - iommu_flush_pages(iommu, domain->id, *dma_addr, size); - - if (iommu->need_sync) + if (unlikely(iommu->need_sync)) iommu_completion_wait(iommu); out: @@ -851,6 +1229,9 @@ out: return virt_addr; } +/* + * The exported free_coherent function for dma_ops. + */ static void free_coherent(struct device *dev, size_t size, void *virt_addr, dma_addr_t dma_addr) { @@ -859,6 +1240,9 @@ static void free_coherent(struct device *dev, size_t size, struct protection_domain *domain; u16 devid; + if (!check_device(dev)) + return; + get_device_resources(dev, &iommu, &domain, &devid); if (!iommu || !domain) @@ -867,9 +1251,8 @@ static void free_coherent(struct device *dev, size_t size, spin_lock_irqsave(&domain->lock, flags); __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); - iommu_flush_pages(iommu, domain->id, dma_addr, size); - if (iommu->need_sync) + if (unlikely(iommu->need_sync)) iommu_completion_wait(iommu); spin_unlock_irqrestore(&domain->lock, flags); @@ -879,6 +1262,32 @@ free_mem: } /* + * This function is called by the DMA layer to find out if we can handle a + * particular device. It is part of the dma_ops. + */ +static int amd_iommu_dma_supported(struct device *dev, u64 mask) +{ + u16 bdf; + struct pci_dev *pcidev; + + /* No device or no PCI device */ + if (!dev || dev->bus != &pci_bus_type) + return 0; + + pcidev = to_pci_dev(dev); + + bdf = calc_devid(pcidev->bus->number, pcidev->devfn); + + /* Out of our scope? */ + if (bdf > amd_iommu_last_bdf) + return 0; + + return 1; +} + +/* + * The function for pre-allocating protection domains. + * * If the driver core informs the DMA layer if a driver grabs a device * we don't need to preallocate the protection domains anymore. * For now we have to. @@ -893,7 +1302,7 @@ void prealloc_protection_domains(void) while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { devid = (dev->bus->number << 8) | dev->devfn; - if (devid >= amd_iommu_last_bdf) + if (devid > amd_iommu_last_bdf) continue; devid = amd_iommu_alias_table[devid]; if (domain_for_device(devid)) @@ -905,10 +1314,9 @@ void prealloc_protection_domains(void) if (!dma_dom) continue; init_unity_mappings_for_device(dma_dom, devid); - set_device_domain(iommu, &dma_dom->domain, devid); - printk(KERN_INFO "AMD IOMMU: Allocated domain %d for device ", - dma_dom->domain.id); - print_devid(devid, 1); + dma_dom->target_dev = devid; + + list_add_tail(&dma_dom->list, &iommu_pd_list); } } @@ -919,14 +1327,23 @@ static struct dma_mapping_ops amd_iommu_dma_ops = { .unmap_single = unmap_single, .map_sg = map_sg, .unmap_sg = unmap_sg, + .dma_supported = amd_iommu_dma_supported, }; +/* + * The function which clues the AMD IOMMU driver into dma_ops. + */ int __init amd_iommu_init_dma_ops(void) { struct amd_iommu *iommu; int order = amd_iommu_aperture_order; int ret; + /* + * first allocate a default protection domain for every IOMMU we + * found in the system. Devices not assigned to any other + * protection domain will be assigned to the default one. + */ list_for_each_entry(iommu, &amd_iommu_list, list) { iommu->default_dom = dma_ops_domain_alloc(iommu, order); if (iommu->default_dom == NULL) @@ -936,6 +1353,10 @@ int __init amd_iommu_init_dma_ops(void) goto free_domains; } + /* + * If device isolation is enabled, pre-allocate the protection + * domains for each device. + */ if (amd_iommu_isolate) prealloc_protection_domains(); @@ -947,6 +1368,7 @@ int __init amd_iommu_init_dma_ops(void) gart_iommu_aperture = 0; #endif + /* Make the driver finally visible to the drivers */ dma_ops = &amd_iommu_dma_ops; return 0; diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 2a13e430437d..0cdcda35a05f 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -22,23 +22,17 @@ #include <linux/gfp.h> #include <linux/list.h> #include <linux/sysdev.h> +#include <linux/interrupt.h> +#include <linux/msi.h> #include <asm/pci-direct.h> #include <asm/amd_iommu_types.h> #include <asm/amd_iommu.h> -#include <asm/gart.h> +#include <asm/iommu.h> /* * definitions for the ACPI scanning code */ -#define UPDATE_LAST_BDF(x) do {\ - if ((x) > amd_iommu_last_bdf) \ - amd_iommu_last_bdf = (x); \ - } while (0); - -#define DEVID(bus, devfn) (((bus) << 8) | (devfn)) -#define PCI_BUS(x) (((x) >> 8) & 0xff) #define IVRS_HEADER_LENGTH 48 -#define TBL_SIZE(x) (1 << (PAGE_SHIFT + get_order(amd_iommu_last_bdf * (x)))) #define ACPI_IVHD_TYPE 0x10 #define ACPI_IVMD_TYPE_ALL 0x20 @@ -71,6 +65,17 @@ #define ACPI_DEVFLAG_LINT1 0x80 #define ACPI_DEVFLAG_ATSDIS 0x10000000 +/* + * ACPI table definitions + * + * These data structures are laid over the table to parse the important values + * out of it. + */ + +/* + * structure describing one IOMMU in the ACPI table. Typically followed by one + * or more ivhd_entrys. + */ struct ivhd_header { u8 type; u8 flags; @@ -83,6 +88,10 @@ struct ivhd_header { u32 reserved; } __attribute__((packed)); +/* + * A device entry describing which devices a specific IOMMU translates and + * which requestor ids they use. + */ struct ivhd_entry { u8 type; u16 devid; @@ -90,6 +99,10 @@ struct ivhd_entry { u32 ext; } __attribute__((packed)); +/* + * An AMD IOMMU memory definition structure. It defines things like exclusion + * ranges for devices and regions that should be unity mapped. + */ struct ivmd_header { u8 type; u8 flags; @@ -103,22 +116,81 @@ struct ivmd_header { static int __initdata amd_iommu_detected; -u16 amd_iommu_last_bdf; -struct list_head amd_iommu_unity_map; -unsigned amd_iommu_aperture_order = 26; -int amd_iommu_isolate; +u16 amd_iommu_last_bdf; /* largest PCI device id we have + to handle */ +LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings + we find in ACPI */ +unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ +int amd_iommu_isolate; /* if 1, device isolation is enabled */ +bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ + +LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the + system */ -struct list_head amd_iommu_list; +/* + * Pointer to the device table which is shared by all AMD IOMMUs + * it is indexed by the PCI device id or the HT unit id and contains + * information about the domain the device belongs to as well as the + * page table root pointer. + */ struct dev_table_entry *amd_iommu_dev_table; + +/* + * The alias table is a driver specific data structure which contains the + * mappings of the PCI device ids to the actual requestor ids on the IOMMU. + * More than one device can share the same requestor id. + */ u16 *amd_iommu_alias_table; + +/* + * The rlookup table is used to find the IOMMU which is responsible + * for a specific device. It is also indexed by the PCI device id. + */ struct amd_iommu **amd_iommu_rlookup_table; + +/* + * The pd table (protection domain table) is used to find the protection domain + * data structure a device belongs to. Indexed with the PCI device id too. + */ struct protection_domain **amd_iommu_pd_table; + +/* + * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap + * to know which ones are already in use. + */ unsigned long *amd_iommu_pd_alloc_bitmap; -static u32 dev_table_size; -static u32 alias_table_size; -static u32 rlookup_table_size; +static u32 dev_table_size; /* size of the device table */ +static u32 alias_table_size; /* size of the alias table */ +static u32 rlookup_table_size; /* size if the rlookup table */ + +static inline void update_last_devid(u16 devid) +{ + if (devid > amd_iommu_last_bdf) + amd_iommu_last_bdf = devid; +} + +static inline unsigned long tbl_size(int entry_size) +{ + unsigned shift = PAGE_SHIFT + + get_order(amd_iommu_last_bdf * entry_size); + + return 1UL << shift; +} +/**************************************************************************** + * + * AMD IOMMU MMIO register space handling functions + * + * These functions are used to program the IOMMU device registers in + * MMIO space required for that driver. + * + ****************************************************************************/ + +/* + * This function set the exclusion range in the IOMMU. DMA accesses to the + * exclusion range are passed through untranslated + */ static void __init iommu_set_exclusion_range(struct amd_iommu *iommu) { u64 start = iommu->exclusion_start & PAGE_MASK; @@ -137,9 +209,10 @@ static void __init iommu_set_exclusion_range(struct amd_iommu *iommu) &entry, sizeof(entry)); } +/* Programs the physical address of the device table into the IOMMU hardware */ static void __init iommu_set_device_table(struct amd_iommu *iommu) { - u32 entry; + u64 entry; BUG_ON(iommu->mmio_base == NULL); @@ -149,6 +222,7 @@ static void __init iommu_set_device_table(struct amd_iommu *iommu) &entry, sizeof(entry)); } +/* Generic functions to enable/disable certain features of the IOMMU. */ static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit) { u32 ctrl; @@ -162,20 +236,35 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit) { u32 ctrl; - ctrl = (u64)readl(iommu->mmio_base + MMIO_CONTROL_OFFSET); + ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET); ctrl &= ~(1 << bit); writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); } +/* Function to enable the hardware */ void __init iommu_enable(struct amd_iommu *iommu) { - printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at "); - print_devid(iommu->devid, 0); - printk(" cap 0x%hx\n", iommu->cap_ptr); + printk(KERN_INFO "AMD IOMMU: Enabling IOMMU " + "at %02x:%02x.%x cap 0x%hx\n", + iommu->dev->bus->number, + PCI_SLOT(iommu->dev->devfn), + PCI_FUNC(iommu->dev->devfn), + iommu->cap_ptr); iommu_feature_enable(iommu, CONTROL_IOMMU_EN); } +/* Function to enable IOMMU event logging and event interrupts */ +void __init iommu_enable_event_logging(struct amd_iommu *iommu) +{ + iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); + iommu_feature_enable(iommu, CONTROL_EVT_INT_EN); +} + +/* + * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in + * the system has one. + */ static u8 * __init iommu_map_mmio_space(u64 address) { u8 *ret; @@ -199,16 +288,41 @@ static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu) release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH); } +/**************************************************************************** + * + * The functions below belong to the first pass of AMD IOMMU ACPI table + * parsing. In this pass we try to find out the highest device id this + * code has to handle. Upon this information the size of the shared data + * structures is determined later. + * + ****************************************************************************/ + +/* + * This function calculates the length of a given IVHD entry + */ +static inline int ivhd_entry_length(u8 *ivhd) +{ + return 0x04 << (*ivhd >> 6); +} + +/* + * This function reads the last device id the IOMMU has to handle from the PCI + * capability header for this IOMMU + */ static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr) { u32 cap; cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET); - UPDATE_LAST_BDF(DEVID(MMIO_GET_BUS(cap), MMIO_GET_LD(cap))); + update_last_devid(calc_devid(MMIO_GET_BUS(cap), MMIO_GET_LD(cap))); return 0; } +/* + * After reading the highest device id from the IOMMU PCI capability header + * this function looks if there is a higher device id defined in the ACPI table + */ static int __init find_last_devid_from_ivhd(struct ivhd_header *h) { u8 *p = (void *)h, *end = (void *)h; @@ -229,12 +343,13 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h) case IVHD_DEV_RANGE_END: case IVHD_DEV_ALIAS: case IVHD_DEV_EXT_SELECT: - UPDATE_LAST_BDF(dev->devid); + /* all the above subfield types refer to device ids */ + update_last_devid(dev->devid); break; default: break; } - p += 0x04 << (*p >> 6); + p += ivhd_entry_length(p); } WARN_ON(p != end); @@ -242,6 +357,11 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h) return 0; } +/* + * Iterate over all IVHD entries in the ACPI table and find the highest device + * id which we need to handle. This is the first of three functions which parse + * the ACPI table. So we check the checksum here. + */ static int __init find_last_devid_acpi(struct acpi_table_header *table) { int i; @@ -277,19 +397,31 @@ static int __init find_last_devid_acpi(struct acpi_table_header *table) return 0; } +/**************************************************************************** + * + * The following functions belong the the code path which parses the ACPI table + * the second time. In this ACPI parsing iteration we allocate IOMMU specific + * data structures, initialize the device/alias/rlookup table and also + * basically initialize the hardware. + * + ****************************************************************************/ + +/* + * Allocates the command buffer. This buffer is per AMD IOMMU. We can + * write commands to that buffer later and the IOMMU will execute them + * asynchronously + */ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) { - u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL, + u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(CMD_BUFFER_SIZE)); - u64 entry = 0; + u64 entry; if (cmd_buf == NULL) return NULL; iommu->cmd_buf_size = CMD_BUFFER_SIZE; - memset(cmd_buf, 0, CMD_BUFFER_SIZE); - entry = (u64)virt_to_phys(cmd_buf); entry |= MMIO_CMD_SIZE_512; memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, @@ -302,11 +434,35 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) static void __init free_command_buffer(struct amd_iommu *iommu) { - if (iommu->cmd_buf) - free_pages((unsigned long)iommu->cmd_buf, - get_order(CMD_BUFFER_SIZE)); + free_pages((unsigned long)iommu->cmd_buf, + get_order(iommu->cmd_buf_size)); +} + +/* allocates the memory where the IOMMU will log its events to */ +static u8 * __init alloc_event_buffer(struct amd_iommu *iommu) +{ + u64 entry; + iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(EVT_BUFFER_SIZE)); + + if (iommu->evt_buf == NULL) + return NULL; + + entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK; + memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET, + &entry, sizeof(entry)); + + iommu->evt_buf_size = EVT_BUFFER_SIZE; + + return iommu->evt_buf; +} + +static void __init free_event_buffer(struct amd_iommu *iommu) +{ + free_pages((unsigned long)iommu->evt_buf, get_order(EVT_BUFFER_SIZE)); } +/* sets a specific bit in the device table entry. */ static void set_dev_entry_bit(u16 devid, u8 bit) { int i = (bit >> 5) & 0x07; @@ -315,7 +471,18 @@ static void set_dev_entry_bit(u16 devid, u8 bit) amd_iommu_dev_table[devid].data[i] |= (1 << _bit); } -static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags) +/* Writes the specific IOMMU for a device into the rlookup table */ +static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid) +{ + amd_iommu_rlookup_table[devid] = iommu; +} + +/* + * This function takes the device specific flags read from the ACPI + * table and sets up the device table entry with that information + */ +static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu, + u16 devid, u32 flags, u32 ext_flags) { if (flags & ACPI_DEVFLAG_INITPASS) set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS); @@ -331,13 +498,14 @@ static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags) set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS); if (flags & ACPI_DEVFLAG_LINT1) set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS); -} -static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid) -{ - amd_iommu_rlookup_table[devid] = iommu; + set_iommu_for_device(iommu, devid); } +/* + * Reads the device exclusion range from ACPI and initialize IOMMU with + * it + */ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m) { struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; @@ -346,27 +514,45 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m) return; if (iommu) { + /* + * We only can configure exclusion ranges per IOMMU, not + * per device. But we can enable the exclusion range per + * device. This is done here + */ set_dev_entry_bit(m->devid, DEV_ENTRY_EX); iommu->exclusion_start = m->range_start; iommu->exclusion_length = m->range_length; } } +/* + * This function reads some important data from the IOMMU PCI space and + * initializes the driver data structure with it. It reads the hardware + * capabilities and the first/last device entries + */ static void __init init_iommu_from_pci(struct amd_iommu *iommu) { - int bus = PCI_BUS(iommu->devid); - int dev = PCI_SLOT(iommu->devid); - int fn = PCI_FUNC(iommu->devid); int cap_ptr = iommu->cap_ptr; - u32 range; - - iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET); - - range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET); - iommu->first_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_FD(range)); - iommu->last_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_LD(range)); + u32 range, misc; + + pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET, + &iommu->cap); + pci_read_config_dword(iommu->dev, cap_ptr + MMIO_RANGE_OFFSET, + &range); + pci_read_config_dword(iommu->dev, cap_ptr + MMIO_MISC_OFFSET, + &misc); + + iommu->first_device = calc_devid(MMIO_GET_BUS(range), + MMIO_GET_FD(range)); + iommu->last_device = calc_devid(MMIO_GET_BUS(range), + MMIO_GET_LD(range)); + iommu->evt_msi_num = MMIO_MSI_NUM(misc); } +/* + * Takes a pointer to an AMD IOMMU entry in the ACPI table and + * initializes the hardware and our data structures with it. + */ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, struct ivhd_header *h) { @@ -374,7 +560,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, u8 *end = p, flags = 0; u16 dev_i, devid = 0, devid_start = 0, devid_to = 0; u32 ext_flags = 0; - bool alias = 0; + bool alias = false; struct ivhd_entry *e; /* @@ -414,22 +600,23 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, case IVHD_DEV_ALL: for (dev_i = iommu->first_device; dev_i <= iommu->last_device; ++dev_i) - set_dev_entry_from_acpi(dev_i, e->flags, 0); + set_dev_entry_from_acpi(iommu, dev_i, + e->flags, 0); break; case IVHD_DEV_SELECT: devid = e->devid; - set_dev_entry_from_acpi(devid, e->flags, 0); + set_dev_entry_from_acpi(iommu, devid, e->flags, 0); break; case IVHD_DEV_SELECT_RANGE_START: devid_start = e->devid; flags = e->flags; ext_flags = 0; - alias = 0; + alias = false; break; case IVHD_DEV_ALIAS: devid = e->devid; devid_to = e->ext >> 8; - set_dev_entry_from_acpi(devid, e->flags, 0); + set_dev_entry_from_acpi(iommu, devid, e->flags, 0); amd_iommu_alias_table[devid] = devid_to; break; case IVHD_DEV_ALIAS_RANGE: @@ -437,24 +624,25 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, flags = e->flags; devid_to = e->ext >> 8; ext_flags = 0; - alias = 1; + alias = true; break; case IVHD_DEV_EXT_SELECT: devid = e->devid; - set_dev_entry_from_acpi(devid, e->flags, e->ext); + set_dev_entry_from_acpi(iommu, devid, e->flags, + e->ext); break; case IVHD_DEV_EXT_SELECT_RANGE: devid_start = e->devid; flags = e->flags; ext_flags = e->ext; - alias = 0; + alias = false; break; case IVHD_DEV_RANGE_END: devid = e->devid; for (dev_i = devid_start; dev_i <= devid; ++dev_i) { if (alias) amd_iommu_alias_table[dev_i] = devid_to; - set_dev_entry_from_acpi( + set_dev_entry_from_acpi(iommu, amd_iommu_alias_table[dev_i], flags, ext_flags); } @@ -463,10 +651,11 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, break; } - p += 0x04 << (e->type >> 6); + p += ivhd_entry_length(p); } } +/* Initializes the device->iommu mapping for the driver */ static int __init init_iommu_devices(struct amd_iommu *iommu) { u16 i; @@ -480,6 +669,7 @@ static int __init init_iommu_devices(struct amd_iommu *iommu) static void __init free_iommu_one(struct amd_iommu *iommu) { free_command_buffer(iommu); + free_event_buffer(iommu); iommu_unmap_mmio_space(iommu); } @@ -494,6 +684,11 @@ static void __init free_iommu_all(void) } } +/* + * This function clues the initialization function for one IOMMU + * together and also allocates the command buffer and programs the + * hardware. It does NOT enable the IOMMU. This is done afterwards. + */ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) { spin_lock_init(&iommu->lock); @@ -502,8 +697,12 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) /* * Copy data from ACPI table entry to the iommu struct */ - iommu->devid = h->devid; + iommu->dev = pci_get_bus_and_slot(PCI_BUS(h->devid), h->devid & 0xff); + if (!iommu->dev) + return 1; + iommu->cap_ptr = h->cap_ptr; + iommu->pci_seg = h->pci_seg; iommu->mmio_phys = h->mmio_phys; iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys); if (!iommu->mmio_base) @@ -514,13 +713,23 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) if (!iommu->cmd_buf) return -ENOMEM; + iommu->evt_buf = alloc_event_buffer(iommu); + if (!iommu->evt_buf) + return -ENOMEM; + + iommu->int_enabled = false; + init_iommu_from_pci(iommu); init_iommu_from_acpi(iommu, h); init_iommu_devices(iommu); - return 0; + return pci_enable_device(iommu->dev); } +/* + * Iterates over all IOMMU entries in the ACPI table, allocates the + * IOMMU structure and initializes it with init_iommu_one() + */ static int __init init_iommu_all(struct acpi_table_header *table) { u8 *p = (u8 *)table, *end = (u8 *)table; @@ -528,8 +737,6 @@ static int __init init_iommu_all(struct acpi_table_header *table) struct amd_iommu *iommu; int ret; - INIT_LIST_HEAD(&amd_iommu_list); - end += table->length; p += IVRS_HEADER_LENGTH; @@ -555,6 +762,103 @@ static int __init init_iommu_all(struct acpi_table_header *table) return 0; } +/**************************************************************************** + * + * The following functions initialize the MSI interrupts for all IOMMUs + * in the system. Its a bit challenging because there could be multiple + * IOMMUs per PCI BDF but we can call pci_enable_msi(x) only once per + * pci_dev. + * + ****************************************************************************/ + +static int __init iommu_setup_msix(struct amd_iommu *iommu) +{ + struct amd_iommu *curr; + struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */ + int nvec = 0, i; + + list_for_each_entry(curr, &amd_iommu_list, list) { + if (curr->dev == iommu->dev) { + entries[nvec].entry = curr->evt_msi_num; + entries[nvec].vector = 0; + curr->int_enabled = true; + nvec++; + } + } + + if (pci_enable_msix(iommu->dev, entries, nvec)) { + pci_disable_msix(iommu->dev); + return 1; + } + + for (i = 0; i < nvec; ++i) { + int r = request_irq(entries->vector, amd_iommu_int_handler, + IRQF_SAMPLE_RANDOM, + "AMD IOMMU", + NULL); + if (r) + goto out_free; + } + + return 0; + +out_free: + for (i -= 1; i >= 0; --i) + free_irq(entries->vector, NULL); + + pci_disable_msix(iommu->dev); + + return 1; +} + +static int __init iommu_setup_msi(struct amd_iommu *iommu) +{ + int r; + struct amd_iommu *curr; + + list_for_each_entry(curr, &amd_iommu_list, list) { + if (curr->dev == iommu->dev) + curr->int_enabled = true; + } + + + if (pci_enable_msi(iommu->dev)) + return 1; + + r = request_irq(iommu->dev->irq, amd_iommu_int_handler, + IRQF_SAMPLE_RANDOM, + "AMD IOMMU", + NULL); + + if (r) { + pci_disable_msi(iommu->dev); + return 1; + } + + return 0; +} + +static int __init iommu_init_msi(struct amd_iommu *iommu) +{ + if (iommu->int_enabled) + return 0; + + if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSIX)) + return iommu_setup_msix(iommu); + else if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI)) + return iommu_setup_msi(iommu); + + return 1; +} + +/**************************************************************************** + * + * The next functions belong to the third pass of parsing the ACPI + * table. In this last pass the memory mapping requirements are + * gathered (like exclusion and unity mapping reanges). + * + ****************************************************************************/ + static void __init free_unity_maps(void) { struct unity_map_entry *entry, *next; @@ -565,6 +869,7 @@ static void __init free_unity_maps(void) } } +/* called when we find an exclusion range definition in ACPI */ static int __init init_exclusion_range(struct ivmd_header *m) { int i; @@ -574,7 +879,7 @@ static int __init init_exclusion_range(struct ivmd_header *m) set_device_exclusion_range(m->devid, m); break; case ACPI_IVMD_TYPE_ALL: - for (i = 0; i < amd_iommu_last_bdf; ++i) + for (i = 0; i <= amd_iommu_last_bdf; ++i) set_device_exclusion_range(i, m); break; case ACPI_IVMD_TYPE_RANGE: @@ -588,6 +893,7 @@ static int __init init_exclusion_range(struct ivmd_header *m) return 0; } +/* called for unity map ACPI definition */ static int __init init_unity_map_range(struct ivmd_header *m) { struct unity_map_entry *e = 0; @@ -619,13 +925,12 @@ static int __init init_unity_map_range(struct ivmd_header *m) return 0; } +/* iterates over all memory definitions we find in the ACPI table */ static int __init init_memory_definitions(struct acpi_table_header *table) { u8 *p = (u8 *)table, *end = (u8 *)table; struct ivmd_header *m; - INIT_LIST_HEAD(&amd_iommu_unity_map); - end += table->length; p += IVRS_HEADER_LENGTH; @@ -642,12 +947,32 @@ static int __init init_memory_definitions(struct acpi_table_header *table) return 0; } +/* + * Init the device table to not allow DMA access for devices and + * suppress all page faults + */ +static void init_device_table(void) +{ + u16 devid; + + for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) { + set_dev_entry_bit(devid, DEV_ENTRY_VALID); + set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION); + } +} + +/* + * This function finally enables all IOMMUs found in the system after + * they have been initialized + */ static void __init enable_iommus(void) { struct amd_iommu *iommu; list_for_each_entry(iommu, &amd_iommu_list, list) { iommu_set_exclusion_range(iommu); + iommu_init_msi(iommu); + iommu_enable_event_logging(iommu); iommu_enable(iommu); } } @@ -678,6 +1003,34 @@ static struct sys_device device_amd_iommu = { .cls = &amd_iommu_sysdev_class, }; +/* + * This is the core init function for AMD IOMMU hardware in the system. + * This function is called from the generic x86 DMA layer initialization + * code. + * + * This function basically parses the ACPI table for AMD IOMMU (IVRS) + * three times: + * + * 1 pass) Find the highest PCI device id the driver has to handle. + * Upon this information the size of the data structures is + * determined that needs to be allocated. + * + * 2 pass) Initialize the data structures just allocated with the + * information in the ACPI table about available AMD IOMMUs + * in the system. It also maps the PCI devices in the + * system to specific IOMMUs + * + * 3 pass) After the basic data structures are allocated and + * initialized we update them with information about memory + * remapping requirements parsed out of the ACPI table in + * this last pass. + * + * After that the hardware is initialized and ready to go. In the last + * step we do some Linux specific things like registering the driver in + * the dma_ops interface and initializing the suspend/resume support + * functions. Finally it prints some information about AMD IOMMUs and + * the driver state and enables the hardware. + */ int __init amd_iommu_init(void) { int i, ret = 0; @@ -699,14 +1052,14 @@ int __init amd_iommu_init(void) if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0) return -ENODEV; - dev_table_size = TBL_SIZE(DEV_TABLE_ENTRY_SIZE); - alias_table_size = TBL_SIZE(ALIAS_TABLE_ENTRY_SIZE); - rlookup_table_size = TBL_SIZE(RLOOKUP_TABLE_ENTRY_SIZE); + dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE); + alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE); + rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE); ret = -ENOMEM; /* Device table - directly used by all IOMMUs */ - amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL, + amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(dev_table_size)); if (amd_iommu_dev_table == NULL) goto out; @@ -730,27 +1083,26 @@ int __init amd_iommu_init(void) * Protection Domain table - maps devices to protection domains * This table has the same size as the rlookup_table */ - amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL, + amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(rlookup_table_size)); if (amd_iommu_pd_table == NULL) goto free; - amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(GFP_KERNEL, + amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages( + GFP_KERNEL | __GFP_ZERO, get_order(MAX_DOMAIN_ID/8)); if (amd_iommu_pd_alloc_bitmap == NULL) goto free; + /* init the device table */ + init_device_table(); + /* - * memory is allocated now; initialize the device table with all zeroes - * and let all alias entries point to itself + * let all alias entries point to itself */ - memset(amd_iommu_dev_table, 0, dev_table_size); - for (i = 0; i < amd_iommu_last_bdf; ++i) + for (i = 0; i <= amd_iommu_last_bdf; ++i) amd_iommu_alias_table[i] = i; - memset(amd_iommu_pd_table, 0, rlookup_table_size); - memset(amd_iommu_pd_alloc_bitmap, 0, MAX_DOMAIN_ID / 8); - /* * never allocate domain 0 because its used as the non-allocated and * error value placeholder @@ -768,15 +1120,15 @@ int __init amd_iommu_init(void) if (acpi_table_parse("IVRS", init_memory_definitions) != 0) goto free; - ret = amd_iommu_init_dma_ops(); + ret = sysdev_class_register(&amd_iommu_sysdev_class); if (ret) goto free; - ret = sysdev_class_register(&amd_iommu_sysdev_class); + ret = sysdev_register(&device_amd_iommu); if (ret) goto free; - ret = sysdev_register(&device_amd_iommu); + ret = amd_iommu_init_dma_ops(); if (ret) goto free; @@ -791,28 +1143,29 @@ int __init amd_iommu_init(void) else printk("disabled\n"); + if (amd_iommu_unmap_flush) + printk(KERN_INFO "AMD IOMMU: IO/TLB flush on unmap enabled\n"); + else + printk(KERN_INFO "AMD IOMMU: Lazy IO/TLB flushing enabled\n"); + out: return ret; free: - if (amd_iommu_pd_alloc_bitmap) - free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1); + free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, + get_order(MAX_DOMAIN_ID/8)); - if (amd_iommu_pd_table) - free_pages((unsigned long)amd_iommu_pd_table, - get_order(rlookup_table_size)); + free_pages((unsigned long)amd_iommu_pd_table, + get_order(rlookup_table_size)); - if (amd_iommu_rlookup_table) - free_pages((unsigned long)amd_iommu_rlookup_table, - get_order(rlookup_table_size)); + free_pages((unsigned long)amd_iommu_rlookup_table, + get_order(rlookup_table_size)); - if (amd_iommu_alias_table) - free_pages((unsigned long)amd_iommu_alias_table, - get_order(alias_table_size)); + free_pages((unsigned long)amd_iommu_alias_table, + get_order(alias_table_size)); - if (amd_iommu_dev_table) - free_pages((unsigned long)amd_iommu_dev_table, - get_order(dev_table_size)); + free_pages((unsigned long)amd_iommu_dev_table, + get_order(dev_table_size)); free_iommu_all(); @@ -821,6 +1174,13 @@ free: goto out; } +/**************************************************************************** + * + * Early detect code. This code runs at IOMMU detection time in the DMA + * layer. It just looks if there is an IVRS ACPI table to detect AMD + * IOMMUs + * + ****************************************************************************/ static int __init early_amd_iommu_detect(struct acpi_table_header *table) { return 0; @@ -828,7 +1188,7 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table) void __init amd_iommu_detect(void) { - if (swiotlb || no_iommu || iommu_detected) + if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture)) return; if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { @@ -841,11 +1201,20 @@ void __init amd_iommu_detect(void) } } +/**************************************************************************** + * + * Parsing functions for the AMD IOMMU specific kernel command line + * options. + * + ****************************************************************************/ + static int __init parse_amd_iommu_options(char *str) { for (; *str; ++str) { - if (strcmp(str, "isolate") == 0) + if (strncmp(str, "isolate", 7) == 0) amd_iommu_isolate = 1; + if (strncmp(str, "fullflush", 11) == 0) + amd_iommu_unmap_flush = true; } return 1; @@ -853,20 +1222,10 @@ static int __init parse_amd_iommu_options(char *str) static int __init parse_amd_iommu_size_options(char *str) { - for (; *str; ++str) { - if (strcmp(str, "32M") == 0) - amd_iommu_aperture_order = 25; - if (strcmp(str, "64M") == 0) - amd_iommu_aperture_order = 26; - if (strcmp(str, "128M") == 0) - amd_iommu_aperture_order = 27; - if (strcmp(str, "256M") == 0) - amd_iommu_aperture_order = 28; - if (strcmp(str, "512M") == 0) - amd_iommu_aperture_order = 29; - if (strcmp(str, "1G") == 0) - amd_iommu_aperture_order = 30; - } + unsigned order = PAGE_SHIFT + get_order(memparse(str, &str)); + + if ((order > 24) && (order < 31)) + amd_iommu_aperture_order = order; return 1; } diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 9f907806c1a5..9a32b37ee2ee 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -21,6 +21,7 @@ #include <linux/suspend.h> #include <asm/e820.h> #include <asm/io.h> +#include <asm/iommu.h> #include <asm/gart.h> #include <asm/pci-direct.h> #include <asm/dma.h> @@ -454,11 +455,11 @@ out: force_iommu || valid_agp || fallback_aper_force) { - printk(KERN_ERR + printk(KERN_INFO "Your BIOS doesn't leave a aperture memory hole\n"); - printk(KERN_ERR + printk(KERN_INFO "Please enable the IOMMU option in the BIOS setup\n"); - printk(KERN_ERR + printk(KERN_INFO "This costs you %d MB of RAM\n", 32 << fallback_aper_order); diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic.c index a437d027f20b..04a7f960bbc0 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic.c @@ -23,11 +23,13 @@ #include <linux/mc146818rtc.h> #include <linux/kernel_stat.h> #include <linux/sysdev.h> +#include <linux/ioport.h> #include <linux/cpu.h> #include <linux/clockchips.h> #include <linux/acpi_pmtmr.h> #include <linux/module.h> #include <linux/dmi.h> +#include <linux/dmar.h> #include <asm/atomic.h> #include <asm/smp.h> @@ -36,8 +38,14 @@ #include <asm/desc.h> #include <asm/arch_hooks.h> #include <asm/hpet.h> +#include <asm/pgalloc.h> #include <asm/i8253.h> #include <asm/nmi.h> +#include <asm/idle.h> +#include <asm/proto.h> +#include <asm/timex.h> +#include <asm/apic.h> +#include <asm/i8259.h> #include <mach_apic.h> #include <mach_apicdef.h> @@ -50,20 +58,60 @@ # error SPURIOUS_APIC_VECTOR definition error #endif -unsigned long mp_lapic_addr; - +#ifdef CONFIG_X86_32 /* * Knob to control our willingness to enable the local APIC. * * +1=force-enable */ static int force_enable_local_apic; -int disable_apic; +/* + * APIC command line parameters + */ +static int __init parse_lapic(char *arg) +{ + force_enable_local_apic = 1; + return 0; +} +early_param("lapic", parse_lapic); +/* Local APIC was disabled by the BIOS and enabled by the kernel */ +static int enabled_via_apicbase; + +#endif -/* Local APIC timer verification ok */ -static int local_apic_timer_verify_ok; +#ifdef CONFIG_X86_64 +static int apic_calibrate_pmtmr __initdata; +static __init int setup_apicpmtimer(char *s) +{ + apic_calibrate_pmtmr = 1; + notsc_setup(NULL); + return 0; +} +__setup("apicpmtimer", setup_apicpmtimer); +#endif + +#ifdef CONFIG_X86_64 +#define HAVE_X2APIC +#endif + +#ifdef HAVE_X2APIC +int x2apic; +/* x2apic enabled before OS handover */ +int x2apic_preenabled; +int disable_x2apic; +static __init int setup_nox2apic(char *str) +{ + disable_x2apic = 1; + setup_clear_cpu_cap(X86_FEATURE_X2APIC); + return 0; +} +early_param("nox2apic", setup_nox2apic); +#endif + +unsigned long mp_lapic_addr; +int disable_apic; /* Disable local APIC timer from the kernel commandline or via dmi quirk */ -static int local_apic_timer_disabled; +static int disable_apic_timer __cpuinitdata; /* Local APIC timer works in C2 */ int local_apic_timer_c2_ok; EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); @@ -75,7 +123,7 @@ char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; /* * Debug level, exported for io_apic.c */ -int apic_verbosity; +unsigned int apic_verbosity; int pic_mode; @@ -112,9 +160,6 @@ static struct clock_event_device lapic_clockevent = { }; static DEFINE_PER_CPU(struct clock_event_device, lapic_events); -/* Local APIC was disabled by the BIOS and enabled by the kernel */ -static int enabled_via_apicbase; - static unsigned long apic_phys; /* @@ -130,7 +175,11 @@ static inline int lapic_get_version(void) */ static inline int lapic_is_integrated(void) { +#ifdef CONFIG_X86_64 + return 1; +#else return APIC_INTEGRATED(lapic_get_version()); +#endif } /* @@ -145,13 +194,18 @@ static int modern_apic(void) return lapic_get_version() >= 0x14; } -void apic_wait_icr_idle(void) +/* + * Paravirt kernels also might be using these below ops. So we still + * use generic apic_read()/apic_write(), which might be pointing to different + * ops in PARAVIRT case. + */ +void xapic_wait_icr_idle(void) { while (apic_read(APIC_ICR) & APIC_ICR_BUSY) cpu_relax(); } -u32 safe_apic_wait_icr_idle(void) +u32 safe_xapic_wait_icr_idle(void) { u32 send_status; int timeout; @@ -167,19 +221,88 @@ u32 safe_apic_wait_icr_idle(void) return send_status; } +void xapic_icr_write(u32 low, u32 id) +{ + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id)); + apic_write(APIC_ICR, low); +} + +u64 xapic_icr_read(void) +{ + u32 icr1, icr2; + + icr2 = apic_read(APIC_ICR2); + icr1 = apic_read(APIC_ICR); + + return icr1 | ((u64)icr2 << 32); +} + +static struct apic_ops xapic_ops = { + .read = native_apic_mem_read, + .write = native_apic_mem_write, + .icr_read = xapic_icr_read, + .icr_write = xapic_icr_write, + .wait_icr_idle = xapic_wait_icr_idle, + .safe_wait_icr_idle = safe_xapic_wait_icr_idle, +}; + +struct apic_ops __read_mostly *apic_ops = &xapic_ops; +EXPORT_SYMBOL_GPL(apic_ops); + +#ifdef HAVE_X2APIC +static void x2apic_wait_icr_idle(void) +{ + /* no need to wait for icr idle in x2apic */ + return; +} + +static u32 safe_x2apic_wait_icr_idle(void) +{ + /* no need to wait for icr idle in x2apic */ + return 0; +} + +void x2apic_icr_write(u32 low, u32 id) +{ + wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low); +} + +u64 x2apic_icr_read(void) +{ + unsigned long val; + + rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val); + return val; +} + +static struct apic_ops x2apic_ops = { + .read = native_apic_msr_read, + .write = native_apic_msr_write, + .icr_read = x2apic_icr_read, + .icr_write = x2apic_icr_write, + .wait_icr_idle = x2apic_wait_icr_idle, + .safe_wait_icr_idle = safe_x2apic_wait_icr_idle, +}; +#endif + /** * enable_NMI_through_LVT0 - enable NMI through local vector table 0 */ void __cpuinit enable_NMI_through_LVT0(void) { - unsigned int v = APIC_DM_NMI; + unsigned int v; + + /* unmask and set to NMI */ + v = APIC_DM_NMI; - /* Level triggered for 82489DX */ + /* Level triggered for 82489DX (32bit mode) */ if (!lapic_is_integrated()) v |= APIC_LVT_LEVEL_TRIGGER; - apic_write_around(APIC_LVT0, v); + + apic_write(APIC_LVT0, v); } +#ifdef CONFIG_X86_32 /** * get_physical_broadcast - Get number of physical broadcast IDs */ @@ -187,15 +310,20 @@ int get_physical_broadcast(void) { return modern_apic() ? 0xff : 0xf; } +#endif /** * lapic_get_maxlvt - get the maximum number of local vector table entries */ int lapic_get_maxlvt(void) { - unsigned int v = apic_read(APIC_LVR); + unsigned int v; - /* 82489DXs do not report # of LVT entries. */ + v = apic_read(APIC_LVR); + /* + * - we always have APIC integrated on 64bit mode + * - 82489DXs do not report # of LVT entries + */ return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2; } @@ -203,7 +331,7 @@ int lapic_get_maxlvt(void) * Local APIC timer */ -/* Clock divisor is set to 16 */ +/* Clock divisor */ #define APIC_DIVISOR 16 /* @@ -229,27 +357,61 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) if (!irqen) lvtt_value |= APIC_LVT_MASKED; - apic_write_around(APIC_LVTT, lvtt_value); + apic_write(APIC_LVTT, lvtt_value); /* * Divide PICLK by 16 */ tmp_value = apic_read(APIC_TDCR); - apic_write_around(APIC_TDCR, (tmp_value - & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) - | APIC_TDR_DIV_16); + apic_write(APIC_TDCR, + (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | + APIC_TDR_DIV_16); if (!oneshot) - apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); + apic_write(APIC_TMICT, clocks / APIC_DIVISOR); } /* + * Setup extended LVT, AMD specific (K8, family 10h) + * + * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and + * MCE interrupts are supported. Thus MCE offset must be set to 0. + * + * If mask=1, the LVT entry does not generate interrupts while mask=0 + * enables the vector. See also the BKDGs. + */ + +#define APIC_EILVT_LVTOFF_MCE 0 +#define APIC_EILVT_LVTOFF_IBS 1 + +static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask) +{ + unsigned long reg = (lvt_off << 4) + APIC_EILVT0; + unsigned int v = (mask << 16) | (msg_type << 8) | vector; + + apic_write(reg, v); +} + +u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask) +{ + setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask); + return APIC_EILVT_LVTOFF_MCE; +} + +u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) +{ + setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); + return APIC_EILVT_LVTOFF_IBS; +} +EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs); + +/* * Program the next event, relative to now */ static int lapic_next_event(unsigned long delta, struct clock_event_device *evt) { - apic_write_around(APIC_TMICT, delta); + apic_write(APIC_TMICT, delta); return 0; } @@ -262,8 +424,8 @@ static void lapic_timer_setup(enum clock_event_mode mode, unsigned long flags; unsigned int v; - /* Lapic used for broadcast ? */ - if (!local_apic_timer_verify_ok) + /* Lapic used as dummy for broadcast ? */ + if (evt->features & CLOCK_EVT_FEAT_DUMMY) return; local_irq_save(flags); @@ -278,7 +440,7 @@ static void lapic_timer_setup(enum clock_event_mode mode, case CLOCK_EVT_MODE_SHUTDOWN: v = apic_read(APIC_LVTT); v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); - apic_write_around(APIC_LVTT, v); + apic_write(APIC_LVTT, v); break; case CLOCK_EVT_MODE_RESUME: /* Nothing to do here */ @@ -302,7 +464,7 @@ static void lapic_timer_broadcast(cpumask_t mask) * Setup the local APIC timer for this CPU. Copy the initilized values * of the boot CPU and register the clock event in the framework. */ -static void __devinit setup_APIC_timer(void) +static void __cpuinit setup_APIC_timer(void) { struct clock_event_device *levt = &__get_cpu_var(lapic_events); @@ -372,39 +534,53 @@ static void __init lapic_cal_handler(struct clock_event_device *dev) } } -/* - * Setup the boot APIC - * - * Calibrate and verify the result. - */ -void __init setup_boot_APIC_clock(void) +static int __init calibrate_by_pmtimer(long deltapm, long *delta) +{ + const long pm_100ms = PMTMR_TICKS_PER_SEC / 10; + const long pm_thresh = pm_100ms / 100; + unsigned long mult; + u64 res; + +#ifndef CONFIG_X86_PM_TIMER + return -1; +#endif + + apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); + + /* Check, if the PM timer is available */ + if (!deltapm) + return -1; + + mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22); + + if (deltapm > (pm_100ms - pm_thresh) && + deltapm < (pm_100ms + pm_thresh)) { + apic_printk(APIC_VERBOSE, "... PM timer result ok\n"); + } else { + res = (((u64)deltapm) * mult) >> 22; + do_div(res, 1000000); + printk(KERN_WARNING "APIC calibration not consistent " + "with PM Timer: %ldms instead of 100ms\n", + (long)res); + /* Correct the lapic counter value */ + res = (((u64)(*delta)) * pm_100ms); + do_div(res, deltapm); + printk(KERN_INFO "APIC delta adjusted to PM-Timer: " + "%lu (%ld)\n", (unsigned long)res, *delta); + *delta = (long)res; + } + + return 0; +} + +static int __init calibrate_APIC_clock(void) { struct clock_event_device *levt = &__get_cpu_var(lapic_events); - const long pm_100ms = PMTMR_TICKS_PER_SEC/10; - const long pm_thresh = pm_100ms/100; void (*real_handler)(struct clock_event_device *dev); unsigned long deltaj; - long delta, deltapm; + long delta; int pm_referenced = 0; - /* - * The local apic timer can be disabled via the kernel - * commandline or from the CPU detection code. Register the lapic - * timer as a dummy clock event source on SMP systems, so the - * broadcast mechanism is used. On UP systems simply ignore it. - */ - if (local_apic_timer_disabled) { - /* No broadcast on UP ! */ - if (num_possible_cpus() > 1) { - lapic_clockevent.mult = 1; - setup_APIC_timer(); - } - return; - } - - apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" - "calibrating APIC timer ...\n"); - local_irq_disable(); /* Replace the global interrupt handler */ @@ -412,10 +588,10 @@ void __init setup_boot_APIC_clock(void) global_clock_event->event_handler = lapic_cal_handler; /* - * Setup the APIC counter to 1e9. There is no way the lapic + * Setup the APIC counter to maximum. There is no way the lapic * can underflow in the 100ms detection time frame */ - __setup_APIC_LVTT(1000000000, 0, 0); + __setup_APIC_LVTT(0xffffffff, 0, 0); /* Let the interrupts run */ local_irq_enable(); @@ -432,34 +608,9 @@ void __init setup_boot_APIC_clock(void) delta = lapic_cal_t1 - lapic_cal_t2; apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta); - /* Check, if the PM timer is available */ - deltapm = lapic_cal_pm2 - lapic_cal_pm1; - apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); - - if (deltapm) { - unsigned long mult; - u64 res; - - mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22); - - if (deltapm > (pm_100ms - pm_thresh) && - deltapm < (pm_100ms + pm_thresh)) { - apic_printk(APIC_VERBOSE, "... PM timer result ok\n"); - } else { - res = (((u64) deltapm) * mult) >> 22; - do_div(res, 1000000); - printk(KERN_WARNING "APIC calibration not consistent " - "with PM Timer: %ldms instead of 100ms\n", - (long)res); - /* Correct the lapic counter value */ - res = (((u64) delta) * pm_100ms); - do_div(res, deltapm); - printk(KERN_INFO "APIC delta adjusted to PM-Timer: " - "%lu (%ld)\n", (unsigned long) res, delta); - delta = (long) res; - } - pm_referenced = 1; - } + /* we trust the PM based calibration if possible */ + pm_referenced = !calibrate_by_pmtimer(lapic_cal_pm2 - lapic_cal_pm1, + &delta); /* Calculate the scaled math multiplication factor */ lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, @@ -489,8 +640,6 @@ void __init setup_boot_APIC_clock(void) calibration_result / (1000000 / HZ), calibration_result % (1000000 / HZ)); - local_apic_timer_verify_ok = 1; - /* * Do a sanity check on the APIC calibration result */ @@ -498,13 +647,15 @@ void __init setup_boot_APIC_clock(void) local_irq_enable(); printk(KERN_WARNING "APIC frequency too slow, disabling apic timer\n"); - /* No broadcast on UP ! */ - if (num_possible_cpus() > 1) - setup_APIC_timer(); - return; + return -1; } - /* We trust the pm timer based calibration */ + levt->features &= ~CLOCK_EVT_FEAT_DUMMY; + + /* + * PM timer calibration failed or not turned on + * so lets try APIC timer based calibration + */ if (!pm_referenced) { apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); @@ -536,34 +687,68 @@ void __init setup_boot_APIC_clock(void) if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2) apic_printk(APIC_VERBOSE, "... jiffies result ok\n"); else - local_apic_timer_verify_ok = 0; + levt->features |= CLOCK_EVT_FEAT_DUMMY; } else local_irq_enable(); - if (!local_apic_timer_verify_ok) { + if (levt->features & CLOCK_EVT_FEAT_DUMMY) { printk(KERN_WARNING "APIC timer disabled due to verification failure.\n"); + return -1; + } + + return 0; +} + +/* + * Setup the boot APIC + * + * Calibrate and verify the result. + */ +void __init setup_boot_APIC_clock(void) +{ + /* + * The local apic timer can be disabled via the kernel + * commandline or from the CPU detection code. Register the lapic + * timer as a dummy clock event source on SMP systems, so the + * broadcast mechanism is used. On UP systems simply ignore it. + */ + if (disable_apic_timer) { + printk(KERN_INFO "Disabling APIC timer\n"); /* No broadcast on UP ! */ - if (num_possible_cpus() == 1) - return; - } else { - /* - * If nmi_watchdog is set to IO_APIC, we need the - * PIT/HPET going. Otherwise register lapic as a dummy - * device. - */ - if (nmi_watchdog != NMI_IO_APIC) - lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; - else - printk(KERN_WARNING "APIC timer registered as dummy," - " due to nmi_watchdog=%d!\n", nmi_watchdog); + if (num_possible_cpus() > 1) { + lapic_clockevent.mult = 1; + setup_APIC_timer(); + } + return; + } + + apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" + "calibrating APIC timer ...\n"); + + if (calibrate_APIC_clock()) { + /* No broadcast on UP ! */ + if (num_possible_cpus() > 1) + setup_APIC_timer(); + return; } + /* + * If nmi_watchdog is set to IO_APIC, we need the + * PIT/HPET going. Otherwise register lapic as a dummy + * device. + */ + if (nmi_watchdog != NMI_IO_APIC) + lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; + else + printk(KERN_WARNING "APIC timer registered as dummy," + " due to nmi_watchdog=%d!\n", nmi_watchdog); + /* Setup the lapic or request the broadcast */ setup_APIC_timer(); } -void __devinit setup_secondary_APIC_clock(void) +void __cpuinit setup_secondary_APIC_clock(void) { setup_APIC_timer(); } @@ -598,7 +783,11 @@ static void local_apic_timer_interrupt(void) /* * the NMI deadlock-detector uses this. */ +#ifdef CONFIG_X86_64 + add_pda(apic_timer_irqs, 1); +#else per_cpu(irq_stat, cpu).apic_timer_irqs++; +#endif evt->event_handler(evt); } @@ -625,6 +814,9 @@ void smp_apic_timer_interrupt(struct pt_regs *regs) * Besides, if we don't timer interrupts ignore the global * interrupt lock, which is the WrongThing (tm) to do. */ +#ifdef CONFIG_X86_64 + exit_idle(); +#endif irq_enter(); local_apic_timer_interrupt(); irq_exit(); @@ -638,35 +830,6 @@ int setup_profiling_timer(unsigned int multiplier) } /* - * Setup extended LVT, AMD specific (K8, family 10h) - * - * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and - * MCE interrupts are supported. Thus MCE offset must be set to 0. - */ - -#define APIC_EILVT_LVTOFF_MCE 0 -#define APIC_EILVT_LVTOFF_IBS 1 - -static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask) -{ - unsigned long reg = (lvt_off << 4) + APIC_EILVT0; - unsigned int v = (mask << 16) | (msg_type << 8) | vector; - apic_write(reg, v); -} - -u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask) -{ - setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask); - return APIC_EILVT_LVTOFF_MCE; -} - -u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) -{ - setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); - return APIC_EILVT_LVTOFF_IBS; -} - -/* * Local APIC start and shutdown */ @@ -693,45 +856,41 @@ void clear_local_APIC(void) */ if (maxlvt >= 3) { v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ - apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED); + apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); } /* * Careful: we have to set masks only first to deassert * any level-triggered sources. */ v = apic_read(APIC_LVTT); - apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); + apic_write(APIC_LVTT, v | APIC_LVT_MASKED); v = apic_read(APIC_LVT0); - apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); + apic_write(APIC_LVT0, v | APIC_LVT_MASKED); v = apic_read(APIC_LVT1); - apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED); + apic_write(APIC_LVT1, v | APIC_LVT_MASKED); if (maxlvt >= 4) { v = apic_read(APIC_LVTPC); - apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED); + apic_write(APIC_LVTPC, v | APIC_LVT_MASKED); } /* lets not touch this if we didn't frob it */ -#ifdef CONFIG_X86_MCE_P4THERMAL +#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(X86_MCE_INTEL) if (maxlvt >= 5) { v = apic_read(APIC_LVTTHMR); - apic_write_around(APIC_LVTTHMR, v | APIC_LVT_MASKED); + apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); } #endif /* * Clean APIC state for other OSs: */ - apic_write_around(APIC_LVTT, APIC_LVT_MASKED); - apic_write_around(APIC_LVT0, APIC_LVT_MASKED); - apic_write_around(APIC_LVT1, APIC_LVT_MASKED); + apic_write(APIC_LVTT, APIC_LVT_MASKED); + apic_write(APIC_LVT0, APIC_LVT_MASKED); + apic_write(APIC_LVT1, APIC_LVT_MASKED); if (maxlvt >= 3) - apic_write_around(APIC_LVTERR, APIC_LVT_MASKED); + apic_write(APIC_LVTERR, APIC_LVT_MASKED); if (maxlvt >= 4) - apic_write_around(APIC_LVTPC, APIC_LVT_MASKED); + apic_write(APIC_LVTPC, APIC_LVT_MASKED); -#ifdef CONFIG_X86_MCE_P4THERMAL - if (maxlvt >= 5) - apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED); -#endif /* Integrated APIC (!82489DX) ? */ if (lapic_is_integrated()) { if (maxlvt > 3) @@ -746,7 +905,7 @@ void clear_local_APIC(void) */ void disable_local_APIC(void) { - unsigned long value; + unsigned int value; clear_local_APIC(); @@ -756,8 +915,9 @@ void disable_local_APIC(void) */ value = apic_read(APIC_SPIV); value &= ~APIC_SPIV_APIC_ENABLED; - apic_write_around(APIC_SPIV, value); + apic_write(APIC_SPIV, value); +#ifdef CONFIG_X86_32 /* * When LAPIC was disabled by the BIOS and enabled by the kernel, * restore the disabled state. @@ -769,6 +929,7 @@ void disable_local_APIC(void) l &= ~MSR_IA32_APICBASE_ENABLE; wrmsr(MSR_IA32_APICBASE, l, h); } +#endif } /* @@ -785,11 +946,15 @@ void lapic_shutdown(void) return; local_irq_save(flags); - clear_local_APIC(); - if (enabled_via_apicbase) +#ifdef CONFIG_X86_32 + if (!enabled_via_apicbase) + clear_local_APIC(); + else +#endif disable_local_APIC(); + local_irq_restore(flags); } @@ -834,6 +999,12 @@ int __init verify_local_APIC(void) */ reg0 = apic_read(APIC_ID); apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); + apic_write(APIC_ID, reg0 ^ APIC_ID_MASK); + reg1 = apic_read(APIC_ID); + apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1); + apic_write(APIC_ID, reg0); + if (reg1 != (reg0 ^ APIC_ID_MASK)) + return 0; /* * The next two are just to see if we have sane values. @@ -859,14 +1030,15 @@ void __init sync_Arb_IDs(void) */ if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD) return; + /* * Wait for idle. */ apic_wait_icr_idle(); apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); - apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG - | APIC_DM_INIT); + apic_write(APIC_ICR, APIC_DEST_ALLINC | + APIC_INT_LEVELTRIG | APIC_DM_INIT); } /* @@ -874,7 +1046,7 @@ void __init sync_Arb_IDs(void) */ void __init init_bsp_APIC(void) { - unsigned long value; + unsigned int value; /* * Don't do the setup now if we have a SMP BIOS as the @@ -895,60 +1067,66 @@ void __init init_bsp_APIC(void) value &= ~APIC_VECTOR_MASK; value |= APIC_SPIV_APIC_ENABLED; +#ifdef CONFIG_X86_32 /* This bit is reserved on P4/Xeon and should be cleared */ if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 15)) value &= ~APIC_SPIV_FOCUS_DISABLED; else +#endif value |= APIC_SPIV_FOCUS_DISABLED; value |= SPURIOUS_APIC_VECTOR; - apic_write_around(APIC_SPIV, value); + apic_write(APIC_SPIV, value); /* * Set up the virtual wire mode. */ - apic_write_around(APIC_LVT0, APIC_DM_EXTINT); + apic_write(APIC_LVT0, APIC_DM_EXTINT); value = APIC_DM_NMI; if (!lapic_is_integrated()) /* 82489DX */ value |= APIC_LVT_LEVEL_TRIGGER; - apic_write_around(APIC_LVT1, value); + apic_write(APIC_LVT1, value); } static void __cpuinit lapic_setup_esr(void) { - unsigned long oldvalue, value, maxlvt; - if (lapic_is_integrated() && !esr_disable) { - /* !82489DX */ - maxlvt = lapic_get_maxlvt(); - if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ - apic_write(APIC_ESR, 0); - oldvalue = apic_read(APIC_ESR); + unsigned int oldvalue, value, maxlvt; - /* enables sending errors */ - value = ERROR_APIC_VECTOR; - apic_write_around(APIC_LVTERR, value); + if (!lapic_is_integrated()) { + printk(KERN_INFO "No ESR for 82489DX.\n"); + return; + } + + if (esr_disable) { /* - * spec says clear errors after enabling vector. + * Something untraceable is creating bad interrupts on + * secondary quads ... for the moment, just leave the + * ESR disabled - we can't do anything useful with the + * errors anyway - mbligh */ - if (maxlvt > 3) - apic_write(APIC_ESR, 0); - value = apic_read(APIC_ESR); - if (value != oldvalue) - apic_printk(APIC_VERBOSE, "ESR value before enabling " - "vector: 0x%08lx after: 0x%08lx\n", - oldvalue, value); - } else { - if (esr_disable) - /* - * Something untraceable is creating bad interrupts on - * secondary quads ... for the moment, just leave the - * ESR disabled - we can't do anything useful with the - * errors anyway - mbligh - */ - printk(KERN_INFO "Leaving ESR disabled.\n"); - else - printk(KERN_INFO "No ESR for 82489DX.\n"); + printk(KERN_INFO "Leaving ESR disabled.\n"); + return; } + + maxlvt = lapic_get_maxlvt(); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + oldvalue = apic_read(APIC_ESR); + + /* enables sending errors */ + value = ERROR_APIC_VECTOR; + apic_write(APIC_LVTERR, value); + + /* + * spec says clear errors after enabling vector. + */ + if (maxlvt > 3) + apic_write(APIC_ESR, 0); + value = apic_read(APIC_ESR); + if (value != oldvalue) + apic_printk(APIC_VERBOSE, "ESR value before enabling " + "vector: 0x%08x after: 0x%08x\n", + oldvalue, value); } @@ -957,24 +1135,27 @@ static void __cpuinit lapic_setup_esr(void) */ void __cpuinit setup_local_APIC(void) { - unsigned long value, integrated; + unsigned int value; int i, j; +#ifdef CONFIG_X86_32 /* Pound the ESR really hard over the head with a big hammer - mbligh */ - if (esr_disable) { + if (lapic_is_integrated() && esr_disable) { apic_write(APIC_ESR, 0); apic_write(APIC_ESR, 0); apic_write(APIC_ESR, 0); apic_write(APIC_ESR, 0); } +#endif - integrated = lapic_is_integrated(); + preempt_disable(); /* * Double-check whether this APIC is really registered. + * This is meaningless in clustered apic mode, so we skip it. */ if (!apic_id_registered()) - WARN_ON_ONCE(1); + BUG(); /* * Intel recommends to set DFR, LDR and TPR before enabling @@ -989,7 +1170,7 @@ void __cpuinit setup_local_APIC(void) */ value = apic_read(APIC_TASKPRI); value &= ~APIC_TPRI_MASK; - apic_write_around(APIC_TASKPRI, value); + apic_write(APIC_TASKPRI, value); /* * After a crash, we no longer service the interrupts and a pending @@ -1020,6 +1201,7 @@ void __cpuinit setup_local_APIC(void) */ value |= APIC_SPIV_APIC_ENABLED; +#ifdef CONFIG_X86_32 /* * Some unknown Intel IO/APIC (or APIC) errata is biting us with * certain networking cards. If high frequency interrupts are @@ -1040,14 +1222,19 @@ void __cpuinit setup_local_APIC(void) * See also the comment in end_level_ioapic_irq(). --macro */ - /* Enable focus processor (bit==0) */ + /* + * - enable focus processor (bit==0) + * - 64bit mode always use processor focus + * so no need to set it + */ value &= ~APIC_SPIV_FOCUS_DISABLED; +#endif /* * Set spurious IRQ vector */ value |= SPURIOUS_APIC_VECTOR; - apic_write_around(APIC_SPIV, value); + apic_write(APIC_SPIV, value); /* * Set up LVT0, LVT1: @@ -1069,7 +1256,7 @@ void __cpuinit setup_local_APIC(void) apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", smp_processor_id()); } - apic_write_around(APIC_LVT0, value); + apic_write(APIC_LVT0, value); /* * only the BP should see the LINT1 NMI signal, obviously. @@ -1078,25 +1265,178 @@ void __cpuinit setup_local_APIC(void) value = APIC_DM_NMI; else value = APIC_DM_NMI | APIC_LVT_MASKED; - if (!integrated) /* 82489DX */ + if (!lapic_is_integrated()) /* 82489DX */ value |= APIC_LVT_LEVEL_TRIGGER; - apic_write_around(APIC_LVT1, value); + apic_write(APIC_LVT1, value); + + preempt_enable(); } void __cpuinit end_local_APIC_setup(void) { - unsigned long value; - lapic_setup_esr(); - /* Disable the local apic timer */ - value = apic_read(APIC_LVTT); - value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); - apic_write_around(APIC_LVTT, value); + +#ifdef CONFIG_X86_32 + { + unsigned int value; + /* Disable the local apic timer */ + value = apic_read(APIC_LVTT); + value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); + apic_write(APIC_LVTT, value); + } +#endif setup_apic_nmi_watchdog(NULL); apic_pm_activate(); } +#ifdef HAVE_X2APIC +void check_x2apic(void) +{ + int msr, msr2; + + rdmsr(MSR_IA32_APICBASE, msr, msr2); + + if (msr & X2APIC_ENABLE) { + printk("x2apic enabled by BIOS, switching to x2apic ops\n"); + x2apic_preenabled = x2apic = 1; + apic_ops = &x2apic_ops; + } +} + +void enable_x2apic(void) +{ + int msr, msr2; + + rdmsr(MSR_IA32_APICBASE, msr, msr2); + if (!(msr & X2APIC_ENABLE)) { + printk("Enabling x2apic\n"); + wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0); + } +} + +void enable_IR_x2apic(void) +{ +#ifdef CONFIG_INTR_REMAP + int ret; + unsigned long flags; + + if (!cpu_has_x2apic) + return; + + if (!x2apic_preenabled && disable_x2apic) { + printk(KERN_INFO + "Skipped enabling x2apic and Interrupt-remapping " + "because of nox2apic\n"); + return; + } + + if (x2apic_preenabled && disable_x2apic) + panic("Bios already enabled x2apic, can't enforce nox2apic"); + + if (!x2apic_preenabled && skip_ioapic_setup) { + printk(KERN_INFO + "Skipped enabling x2apic and Interrupt-remapping " + "because of skipping io-apic setup\n"); + return; + } + + ret = dmar_table_init(); + if (ret) { + printk(KERN_INFO + "dmar_table_init() failed with %d:\n", ret); + + if (x2apic_preenabled) + panic("x2apic enabled by bios. But IR enabling failed"); + else + printk(KERN_INFO + "Not enabling x2apic,Intr-remapping\n"); + return; + } + + local_irq_save(flags); + mask_8259A(); + + ret = save_mask_IO_APIC_setup(); + if (ret) { + printk(KERN_INFO "Saving IO-APIC state failed: %d\n", ret); + goto end; + } + + ret = enable_intr_remapping(1); + + if (ret && x2apic_preenabled) { + local_irq_restore(flags); + panic("x2apic enabled by bios. But IR enabling failed"); + } + + if (ret) + goto end_restore; + + if (!x2apic) { + x2apic = 1; + apic_ops = &x2apic_ops; + enable_x2apic(); + } + +end_restore: + if (ret) + /* + * IR enabling failed + */ + restore_IO_APIC_setup(); + else + reinit_intr_remapped_IO_APIC(x2apic_preenabled); + +end: + unmask_8259A(); + local_irq_restore(flags); + + if (!ret) { + if (!x2apic_preenabled) + printk(KERN_INFO + "Enabled x2apic and interrupt-remapping\n"); + else + printk(KERN_INFO + "Enabled Interrupt-remapping\n"); + } else + printk(KERN_ERR + "Failed to enable Interrupt-remapping and x2apic\n"); +#else + if (!cpu_has_x2apic) + return; + + if (x2apic_preenabled) + panic("x2apic enabled prior OS handover," + " enable CONFIG_INTR_REMAP"); + + printk(KERN_INFO "Enable CONFIG_INTR_REMAP for enabling intr-remapping " + " and x2apic\n"); +#endif + + return; +} +#endif /* HAVE_X2APIC */ + +#ifdef CONFIG_X86_64 +/* + * Detect and enable local APICs on non-SMP boards. + * Original code written by Keir Fraser. + * On AMD64 we trust the BIOS - if it says no APIC it is likely + * not correctly set up (usually the APIC timer won't work etc.) + */ +static int __init detect_init_APIC(void) +{ + if (!cpu_has_apic) { + printk(KERN_INFO "No local APIC present\n"); + return -1; + } + + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + boot_cpu_physical_apicid = 0; + return 0; +} +#else /* * Detect and initialize APIC */ @@ -1175,12 +1515,46 @@ no_apic: printk(KERN_INFO "No local APIC present or hardware disabled\n"); return -1; } +#endif + +#ifdef CONFIG_X86_64 +void __init early_init_lapic_mapping(void) +{ + unsigned long phys_addr; + + /* + * If no local APIC can be found then go out + * : it means there is no mpatable and MADT + */ + if (!smp_found_config) + return; + + phys_addr = mp_lapic_addr; + + set_fixmap_nocache(FIX_APIC_BASE, phys_addr); + apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", + APIC_BASE, phys_addr); + + /* + * Fetch the APIC ID of the BSP in case we have a + * default configuration (or the MP table is broken). + */ + boot_cpu_physical_apicid = read_apic_id(); +} +#endif /** * init_apic_mappings - initialize APIC mappings */ void __init init_apic_mappings(void) { +#ifdef HAVE_X2APIC + if (x2apic) { + boot_cpu_physical_apicid = read_apic_id(); + return; + } +#endif + /* * If no local APIC can be found then set up a fake all * zeroes page to simulate the local APIC and another @@ -1193,30 +1567,36 @@ void __init init_apic_mappings(void) apic_phys = mp_lapic_addr; set_fixmap_nocache(FIX_APIC_BASE, apic_phys); - printk(KERN_DEBUG "mapped APIC to %08lx (%08lx)\n", APIC_BASE, - apic_phys); + apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", + APIC_BASE, apic_phys); /* * Fetch the APIC ID of the BSP in case we have a * default configuration (or the MP table is broken). */ if (boot_cpu_physical_apicid == -1U) - boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); - + boot_cpu_physical_apicid = read_apic_id(); } /* * This initializes the IO-APIC and APIC hardware if this is * a UP kernel. */ - int apic_version[MAX_APICS]; int __init APIC_init_uniprocessor(void) { - if (disable_apic) - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); - +#ifdef CONFIG_X86_64 + if (disable_apic) { + printk(KERN_INFO "Apic disabled\n"); + return -1; + } + if (!cpu_has_apic) { + disable_apic = 1; + printk(KERN_INFO "Apic disabled by BIOS\n"); + return -1; + } +#else if (!smp_found_config && !cpu_has_apic) return -1; @@ -1225,39 +1605,68 @@ int __init APIC_init_uniprocessor(void) */ if (!cpu_has_apic && APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { - printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", + printk(KERN_ERR "BIOS bug, local APIC 0x%x not detected!...\n", boot_cpu_physical_apicid); clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); return -1; } +#endif - verify_local_APIC(); +#ifdef HAVE_X2APIC + enable_IR_x2apic(); +#endif +#ifdef CONFIG_X86_64 + setup_apic_routing(); +#endif + verify_local_APIC(); connect_bsp_APIC(); +#ifdef CONFIG_X86_64 + apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid)); +#else /* * Hack: In case of kdump, after a crash, kernel might be booting * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid * might be zero if read from MP tables. Get it from LAPIC. */ -#ifdef CONFIG_CRASH_DUMP - boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); +# ifdef CONFIG_CRASH_DUMP + boot_cpu_physical_apicid = read_apic_id(); +# endif #endif physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); - setup_local_APIC(); +#ifdef CONFIG_X86_64 + /* + * Now enable IO-APICs, actually call clear_IO_APIC + * We need clear_IO_APIC before enabling vector on BP + */ + if (!skip_ioapic_setup && nr_ioapics) + enable_IO_APIC(); +#endif + #ifdef CONFIG_X86_IO_APIC if (!smp_found_config || skip_ioapic_setup || !nr_ioapics) #endif localise_nmi_watchdog(); end_local_APIC_setup(); + #ifdef CONFIG_X86_IO_APIC - if (smp_found_config) - if (!skip_ioapic_setup && nr_ioapics) - setup_IO_APIC(); + if (smp_found_config && !skip_ioapic_setup && nr_ioapics) + setup_IO_APIC(); +# ifdef CONFIG_X86_64 + else + nr_ioapics = 0; +# endif #endif + +#ifdef CONFIG_X86_64 + setup_boot_APIC_clock(); + check_nmi_watchdog(); +#else setup_boot_clock(); +#endif return 0; } @@ -1271,8 +1680,11 @@ int __init APIC_init_uniprocessor(void) */ void smp_spurious_interrupt(struct pt_regs *regs) { - unsigned long v; + u32 v; +#ifdef CONFIG_X86_64 + exit_idle(); +#endif irq_enter(); /* * Check if this really is a spurious interrupt and ACK it @@ -1283,10 +1695,14 @@ void smp_spurious_interrupt(struct pt_regs *regs) if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) ack_APIC_irq(); +#ifdef CONFIG_X86_64 + add_pda(irq_spurious_count, 1); +#else /* see sw-dev-man vol 3, chapter 7.4.13.5 */ printk(KERN_INFO "spurious APIC interrupt on CPU#%d, " "should never happen.\n", smp_processor_id()); __get_cpu_var(irq_stat).irq_spurious_count++; +#endif irq_exit(); } @@ -1295,8 +1711,11 @@ void smp_spurious_interrupt(struct pt_regs *regs) */ void smp_error_interrupt(struct pt_regs *regs) { - unsigned long v, v1; + u32 v, v1; +#ifdef CONFIG_X86_64 + exit_idle(); +#endif irq_enter(); /* First tickle the hardware, only then report what went on. -- REW */ v = apic_read(APIC_ESR); @@ -1315,64 +1734,17 @@ void smp_error_interrupt(struct pt_regs *regs) 6: Received illegal vector 7: Illegal register address */ - printk(KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n", + printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", smp_processor_id(), v , v1); irq_exit(); } -#ifdef CONFIG_SMP -void __init smp_intr_init(void) -{ - /* - * IRQ0 must be given a fixed assignment and initialized, - * because it's used before the IO-APIC is set up. - */ - set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]); - - /* - * The reschedule interrupt is a CPU-to-CPU reschedule-helper - * IPI, driven by wakeup. - */ - alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); - - /* IPI for invalidation */ - alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); - - /* IPI for generic function call */ - alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); - - /* IPI for single call function */ - set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, - call_function_single_interrupt); -} -#endif - -/* - * Initialize APIC interrupts - */ -void __init apic_intr_init(void) -{ -#ifdef CONFIG_SMP - smp_intr_init(); -#endif - /* self generated IPI for local APIC timer */ - alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); - - /* IPI vectors for APIC spurious and error interrupts */ - alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); - alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); - - /* thermal monitor LVT interrupt */ -#ifdef CONFIG_X86_MCE_P4THERMAL - alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); -#endif -} - /** * connect_bsp_APIC - attach the APIC to the interrupt system */ void __init connect_bsp_APIC(void) { +#ifdef CONFIG_X86_32 if (pic_mode) { /* * Do not trust the local APIC being empty at bootup. @@ -1387,6 +1759,7 @@ void __init connect_bsp_APIC(void) outb(0x70, 0x22); outb(0x01, 0x23); } +#endif enable_apic_mode(); } @@ -1399,6 +1772,9 @@ void __init connect_bsp_APIC(void) */ void disconnect_bsp_APIC(int virt_wire_setup) { + unsigned int value; + +#ifdef CONFIG_X86_32 if (pic_mode) { /* * Put the board back into PIC mode (has an effect only on @@ -1410,56 +1786,53 @@ void disconnect_bsp_APIC(int virt_wire_setup) "entering PIC mode.\n"); outb(0x70, 0x22); outb(0x00, 0x23); - } else { - /* Go back to Virtual Wire compatibility mode */ - unsigned long value; - - /* For the spurious interrupt use vector F, and enable it */ - value = apic_read(APIC_SPIV); - value &= ~APIC_VECTOR_MASK; - value |= APIC_SPIV_APIC_ENABLED; - value |= 0xf; - apic_write_around(APIC_SPIV, value); - - if (!virt_wire_setup) { - /* - * For LVT0 make it edge triggered, active high, - * external and enabled - */ - value = apic_read(APIC_LVT0); - value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | - APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); - value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); - apic_write_around(APIC_LVT0, value); - } else { - /* Disable LVT0 */ - apic_write_around(APIC_LVT0, APIC_LVT_MASKED); - } + return; + } +#endif + + /* Go back to Virtual Wire compatibility mode */ + + /* For the spurious interrupt use vector F, and enable it */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + value |= 0xf; + apic_write(APIC_SPIV, value); + if (!virt_wire_setup) { /* - * For LVT1 make it edge triggered, active high, nmi and - * enabled + * For LVT0 make it edge triggered, active high, + * external and enabled */ - value = apic_read(APIC_LVT1); - value &= ~( - APIC_MODE_MASK | APIC_SEND_PENDING | + value = apic_read(APIC_LVT0); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); - apic_write_around(APIC_LVT1, value); + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); + apic_write(APIC_LVT0, value); + } else { + /* Disable LVT0 */ + apic_write(APIC_LVT0, APIC_LVT_MASKED); } -} -unsigned int __cpuinitdata maxcpus = NR_CPUS; + /* + * For LVT1 make it edge triggered, active high, + * nmi and enabled + */ + value = apic_read(APIC_LVT1); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); + apic_write(APIC_LVT1, value); +} void __cpuinit generic_processor_info(int apicid, int version) { int cpu; cpumask_t tmp_map; - physid_mask_t phys_cpu; /* * Validate version @@ -1472,36 +1845,29 @@ void __cpuinit generic_processor_info(int apicid, int version) } apic_version[apicid] = version; - phys_cpu = apicid_to_cpu_present(apicid); - physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu); - if (num_processors >= NR_CPUS) { printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." " Processor ignored.\n", NR_CPUS); return; } - if (num_processors >= maxcpus) { - printk(KERN_WARNING "WARNING: maxcpus limit of %i reached." - " Processor ignored.\n", maxcpus); - return; - } - num_processors++; cpus_complement(tmp_map, cpu_present_map); cpu = first_cpu(tmp_map); - if (apicid == boot_cpu_physical_apicid) + physid_set(apicid, phys_cpu_present_map); + if (apicid == boot_cpu_physical_apicid) { /* * x86_bios_cpu_apicid is required to have processors listed * in same order as logical cpu numbers. Hence the first * entry is BSP, and so on. */ cpu = 0; - + } if (apicid > max_physical_apicid) max_physical_apicid = apicid; +#ifdef CONFIG_X86_32 /* * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y * but we need to work other dependencies like SMP_SUSPEND etc @@ -1521,7 +1887,9 @@ void __cpuinit generic_processor_info(int apicid, int version) def_to_bigsmp = 1; } } -#ifdef CONFIG_SMP +#endif + +#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64) /* are we being called early in kernel startup? */ if (early_per_cpu_ptr(x86_cpu_to_apicid)) { u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); @@ -1534,16 +1902,29 @@ void __cpuinit generic_processor_info(int apicid, int version) per_cpu(x86_bios_cpu_apicid, cpu) = apicid; } #endif + cpu_set(cpu, cpu_possible_map); cpu_set(cpu, cpu_present_map); } +#ifdef CONFIG_X86_64 +int hard_smp_processor_id(void) +{ + return read_apic_id(); +} +#endif + /* * Power management */ #ifdef CONFIG_PM static struct { + /* + * 'active' is true if the local APIC was enabled by us and + * not the BIOS; this signifies that we are also responsible + * for disabling it before entering apm/acpi suspend + */ int active; /* r/w apic fields */ unsigned int apic_id; @@ -1584,7 +1965,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state) apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); apic_pm_state.apic_tmict = apic_read(APIC_TMICT); apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); -#ifdef CONFIG_X86_MCE_P4THERMAL +#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) if (maxlvt >= 5) apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); #endif @@ -1608,16 +1989,23 @@ static int lapic_resume(struct sys_device *dev) local_irq_save(flags); - /* - * Make sure the APICBASE points to the right address - * - * FIXME! This will be wrong if we ever support suspend on - * SMP! We'll need to do this as part of the CPU restore! - */ - rdmsr(MSR_IA32_APICBASE, l, h); - l &= ~MSR_IA32_APICBASE_BASE; - l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; - wrmsr(MSR_IA32_APICBASE, l, h); +#ifdef HAVE_X2APIC + if (x2apic) + enable_x2apic(); + else +#endif + { + /* + * Make sure the APICBASE points to the right address + * + * FIXME! This will be wrong if we ever support suspend on + * SMP! We'll need to do this as part of the CPU restore! + */ + rdmsr(MSR_IA32_APICBASE, l, h); + l &= ~MSR_IA32_APICBASE_BASE; + l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; + wrmsr(MSR_IA32_APICBASE, l, h); + } apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); apic_write(APIC_ID, apic_pm_state.apic_id); @@ -1627,7 +2015,7 @@ static int lapic_resume(struct sys_device *dev) apic_write(APIC_SPIV, apic_pm_state.apic_spiv); apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); -#ifdef CONFIG_X86_MCE_P4THERMAL +#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) if (maxlvt >= 5) apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); #endif @@ -1641,7 +2029,9 @@ static int lapic_resume(struct sys_device *dev) apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); apic_write(APIC_ESR, 0); apic_read(APIC_ESR); + local_irq_restore(flags); + return 0; } @@ -1661,7 +2051,7 @@ static struct sys_device device_lapic = { .cls = &lapic_sysclass, }; -static void __devinit apic_pm_activate(void) +static void __cpuinit apic_pm_activate(void) { apic_pm_state.active = 1; } @@ -1687,30 +2077,101 @@ static void apic_pm_activate(void) { } #endif /* CONFIG_PM */ +#ifdef CONFIG_X86_64 /* - * APIC command line parameters + * apic_is_clustered_box() -- Check if we can expect good TSC + * + * Thus far, the major user of this is IBM's Summit2 series: + * + * Clustered boxes may have unsynced TSC problems if they are + * multi-chassis. Use available data to take a good guess. + * If in doubt, go HPET. */ -static int __init parse_lapic(char *arg) +__cpuinit int apic_is_clustered_box(void) { - force_enable_local_apic = 1; - return 0; + int i, clusters, zeros; + unsigned id; + u16 *bios_cpu_apicid; + DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); + + /* + * there is not this kind of box with AMD CPU yet. + * Some AMD box with quadcore cpu and 8 sockets apicid + * will be [4, 0x23] or [8, 0x27] could be thought to + * vsmp box still need checking... + */ + if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box()) + return 0; + + bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); + bitmap_zero(clustermap, NUM_APIC_CLUSTERS); + + for (i = 0; i < NR_CPUS; i++) { + /* are we being called early in kernel startup? */ + if (bios_cpu_apicid) { + id = bios_cpu_apicid[i]; + } + else if (i < nr_cpu_ids) { + if (cpu_present(i)) + id = per_cpu(x86_bios_cpu_apicid, i); + else + continue; + } + else + break; + + if (id != BAD_APICID) + __set_bit(APIC_CLUSTERID(id), clustermap); + } + + /* Problem: Partially populated chassis may not have CPUs in some of + * the APIC clusters they have been allocated. Only present CPUs have + * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap. + * Since clusters are allocated sequentially, count zeros only if + * they are bounded by ones. + */ + clusters = 0; + zeros = 0; + for (i = 0; i < NUM_APIC_CLUSTERS; i++) { + if (test_bit(i, clustermap)) { + clusters += 1 + zeros; + zeros = 0; + } else + ++zeros; + } + + /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are + * not guaranteed to be synced between boards + */ + if (is_vsmp_box() && clusters > 1) + return 1; + + /* + * If clusters > 2, then should be multi-chassis. + * May have to revisit this when multi-core + hyperthreaded CPUs come + * out, but AFAIK this will work even for them. + */ + return (clusters > 2); } -early_param("lapic", parse_lapic); +#endif -static int __init parse_nolapic(char *arg) +/* + * APIC command line parameters + */ +static int __init setup_disableapic(char *arg) { disable_apic = 1; - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); + setup_clear_cpu_cap(X86_FEATURE_APIC); return 0; } -early_param("nolapic", parse_nolapic); +early_param("disableapic", setup_disableapic); -static int __init parse_disable_lapic_timer(char *arg) +/* same as disableapic, for compatibility */ +static int __init setup_nolapic(char *arg) { - local_apic_timer_disabled = 1; - return 0; + return setup_disableapic(arg); } -early_param("nolapic_timer", parse_disable_lapic_timer); +early_param("nolapic", setup_nolapic); static int __init parse_lapic_timer_c2_ok(char *arg) { @@ -1719,15 +2180,43 @@ static int __init parse_lapic_timer_c2_ok(char *arg) } early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); -static int __init apic_set_verbosity(char *str) +static int __init parse_disable_apic_timer(char *arg) { - if (strcmp("debug", str) == 0) + disable_apic_timer = 1; + return 0; +} +early_param("noapictimer", parse_disable_apic_timer); + +static int __init parse_nolapic_timer(char *arg) +{ + disable_apic_timer = 1; + return 0; +} +early_param("nolapic_timer", parse_nolapic_timer); + +static int __init apic_set_verbosity(char *arg) +{ + if (!arg) { +#ifdef CONFIG_X86_64 + skip_ioapic_setup = 0; + return 0; +#endif + return -EINVAL; + } + + if (strcmp("debug", arg) == 0) apic_verbosity = APIC_DEBUG; - else if (strcmp("verbose", str) == 0) + else if (strcmp("verbose", arg) == 0) apic_verbosity = APIC_VERBOSE; - return 1; + else { + printk(KERN_WARNING "APIC Verbosity level %s not recognised" + " use apic=verbose or apic=debug\n", arg); + return -EINVAL; + } + + return 0; } -__setup("apic=", apic_set_verbosity); +early_param("apic", apic_set_verbosity); static int __init lapic_insert_resource(void) { diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c deleted file mode 100644 index 1e3d32e27c14..000000000000 --- a/arch/x86/kernel/apic_64.c +++ /dev/null @@ -1,1393 +0,0 @@ -/* - * Local APIC handling, local APIC timers - * - * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com> - * - * Fixes - * Maciej W. Rozycki : Bits for genuine 82489DX APICs; - * thanks to Eric Gilmore - * and Rolf G. Tews - * for testing these extensively. - * Maciej W. Rozycki : Various updates and fixes. - * Mikael Pettersson : Power Management for UP-APIC. - * Pavel Machek and - * Mikael Pettersson : PM converted to driver model. - */ - -#include <linux/init.h> - -#include <linux/mm.h> -#include <linux/delay.h> -#include <linux/bootmem.h> -#include <linux/interrupt.h> -#include <linux/mc146818rtc.h> -#include <linux/kernel_stat.h> -#include <linux/sysdev.h> -#include <linux/ioport.h> -#include <linux/clockchips.h> -#include <linux/acpi_pmtmr.h> -#include <linux/module.h> - -#include <asm/atomic.h> -#include <asm/smp.h> -#include <asm/mtrr.h> -#include <asm/mpspec.h> -#include <asm/hpet.h> -#include <asm/pgalloc.h> -#include <asm/nmi.h> -#include <asm/idle.h> -#include <asm/proto.h> -#include <asm/timex.h> -#include <asm/apic.h> - -#include <mach_ipi.h> -#include <mach_apic.h> - -static int disable_apic_timer __cpuinitdata; -static int apic_calibrate_pmtmr __initdata; -int disable_apic; - -/* Local APIC timer works in C2 */ -int local_apic_timer_c2_ok; -EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); - -/* - * Debug level, exported for io_apic.c - */ -int apic_verbosity; - -/* Have we found an MP table */ -int smp_found_config; - -static struct resource lapic_resource = { - .name = "Local APIC", - .flags = IORESOURCE_MEM | IORESOURCE_BUSY, -}; - -static unsigned int calibration_result; - -static int lapic_next_event(unsigned long delta, - struct clock_event_device *evt); -static void lapic_timer_setup(enum clock_event_mode mode, - struct clock_event_device *evt); -static void lapic_timer_broadcast(cpumask_t mask); -static void apic_pm_activate(void); - -static struct clock_event_device lapic_clockevent = { - .name = "lapic", - .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT - | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY, - .shift = 32, - .set_mode = lapic_timer_setup, - .set_next_event = lapic_next_event, - .broadcast = lapic_timer_broadcast, - .rating = 100, - .irq = -1, -}; -static DEFINE_PER_CPU(struct clock_event_device, lapic_events); - -static unsigned long apic_phys; - -unsigned long mp_lapic_addr; - -unsigned int __cpuinitdata maxcpus = NR_CPUS; -/* - * Get the LAPIC version - */ -static inline int lapic_get_version(void) -{ - return GET_APIC_VERSION(apic_read(APIC_LVR)); -} - -/* - * Check, if the APIC is integrated or a seperate chip - */ -static inline int lapic_is_integrated(void) -{ - return 1; -} - -/* - * Check, whether this is a modern or a first generation APIC - */ -static int modern_apic(void) -{ - /* AMD systems use old APIC versions, so check the CPU */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 >= 0xf) - return 1; - return lapic_get_version() >= 0x14; -} - -void apic_wait_icr_idle(void) -{ - while (apic_read(APIC_ICR) & APIC_ICR_BUSY) - cpu_relax(); -} - -u32 safe_apic_wait_icr_idle(void) -{ - u32 send_status; - int timeout; - - timeout = 0; - do { - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - if (!send_status) - break; - udelay(100); - } while (timeout++ < 1000); - - return send_status; -} - -/** - * enable_NMI_through_LVT0 - enable NMI through local vector table 0 - */ -void __cpuinit enable_NMI_through_LVT0(void) -{ - unsigned int v; - - /* unmask and set to NMI */ - v = APIC_DM_NMI; - apic_write(APIC_LVT0, v); -} - -/** - * lapic_get_maxlvt - get the maximum number of local vector table entries - */ -int lapic_get_maxlvt(void) -{ - unsigned int v, maxlvt; - - v = apic_read(APIC_LVR); - maxlvt = GET_APIC_MAXLVT(v); - return maxlvt; -} - -/* - * This function sets up the local APIC timer, with a timeout of - * 'clocks' APIC bus clock. During calibration we actually call - * this function twice on the boot CPU, once with a bogus timeout - * value, second time for real. The other (noncalibrating) CPUs - * call this function only once, with the real, calibrated value. - * - * We do reads before writes even if unnecessary, to get around the - * P5 APIC double write bug. - */ - -static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) -{ - unsigned int lvtt_value, tmp_value; - - lvtt_value = LOCAL_TIMER_VECTOR; - if (!oneshot) - lvtt_value |= APIC_LVT_TIMER_PERIODIC; - if (!irqen) - lvtt_value |= APIC_LVT_MASKED; - - apic_write(APIC_LVTT, lvtt_value); - - /* - * Divide PICLK by 16 - */ - tmp_value = apic_read(APIC_TDCR); - apic_write(APIC_TDCR, (tmp_value - & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) - | APIC_TDR_DIV_16); - - if (!oneshot) - apic_write(APIC_TMICT, clocks); -} - -/* - * Setup extended LVT, AMD specific (K8, family 10h) - * - * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and - * MCE interrupts are supported. Thus MCE offset must be set to 0. - */ - -#define APIC_EILVT_LVTOFF_MCE 0 -#define APIC_EILVT_LVTOFF_IBS 1 - -static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask) -{ - unsigned long reg = (lvt_off << 4) + APIC_EILVT0; - unsigned int v = (mask << 16) | (msg_type << 8) | vector; - - apic_write(reg, v); -} - -u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask) -{ - setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask); - return APIC_EILVT_LVTOFF_MCE; -} - -u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) -{ - setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); - return APIC_EILVT_LVTOFF_IBS; -} - -/* - * Program the next event, relative to now - */ -static int lapic_next_event(unsigned long delta, - struct clock_event_device *evt) -{ - apic_write(APIC_TMICT, delta); - return 0; -} - -/* - * Setup the lapic timer in periodic or oneshot mode - */ -static void lapic_timer_setup(enum clock_event_mode mode, - struct clock_event_device *evt) -{ - unsigned long flags; - unsigned int v; - - /* Lapic used as dummy for broadcast ? */ - if (evt->features & CLOCK_EVT_FEAT_DUMMY) - return; - - local_irq_save(flags); - - switch (mode) { - case CLOCK_EVT_MODE_PERIODIC: - case CLOCK_EVT_MODE_ONESHOT: - __setup_APIC_LVTT(calibration_result, - mode != CLOCK_EVT_MODE_PERIODIC, 1); - break; - case CLOCK_EVT_MODE_UNUSED: - case CLOCK_EVT_MODE_SHUTDOWN: - v = apic_read(APIC_LVTT); - v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); - apic_write(APIC_LVTT, v); - break; - case CLOCK_EVT_MODE_RESUME: - /* Nothing to do here */ - break; - } - - local_irq_restore(flags); -} - -/* - * Local APIC timer broadcast function - */ -static void lapic_timer_broadcast(cpumask_t mask) -{ -#ifdef CONFIG_SMP - send_IPI_mask(mask, LOCAL_TIMER_VECTOR); -#endif -} - -/* - * Setup the local APIC timer for this CPU. Copy the initilized values - * of the boot CPU and register the clock event in the framework. - */ -static void setup_APIC_timer(void) -{ - struct clock_event_device *levt = &__get_cpu_var(lapic_events); - - memcpy(levt, &lapic_clockevent, sizeof(*levt)); - levt->cpumask = cpumask_of_cpu(smp_processor_id()); - - clockevents_register_device(levt); -} - -/* - * In this function we calibrate APIC bus clocks to the external - * timer. Unfortunately we cannot use jiffies and the timer irq - * to calibrate, since some later bootup code depends on getting - * the first irq? Ugh. - * - * We want to do the calibration only once since we - * want to have local timer irqs syncron. CPUs connected - * by the same APIC bus have the very same bus frequency. - * And we want to have irqs off anyways, no accidental - * APIC irq that way. - */ - -#define TICK_COUNT 100000000 - -static void __init calibrate_APIC_clock(void) -{ - unsigned apic, apic_start; - unsigned long tsc, tsc_start; - int result; - - local_irq_disable(); - - /* - * Put whatever arbitrary (but long enough) timeout - * value into the APIC clock, we just want to get the - * counter running for calibration. - * - * No interrupt enable ! - */ - __setup_APIC_LVTT(250000000, 0, 0); - - apic_start = apic_read(APIC_TMCCT); -#ifdef CONFIG_X86_PM_TIMER - if (apic_calibrate_pmtmr && pmtmr_ioport) { - pmtimer_wait(5000); /* 5ms wait */ - apic = apic_read(APIC_TMCCT); - result = (apic_start - apic) * 1000L / 5; - } else -#endif - { - rdtscll(tsc_start); - - do { - apic = apic_read(APIC_TMCCT); - rdtscll(tsc); - } while ((tsc - tsc_start) < TICK_COUNT && - (apic_start - apic) < TICK_COUNT); - - result = (apic_start - apic) * 1000L * tsc_khz / - (tsc - tsc_start); - } - - local_irq_enable(); - - printk(KERN_DEBUG "APIC timer calibration result %d\n", result); - - printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n", - result / 1000 / 1000, result / 1000 % 1000); - - /* Calculate the scaled math multiplication factor */ - lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC, - lapic_clockevent.shift); - lapic_clockevent.max_delta_ns = - clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); - lapic_clockevent.min_delta_ns = - clockevent_delta2ns(0xF, &lapic_clockevent); - - calibration_result = result / HZ; -} - -/* - * Setup the boot APIC - * - * Calibrate and verify the result. - */ -void __init setup_boot_APIC_clock(void) -{ - /* - * The local apic timer can be disabled via the kernel commandline. - * Register the lapic timer as a dummy clock event source on SMP - * systems, so the broadcast mechanism is used. On UP systems simply - * ignore it. - */ - if (disable_apic_timer) { - printk(KERN_INFO "Disabling APIC timer\n"); - /* No broadcast on UP ! */ - if (num_possible_cpus() > 1) { - lapic_clockevent.mult = 1; - setup_APIC_timer(); - } - return; - } - - printk(KERN_INFO "Using local APIC timer interrupts.\n"); - calibrate_APIC_clock(); - - /* - * Do a sanity check on the APIC calibration result - */ - if (calibration_result < (1000000 / HZ)) { - printk(KERN_WARNING - "APIC frequency too slow, disabling apic timer\n"); - /* No broadcast on UP ! */ - if (num_possible_cpus() > 1) - setup_APIC_timer(); - return; - } - - /* - * If nmi_watchdog is set to IO_APIC, we need the - * PIT/HPET going. Otherwise register lapic as a dummy - * device. - */ - if (nmi_watchdog != NMI_IO_APIC) - lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; - else - printk(KERN_WARNING "APIC timer registered as dummy," - " due to nmi_watchdog=%d!\n", nmi_watchdog); - - setup_APIC_timer(); -} - -void __cpuinit setup_secondary_APIC_clock(void) -{ - setup_APIC_timer(); -} - -/* - * The guts of the apic timer interrupt - */ -static void local_apic_timer_interrupt(void) -{ - int cpu = smp_processor_id(); - struct clock_event_device *evt = &per_cpu(lapic_events, cpu); - - /* - * Normally we should not be here till LAPIC has been initialized but - * in some cases like kdump, its possible that there is a pending LAPIC - * timer interrupt from previous kernel's context and is delivered in - * new kernel the moment interrupts are enabled. - * - * Interrupts are enabled early and LAPIC is setup much later, hence - * its possible that when we get here evt->event_handler is NULL. - * Check for event_handler being NULL and discard the interrupt as - * spurious. - */ - if (!evt->event_handler) { - printk(KERN_WARNING - "Spurious LAPIC timer interrupt on cpu %d\n", cpu); - /* Switch it off */ - lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt); - return; - } - - /* - * the NMI deadlock-detector uses this. - */ - add_pda(apic_timer_irqs, 1); - - evt->event_handler(evt); -} - -/* - * Local APIC timer interrupt. This is the most natural way for doing - * local interrupts, but local timer interrupts can be emulated by - * broadcast interrupts too. [in case the hw doesn't support APIC timers] - * - * [ if a single-CPU system runs an SMP kernel then we call the local - * interrupt as well. Thus we cannot inline the local irq ... ] - */ -void smp_apic_timer_interrupt(struct pt_regs *regs) -{ - struct pt_regs *old_regs = set_irq_regs(regs); - - /* - * NOTE! We'd better ACK the irq immediately, - * because timer handling can be slow. - */ - ack_APIC_irq(); - /* - * update_process_times() expects us to have done irq_enter(). - * Besides, if we don't timer interrupts ignore the global - * interrupt lock, which is the WrongThing (tm) to do. - */ - exit_idle(); - irq_enter(); - local_apic_timer_interrupt(); - irq_exit(); - set_irq_regs(old_regs); -} - -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - - -/* - * Local APIC start and shutdown - */ - -/** - * clear_local_APIC - shutdown the local APIC - * - * This is called, when a CPU is disabled and before rebooting, so the state of - * the local APIC has no dangling leftovers. Also used to cleanout any BIOS - * leftovers during boot. - */ -void clear_local_APIC(void) -{ - int maxlvt; - u32 v; - - /* APIC hasn't been mapped yet */ - if (!apic_phys) - return; - - maxlvt = lapic_get_maxlvt(); - /* - * Masking an LVT entry can trigger a local APIC error - * if the vector is zero. Mask LVTERR first to prevent this. - */ - if (maxlvt >= 3) { - v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ - apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); - } - /* - * Careful: we have to set masks only first to deassert - * any level-triggered sources. - */ - v = apic_read(APIC_LVTT); - apic_write(APIC_LVTT, v | APIC_LVT_MASKED); - v = apic_read(APIC_LVT0); - apic_write(APIC_LVT0, v | APIC_LVT_MASKED); - v = apic_read(APIC_LVT1); - apic_write(APIC_LVT1, v | APIC_LVT_MASKED); - if (maxlvt >= 4) { - v = apic_read(APIC_LVTPC); - apic_write(APIC_LVTPC, v | APIC_LVT_MASKED); - } - - /* - * Clean APIC state for other OSs: - */ - apic_write(APIC_LVTT, APIC_LVT_MASKED); - apic_write(APIC_LVT0, APIC_LVT_MASKED); - apic_write(APIC_LVT1, APIC_LVT_MASKED); - if (maxlvt >= 3) - apic_write(APIC_LVTERR, APIC_LVT_MASKED); - if (maxlvt >= 4) - apic_write(APIC_LVTPC, APIC_LVT_MASKED); - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); -} - -/** - * disable_local_APIC - clear and disable the local APIC - */ -void disable_local_APIC(void) -{ - unsigned int value; - - clear_local_APIC(); - - /* - * Disable APIC (implies clearing of registers - * for 82489DX!). - */ - value = apic_read(APIC_SPIV); - value &= ~APIC_SPIV_APIC_ENABLED; - apic_write(APIC_SPIV, value); -} - -void lapic_shutdown(void) -{ - unsigned long flags; - - if (!cpu_has_apic) - return; - - local_irq_save(flags); - - disable_local_APIC(); - - local_irq_restore(flags); -} - -/* - * This is to verify that we're looking at a real local APIC. - * Check these against your board if the CPUs aren't getting - * started for no apparent reason. - */ -int __init verify_local_APIC(void) -{ - unsigned int reg0, reg1; - - /* - * The version register is read-only in a real APIC. - */ - reg0 = apic_read(APIC_LVR); - apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0); - apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK); - reg1 = apic_read(APIC_LVR); - apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1); - - /* - * The two version reads above should print the same - * numbers. If the second one is different, then we - * poke at a non-APIC. - */ - if (reg1 != reg0) - return 0; - - /* - * Check if the version looks reasonably. - */ - reg1 = GET_APIC_VERSION(reg0); - if (reg1 == 0x00 || reg1 == 0xff) - return 0; - reg1 = lapic_get_maxlvt(); - if (reg1 < 0x02 || reg1 == 0xff) - return 0; - - /* - * The ID register is read/write in a real APIC. - */ - reg0 = read_apic_id(); - apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); - apic_write(APIC_ID, reg0 ^ APIC_ID_MASK); - reg1 = read_apic_id(); - apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1); - apic_write(APIC_ID, reg0); - if (reg1 != (reg0 ^ APIC_ID_MASK)) - return 0; - - /* - * The next two are just to see if we have sane values. - * They're only really relevant if we're in Virtual Wire - * compatibility mode, but most boxes are anymore. - */ - reg0 = apic_read(APIC_LVT0); - apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0); - reg1 = apic_read(APIC_LVT1); - apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1); - - return 1; -} - -/** - * sync_Arb_IDs - synchronize APIC bus arbitration IDs - */ -void __init sync_Arb_IDs(void) -{ - /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */ - if (modern_apic()) - return; - - /* - * Wait for idle. - */ - apic_wait_icr_idle(); - - apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); - apic_write(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG - | APIC_DM_INIT); -} - -/* - * An initial setup of the virtual wire mode. - */ -void __init init_bsp_APIC(void) -{ - unsigned int value; - - /* - * Don't do the setup now if we have a SMP BIOS as the - * through-I/O-APIC virtual wire mode might be active. - */ - if (smp_found_config || !cpu_has_apic) - return; - - value = apic_read(APIC_LVR); - - /* - * Do not trust the local APIC being empty at bootup. - */ - clear_local_APIC(); - - /* - * Enable APIC. - */ - value = apic_read(APIC_SPIV); - value &= ~APIC_VECTOR_MASK; - value |= APIC_SPIV_APIC_ENABLED; - value |= APIC_SPIV_FOCUS_DISABLED; - value |= SPURIOUS_APIC_VECTOR; - apic_write(APIC_SPIV, value); - - /* - * Set up the virtual wire mode. - */ - apic_write(APIC_LVT0, APIC_DM_EXTINT); - value = APIC_DM_NMI; - apic_write(APIC_LVT1, value); -} - -/** - * setup_local_APIC - setup the local APIC - */ -void __cpuinit setup_local_APIC(void) -{ - unsigned int value; - int i, j; - - preempt_disable(); - value = apic_read(APIC_LVR); - - BUILD_BUG_ON((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f); - - /* - * Double-check whether this APIC is really registered. - * This is meaningless in clustered apic mode, so we skip it. - */ - if (!apic_id_registered()) - BUG(); - - /* - * Intel recommends to set DFR, LDR and TPR before enabling - * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel - * document number 292116). So here it goes... - */ - init_apic_ldr(); - - /* - * Set Task Priority to 'accept all'. We never change this - * later on. - */ - value = apic_read(APIC_TASKPRI); - value &= ~APIC_TPRI_MASK; - apic_write(APIC_TASKPRI, value); - - /* - * After a crash, we no longer service the interrupts and a pending - * interrupt from previous kernel might still have ISR bit set. - * - * Most probably by now CPU has serviced that pending interrupt and - * it might not have done the ack_APIC_irq() because it thought, - * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it - * does not clear the ISR bit and cpu thinks it has already serivced - * the interrupt. Hence a vector might get locked. It was noticed - * for timer irq (vector 0x31). Issue an extra EOI to clear ISR. - */ - for (i = APIC_ISR_NR - 1; i >= 0; i--) { - value = apic_read(APIC_ISR + i*0x10); - for (j = 31; j >= 0; j--) { - if (value & (1<<j)) - ack_APIC_irq(); - } - } - - /* - * Now that we are all set up, enable the APIC - */ - value = apic_read(APIC_SPIV); - value &= ~APIC_VECTOR_MASK; - /* - * Enable APIC - */ - value |= APIC_SPIV_APIC_ENABLED; - - /* We always use processor focus */ - - /* - * Set spurious IRQ vector - */ - value |= SPURIOUS_APIC_VECTOR; - apic_write(APIC_SPIV, value); - - /* - * Set up LVT0, LVT1: - * - * set up through-local-APIC on the BP's LINT0. This is not - * strictly necessary in pure symmetric-IO mode, but sometimes - * we delegate interrupts to the 8259A. - */ - /* - * TODO: set up through-local-APIC from through-I/O-APIC? --macro - */ - value = apic_read(APIC_LVT0) & APIC_LVT_MASKED; - if (!smp_processor_id() && !value) { - value = APIC_DM_EXTINT; - apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", - smp_processor_id()); - } else { - value = APIC_DM_EXTINT | APIC_LVT_MASKED; - apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", - smp_processor_id()); - } - apic_write(APIC_LVT0, value); - - /* - * only the BP should see the LINT1 NMI signal, obviously. - */ - if (!smp_processor_id()) - value = APIC_DM_NMI; - else - value = APIC_DM_NMI | APIC_LVT_MASKED; - apic_write(APIC_LVT1, value); - preempt_enable(); -} - -static void __cpuinit lapic_setup_esr(void) -{ - unsigned maxlvt = lapic_get_maxlvt(); - - apic_write(APIC_LVTERR, ERROR_APIC_VECTOR); - /* - * spec says clear errors after enabling vector. - */ - if (maxlvt > 3) - apic_write(APIC_ESR, 0); -} - -void __cpuinit end_local_APIC_setup(void) -{ - lapic_setup_esr(); - setup_apic_nmi_watchdog(NULL); - apic_pm_activate(); -} - -/* - * Detect and enable local APICs on non-SMP boards. - * Original code written by Keir Fraser. - * On AMD64 we trust the BIOS - if it says no APIC it is likely - * not correctly set up (usually the APIC timer won't work etc.) - */ -static int __init detect_init_APIC(void) -{ - if (!cpu_has_apic) { - printk(KERN_INFO "No local APIC present\n"); - return -1; - } - - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - boot_cpu_physical_apicid = 0; - return 0; -} - -void __init early_init_lapic_mapping(void) -{ - unsigned long phys_addr; - - /* - * If no local APIC can be found then go out - * : it means there is no mpatable and MADT - */ - if (!smp_found_config) - return; - - phys_addr = mp_lapic_addr; - - set_fixmap_nocache(FIX_APIC_BASE, phys_addr); - apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", - APIC_BASE, phys_addr); - - /* - * Fetch the APIC ID of the BSP in case we have a - * default configuration (or the MP table is broken). - */ - boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); -} - -/** - * init_apic_mappings - initialize APIC mappings - */ -void __init init_apic_mappings(void) -{ - /* - * If no local APIC can be found then set up a fake all - * zeroes page to simulate the local APIC and another - * one for the IO-APIC. - */ - if (!smp_found_config && detect_init_APIC()) { - apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); - apic_phys = __pa(apic_phys); - } else - apic_phys = mp_lapic_addr; - - set_fixmap_nocache(FIX_APIC_BASE, apic_phys); - apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", - APIC_BASE, apic_phys); - - /* - * Fetch the APIC ID of the BSP in case we have a - * default configuration (or the MP table is broken). - */ - boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); -} - -/* - * This initializes the IO-APIC and APIC hardware if this is - * a UP kernel. - */ -int __init APIC_init_uniprocessor(void) -{ - if (disable_apic) { - printk(KERN_INFO "Apic disabled\n"); - return -1; - } - if (!cpu_has_apic) { - disable_apic = 1; - printk(KERN_INFO "Apic disabled by BIOS\n"); - return -1; - } - - verify_local_APIC(); - - connect_bsp_APIC(); - - physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); - apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid)); - - setup_local_APIC(); - - /* - * Now enable IO-APICs, actually call clear_IO_APIC - * We need clear_IO_APIC before enabling vector on BP - */ - if (!skip_ioapic_setup && nr_ioapics) - enable_IO_APIC(); - - if (!smp_found_config || skip_ioapic_setup || !nr_ioapics) - localise_nmi_watchdog(); - end_local_APIC_setup(); - - if (smp_found_config && !skip_ioapic_setup && nr_ioapics) - setup_IO_APIC(); - else - nr_ioapics = 0; - setup_boot_APIC_clock(); - check_nmi_watchdog(); - return 0; -} - -/* - * Local APIC interrupts - */ - -/* - * This interrupt should _never_ happen with our APIC/SMP architecture - */ -asmlinkage void smp_spurious_interrupt(void) -{ - unsigned int v; - exit_idle(); - irq_enter(); - /* - * Check if this really is a spurious interrupt and ACK it - * if it is a vectored one. Just in case... - * Spurious interrupts should not be ACKed. - */ - v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1)); - if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) - ack_APIC_irq(); - - add_pda(irq_spurious_count, 1); - irq_exit(); -} - -/* - * This interrupt should never happen with our APIC/SMP architecture - */ -asmlinkage void smp_error_interrupt(void) -{ - unsigned int v, v1; - - exit_idle(); - irq_enter(); - /* First tickle the hardware, only then report what went on. -- REW */ - v = apic_read(APIC_ESR); - apic_write(APIC_ESR, 0); - v1 = apic_read(APIC_ESR); - ack_APIC_irq(); - atomic_inc(&irq_err_count); - - /* Here is what the APIC error bits mean: - 0: Send CS error - 1: Receive CS error - 2: Send accept error - 3: Receive accept error - 4: Reserved - 5: Send illegal vector - 6: Received illegal vector - 7: Illegal register address - */ - printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", - smp_processor_id(), v , v1); - irq_exit(); -} - -/** - * * connect_bsp_APIC - attach the APIC to the interrupt system - * */ -void __init connect_bsp_APIC(void) -{ - enable_apic_mode(); -} - -void disconnect_bsp_APIC(int virt_wire_setup) -{ - /* Go back to Virtual Wire compatibility mode */ - unsigned long value; - - /* For the spurious interrupt use vector F, and enable it */ - value = apic_read(APIC_SPIV); - value &= ~APIC_VECTOR_MASK; - value |= APIC_SPIV_APIC_ENABLED; - value |= 0xf; - apic_write(APIC_SPIV, value); - - if (!virt_wire_setup) { - /* - * For LVT0 make it edge triggered, active high, - * external and enabled - */ - value = apic_read(APIC_LVT0); - value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | - APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); - value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); - apic_write(APIC_LVT0, value); - } else { - /* Disable LVT0 */ - apic_write(APIC_LVT0, APIC_LVT_MASKED); - } - - /* For LVT1 make it edge triggered, active high, nmi and enabled */ - value = apic_read(APIC_LVT1); - value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | - APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); - value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); - apic_write(APIC_LVT1, value); -} - -void __cpuinit generic_processor_info(int apicid, int version) -{ - int cpu; - cpumask_t tmp_map; - - if (num_processors >= NR_CPUS) { - printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." - " Processor ignored.\n", NR_CPUS); - return; - } - - if (num_processors >= maxcpus) { - printk(KERN_WARNING "WARNING: maxcpus limit of %i reached." - " Processor ignored.\n", maxcpus); - return; - } - - num_processors++; - cpus_complement(tmp_map, cpu_present_map); - cpu = first_cpu(tmp_map); - - physid_set(apicid, phys_cpu_present_map); - if (apicid == boot_cpu_physical_apicid) { - /* - * x86_bios_cpu_apicid is required to have processors listed - * in same order as logical cpu numbers. Hence the first - * entry is BSP, and so on. - */ - cpu = 0; - } - if (apicid > max_physical_apicid) - max_physical_apicid = apicid; - - /* are we being called early in kernel startup? */ - if (early_per_cpu_ptr(x86_cpu_to_apicid)) { - u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); - u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); - - cpu_to_apicid[cpu] = apicid; - bios_cpu_apicid[cpu] = apicid; - } else { - per_cpu(x86_cpu_to_apicid, cpu) = apicid; - per_cpu(x86_bios_cpu_apicid, cpu) = apicid; - } - - cpu_set(cpu, cpu_possible_map); - cpu_set(cpu, cpu_present_map); -} - -/* - * Power management - */ -#ifdef CONFIG_PM - -static struct { - /* 'active' is true if the local APIC was enabled by us and - not the BIOS; this signifies that we are also responsible - for disabling it before entering apm/acpi suspend */ - int active; - /* r/w apic fields */ - unsigned int apic_id; - unsigned int apic_taskpri; - unsigned int apic_ldr; - unsigned int apic_dfr; - unsigned int apic_spiv; - unsigned int apic_lvtt; - unsigned int apic_lvtpc; - unsigned int apic_lvt0; - unsigned int apic_lvt1; - unsigned int apic_lvterr; - unsigned int apic_tmict; - unsigned int apic_tdcr; - unsigned int apic_thmr; -} apic_pm_state; - -static int lapic_suspend(struct sys_device *dev, pm_message_t state) -{ - unsigned long flags; - int maxlvt; - - if (!apic_pm_state.active) - return 0; - - maxlvt = lapic_get_maxlvt(); - - apic_pm_state.apic_id = read_apic_id(); - apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); - apic_pm_state.apic_ldr = apic_read(APIC_LDR); - apic_pm_state.apic_dfr = apic_read(APIC_DFR); - apic_pm_state.apic_spiv = apic_read(APIC_SPIV); - apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); - if (maxlvt >= 4) - apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); - apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); - apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); - apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); - apic_pm_state.apic_tmict = apic_read(APIC_TMICT); - apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); -#ifdef CONFIG_X86_MCE_INTEL - if (maxlvt >= 5) - apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); -#endif - local_irq_save(flags); - disable_local_APIC(); - local_irq_restore(flags); - return 0; -} - -static int lapic_resume(struct sys_device *dev) -{ - unsigned int l, h; - unsigned long flags; - int maxlvt; - - if (!apic_pm_state.active) - return 0; - - maxlvt = lapic_get_maxlvt(); - - local_irq_save(flags); - rdmsr(MSR_IA32_APICBASE, l, h); - l &= ~MSR_IA32_APICBASE_BASE; - l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; - wrmsr(MSR_IA32_APICBASE, l, h); - apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); - apic_write(APIC_ID, apic_pm_state.apic_id); - apic_write(APIC_DFR, apic_pm_state.apic_dfr); - apic_write(APIC_LDR, apic_pm_state.apic_ldr); - apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri); - apic_write(APIC_SPIV, apic_pm_state.apic_spiv); - apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); - apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); -#ifdef CONFIG_X86_MCE_INTEL - if (maxlvt >= 5) - apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); -#endif - if (maxlvt >= 4) - apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); - apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); - apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); - apic_write(APIC_TMICT, apic_pm_state.apic_tmict); - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - local_irq_restore(flags); - return 0; -} - -static struct sysdev_class lapic_sysclass = { - .name = "lapic", - .resume = lapic_resume, - .suspend = lapic_suspend, -}; - -static struct sys_device device_lapic = { - .id = 0, - .cls = &lapic_sysclass, -}; - -static void __cpuinit apic_pm_activate(void) -{ - apic_pm_state.active = 1; -} - -static int __init init_lapic_sysfs(void) -{ - int error; - - if (!cpu_has_apic) - return 0; - /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ - - error = sysdev_class_register(&lapic_sysclass); - if (!error) - error = sysdev_register(&device_lapic); - return error; -} -device_initcall(init_lapic_sysfs); - -#else /* CONFIG_PM */ - -static void apic_pm_activate(void) { } - -#endif /* CONFIG_PM */ - -/* - * apic_is_clustered_box() -- Check if we can expect good TSC - * - * Thus far, the major user of this is IBM's Summit2 series: - * - * Clustered boxes may have unsynced TSC problems if they are - * multi-chassis. Use available data to take a good guess. - * If in doubt, go HPET. - */ -__cpuinit int apic_is_clustered_box(void) -{ - int i, clusters, zeros; - unsigned id; - u16 *bios_cpu_apicid; - DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); - - /* - * there is not this kind of box with AMD CPU yet. - * Some AMD box with quadcore cpu and 8 sockets apicid - * will be [4, 0x23] or [8, 0x27] could be thought to - * vsmp box still need checking... - */ - if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box()) - return 0; - - bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); - bitmap_zero(clustermap, NUM_APIC_CLUSTERS); - - for (i = 0; i < NR_CPUS; i++) { - /* are we being called early in kernel startup? */ - if (bios_cpu_apicid) { - id = bios_cpu_apicid[i]; - } - else if (i < nr_cpu_ids) { - if (cpu_present(i)) - id = per_cpu(x86_bios_cpu_apicid, i); - else - continue; - } - else - break; - - if (id != BAD_APICID) - __set_bit(APIC_CLUSTERID(id), clustermap); - } - - /* Problem: Partially populated chassis may not have CPUs in some of - * the APIC clusters they have been allocated. Only present CPUs have - * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap. - * Since clusters are allocated sequentially, count zeros only if - * they are bounded by ones. - */ - clusters = 0; - zeros = 0; - for (i = 0; i < NUM_APIC_CLUSTERS; i++) { - if (test_bit(i, clustermap)) { - clusters += 1 + zeros; - zeros = 0; - } else - ++zeros; - } - - /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are - * not guaranteed to be synced between boards - */ - if (is_vsmp_box() && clusters > 1) - return 1; - - /* - * If clusters > 2, then should be multi-chassis. - * May have to revisit this when multi-core + hyperthreaded CPUs come - * out, but AFAIK this will work even for them. - */ - return (clusters > 2); -} - -/* - * APIC command line parameters - */ -static int __init apic_set_verbosity(char *str) -{ - if (str == NULL) { - skip_ioapic_setup = 0; - ioapic_force = 1; - return 0; - } - if (strcmp("debug", str) == 0) - apic_verbosity = APIC_DEBUG; - else if (strcmp("verbose", str) == 0) - apic_verbosity = APIC_VERBOSE; - else { - printk(KERN_WARNING "APIC Verbosity level %s not recognised" - " use apic=verbose or apic=debug\n", str); - return -EINVAL; - } - - return 0; -} -early_param("apic", apic_set_verbosity); - -static __init int setup_disableapic(char *str) -{ - disable_apic = 1; - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); - return 0; -} -early_param("disableapic", setup_disableapic); - -/* same as disableapic, for compatibility */ -static __init int setup_nolapic(char *str) -{ - return setup_disableapic(str); -} -early_param("nolapic", setup_nolapic); - -static int __init parse_lapic_timer_c2_ok(char *arg) -{ - local_apic_timer_c2_ok = 1; - return 0; -} -early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); - -static __init int setup_noapictimer(char *str) -{ - if (str[0] != ' ' && str[0] != 0) - return 0; - disable_apic_timer = 1; - return 1; -} -__setup("noapictimer", setup_noapictimer); - -static __init int setup_apicpmtimer(char *s) -{ - apic_calibrate_pmtmr = 1; - notsc_setup(NULL); - return 0; -} -__setup("apicpmtimer", setup_apicpmtimer); - -static int __init lapic_insert_resource(void) -{ - if (!apic_phys) - return -1; - - /* Put local APIC into the resource map. */ - lapic_resource.start = apic_phys; - lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1; - insert_resource(&iomem_resource, &lapic_resource); - - return 0; -} - -/* - * need call insert after e820_reserve_resources() - * that is using request_resource - */ -late_initcall(lapic_insert_resource); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index bf9b441331e9..5145a6e72bbb 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -219,7 +219,6 @@ #include <linux/time.h> #include <linux/sched.h> #include <linux/pm.h> -#include <linux/pm_legacy.h> #include <linux/capability.h> #include <linux/device.h> #include <linux/kernel.h> @@ -229,12 +228,12 @@ #include <linux/suspend.h> #include <linux/kthread.h> #include <linux/jiffies.h> -#include <linux/smp_lock.h> #include <asm/system.h> #include <asm/uaccess.h> #include <asm/desc.h> #include <asm/i8253.h> +#include <asm/olpc.h> #include <asm/paravirt.h> #include <asm/reboot.h> @@ -2218,7 +2217,7 @@ static int __init apm_init(void) dmi_check_system(apm_dmi_table); - if (apm_info.bios.version == 0 || paravirt_enabled()) { + if (apm_info.bios.version == 0 || paravirt_enabled() || machine_is_olpc()) { printk(KERN_INFO "apm: BIOS not found.\n"); return -ENODEV; } diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index bacf5deeec2d..7fcf63d22f8b 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -18,9 +18,11 @@ #include <asm/ia32.h> #include <asm/bootparam.h> +#include <xen/interface/xen.h> + #define __NO_STUBS 1 #undef __SYSCALL -#undef _ASM_X86_64_UNISTD_H_ +#undef _ASM_X86_UNISTD_64_H #define __SYSCALL(nr, sym) [nr] = 1, static char syscalls[] = { #include <asm/unistd.h> @@ -131,5 +133,14 @@ int main(void) OFFSET(BP_loadflags, boot_params, hdr.loadflags); OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); OFFSET(BP_version, boot_params, hdr.version); + + BLANK(); + DEFINE(PAGE_SIZE_asm, PAGE_SIZE); +#ifdef CONFIG_XEN + BLANK(); + OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); + OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); +#undef ENTRY +#endif return 0; } diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c new file mode 100644 index 000000000000..f0dfe6f17e7e --- /dev/null +++ b/arch/x86/kernel/bios_uv.c @@ -0,0 +1,141 @@ +/* + * BIOS run time interface routines. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) Russ Anderson + */ + +#include <linux/efi.h> +#include <asm/efi.h> +#include <linux/io.h> +#include <asm/uv/bios.h> +#include <asm/uv/uv_hub.h> + +struct uv_systab uv_systab; + +s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) +{ + struct uv_systab *tab = &uv_systab; + + if (!tab->function) + /* + * BIOS does not support UV systab + */ + return BIOS_STATUS_UNIMPLEMENTED; + + return efi_call6((void *)__va(tab->function), + (u64)which, a1, a2, a3, a4, a5); +} + +s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, + u64 a4, u64 a5) +{ + unsigned long bios_flags; + s64 ret; + + local_irq_save(bios_flags); + ret = uv_bios_call(which, a1, a2, a3, a4, a5); + local_irq_restore(bios_flags); + + return ret; +} + +s64 uv_bios_call_reentrant(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, + u64 a4, u64 a5) +{ + s64 ret; + + preempt_disable(); + ret = uv_bios_call(which, a1, a2, a3, a4, a5); + preempt_enable(); + + return ret; +} + + +long sn_partition_id; +EXPORT_SYMBOL_GPL(sn_partition_id); +long uv_coherency_id; +EXPORT_SYMBOL_GPL(uv_coherency_id); +long uv_region_size; +EXPORT_SYMBOL_GPL(uv_region_size); +int uv_type; + + +s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher, + long *region) +{ + s64 ret; + u64 v0, v1; + union partition_info_u part; + + ret = uv_bios_call_irqsave(UV_BIOS_GET_SN_INFO, fc, + (u64)(&v0), (u64)(&v1), 0, 0); + if (ret != BIOS_STATUS_SUCCESS) + return ret; + + part.val = v0; + if (uvtype) + *uvtype = part.hub_version; + if (partid) + *partid = part.partition_id; + if (coher) + *coher = part.coherence_id; + if (region) + *region = part.region_size; + return ret; +} + + +s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second) +{ + return uv_bios_call(UV_BIOS_FREQ_BASE, clock_type, + (u64)ticks_per_second, 0, 0, 0); +} +EXPORT_SYMBOL_GPL(uv_bios_freq_base); + + +#ifdef CONFIG_EFI +void uv_bios_init(void) +{ + struct uv_systab *tab; + + if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) || + (efi.uv_systab == (unsigned long)NULL)) { + printk(KERN_CRIT "No EFI UV System Table.\n"); + uv_systab.function = (unsigned long)NULL; + return; + } + + tab = (struct uv_systab *)ioremap(efi.uv_systab, + sizeof(struct uv_systab)); + if (strncmp(tab->signature, "UVST", 4) != 0) + printk(KERN_ERR "bad signature in UV system table!"); + + /* + * Copy table to permanent spot for later use. + */ + memcpy(&uv_systab, tab, sizeof(struct uv_systab)); + iounmap(tab); + + printk(KERN_INFO "EFI UV System Table Revision %d\n", tab->revision); +} +#else /* !CONFIG_EFI */ + +void uv_bios_init(void) { } +#endif + diff --git a/arch/x86/kernel/cpu/.gitignore b/arch/x86/kernel/cpu/.gitignore new file mode 100644 index 000000000000..667df55a4399 --- /dev/null +++ b/arch/x86/kernel/cpu/.gitignore @@ -0,0 +1 @@ +capflags.c diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index ee76eaad3001..82ec6075c057 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -3,22 +3,30 @@ # obj-y := intel_cacheinfo.o addon_cpuid_features.o -obj-y += proc.o feature_names.o - -obj-$(CONFIG_X86_32) += common.o bugs.o -obj-$(CONFIG_X86_64) += common_64.o bugs_64.o -obj-$(CONFIG_X86_32) += amd.o -obj-$(CONFIG_X86_64) += amd_64.o -obj-$(CONFIG_X86_32) += cyrix.o -obj-$(CONFIG_X86_32) += centaur.o -obj-$(CONFIG_X86_64) += centaur_64.o -obj-$(CONFIG_X86_32) += transmeta.o -obj-$(CONFIG_X86_32) += intel.o -obj-$(CONFIG_X86_64) += intel_64.o -obj-$(CONFIG_X86_32) += umc.o +obj-y += proc.o capflags.o powerflags.o common.o + +obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o +obj-$(CONFIG_X86_64) += bugs_64.o + +obj-$(CONFIG_CPU_SUP_INTEL) += intel.o +obj-$(CONFIG_CPU_SUP_AMD) += amd.o +obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o +obj-$(CONFIG_CPU_SUP_CENTAUR_32) += centaur.o +obj-$(CONFIG_CPU_SUP_CENTAUR_64) += centaur_64.o +obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o +obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ obj-$(CONFIG_CPU_FREQ) += cpufreq/ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o + +quiet_cmd_mkcapflags = MKCAP $@ + cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ + +cpufeature = $(src)/../../include/asm/cpufeature.h + +targets += capflags.c +$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.pl FORCE + $(call if_changed,mkcapflags) diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index 84a8220a6072..0d9c993aa93e 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -7,6 +7,8 @@ #include <asm/pat.h> #include <asm/processor.h> +#include <mach_apic.h> + struct cpuid_bit { u16 feature; u8 reg; @@ -48,6 +50,92 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) } } +/* leaf 0xb SMT level */ +#define SMT_LEVEL 0 + +/* leaf 0xb sub-leaf types */ +#define INVALID_TYPE 0 +#define SMT_TYPE 1 +#define CORE_TYPE 2 + +#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff) +#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f) +#define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff) + +/* + * Check for extended topology enumeration cpuid leaf 0xb and if it + * exists, use it for populating initial_apicid and cpu topology + * detection. + */ +void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + unsigned int eax, ebx, ecx, edx, sub_index; + unsigned int ht_mask_width, core_plus_mask_width; + unsigned int core_select_mask, core_level_siblings; + + if (c->cpuid_level < 0xb) + return; + + cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx); + + /* + * check if the cpuid leaf 0xb is actually implemented. + */ + if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE)) + return; + + set_cpu_cap(c, X86_FEATURE_XTOPOLOGY); + + /* + * initial apic id, which also represents 32-bit extended x2apic id. + */ + c->initial_apicid = edx; + + /* + * Populate HT related information from sub-leaf level 0. + */ + core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx); + core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + + sub_index = 1; + do { + cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx); + + /* + * Check for the Core type in the implemented sub leaves. + */ + if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) { + core_level_siblings = LEVEL_MAX_SIBLINGS(ebx); + core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + break; + } + + sub_index++; + } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE); + + core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width; + +#ifdef CONFIG_X86_32 + c->cpu_core_id = phys_pkg_id(c->initial_apicid, ht_mask_width) + & core_select_mask; + c->phys_proc_id = phys_pkg_id(c->initial_apicid, core_plus_mask_width); +#else + c->cpu_core_id = phys_pkg_id(ht_mask_width) & core_select_mask; + c->phys_proc_id = phys_pkg_id(core_plus_mask_width); +#endif + c->x86_max_cores = (core_level_siblings / smp_num_siblings); + + + printk(KERN_INFO "CPU: Physical Processor ID: %d\n", + c->phys_proc_id); + if (c->x86_max_cores > 1) + printk(KERN_INFO "CPU: Processor Core ID: %d\n", + c->cpu_core_id); + return; +#endif +} + #ifdef CONFIG_X86_PAT void __cpuinit validate_pat_support(struct cpuinfo_x86 *c) { @@ -56,9 +144,22 @@ void __cpuinit validate_pat_support(struct cpuinfo_x86 *c) switch (c->x86_vendor) { case X86_VENDOR_INTEL: - if (c->x86 == 0xF || (c->x86 == 6 && c->x86_model >= 15)) + /* + * There is a known erratum on Pentium III and Core Solo + * and Core Duo CPUs. + * " Page with PAT set to WC while associated MTRR is UC + * may consolidate to UC " + * Because of this erratum, it is better to stick with + * setting WC in MTRR rather than using PAT on these CPUs. + * + * Enable PAT WC only on P4, Core 2 or later CPUs. + */ + if (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 15)) return; - break; + + pat_disable("PAT WC disabled due to known CPU erratum."); + return; + case X86_VENDOR_AMD: case X86_VENDOR_CENTAUR: case X86_VENDOR_TRANSMETA: diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 81a07ca65d44..8f1e31db2ad5 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1,13 +1,22 @@ #include <linux/init.h> #include <linux/bitops.h> #include <linux/mm.h> + #include <asm/io.h> #include <asm/processor.h> #include <asm/apic.h> +#ifdef CONFIG_X86_64 +# include <asm/numa_64.h> +# include <asm/mmconfig.h> +# include <asm/cacheflush.h> +#endif + #include <mach_apic.h> + #include "cpu.h" +#ifdef CONFIG_X86_32 /* * B step AMD K6 before B 9730xxxx have hardware bugs that can cause * misexecution of code under Linux. Owners of such processors should @@ -24,23 +33,273 @@ extern void vide(void); __asm__(".align 4\nvide: ret"); -int force_mwait __cpuinitdata; - -static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) +static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c) { - if (cpuid_eax(0x80000000) >= 0x80000007) { - c->x86_power = cpuid_edx(0x80000007); - if (c->x86_power & (1<<8)) - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); +/* + * General Systems BIOSen alias the cpu frequency registers + * of the Elan at 0x000df000. Unfortuantly, one of the Linux + * drivers subsequently pokes it, and changes the CPU speed. + * Workaround : Remove the unneeded alias. + */ +#define CBAR (0xfffc) /* Configuration Base Address (32-bit) */ +#define CBAR_ENB (0x80000000) +#define CBAR_KEY (0X000000CB) + if (c->x86_model == 9 || c->x86_model == 10) { + if (inl (CBAR) & CBAR_ENB) + outl (0 | CBAR_KEY, CBAR); } } -static void __cpuinit init_amd(struct cpuinfo_x86 *c) + +static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c) { u32 l, h; int mbytes = num_physpages >> (20-PAGE_SHIFT); - int r; + if (c->x86_model < 6) { + /* Based on AMD doc 20734R - June 2000 */ + if (c->x86_model == 0) { + clear_cpu_cap(c, X86_FEATURE_APIC); + set_cpu_cap(c, X86_FEATURE_PGE); + } + return; + } + + if (c->x86_model == 6 && c->x86_mask == 1) { + const int K6_BUG_LOOP = 1000000; + int n; + void (*f_vide)(void); + unsigned long d, d2; + + printk(KERN_INFO "AMD K6 stepping B detected - "); + + /* + * It looks like AMD fixed the 2.6.2 bug and improved indirect + * calls at the same time. + */ + + n = K6_BUG_LOOP; + f_vide = vide; + rdtscl(d); + while (n--) + f_vide(); + rdtscl(d2); + d = d2-d; + + if (d > 20*K6_BUG_LOOP) + printk("system stability may be impaired when more than 32 MB are used.\n"); + else + printk("probably OK (after B9730xxxx).\n"); + printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n"); + } + + /* K6 with old style WHCR */ + if (c->x86_model < 8 || + (c->x86_model == 8 && c->x86_mask < 8)) { + /* We can only write allocate on the low 508Mb */ + if (mbytes > 508) + mbytes = 508; + + rdmsr(MSR_K6_WHCR, l, h); + if ((l&0x0000FFFF) == 0) { + unsigned long flags; + l = (1<<0)|((mbytes/4)<<1); + local_irq_save(flags); + wbinvd(); + wrmsr(MSR_K6_WHCR, l, h); + local_irq_restore(flags); + printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n", + mbytes); + } + return; + } + + if ((c->x86_model == 8 && c->x86_mask > 7) || + c->x86_model == 9 || c->x86_model == 13) { + /* The more serious chips .. */ + + if (mbytes > 4092) + mbytes = 4092; + + rdmsr(MSR_K6_WHCR, l, h); + if ((l&0xFFFF0000) == 0) { + unsigned long flags; + l = ((mbytes>>2)<<22)|(1<<16); + local_irq_save(flags); + wbinvd(); + wrmsr(MSR_K6_WHCR, l, h); + local_irq_restore(flags); + printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n", + mbytes); + } + + return; + } + + if (c->x86_model == 10) { + /* AMD Geode LX is model 10 */ + /* placeholder for any needed mods */ + return; + } +} + +static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) +{ + u32 l, h; + + /* + * Bit 15 of Athlon specific MSR 15, needs to be 0 + * to enable SSE on Palomino/Morgan/Barton CPU's. + * If the BIOS didn't enable it already, enable it here. + */ + if (c->x86_model >= 6 && c->x86_model <= 10) { + if (!cpu_has(c, X86_FEATURE_XMM)) { + printk(KERN_INFO "Enabling disabled K7/SSE Support.\n"); + rdmsr(MSR_K7_HWCR, l, h); + l &= ~0x00008000; + wrmsr(MSR_K7_HWCR, l, h); + set_cpu_cap(c, X86_FEATURE_XMM); + } + } + + /* + * It's been determined by AMD that Athlons since model 8 stepping 1 + * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx + * As per AMD technical note 27212 0.2 + */ + if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) { + rdmsr(MSR_K7_CLK_CTL, l, h); + if ((l & 0xfff00000) != 0x20000000) { + printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l, + ((l & 0x000fffff)|0x20000000)); + wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h); + } + } + + set_cpu_cap(c, X86_FEATURE_K7); +} +#endif + +#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) +static int __cpuinit nearby_node(int apicid) +{ + int i, node; + + for (i = apicid - 1; i >= 0; i--) { + node = apicid_to_node[i]; + if (node != NUMA_NO_NODE && node_online(node)) + return node; + } + for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { + node = apicid_to_node[i]; + if (node != NUMA_NO_NODE && node_online(node)) + return node; + } + return first_node(node_online_map); /* Shouldn't happen */ +} +#endif + +/* + * On a AMD dual core setup the lower bits of the APIC id distingush the cores. + * Assumes number of cores is a power of two. + */ +static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_HT + unsigned bits; + + bits = c->x86_coreid_bits; + + /* Low order bits define the core id (index of core in socket) */ + c->cpu_core_id = c->initial_apicid & ((1 << bits)-1); + /* Convert the initial APIC ID into the socket ID */ + c->phys_proc_id = c->initial_apicid >> bits; +#endif +} + +static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) +{ +#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) + int cpu = smp_processor_id(); + int node; + unsigned apicid = hard_smp_processor_id(); + + node = c->phys_proc_id; + if (apicid_to_node[apicid] != NUMA_NO_NODE) + node = apicid_to_node[apicid]; + if (!node_online(node)) { + /* Two possibilities here: + - The CPU is missing memory and no node was created. + In that case try picking one from a nearby CPU + - The APIC IDs differ from the HyperTransport node IDs + which the K8 northbridge parsing fills in. + Assume they are all increased by a constant offset, + but in the same order as the HT nodeids. + If that doesn't result in a usable node fall back to the + path for the previous case. */ + + int ht_nodeid = c->initial_apicid; + + if (ht_nodeid >= 0 && + apicid_to_node[ht_nodeid] != NUMA_NO_NODE) + node = apicid_to_node[ht_nodeid]; + /* Pick a nearby node */ + if (!node_online(node)) + node = nearby_node(apicid); + } + numa_set_node(cpu, node); + + printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node); +#endif +} + +static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_HT + unsigned bits, ecx; + + /* Multi core CPU? */ + if (c->extended_cpuid_level < 0x80000008) + return; + + ecx = cpuid_ecx(0x80000008); + + c->x86_max_cores = (ecx & 0xff) + 1; + + /* CPU telling us the core id bits shift? */ + bits = (ecx >> 12) & 0xF; + + /* Otherwise recompute */ + if (bits == 0) { + while ((1 << bits) < c->x86_max_cores) + bits++; + } + + c->x86_coreid_bits = bits; +#endif +} + +static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) +{ + early_init_amd_mc(c); + + /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ + if (c->x86_power & (1<<8)) + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + +#ifdef CONFIG_X86_64 + set_cpu_cap(c, X86_FEATURE_SYSCALL32); +#else + /* Set MTRR capability flag if appropriate */ + if (c->x86 == 5) + if (c->x86_model == 13 || c->x86_model == 9 || + (c->x86_model == 8 && c->x86_mask >= 8)) + set_cpu_cap(c, X86_FEATURE_K6_MTRR); +#endif +} + +static void __cpuinit init_amd(struct cpuinfo_x86 *c) +{ #ifdef CONFIG_SMP unsigned long long value; @@ -51,7 +310,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) * Errata 63 for SH-B3 steppings * Errata 122 for all steppings (F+ have it disabled by default) */ - if (c->x86 == 15) { + if (c->x86 == 0xf) { rdmsrl(MSR_K7_HWCR, value); value |= 1 << 6; wrmsrl(MSR_K7_HWCR, value); @@ -61,213 +320,119 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) early_init_amd(c); /* - * FIXME: We should handle the K5 here. Set up the write - * range and also turn on MSR 83 bits 4 and 31 (write alloc, - * no bus pipeline) - */ - - /* * Bit 31 in normal CPUID used for nonstandard 3DNow ID; * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ clear_cpu_cap(c, 0*32+31); - r = get_model_name(c); - - switch (c->x86) { - case 4: - /* - * General Systems BIOSen alias the cpu frequency registers - * of the Elan at 0x000df000. Unfortuantly, one of the Linux - * drivers subsequently pokes it, and changes the CPU speed. - * Workaround : Remove the unneeded alias. - */ -#define CBAR (0xfffc) /* Configuration Base Address (32-bit) */ -#define CBAR_ENB (0x80000000) -#define CBAR_KEY (0X000000CB) - if (c->x86_model == 9 || c->x86_model == 10) { - if (inl (CBAR) & CBAR_ENB) - outl (0 | CBAR_KEY, CBAR); - } - break; - case 5: - if (c->x86_model < 6) { - /* Based on AMD doc 20734R - June 2000 */ - if (c->x86_model == 0) { - clear_cpu_cap(c, X86_FEATURE_APIC); - set_cpu_cap(c, X86_FEATURE_PGE); - } - break; - } - - if (c->x86_model == 6 && c->x86_mask == 1) { - const int K6_BUG_LOOP = 1000000; - int n; - void (*f_vide)(void); - unsigned long d, d2; - - printk(KERN_INFO "AMD K6 stepping B detected - "); - - /* - * It looks like AMD fixed the 2.6.2 bug and improved indirect - * calls at the same time. - */ - - n = K6_BUG_LOOP; - f_vide = vide; - rdtscl(d); - while (n--) - f_vide(); - rdtscl(d2); - d = d2-d; - - if (d > 20*K6_BUG_LOOP) - printk("system stability may be impaired when more than 32 MB are used.\n"); - else - printk("probably OK (after B9730xxxx).\n"); - printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n"); - } - - /* K6 with old style WHCR */ - if (c->x86_model < 8 || - (c->x86_model == 8 && c->x86_mask < 8)) { - /* We can only write allocate on the low 508Mb */ - if (mbytes > 508) - mbytes = 508; - - rdmsr(MSR_K6_WHCR, l, h); - if ((l&0x0000FFFF) == 0) { - unsigned long flags; - l = (1<<0)|((mbytes/4)<<1); - local_irq_save(flags); - wbinvd(); - wrmsr(MSR_K6_WHCR, l, h); - local_irq_restore(flags); - printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n", - mbytes); - } - break; - } - - if ((c->x86_model == 8 && c->x86_mask > 7) || - c->x86_model == 9 || c->x86_model == 13) { - /* The more serious chips .. */ - - if (mbytes > 4092) - mbytes = 4092; - - rdmsr(MSR_K6_WHCR, l, h); - if ((l&0xFFFF0000) == 0) { - unsigned long flags; - l = ((mbytes>>2)<<22)|(1<<16); - local_irq_save(flags); - wbinvd(); - wrmsr(MSR_K6_WHCR, l, h); - local_irq_restore(flags); - printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n", - mbytes); - } - - /* Set MTRR capability flag if appropriate */ - if (c->x86_model == 13 || c->x86_model == 9 || - (c->x86_model == 8 && c->x86_mask >= 8)) - set_cpu_cap(c, X86_FEATURE_K6_MTRR); - break; - } - - if (c->x86_model == 10) { - /* AMD Geode LX is model 10 */ - /* placeholder for any needed mods */ - break; - } - break; - case 6: /* An Athlon/Duron */ +#ifdef CONFIG_X86_64 + /* On C+ stepping K8 rep microcode works well for copy/memset */ + if (c->x86 == 0xf) { + u32 level; - /* - * Bit 15 of Athlon specific MSR 15, needs to be 0 - * to enable SSE on Palomino/Morgan/Barton CPU's. - * If the BIOS didn't enable it already, enable it here. - */ - if (c->x86_model >= 6 && c->x86_model <= 10) { - if (!cpu_has(c, X86_FEATURE_XMM)) { - printk(KERN_INFO "Enabling disabled K7/SSE Support.\n"); - rdmsr(MSR_K7_HWCR, l, h); - l &= ~0x00008000; - wrmsr(MSR_K7_HWCR, l, h); - set_cpu_cap(c, X86_FEATURE_XMM); - } - } - - /* - * It's been determined by AMD that Athlons since model 8 stepping 1 - * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx - * As per AMD technical note 27212 0.2 - */ - if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) { - rdmsr(MSR_K7_CLK_CTL, l, h); - if ((l & 0xfff00000) != 0x20000000) { - printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l, - ((l & 0x000fffff)|0x20000000)); - wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h); - } - } - break; + level = cpuid_eax(1); + if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); } + if (c->x86 == 0x10 || c->x86 == 0x11) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); +#else + + /* + * FIXME: We should handle the K5 here. Set up the write + * range and also turn on MSR 83 bits 4 and 31 (write alloc, + * no bus pipeline) + */ switch (c->x86) { - case 15: - /* Use K8 tuning for Fam10h and Fam11h */ - case 0x10: - case 0x11: - set_cpu_cap(c, X86_FEATURE_K8); + case 4: + init_amd_k5(c); break; - case 6: - set_cpu_cap(c, X86_FEATURE_K7); + case 5: + init_amd_k6(c); + break; + case 6: /* An Athlon/Duron */ + init_amd_k7(c); break; } + + /* K6s reports MCEs but don't actually have all the MSRs */ + if (c->x86 < 6) + clear_cpu_cap(c, X86_FEATURE_MCE); +#endif + + /* Enable workaround for FXSAVE leak */ if (c->x86 >= 6) set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); - display_cacheinfo(c); - - if (cpuid_eax(0x80000000) >= 0x80000008) - c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; + if (!c->x86_model_id[0]) { + switch (c->x86) { + case 0xf: + /* Should distinguish Models here, but this is only + a fallback anyways. */ + strcpy(c->x86_model_id, "Hammer"); + break; + } + } -#ifdef CONFIG_X86_HT - /* - * On a AMD multi core setup the lower bits of the APIC id - * distinguish the cores. - */ - if (c->x86_max_cores > 1) { - int cpu = smp_processor_id(); - unsigned bits = (cpuid_ecx(0x80000008) >> 12) & 0xf; + display_cacheinfo(c); - if (bits == 0) { - while ((1 << bits) < c->x86_max_cores) - bits++; - } - c->cpu_core_id = c->phys_proc_id & ((1<<bits)-1); - c->phys_proc_id >>= bits; - printk(KERN_INFO "CPU %d(%d) -> Core %d\n", - cpu, c->x86_max_cores, c->cpu_core_id); + /* Multi core CPU? */ + if (c->extended_cpuid_level >= 0x80000008) { + amd_detect_cmp(c); + srat_detect_node(c); } + +#ifdef CONFIG_X86_32 + detect_ht(c); #endif - if (cpuid_eax(0x80000000) >= 0x80000006) { - if ((c->x86 == 0x10) && (cpuid_edx(0x80000006) & 0xf000)) + if (c->extended_cpuid_level >= 0x80000006) { + if ((c->x86 >= 0x0f) && (cpuid_edx(0x80000006) & 0xf000)) num_cache_leaves = 4; else num_cache_leaves = 3; } - /* K6s reports MCEs but don't actually have all the MSRs */ - if (c->x86 < 6) - clear_cpu_cap(c, X86_FEATURE_MCE); + if (c->x86 >= 0xf && c->x86 <= 0x11) + set_cpu_cap(c, X86_FEATURE_K8); - if (cpu_has_xmm2) + if (cpu_has_xmm2) { + /* MFENCE stops RDTSC speculation */ set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); + } + +#ifdef CONFIG_X86_64 + if (c->x86 == 0x10) { + /* do this for boot cpu */ + if (c == &boot_cpu_data) + check_enable_amd_mmconf_dmi(); + + fam10h_check_enable_mmcfg(); + } + + if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) { + unsigned long long tseg; + + /* + * Split up direct mapping around the TSEG SMM area. + * Don't do it for gbpages because there seems very little + * benefit in doing so. + */ + if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { + printk(KERN_DEBUG "tseg: %010llx\n", tseg); + if ((tseg>>PMD_SHIFT) < + (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) || + ((tseg>>PMD_SHIFT) < + (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) && + (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT)))) + set_memory_4k((unsigned long)__va(tseg), 1); + } + } +#endif } +#ifdef CONFIG_X86_32 static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) { /* AMD errata T13 (order #21922) */ @@ -280,10 +445,12 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int } return size; } +#endif static struct cpu_dev amd_cpu_dev __cpuinitdata = { .c_vendor = "AMD", .c_ident = { "AuthenticAMD" }, +#ifdef CONFIG_X86_32 .c_models = { { .vendor = X86_VENDOR_AMD, .family = 4, .model_names = { @@ -296,9 +463,11 @@ static struct cpu_dev amd_cpu_dev __cpuinitdata = { } }, }, + .c_size_cache = amd_size_cache, +#endif .c_early_init = early_init_amd, .c_init = init_amd, - .c_size_cache = amd_size_cache, + .c_x86_vendor = X86_VENDOR_AMD, }; -cpu_vendor_dev_register(X86_VENDOR_AMD, &amd_cpu_dev); +cpu_dev_register(amd_cpu_dev); diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c deleted file mode 100644 index 7c36fb8a28d4..000000000000 --- a/arch/x86/kernel/cpu/amd_64.c +++ /dev/null @@ -1,222 +0,0 @@ -#include <linux/init.h> -#include <linux/mm.h> - -#include <asm/numa_64.h> -#include <asm/mmconfig.h> -#include <asm/cacheflush.h> - -#include <mach_apic.h> - -#include "cpu.h" - -int force_mwait __cpuinitdata; - -#ifdef CONFIG_NUMA -static int __cpuinit nearby_node(int apicid) -{ - int i, node; - - for (i = apicid - 1; i >= 0; i--) { - node = apicid_to_node[i]; - if (node != NUMA_NO_NODE && node_online(node)) - return node; - } - for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { - node = apicid_to_node[i]; - if (node != NUMA_NO_NODE && node_online(node)) - return node; - } - return first_node(node_online_map); /* Shouldn't happen */ -} -#endif - -/* - * On a AMD dual core setup the lower bits of the APIC id distingush the cores. - * Assumes number of cores is a power of two. - */ -static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_SMP - unsigned bits; -#ifdef CONFIG_NUMA - int cpu = smp_processor_id(); - int node = 0; - unsigned apicid = hard_smp_processor_id(); -#endif - bits = c->x86_coreid_bits; - - /* Low order bits define the core id (index of core in socket) */ - c->cpu_core_id = c->initial_apicid & ((1 << bits)-1); - /* Convert the initial APIC ID into the socket ID */ - c->phys_proc_id = c->initial_apicid >> bits; - -#ifdef CONFIG_NUMA - node = c->phys_proc_id; - if (apicid_to_node[apicid] != NUMA_NO_NODE) - node = apicid_to_node[apicid]; - if (!node_online(node)) { - /* Two possibilities here: - - The CPU is missing memory and no node was created. - In that case try picking one from a nearby CPU - - The APIC IDs differ from the HyperTransport node IDs - which the K8 northbridge parsing fills in. - Assume they are all increased by a constant offset, - but in the same order as the HT nodeids. - If that doesn't result in a usable node fall back to the - path for the previous case. */ - - int ht_nodeid = c->initial_apicid; - - if (ht_nodeid >= 0 && - apicid_to_node[ht_nodeid] != NUMA_NO_NODE) - node = apicid_to_node[ht_nodeid]; - /* Pick a nearby node */ - if (!node_online(node)) - node = nearby_node(apicid); - } - numa_set_node(cpu, node); - - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); -#endif -#endif -} - -static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_SMP - unsigned bits, ecx; - - /* Multi core CPU? */ - if (c->extended_cpuid_level < 0x80000008) - return; - - ecx = cpuid_ecx(0x80000008); - - c->x86_max_cores = (ecx & 0xff) + 1; - - /* CPU telling us the core id bits shift? */ - bits = (ecx >> 12) & 0xF; - - /* Otherwise recompute */ - if (bits == 0) { - while ((1 << bits) < c->x86_max_cores) - bits++; - } - - c->x86_coreid_bits = bits; - -#endif -} - -static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) -{ - early_init_amd_mc(c); - - /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ - if (c->x86_power & (1<<8)) - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); -} - -static void __cpuinit init_amd(struct cpuinfo_x86 *c) -{ - unsigned level; - -#ifdef CONFIG_SMP - unsigned long value; - - /* - * Disable TLB flush filter by setting HWCR.FFDIS on K8 - * bit 6 of msr C001_0015 - * - * Errata 63 for SH-B3 steppings - * Errata 122 for all steppings (F+ have it disabled by default) - */ - if (c->x86 == 0xf) { - rdmsrl(MSR_K8_HWCR, value); - value |= 1 << 6; - wrmsrl(MSR_K8_HWCR, value); - } -#endif - - /* Bit 31 in normal CPUID used for nonstandard 3DNow ID; - 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ - clear_cpu_cap(c, 0*32+31); - - /* On C+ stepping K8 rep microcode works well for copy/memset */ - if (c->x86 == 0xf) { - level = cpuid_eax(1); - if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) - set_cpu_cap(c, X86_FEATURE_REP_GOOD); - } - if (c->x86 == 0x10 || c->x86 == 0x11) - set_cpu_cap(c, X86_FEATURE_REP_GOOD); - - /* Enable workaround for FXSAVE leak */ - if (c->x86 >= 6) - set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); - - level = get_model_name(c); - if (!level) { - switch (c->x86) { - case 0xf: - /* Should distinguish Models here, but this is only - a fallback anyways. */ - strcpy(c->x86_model_id, "Hammer"); - break; - } - } - display_cacheinfo(c); - - /* Multi core CPU? */ - if (c->extended_cpuid_level >= 0x80000008) - amd_detect_cmp(c); - - if (c->extended_cpuid_level >= 0x80000006 && - (cpuid_edx(0x80000006) & 0xf000)) - num_cache_leaves = 4; - else - num_cache_leaves = 3; - - if (c->x86 >= 0xf && c->x86 <= 0x11) - set_cpu_cap(c, X86_FEATURE_K8); - - /* MFENCE stops RDTSC speculation */ - set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); - - if (c->x86 == 0x10) { - /* do this for boot cpu */ - if (c == &boot_cpu_data) - check_enable_amd_mmconf_dmi(); - - fam10h_check_enable_mmcfg(); - } - - if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) { - unsigned long long tseg; - - /* - * Split up direct mapping around the TSEG SMM area. - * Don't do it for gbpages because there seems very little - * benefit in doing so. - */ - if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { - printk(KERN_DEBUG "tseg: %010llx\n", tseg); - if ((tseg>>PMD_SHIFT) < - (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) || - ((tseg>>PMD_SHIFT) < - (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) && - (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT)))) - set_memory_4k((unsigned long)__va(tseg), 1); - } - } -} - -static struct cpu_dev amd_cpu_dev __cpuinitdata = { - .c_vendor = "AMD", - .c_ident = { "AuthenticAMD" }, - .c_early_init = early_init_amd, - .c_init = init_amd, -}; - -cpu_vendor_dev_register(X86_VENDOR_AMD, &amd_cpu_dev); - diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 1b1c56bb338f..c8e315f1aa83 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -50,6 +50,8 @@ static double __initdata y = 3145727.0; */ static void __init check_fpu(void) { + s32 fdiv_bug; + if (!boot_cpu_data.hard_math) { #ifndef CONFIG_MATH_EMULATION printk(KERN_EMERG "No coprocessor found and no math emulation present.\n"); @@ -74,8 +76,10 @@ static void __init check_fpu(void) "fistpl %0\n\t" "fwait\n\t" "fninit" - : "=m" (*&boot_cpu_data.fdiv_bug) + : "=m" (*&fdiv_bug) : "m" (*&x), "m" (*&y)); + + boot_cpu_data.fdiv_bug = fdiv_bug; if (boot_cpu_data.fdiv_bug) printk("Hmm, FPU with FDIV bug.\n"); } @@ -131,13 +135,7 @@ static void __init check_popad(void) * (for due to lack of "invlpg" and working WP on a i386) * - In order to run on anything without a TSC, we need to be * compiled for a i486. - * - In order to support the local APIC on a buggy Pentium machine, - * we need to be compiled with CONFIG_X86_GOOD_APIC disabled, - * which happens implicitly if compiled for a Pentium or lower - * (unless an advanced selection of CPU features is used) as an - * otherwise config implies a properly working local APIC without - * the need to do extra reads from the APIC. -*/ + */ static void __init check_config(void) { @@ -151,21 +149,6 @@ static void __init check_config(void) if (boot_cpu_data.x86 == 3) panic("Kernel requires i486+ for 'invlpg' and other features"); #endif - -/* - * If we were told we had a good local APIC, check for buggy Pentia, - * i.e. all B steppings and the C2 stepping of P54C when using their - * integrated APIC (see 11AP erratum in "Pentium Processor - * Specification Update"). - */ -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_GOOD_APIC) - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL - && cpu_has_apic - && boot_cpu_data.x86 == 5 - && boot_cpu_data.x86_model == 2 - && (boot_cpu_data.x86_mask < 6 || boot_cpu_data.x86_mask == 11)) - panic("Kernel compiled for PMMX+, assumes a local APIC without the read-before-write bug!"); -#endif } diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index e0f45edd6a55..89bfdd9cacc6 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -289,7 +289,6 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c) if (c->x86_model >= 6 && c->x86_model < 9) set_cpu_cap(c, X86_FEATURE_3DNOW); - get_model_name(c); display_cacheinfo(c); } @@ -314,6 +313,16 @@ enum { EAMD3D = 1<<20, }; +static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) +{ + switch (c->x86) { + case 5: + /* Emulate MTRRs using Centaur's MCR. */ + set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR); + break; + } +} + static void __cpuinit init_centaur(struct cpuinfo_x86 *c) { @@ -462,8 +471,10 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size) static struct cpu_dev centaur_cpu_dev __cpuinitdata = { .c_vendor = "Centaur", .c_ident = { "CentaurHauls" }, + .c_early_init = early_init_centaur, .c_init = init_centaur, .c_size_cache = centaur_size_cache, + .c_x86_vendor = X86_VENDOR_CENTAUR, }; -cpu_vendor_dev_register(X86_VENDOR_CENTAUR, ¢aur_cpu_dev); +cpu_dev_register(centaur_cpu_dev); diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c index 1d181c40e2e1..a1625f5a1e78 100644 --- a/arch/x86/kernel/cpu/centaur_64.c +++ b/arch/x86/kernel/cpu/centaur_64.c @@ -16,9 +16,10 @@ static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) static void __cpuinit init_centaur(struct cpuinfo_x86 *c) { + early_init_centaur(c); + if (c->x86 == 0x6 && c->x86_model >= 0xf) { c->x86_cache_alignment = c->x86_clflush_size * 2; - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_REP_GOOD); } set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); @@ -29,7 +30,8 @@ static struct cpu_dev centaur_cpu_dev __cpuinitdata = { .c_ident = { "CentaurHauls" }, .c_early_init = early_init_centaur, .c_init = init_centaur, + .c_x86_vendor = X86_VENDOR_CENTAUR, }; -cpu_vendor_dev_register(X86_VENDOR_CENTAUR, ¢aur_cpu_dev); +cpu_dev_register(centaur_cpu_dev); diff --git a/arch/x86/kernel/cpu/cmpxchg.c b/arch/x86/kernel/cpu/cmpxchg.c new file mode 100644 index 000000000000..2056ccf572cc --- /dev/null +++ b/arch/x86/kernel/cpu/cmpxchg.c @@ -0,0 +1,72 @@ +/* + * cmpxchg*() fallbacks for CPU not supporting these instructions + */ + +#include <linux/kernel.h> +#include <linux/smp.h> +#include <linux/module.h> + +#ifndef CONFIG_X86_CMPXCHG +unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new) +{ + u8 prev; + unsigned long flags; + + /* Poor man's cmpxchg for 386. Unsuitable for SMP */ + local_irq_save(flags); + prev = *(u8 *)ptr; + if (prev == old) + *(u8 *)ptr = new; + local_irq_restore(flags); + return prev; +} +EXPORT_SYMBOL(cmpxchg_386_u8); + +unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new) +{ + u16 prev; + unsigned long flags; + + /* Poor man's cmpxchg for 386. Unsuitable for SMP */ + local_irq_save(flags); + prev = *(u16 *)ptr; + if (prev == old) + *(u16 *)ptr = new; + local_irq_restore(flags); + return prev; +} +EXPORT_SYMBOL(cmpxchg_386_u16); + +unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new) +{ + u32 prev; + unsigned long flags; + + /* Poor man's cmpxchg for 386. Unsuitable for SMP */ + local_irq_save(flags); + prev = *(u32 *)ptr; + if (prev == old) + *(u32 *)ptr = new; + local_irq_restore(flags); + return prev; +} +EXPORT_SYMBOL(cmpxchg_386_u32); +#endif + +#ifndef CONFIG_X86_CMPXCHG64 +unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new) +{ + u64 prev; + unsigned long flags; + + /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */ + local_irq_save(flags); + prev = *(u64 *)ptr; + if (prev == old) + *(u64 *)ptr = new; + local_irq_restore(flags); + return prev; +} +EXPORT_SYMBOL(cmpxchg_486_u64); +#endif + diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 80ab20d4fa39..25581dcb280e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1,27 +1,62 @@ #include <linux/init.h> +#include <linux/kernel.h> +#include <linux/sched.h> #include <linux/string.h> +#include <linux/bootmem.h> +#include <linux/bitops.h> +#include <linux/module.h> +#include <linux/kgdb.h> +#include <linux/topology.h> #include <linux/delay.h> #include <linux/smp.h> -#include <linux/module.h> #include <linux/percpu.h> -#include <linux/bootmem.h> -#include <asm/processor.h> #include <asm/i387.h> #include <asm/msr.h> #include <asm/io.h> +#include <asm/linkage.h> #include <asm/mmu_context.h> #include <asm/mtrr.h> #include <asm/mce.h> #include <asm/pat.h> +#include <asm/asm.h> +#include <asm/numa.h> #ifdef CONFIG_X86_LOCAL_APIC #include <asm/mpspec.h> #include <asm/apic.h> #include <mach_apic.h> +#include <asm/genapic.h> #endif +#include <asm/pda.h> +#include <asm/pgtable.h> +#include <asm/processor.h> +#include <asm/desc.h> +#include <asm/atomic.h> +#include <asm/proto.h> +#include <asm/sections.h> +#include <asm/setup.h> + #include "cpu.h" +static struct cpu_dev *this_cpu __cpuinitdata; + +#ifdef CONFIG_X86_64 +/* We need valid kernel segments for data and code in long mode too + * IRET will check the segment types kkeil 2000/10/28 + * Also sysret mandates a special GDT layout + */ +/* The TLS descriptors are currently at a different place compared to i386. + Hopefully nobody expects them at a fixed place (Wine?) */ DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { + [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, + [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } }, + [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } }, + [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } }, + [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, + [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, +} }; +#else +DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, @@ -55,17 +90,157 @@ DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } }, } }; +#endif EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); -__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; - +#ifdef CONFIG_X86_32 static int cachesize_override __cpuinitdata = -1; static int disable_x86_serial_nr __cpuinitdata = 1; -struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; +static int __init cachesize_setup(char *str) +{ + get_option(&str, &cachesize_override); + return 1; +} +__setup("cachesize=", cachesize_setup); + +static int __init x86_fxsr_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_FXSR); + setup_clear_cpu_cap(X86_FEATURE_XMM); + return 1; +} +__setup("nofxsr", x86_fxsr_setup); + +static int __init x86_sep_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_SEP); + return 1; +} +__setup("nosep", x86_sep_setup); + +/* Standard macro to see if a specific flag is changeable */ +static inline int flag_is_changeable_p(u32 flag) +{ + u32 f1, f2; + + /* + * Cyrix and IDT cpus allow disabling of CPUID + * so the code below may return different results + * when it is executed before and after enabling + * the CPUID. Add "volatile" to not allow gcc to + * optimize the subsequent calls to this function. + */ + asm volatile ("pushfl\n\t" + "pushfl\n\t" + "popl %0\n\t" + "movl %0,%1\n\t" + "xorl %2,%0\n\t" + "pushl %0\n\t" + "popfl\n\t" + "pushfl\n\t" + "popl %0\n\t" + "popfl\n\t" + : "=&r" (f1), "=&r" (f2) + : "ir" (flag)); + + return ((f1^f2) & flag) != 0; +} + +/* Probe for the CPUID instruction */ +static int __cpuinit have_cpuid_p(void) +{ + return flag_is_changeable_p(X86_EFLAGS_ID); +} + +static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) { + /* Disable processor serial number */ + unsigned long lo, hi; + rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi); + lo |= 0x200000; + wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi); + printk(KERN_NOTICE "CPU serial number disabled.\n"); + clear_cpu_cap(c, X86_FEATURE_PN); + + /* Disabling the serial number may affect the cpuid level */ + c->cpuid_level = cpuid_eax(0); + } +} + +static int __init x86_serial_nr_setup(char *s) +{ + disable_x86_serial_nr = 0; + return 1; +} +__setup("serialnumber", x86_serial_nr_setup); +#else +static inline int flag_is_changeable_p(u32 flag) +{ + return 1; +} +/* Probe for the CPUID instruction */ +static inline int have_cpuid_p(void) +{ + return 1; +} +static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) +{ +} +#endif + +/* + * Naming convention should be: <Name> [(<Codename>)] + * This table only is used unless init_<vendor>() below doesn't set it; + * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used + * + */ + +/* Look up CPU names by table lookup. */ +static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c) +{ + struct cpu_model_info *info; + + if (c->x86_model >= 16) + return NULL; /* Range check */ + + if (!this_cpu) + return NULL; + + info = this_cpu->c_models; + + while (info && info->family) { + if (info->family == c->x86) + return info->model_names[c->x86_model]; + info++; + } + return NULL; /* Not found */ +} + +__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; + +/* Current gdt points %fs at the "master" per-cpu area: after this, + * it's on the real one. */ +void switch_to_new_gdt(void) +{ + struct desc_ptr gdt_descr; + + gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); + gdt_descr.size = GDT_SIZE - 1; + load_gdt(&gdt_descr); +#ifdef CONFIG_X86_32 + asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory"); +#endif +} + +static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; static void __cpuinit default_init(struct cpuinfo_x86 *c) { +#ifdef CONFIG_X86_64 + display_cacheinfo(c); +#else /* Not much we can do here... */ /* Check if at least it has cpuid */ if (c->cpuid_level == -1) { @@ -75,28 +250,22 @@ static void __cpuinit default_init(struct cpuinfo_x86 *c) else if (c->x86 == 3) strcpy(c->x86_model_id, "386"); } +#endif } static struct cpu_dev __cpuinitdata default_cpu = { .c_init = default_init, .c_vendor = "Unknown", + .c_x86_vendor = X86_VENDOR_UNKNOWN, }; -static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu; - -static int __init cachesize_setup(char *str) -{ - get_option(&str, &cachesize_override); - return 1; -} -__setup("cachesize=", cachesize_setup); -int __cpuinit get_model_name(struct cpuinfo_x86 *c) +static void __cpuinit get_model_name(struct cpuinfo_x86 *c) { unsigned int *v; char *p, *q; - if (cpuid_eax(0x80000000) < 0x80000004) - return 0; + if (c->extended_cpuid_level < 0x80000004) + return; v = (unsigned int *) c->x86_model_id; cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); @@ -115,30 +284,34 @@ int __cpuinit get_model_name(struct cpuinfo_x86 *c) while (q <= &c->x86_model_id[48]) *q++ = '\0'; /* Zero-pad the rest */ } - - return 1; } - void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) { - unsigned int n, dummy, ecx, edx, l2size; + unsigned int n, dummy, ebx, ecx, edx, l2size; - n = cpuid_eax(0x80000000); + n = c->extended_cpuid_level; if (n >= 0x80000005) { - cpuid(0x80000005, &dummy, &dummy, &ecx, &edx); + cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", - edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); - c->x86_cache_size = (ecx>>24)+(edx>>24); + edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); + c->x86_cache_size = (ecx>>24) + (edx>>24); +#ifdef CONFIG_X86_64 + /* On K8 L1 TLB is inclusive, so don't count it */ + c->x86_tlbsize = 0; +#endif } if (n < 0x80000006) /* Some chips just has a large L1. */ return; - ecx = cpuid_ecx(0x80000006); + cpuid(0x80000006, &dummy, &ebx, &ecx, &edx); l2size = ecx >> 16; +#ifdef CONFIG_X86_64 + c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff); +#else /* do processor-specific cache resizing */ if (this_cpu->c_size_cache) l2size = this_cpu->c_size_cache(c, l2size); @@ -149,116 +322,106 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) if (l2size == 0) return; /* Again, no L2 cache is possible */ +#endif c->x86_cache_size = l2size; printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", - l2size, ecx & 0xFF); + l2size, ecx & 0xFF); } -/* - * Naming convention should be: <Name> [(<Codename>)] - * This table only is used unless init_<vendor>() below doesn't set it; - * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used - * - */ - -/* Look up CPU names by table lookup. */ -static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c) +void __cpuinit detect_ht(struct cpuinfo_x86 *c) { - struct cpu_model_info *info; +#ifdef CONFIG_X86_HT + u32 eax, ebx, ecx, edx; + int index_msb, core_bits; - if (c->x86_model >= 16) - return NULL; /* Range check */ + if (!cpu_has(c, X86_FEATURE_HT)) + return; - if (!this_cpu) - return NULL; + if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) + goto out; - info = this_cpu->c_models; + if (cpu_has(c, X86_FEATURE_XTOPOLOGY)) + return; - while (info && info->family) { - if (info->family == c->x86) - return info->model_names[c->x86_model]; - info++; + cpuid(1, &eax, &ebx, &ecx, &edx); + + smp_num_siblings = (ebx & 0xff0000) >> 16; + + if (smp_num_siblings == 1) { + printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); + } else if (smp_num_siblings > 1) { + + if (smp_num_siblings > NR_CPUS) { + printk(KERN_WARNING "CPU: Unsupported number of siblings %d", + smp_num_siblings); + smp_num_siblings = 1; + return; + } + + index_msb = get_count_order(smp_num_siblings); +#ifdef CONFIG_X86_64 + c->phys_proc_id = phys_pkg_id(index_msb); +#else + c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb); +#endif + + smp_num_siblings = smp_num_siblings / c->x86_max_cores; + + index_msb = get_count_order(smp_num_siblings); + + core_bits = get_count_order(c->x86_max_cores); + +#ifdef CONFIG_X86_64 + c->cpu_core_id = phys_pkg_id(index_msb) & + ((1 << core_bits) - 1); +#else + c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) & + ((1 << core_bits) - 1); +#endif } - return NULL; /* Not found */ -} +out: + if ((c->x86_max_cores * smp_num_siblings) > 1) { + printk(KERN_INFO "CPU: Physical Processor ID: %d\n", + c->phys_proc_id); + printk(KERN_INFO "CPU: Processor Core ID: %d\n", + c->cpu_core_id); + } +#endif +} -static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early) +static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) { char *v = c->x86_vendor_id; int i; static int printed; for (i = 0; i < X86_VENDOR_NUM; i++) { - if (cpu_devs[i]) { - if (!strcmp(v, cpu_devs[i]->c_ident[0]) || - (cpu_devs[i]->c_ident[1] && - !strcmp(v, cpu_devs[i]->c_ident[1]))) { - c->x86_vendor = i; - if (!early) - this_cpu = cpu_devs[i]; - return; - } + if (!cpu_devs[i]) + break; + + if (!strcmp(v, cpu_devs[i]->c_ident[0]) || + (cpu_devs[i]->c_ident[1] && + !strcmp(v, cpu_devs[i]->c_ident[1]))) { + this_cpu = cpu_devs[i]; + c->x86_vendor = this_cpu->c_x86_vendor; + return; } } + if (!printed) { printed++; - printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n"); + printk(KERN_ERR "CPU: vendor_id '%s' unknown, using generic init.\n", v); printk(KERN_ERR "CPU: Your system may be unstable.\n"); } + c->x86_vendor = X86_VENDOR_UNKNOWN; this_cpu = &default_cpu; } - -static int __init x86_fxsr_setup(char *s) -{ - setup_clear_cpu_cap(X86_FEATURE_FXSR); - setup_clear_cpu_cap(X86_FEATURE_XMM); - return 1; -} -__setup("nofxsr", x86_fxsr_setup); - - -static int __init x86_sep_setup(char *s) -{ - setup_clear_cpu_cap(X86_FEATURE_SEP); - return 1; -} -__setup("nosep", x86_sep_setup); - - -/* Standard macro to see if a specific flag is changeable */ -static inline int flag_is_changeable_p(u32 flag) -{ - u32 f1, f2; - - asm("pushfl\n\t" - "pushfl\n\t" - "popl %0\n\t" - "movl %0,%1\n\t" - "xorl %2,%0\n\t" - "pushl %0\n\t" - "popfl\n\t" - "pushfl\n\t" - "popl %0\n\t" - "popfl\n\t" - : "=&r" (f1), "=&r" (f2) - : "ir" (flag)); - - return ((f1^f2) & flag) != 0; -} - - -/* Probe for the CPUID instruction */ -static int __cpuinit have_cpuid_p(void) -{ - return flag_is_changeable_p(X86_EFLAGS_ID); -} - -void __init cpu_detect(struct cpuinfo_x86 *c) +void __cpuinit cpu_detect(struct cpuinfo_x86 *c) { /* Get vendor name */ cpuid(0x00000000, (unsigned int *)&c->cpuid_level, @@ -267,50 +430,87 @@ void __init cpu_detect(struct cpuinfo_x86 *c) (unsigned int *)&c->x86_vendor_id[4]); c->x86 = 4; + /* Intel-defined flags: level 0x00000001 */ if (c->cpuid_level >= 0x00000001) { u32 junk, tfms, cap0, misc; cpuid(0x00000001, &tfms, &misc, &junk, &cap0); - c->x86 = (tfms >> 8) & 15; - c->x86_model = (tfms >> 4) & 15; + c->x86 = (tfms >> 8) & 0xf; + c->x86_model = (tfms >> 4) & 0xf; + c->x86_mask = tfms & 0xf; if (c->x86 == 0xf) c->x86 += (tfms >> 20) & 0xff; if (c->x86 >= 0x6) - c->x86_model += ((tfms >> 16) & 0xF) << 4; - c->x86_mask = tfms & 15; + c->x86_model += ((tfms >> 16) & 0xf) << 4; if (cap0 & (1<<19)) { - c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8; c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; + c->x86_cache_alignment = c->x86_clflush_size; } } } -static void __cpuinit early_get_cap(struct cpuinfo_x86 *c) + +static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) { u32 tfms, xlvl; - unsigned int ebx; + u32 ebx; - memset(&c->x86_capability, 0, sizeof c->x86_capability); - if (have_cpuid_p()) { - /* Intel-defined flags: level 0x00000001 */ - if (c->cpuid_level >= 0x00000001) { - u32 capability, excap; - cpuid(0x00000001, &tfms, &ebx, &excap, &capability); - c->x86_capability[0] = capability; - c->x86_capability[4] = excap; - } + /* Intel-defined flags: level 0x00000001 */ + if (c->cpuid_level >= 0x00000001) { + u32 capability, excap; + cpuid(0x00000001, &tfms, &ebx, &excap, &capability); + c->x86_capability[0] = capability; + c->x86_capability[4] = excap; + } - /* AMD-defined flags: level 0x80000001 */ - xlvl = cpuid_eax(0x80000000); - if ((xlvl & 0xffff0000) == 0x80000000) { - if (xlvl >= 0x80000001) { - c->x86_capability[1] = cpuid_edx(0x80000001); - c->x86_capability[6] = cpuid_ecx(0x80000001); - } + /* AMD-defined flags: level 0x80000001 */ + xlvl = cpuid_eax(0x80000000); + c->extended_cpuid_level = xlvl; + if ((xlvl & 0xffff0000) == 0x80000000) { + if (xlvl >= 0x80000001) { + c->x86_capability[1] = cpuid_edx(0x80000001); + c->x86_capability[6] = cpuid_ecx(0x80000001); } + } + +#ifdef CONFIG_X86_64 + if (c->extended_cpuid_level >= 0x80000008) { + u32 eax = cpuid_eax(0x80000008); + c->x86_virt_bits = (eax >> 8) & 0xff; + c->x86_phys_bits = eax & 0xff; } +#endif + + if (c->extended_cpuid_level >= 0x80000007) + c->x86_power = cpuid_edx(0x80000007); } +static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_32 + int i; + + /* + * First of all, decide if this is a 486 or higher + * It's a 486 if we can modify the AC flag + */ + if (flag_is_changeable_p(X86_EFLAGS_AC)) + c->x86 = 4; + else + c->x86 = 3; + + for (i = 0; i < X86_VENDOR_NUM; i++) + if (cpu_devs[i] && cpu_devs[i]->c_identify) { + c->x86_vendor_id[0] = 0; + cpu_devs[i]->c_identify(c); + if (c->x86_vendor_id[0]) { + get_cpu_vendor(c); + break; + } + } +#endif +} + /* * Do minimum CPU detection early. * Fields really needed: vendor, cpuid_level, family, model, mask, @@ -320,109 +520,113 @@ static void __cpuinit early_get_cap(struct cpuinfo_x86 *c) * WARNING: this function is only called on the BP. Don't add code here * that is supposed to run on all CPUs. */ -static void __init early_cpu_detect(void) +static void __init early_identify_cpu(struct cpuinfo_x86 *c) { - struct cpuinfo_x86 *c = &boot_cpu_data; - - c->x86_cache_alignment = 32; +#ifdef CONFIG_X86_64 + c->x86_clflush_size = 64; +#else c->x86_clflush_size = 32; +#endif + c->x86_cache_alignment = c->x86_clflush_size; + + memset(&c->x86_capability, 0, sizeof c->x86_capability); + c->extended_cpuid_level = 0; + + if (!have_cpuid_p()) + identify_cpu_without_cpuid(c); + /* cyrix could have cpuid enabled via c_identify()*/ if (!have_cpuid_p()) return; cpu_detect(c); - get_cpu_vendor(c, 1); + get_cpu_vendor(c); + + get_cpu_cap(c); - if (c->x86_vendor != X86_VENDOR_UNKNOWN && - cpu_devs[c->x86_vendor]->c_early_init) - cpu_devs[c->x86_vendor]->c_early_init(c); + if (this_cpu->c_early_init) + this_cpu->c_early_init(c); - early_get_cap(c); + validate_pat_support(c); } -static void __cpuinit generic_identify(struct cpuinfo_x86 *c) +void __init early_cpu_init(void) { - u32 tfms, xlvl; - unsigned int ebx; - - if (have_cpuid_p()) { - /* Get vendor name */ - cpuid(0x00000000, (unsigned int *)&c->cpuid_level, - (unsigned int *)&c->x86_vendor_id[0], - (unsigned int *)&c->x86_vendor_id[8], - (unsigned int *)&c->x86_vendor_id[4]); - - get_cpu_vendor(c, 0); - /* Initialize the standard set of capabilities */ - /* Note that the vendor-specific code below might override */ - /* Intel-defined flags: level 0x00000001 */ - if (c->cpuid_level >= 0x00000001) { - u32 capability, excap; - cpuid(0x00000001, &tfms, &ebx, &excap, &capability); - c->x86_capability[0] = capability; - c->x86_capability[4] = excap; - c->x86 = (tfms >> 8) & 15; - c->x86_model = (tfms >> 4) & 15; - if (c->x86 == 0xf) - c->x86 += (tfms >> 20) & 0xff; - if (c->x86 >= 0x6) - c->x86_model += ((tfms >> 16) & 0xF) << 4; - c->x86_mask = tfms & 15; - c->initial_apicid = (ebx >> 24) & 0xFF; -#ifdef CONFIG_X86_HT - c->apicid = phys_pkg_id(c->initial_apicid, 0); - c->phys_proc_id = c->initial_apicid; -#else - c->apicid = c->initial_apicid; -#endif - if (test_cpu_cap(c, X86_FEATURE_CLFLSH)) - c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8; - } else { - /* Have CPUID level 0 only - unheard of */ - c->x86 = 4; - } - - /* AMD-defined flags: level 0x80000001 */ - xlvl = cpuid_eax(0x80000000); - if ((xlvl & 0xffff0000) == 0x80000000) { - if (xlvl >= 0x80000001) { - c->x86_capability[1] = cpuid_edx(0x80000001); - c->x86_capability[6] = cpuid_ecx(0x80000001); - } - if (xlvl >= 0x80000004) - get_model_name(c); /* Default name */ + struct cpu_dev **cdev; + int count = 0; + + printk("KERNEL supported cpus:\n"); + for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) { + struct cpu_dev *cpudev = *cdev; + unsigned int j; + + if (count >= X86_VENDOR_NUM) + break; + cpu_devs[count] = cpudev; + count++; + + for (j = 0; j < 2; j++) { + if (!cpudev->c_ident[j]) + continue; + printk(" %s %s\n", cpudev->c_vendor, + cpudev->c_ident[j]); } - - init_scattered_cpuid_features(c); } + early_identify_cpu(&boot_cpu_data); } -static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) +/* + * The NOPL instruction is supposed to exist on all CPUs with + * family >= 6; unfortunately, that's not true in practice because + * of early VIA chips and (more importantly) broken virtualizers that + * are not easy to detect. In the latter case it doesn't even *fail* + * reliably, so probing for it doesn't even work. Disable it completely + * unless we can find a reliable way to detect all the broken cases. + */ +static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) { - if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) { - /* Disable processor serial number */ - unsigned long lo, hi; - rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi); - lo |= 0x200000; - wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi); - printk(KERN_NOTICE "CPU serial number disabled.\n"); - clear_cpu_cap(c, X86_FEATURE_PN); - - /* Disabling the serial number may affect the cpuid level */ - c->cpuid_level = cpuid_eax(0); - } + clear_cpu_cap(c, X86_FEATURE_NOPL); } -static int __init x86_serial_nr_setup(char *s) +static void __cpuinit generic_identify(struct cpuinfo_x86 *c) { - disable_x86_serial_nr = 0; - return 1; -} -__setup("serialnumber", x86_serial_nr_setup); + c->extended_cpuid_level = 0; + + if (!have_cpuid_p()) + identify_cpu_without_cpuid(c); + + /* cyrix could have cpuid enabled via c_identify()*/ + if (!have_cpuid_p()) + return; + + cpu_detect(c); + + get_cpu_vendor(c); + get_cpu_cap(c); + if (c->cpuid_level >= 0x00000001) { + c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF; +#ifdef CONFIG_X86_32 +# ifdef CONFIG_X86_HT + c->apicid = phys_pkg_id(c->initial_apicid, 0); +# else + c->apicid = c->initial_apicid; +# endif +#endif + +#ifdef CONFIG_X86_HT + c->phys_proc_id = c->initial_apicid; +#endif + } + + get_model_name(c); /* Default name */ + + init_scattered_cpuid_features(c); + detect_nopl(c); +} /* * This does the hard work of actually picking apart the CPU stuff... @@ -434,30 +638,29 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) c->loops_per_jiffy = loops_per_jiffy; c->x86_cache_size = -1; c->x86_vendor = X86_VENDOR_UNKNOWN; - c->cpuid_level = -1; /* CPUID not detected */ c->x86_model = c->x86_mask = 0; /* So far unknown... */ c->x86_vendor_id[0] = '\0'; /* Unset */ c->x86_model_id[0] = '\0'; /* Unset */ c->x86_max_cores = 1; + c->x86_coreid_bits = 0; +#ifdef CONFIG_X86_64 + c->x86_clflush_size = 64; +#else + c->cpuid_level = -1; /* CPUID not detected */ c->x86_clflush_size = 32; +#endif + c->x86_cache_alignment = c->x86_clflush_size; memset(&c->x86_capability, 0, sizeof c->x86_capability); - if (!have_cpuid_p()) { - /* - * First of all, decide if this is a 486 or higher - * It's a 486 if we can modify the AC flag - */ - if (flag_is_changeable_p(X86_EFLAGS_AC)) - c->x86 = 4; - else - c->x86 = 3; - } - generic_identify(c); if (this_cpu->c_identify) this_cpu->c_identify(c); +#ifdef CONFIG_X86_64 + c->apicid = phys_pkg_id(0); +#endif + /* * Vendor-specific initialization. In this section we * canonicalize the feature flags, meaning if there are @@ -491,6 +694,10 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) c->x86, c->x86_model); } +#ifdef CONFIG_X86_64 + detect_ht(c); +#endif + /* * On SMP, boot_cpu_data holds the common feature set between * all CPUs; so make sure that we indicate which features are @@ -499,7 +706,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) */ if (c != &boot_cpu_data) { /* AND the already accumulated flags with these */ - for (i = 0 ; i < NCAPINTS ; i++) + for (i = 0; i < NCAPINTS; i++) boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; } @@ -507,72 +714,91 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) for (i = 0; i < NCAPINTS; i++) c->x86_capability[i] &= ~cleared_cpu_caps[i]; +#ifdef CONFIG_X86_MCE /* Init Machine Check Exception if available. */ mcheck_init(c); +#endif select_idle_routine(c); + +#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) + numa_add_cpu(smp_processor_id()); +#endif } +#ifdef CONFIG_X86_64 +static void vgetcpu_set_mode(void) +{ + if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) + vgetcpu_mode = VGETCPU_RDTSCP; + else + vgetcpu_mode = VGETCPU_LSL; +} +#endif + void __init identify_boot_cpu(void) { identify_cpu(&boot_cpu_data); +#ifdef CONFIG_X86_32 sysenter_setup(); enable_sep_cpu(); +#else + vgetcpu_set_mode(); +#endif } void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) { BUG_ON(c == &boot_cpu_data); identify_cpu(c); +#ifdef CONFIG_X86_32 enable_sep_cpu(); +#endif mtrr_ap_init(); } -#ifdef CONFIG_X86_HT -void __cpuinit detect_ht(struct cpuinfo_x86 *c) -{ - u32 eax, ebx, ecx, edx; - int index_msb, core_bits; - - cpuid(1, &eax, &ebx, &ecx, &edx); - - if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY)) - return; - - smp_num_siblings = (ebx & 0xff0000) >> 16; +struct msr_range { + unsigned min; + unsigned max; +}; - if (smp_num_siblings == 1) { - printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); - } else if (smp_num_siblings > 1) { +static struct msr_range msr_range_array[] __cpuinitdata = { + { 0x00000000, 0x00000418}, + { 0xc0000000, 0xc000040b}, + { 0xc0010000, 0xc0010142}, + { 0xc0011000, 0xc001103b}, +}; - if (smp_num_siblings > NR_CPUS) { - printk(KERN_WARNING "CPU: Unsupported number of the " - "siblings %d", smp_num_siblings); - smp_num_siblings = 1; - return; +static void __cpuinit print_cpu_msr(void) +{ + unsigned index; + u64 val; + int i; + unsigned index_min, index_max; + + for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) { + index_min = msr_range_array[i].min; + index_max = msr_range_array[i].max; + for (index = index_min; index < index_max; index++) { + if (rdmsrl_amd_safe(index, &val)) + continue; + printk(KERN_INFO " MSR%08x: %016llx\n", index, val); } + } +} - index_msb = get_count_order(smp_num_siblings); - c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb); - - printk(KERN_INFO "CPU: Physical Processor ID: %d\n", - c->phys_proc_id); - - smp_num_siblings = smp_num_siblings / c->x86_max_cores; - - index_msb = get_count_order(smp_num_siblings) ; - - core_bits = get_count_order(c->x86_max_cores); +static int show_msr __cpuinitdata; +static __init int setup_show_msr(char *arg) +{ + int num; - c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) & - ((1 << core_bits) - 1); + get_option(&arg, &num); - if (c->x86_max_cores > 1) - printk(KERN_INFO "CPU: Processor Core ID: %d\n", - c->cpu_core_id); - } + if (num > 0) + show_msr = num; + return 1; } -#endif +__setup("show_msr=", setup_show_msr); static __init int setup_noclflush(char *arg) { @@ -590,18 +816,26 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) else if (c->cpuid_level >= 0) vendor = c->x86_vendor_id; - if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor))) - printk("%s ", vendor); + if (vendor && !strstr(c->x86_model_id, vendor)) + printk(KERN_CONT "%s ", vendor); - if (!c->x86_model_id[0]) - printk("%d86", c->x86); + if (c->x86_model_id[0]) + printk(KERN_CONT "%s", c->x86_model_id); else - printk("%s", c->x86_model_id); + printk(KERN_CONT "%d86", c->x86); if (c->x86_mask || c->cpuid_level >= 0) - printk(" stepping %02x\n", c->x86_mask); + printk(KERN_CONT " stepping %02x\n", c->x86_mask); else - printk("\n"); + printk(KERN_CONT "\n"); + +#ifdef CONFIG_SMP + if (c->cpu_index < show_msr) + print_cpu_msr(); +#else + if (show_msr) + print_cpu_msr(); +#endif } static __init int setup_disablecpuid(char *arg) @@ -617,19 +851,89 @@ __setup("clearcpuid=", setup_disablecpuid); cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; -void __init early_cpu_init(void) +#ifdef CONFIG_X86_64 +struct x8664_pda **_cpu_pda __read_mostly; +EXPORT_SYMBOL(_cpu_pda); + +struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; + +char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; + +void __cpuinit pda_init(int cpu) { - struct cpu_vendor_dev *cvdev; + struct x8664_pda *pda = cpu_pda(cpu); + + /* Setup up data that may be needed in __get_free_pages early */ + loadsegment(fs, 0); + loadsegment(gs, 0); + /* Memory clobbers used to order PDA accessed */ + mb(); + wrmsrl(MSR_GS_BASE, pda); + mb(); + + pda->cpunumber = cpu; + pda->irqcount = -1; + pda->kernelstack = (unsigned long)stack_thread_info() - + PDA_STACKOFFSET + THREAD_SIZE; + pda->active_mm = &init_mm; + pda->mmu_state = 0; + + if (cpu == 0) { + /* others are initialized in smpboot.c */ + pda->pcurrent = &init_task; + pda->irqstackptr = boot_cpu_stack; + pda->irqstackptr += IRQSTACKSIZE - 64; + } else { + if (!pda->irqstackptr) { + pda->irqstackptr = (char *) + __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); + if (!pda->irqstackptr) + panic("cannot allocate irqstack for cpu %d", + cpu); + pda->irqstackptr += IRQSTACKSIZE - 64; + } + + if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE) + pda->nodenumber = cpu_to_node(cpu); + } +} + +char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + + DEBUG_STKSZ] __page_aligned_bss; - for (cvdev = __x86cpuvendor_start ; - cvdev < __x86cpuvendor_end ; - cvdev++) - cpu_devs[cvdev->vendor] = cvdev->cpu_dev; +extern asmlinkage void ignore_sysret(void); - early_cpu_detect(); - validate_pat_support(&boot_cpu_data); +/* May not be marked __init: used by software suspend */ +void syscall_init(void) +{ + /* + * LSTAR and STAR live in a bit strange symbiosis. + * They both write to the same internal register. STAR allows to + * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip. + */ + wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); + wrmsrl(MSR_LSTAR, system_call); + wrmsrl(MSR_CSTAR, ignore_sysret); + +#ifdef CONFIG_IA32_EMULATION + syscall32_cpu_init(); +#endif + + /* Flags to clear on syscall */ + wrmsrl(MSR_SYSCALL_MASK, + X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL); } +unsigned long kernel_eflags; + +/* + * Copies of the original ist values from the tss are only accessed during + * debugging, no special alignment required. + */ +DEFINE_PER_CPU(struct orig_ist, orig_ist); + +#else + /* Make sure %fs is initialized properly in idle threads */ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) { @@ -637,25 +941,136 @@ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) regs->fs = __KERNEL_PERCPU; return regs; } - -/* Current gdt points %fs at the "master" per-cpu area: after this, - * it's on the real one. */ -void switch_to_new_gdt(void) -{ - struct desc_ptr gdt_descr; - - gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); - gdt_descr.size = GDT_SIZE - 1; - load_gdt(&gdt_descr); - asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory"); -} +#endif /* * cpu_init() initializes state that is per-CPU. Some data is already * initialized (naturally) in the bootstrap process, such as the GDT * and IDT. We reload them nevertheless, this function acts as a * 'CPU state barrier', nothing should get across. + * A lot of state is already set up in PDA init for 64 bit */ +#ifdef CONFIG_X86_64 +void __cpuinit cpu_init(void) +{ + int cpu = stack_smp_processor_id(); + struct tss_struct *t = &per_cpu(init_tss, cpu); + struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu); + unsigned long v; + char *estacks = NULL; + struct task_struct *me; + int i; + + /* CPU 0 is initialised in head64.c */ + if (cpu != 0) + pda_init(cpu); + else + estacks = boot_exception_stacks; + + me = current; + + if (cpu_test_and_set(cpu, cpu_initialized)) + panic("CPU#%d already initialized!\n", cpu); + + printk(KERN_INFO "Initializing CPU#%d\n", cpu); + + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + + /* + * Initialize the per-CPU GDT with the boot GDT, + * and set up the GDT descriptor: + */ + + switch_to_new_gdt(); + load_idt((const struct desc_ptr *)&idt_descr); + + memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); + syscall_init(); + + wrmsrl(MSR_FS_BASE, 0); + wrmsrl(MSR_KERNEL_GS_BASE, 0); + barrier(); + + check_efer(); + if (cpu != 0 && x2apic) + enable_x2apic(); + + /* + * set up and load the per-CPU TSS + */ + if (!orig_ist->ist[0]) { + static const unsigned int order[N_EXCEPTION_STACKS] = { + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, + [DEBUG_STACK - 1] = DEBUG_STACK_ORDER + }; + for (v = 0; v < N_EXCEPTION_STACKS; v++) { + if (cpu) { + estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); + if (!estacks) + panic("Cannot allocate exception " + "stack %ld %d\n", v, cpu); + } + estacks += PAGE_SIZE << order[v]; + orig_ist->ist[v] = t->x86_tss.ist[v] = + (unsigned long)estacks; + } + } + + t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); + /* + * <= is required because the CPU will access up to + * 8 bits beyond the end of the IO permission bitmap. + */ + for (i = 0; i <= IO_BITMAP_LONGS; i++) + t->io_bitmap[i] = ~0UL; + + atomic_inc(&init_mm.mm_count); + me->active_mm = &init_mm; + if (me->mm) + BUG(); + enter_lazy_tlb(&init_mm, me); + + load_sp0(t, ¤t->thread); + set_tss_desc(cpu, t); + load_TR_desc(); + load_LDT(&init_mm.context); + +#ifdef CONFIG_KGDB + /* + * If the kgdb is connected no debug regs should be altered. This + * is only applicable when KGDB and a KGDB I/O module are built + * into the kernel and you are using early debugging with + * kgdbwait. KGDB will control the kernel HW breakpoint registers. + */ + if (kgdb_connected && arch_kgdb_ops.correct_hw_break) + arch_kgdb_ops.correct_hw_break(); + else { +#endif + /* + * Clear all 6 debug registers: + */ + + set_debugreg(0UL, 0); + set_debugreg(0UL, 1); + set_debugreg(0UL, 2); + set_debugreg(0UL, 3); + set_debugreg(0UL, 6); + set_debugreg(0UL, 7); +#ifdef CONFIG_KGDB + /* If the kgdb is connected no debug regs should be altered. */ + } +#endif + + fpu_init(); + + raw_local_save_flags(kernel_eflags); + + if (is_uv_system()) + uv_cpu_init(); +} + +#else + void __cpuinit cpu_init(void) { int cpu = smp_processor_id(); @@ -709,19 +1124,21 @@ void __cpuinit cpu_init(void) /* * Force FPU initialization: */ - current_thread_info()->status = 0; + if (cpu_has_xsave) + current_thread_info()->status = TS_XSAVE; + else + current_thread_info()->status = 0; clear_used_math(); mxcsr_feature_mask_init(); -} -#ifdef CONFIG_HOTPLUG_CPU -void __cpuinit cpu_uninit(void) -{ - int cpu = raw_smp_processor_id(); - cpu_clear(cpu, cpu_initialized); + /* + * Boot processor to setup the FP and extended state context info. + */ + if (!smp_processor_id()) + init_thread_xstate(); - /* lazy TLB state */ - per_cpu(cpu_tlbstate, cpu).state = 0; - per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm; + xsave_init(); } + + #endif diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c deleted file mode 100644 index 7b8cc72feb40..000000000000 --- a/arch/x86/kernel/cpu/common_64.c +++ /dev/null @@ -1,681 +0,0 @@ -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/string.h> -#include <linux/bootmem.h> -#include <linux/bitops.h> -#include <linux/module.h> -#include <linux/kgdb.h> -#include <linux/topology.h> -#include <linux/string.h> -#include <linux/delay.h> -#include <linux/smp.h> -#include <linux/module.h> -#include <linux/percpu.h> -#include <asm/processor.h> -#include <asm/i387.h> -#include <asm/msr.h> -#include <asm/io.h> -#include <asm/mmu_context.h> -#include <asm/mtrr.h> -#include <asm/mce.h> -#include <asm/pat.h> -#include <asm/numa.h> -#ifdef CONFIG_X86_LOCAL_APIC -#include <asm/mpspec.h> -#include <asm/apic.h> -#include <mach_apic.h> -#endif -#include <asm/pda.h> -#include <asm/pgtable.h> -#include <asm/processor.h> -#include <asm/desc.h> -#include <asm/atomic.h> -#include <asm/proto.h> -#include <asm/sections.h> -#include <asm/setup.h> -#include <asm/genapic.h> - -#include "cpu.h" - -/* We need valid kernel segments for data and code in long mode too - * IRET will check the segment types kkeil 2000/10/28 - * Also sysret mandates a special GDT layout - */ -/* The TLS descriptors are currently at a different place compared to i386. - Hopefully nobody expects them at a fixed place (Wine?) */ -DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { - [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, - [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } }, - [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } }, - [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } }, - [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, - [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, -} }; -EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); - -__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; - -/* Current gdt points %fs at the "master" per-cpu area: after this, - * it's on the real one. */ -void switch_to_new_gdt(void) -{ - struct desc_ptr gdt_descr; - - gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); - gdt_descr.size = GDT_SIZE - 1; - load_gdt(&gdt_descr); -} - -struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; - -static void __cpuinit default_init(struct cpuinfo_x86 *c) -{ - display_cacheinfo(c); -} - -static struct cpu_dev __cpuinitdata default_cpu = { - .c_init = default_init, - .c_vendor = "Unknown", -}; -static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu; - -int __cpuinit get_model_name(struct cpuinfo_x86 *c) -{ - unsigned int *v; - - if (c->extended_cpuid_level < 0x80000004) - return 0; - - v = (unsigned int *) c->x86_model_id; - cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); - cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); - cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); - c->x86_model_id[48] = 0; - return 1; -} - - -void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) -{ - unsigned int n, dummy, ebx, ecx, edx; - - n = c->extended_cpuid_level; - - if (n >= 0x80000005) { - cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); - printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), " - "D cache %dK (%d bytes/line)\n", - edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); - c->x86_cache_size = (ecx>>24) + (edx>>24); - /* On K8 L1 TLB is inclusive, so don't count it */ - c->x86_tlbsize = 0; - } - - if (n >= 0x80000006) { - cpuid(0x80000006, &dummy, &ebx, &ecx, &edx); - ecx = cpuid_ecx(0x80000006); - c->x86_cache_size = ecx >> 16; - c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff); - - printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", - c->x86_cache_size, ecx & 0xFF); - } -} - -void __cpuinit detect_ht(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_SMP - u32 eax, ebx, ecx, edx; - int index_msb, core_bits; - - cpuid(1, &eax, &ebx, &ecx, &edx); - - - if (!cpu_has(c, X86_FEATURE_HT)) - return; - if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) - goto out; - - smp_num_siblings = (ebx & 0xff0000) >> 16; - - if (smp_num_siblings == 1) { - printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); - } else if (smp_num_siblings > 1) { - - if (smp_num_siblings > NR_CPUS) { - printk(KERN_WARNING "CPU: Unsupported number of " - "siblings %d", smp_num_siblings); - smp_num_siblings = 1; - return; - } - - index_msb = get_count_order(smp_num_siblings); - c->phys_proc_id = phys_pkg_id(index_msb); - - smp_num_siblings = smp_num_siblings / c->x86_max_cores; - - index_msb = get_count_order(smp_num_siblings); - - core_bits = get_count_order(c->x86_max_cores); - - c->cpu_core_id = phys_pkg_id(index_msb) & - ((1 << core_bits) - 1); - } -out: - if ((c->x86_max_cores * smp_num_siblings) > 1) { - printk(KERN_INFO "CPU: Physical Processor ID: %d\n", - c->phys_proc_id); - printk(KERN_INFO "CPU: Processor Core ID: %d\n", - c->cpu_core_id); - } - -#endif -} - -static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) -{ - char *v = c->x86_vendor_id; - int i; - static int printed; - - for (i = 0; i < X86_VENDOR_NUM; i++) { - if (cpu_devs[i]) { - if (!strcmp(v, cpu_devs[i]->c_ident[0]) || - (cpu_devs[i]->c_ident[1] && - !strcmp(v, cpu_devs[i]->c_ident[1]))) { - c->x86_vendor = i; - this_cpu = cpu_devs[i]; - return; - } - } - } - if (!printed) { - printed++; - printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n"); - printk(KERN_ERR "CPU: Your system may be unstable.\n"); - } - c->x86_vendor = X86_VENDOR_UNKNOWN; -} - -static void __init early_cpu_support_print(void) -{ - int i,j; - struct cpu_dev *cpu_devx; - - printk("KERNEL supported cpus:\n"); - for (i = 0; i < X86_VENDOR_NUM; i++) { - cpu_devx = cpu_devs[i]; - if (!cpu_devx) - continue; - for (j = 0; j < 2; j++) { - if (!cpu_devx->c_ident[j]) - continue; - printk(" %s %s\n", cpu_devx->c_vendor, - cpu_devx->c_ident[j]); - } - } -} - -static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c); - -void __init early_cpu_init(void) -{ - struct cpu_vendor_dev *cvdev; - - for (cvdev = __x86cpuvendor_start ; - cvdev < __x86cpuvendor_end ; - cvdev++) - cpu_devs[cvdev->vendor] = cvdev->cpu_dev; - early_cpu_support_print(); - early_identify_cpu(&boot_cpu_data); -} - -/* Do some early cpuid on the boot CPU to get some parameter that are - needed before check_bugs. Everything advanced is in identify_cpu - below. */ -static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) -{ - u32 tfms, xlvl; - - c->loops_per_jiffy = loops_per_jiffy; - c->x86_cache_size = -1; - c->x86_vendor = X86_VENDOR_UNKNOWN; - c->x86_model = c->x86_mask = 0; /* So far unknown... */ - c->x86_vendor_id[0] = '\0'; /* Unset */ - c->x86_model_id[0] = '\0'; /* Unset */ - c->x86_clflush_size = 64; - c->x86_cache_alignment = c->x86_clflush_size; - c->x86_max_cores = 1; - c->x86_coreid_bits = 0; - c->extended_cpuid_level = 0; - memset(&c->x86_capability, 0, sizeof c->x86_capability); - - /* Get vendor name */ - cpuid(0x00000000, (unsigned int *)&c->cpuid_level, - (unsigned int *)&c->x86_vendor_id[0], - (unsigned int *)&c->x86_vendor_id[8], - (unsigned int *)&c->x86_vendor_id[4]); - - get_cpu_vendor(c); - - /* Initialize the standard set of capabilities */ - /* Note that the vendor-specific code below might override */ - - /* Intel-defined flags: level 0x00000001 */ - if (c->cpuid_level >= 0x00000001) { - __u32 misc; - cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4], - &c->x86_capability[0]); - c->x86 = (tfms >> 8) & 0xf; - c->x86_model = (tfms >> 4) & 0xf; - c->x86_mask = tfms & 0xf; - if (c->x86 == 0xf) - c->x86 += (tfms >> 20) & 0xff; - if (c->x86 >= 0x6) - c->x86_model += ((tfms >> 16) & 0xF) << 4; - if (test_cpu_cap(c, X86_FEATURE_CLFLSH)) - c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; - } else { - /* Have CPUID level 0 only - unheard of */ - c->x86 = 4; - } - - c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff; -#ifdef CONFIG_SMP - c->phys_proc_id = c->initial_apicid; -#endif - /* AMD-defined flags: level 0x80000001 */ - xlvl = cpuid_eax(0x80000000); - c->extended_cpuid_level = xlvl; - if ((xlvl & 0xffff0000) == 0x80000000) { - if (xlvl >= 0x80000001) { - c->x86_capability[1] = cpuid_edx(0x80000001); - c->x86_capability[6] = cpuid_ecx(0x80000001); - } - if (xlvl >= 0x80000004) - get_model_name(c); /* Default name */ - } - - /* Transmeta-defined flags: level 0x80860001 */ - xlvl = cpuid_eax(0x80860000); - if ((xlvl & 0xffff0000) == 0x80860000) { - /* Don't set x86_cpuid_level here for now to not confuse. */ - if (xlvl >= 0x80860001) - c->x86_capability[2] = cpuid_edx(0x80860001); - } - - c->extended_cpuid_level = cpuid_eax(0x80000000); - if (c->extended_cpuid_level >= 0x80000007) - c->x86_power = cpuid_edx(0x80000007); - - if (c->extended_cpuid_level >= 0x80000008) { - u32 eax = cpuid_eax(0x80000008); - - c->x86_virt_bits = (eax >> 8) & 0xff; - c->x86_phys_bits = eax & 0xff; - } - - /* Assume all 64-bit CPUs support 32-bit syscall */ - set_cpu_cap(c, X86_FEATURE_SYSCALL32); - - if (c->x86_vendor != X86_VENDOR_UNKNOWN && - cpu_devs[c->x86_vendor]->c_early_init) - cpu_devs[c->x86_vendor]->c_early_init(c); - - validate_pat_support(c); - - /* early_param could clear that, but recall get it set again */ - if (disable_apic) - clear_cpu_cap(c, X86_FEATURE_APIC); -} - -/* - * This does the hard work of actually picking apart the CPU stuff... - */ -static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) -{ - int i; - - early_identify_cpu(c); - - init_scattered_cpuid_features(c); - - c->apicid = phys_pkg_id(0); - - /* - * Vendor-specific initialization. In this section we - * canonicalize the feature flags, meaning if there are - * features a certain CPU supports which CPUID doesn't - * tell us, CPUID claiming incorrect flags, or other bugs, - * we handle them here. - * - * At the end of this section, c->x86_capability better - * indicate the features this CPU genuinely supports! - */ - if (this_cpu->c_init) - this_cpu->c_init(c); - - detect_ht(c); - - /* - * On SMP, boot_cpu_data holds the common feature set between - * all CPUs; so make sure that we indicate which features are - * common between the CPUs. The first time this routine gets - * executed, c == &boot_cpu_data. - */ - if (c != &boot_cpu_data) { - /* AND the already accumulated flags with these */ - for (i = 0; i < NCAPINTS; i++) - boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; - } - - /* Clear all flags overriden by options */ - for (i = 0; i < NCAPINTS; i++) - c->x86_capability[i] &= ~cleared_cpu_caps[i]; - -#ifdef CONFIG_X86_MCE - mcheck_init(c); -#endif - select_idle_routine(c); - -#ifdef CONFIG_NUMA - numa_add_cpu(smp_processor_id()); -#endif - -} - -void __cpuinit identify_boot_cpu(void) -{ - identify_cpu(&boot_cpu_data); -} - -void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) -{ - BUG_ON(c == &boot_cpu_data); - identify_cpu(c); - mtrr_ap_init(); -} - -static __init int setup_noclflush(char *arg) -{ - setup_clear_cpu_cap(X86_FEATURE_CLFLSH); - return 1; -} -__setup("noclflush", setup_noclflush); - -void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) -{ - if (c->x86_model_id[0]) - printk(KERN_CONT "%s", c->x86_model_id); - - if (c->x86_mask || c->cpuid_level >= 0) - printk(KERN_CONT " stepping %02x\n", c->x86_mask); - else - printk(KERN_CONT "\n"); -} - -static __init int setup_disablecpuid(char *arg) -{ - int bit; - if (get_option(&arg, &bit) && bit < NCAPINTS*32) - setup_clear_cpu_cap(bit); - else - return 0; - return 1; -} -__setup("clearcpuid=", setup_disablecpuid); - -cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; - -struct x8664_pda **_cpu_pda __read_mostly; -EXPORT_SYMBOL(_cpu_pda); - -struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; - -char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; - -unsigned long __supported_pte_mask __read_mostly = ~0UL; -EXPORT_SYMBOL_GPL(__supported_pte_mask); - -static int do_not_nx __cpuinitdata; - -/* noexec=on|off -Control non executable mappings for 64bit processes. - -on Enable(default) -off Disable -*/ -static int __init nonx_setup(char *str) -{ - if (!str) - return -EINVAL; - if (!strncmp(str, "on", 2)) { - __supported_pte_mask |= _PAGE_NX; - do_not_nx = 0; - } else if (!strncmp(str, "off", 3)) { - do_not_nx = 1; - __supported_pte_mask &= ~_PAGE_NX; - } - return 0; -} -early_param("noexec", nonx_setup); - -int force_personality32; - -/* noexec32=on|off -Control non executable heap for 32bit processes. -To control the stack too use noexec=off - -on PROT_READ does not imply PROT_EXEC for 32bit processes (default) -off PROT_READ implies PROT_EXEC -*/ -static int __init nonx32_setup(char *str) -{ - if (!strcmp(str, "on")) - force_personality32 &= ~READ_IMPLIES_EXEC; - else if (!strcmp(str, "off")) - force_personality32 |= READ_IMPLIES_EXEC; - return 1; -} -__setup("noexec32=", nonx32_setup); - -void pda_init(int cpu) -{ - struct x8664_pda *pda = cpu_pda(cpu); - - /* Setup up data that may be needed in __get_free_pages early */ - loadsegment(fs, 0); - loadsegment(gs, 0); - /* Memory clobbers used to order PDA accessed */ - mb(); - wrmsrl(MSR_GS_BASE, pda); - mb(); - - pda->cpunumber = cpu; - pda->irqcount = -1; - pda->kernelstack = (unsigned long)stack_thread_info() - - PDA_STACKOFFSET + THREAD_SIZE; - pda->active_mm = &init_mm; - pda->mmu_state = 0; - - if (cpu == 0) { - /* others are initialized in smpboot.c */ - pda->pcurrent = &init_task; - pda->irqstackptr = boot_cpu_stack; - } else { - pda->irqstackptr = (char *) - __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); - if (!pda->irqstackptr) - panic("cannot allocate irqstack for cpu %d", cpu); - - if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE) - pda->nodenumber = cpu_to_node(cpu); - } - - pda->irqstackptr += IRQSTACKSIZE-64; -} - -char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + - DEBUG_STKSZ] -__attribute__((section(".bss.page_aligned"))); - -extern asmlinkage void ignore_sysret(void); - -/* May not be marked __init: used by software suspend */ -void syscall_init(void) -{ - /* - * LSTAR and STAR live in a bit strange symbiosis. - * They both write to the same internal register. STAR allows to - * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip. - */ - wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); - wrmsrl(MSR_LSTAR, system_call); - wrmsrl(MSR_CSTAR, ignore_sysret); - -#ifdef CONFIG_IA32_EMULATION - syscall32_cpu_init(); -#endif - - /* Flags to clear on syscall */ - wrmsrl(MSR_SYSCALL_MASK, - X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL); -} - -void __cpuinit check_efer(void) -{ - unsigned long efer; - - rdmsrl(MSR_EFER, efer); - if (!(efer & EFER_NX) || do_not_nx) - __supported_pte_mask &= ~_PAGE_NX; -} - -unsigned long kernel_eflags; - -/* - * Copies of the original ist values from the tss are only accessed during - * debugging, no special alignment required. - */ -DEFINE_PER_CPU(struct orig_ist, orig_ist); - -/* - * cpu_init() initializes state that is per-CPU. Some data is already - * initialized (naturally) in the bootstrap process, such as the GDT - * and IDT. We reload them nevertheless, this function acts as a - * 'CPU state barrier', nothing should get across. - * A lot of state is already set up in PDA init. - */ -void __cpuinit cpu_init(void) -{ - int cpu = stack_smp_processor_id(); - struct tss_struct *t = &per_cpu(init_tss, cpu); - struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu); - unsigned long v; - char *estacks = NULL; - struct task_struct *me; - int i; - - /* CPU 0 is initialised in head64.c */ - if (cpu != 0) - pda_init(cpu); - else - estacks = boot_exception_stacks; - - me = current; - - if (cpu_test_and_set(cpu, cpu_initialized)) - panic("CPU#%d already initialized!\n", cpu); - - printk(KERN_INFO "Initializing CPU#%d\n", cpu); - - clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); - - /* - * Initialize the per-CPU GDT with the boot GDT, - * and set up the GDT descriptor: - */ - - switch_to_new_gdt(); - load_idt((const struct desc_ptr *)&idt_descr); - - memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); - syscall_init(); - - wrmsrl(MSR_FS_BASE, 0); - wrmsrl(MSR_KERNEL_GS_BASE, 0); - barrier(); - - check_efer(); - - /* - * set up and load the per-CPU TSS - */ - for (v = 0; v < N_EXCEPTION_STACKS; v++) { - static const unsigned int order[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, - [DEBUG_STACK - 1] = DEBUG_STACK_ORDER - }; - if (cpu) { - estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); - if (!estacks) - panic("Cannot allocate exception stack %ld %d\n", - v, cpu); - } - estacks += PAGE_SIZE << order[v]; - orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks; - } - - t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); - /* - * <= is required because the CPU will access up to - * 8 bits beyond the end of the IO permission bitmap. - */ - for (i = 0; i <= IO_BITMAP_LONGS; i++) - t->io_bitmap[i] = ~0UL; - - atomic_inc(&init_mm.mm_count); - me->active_mm = &init_mm; - if (me->mm) - BUG(); - enter_lazy_tlb(&init_mm, me); - - load_sp0(t, ¤t->thread); - set_tss_desc(cpu, t); - load_TR_desc(); - load_LDT(&init_mm.context); - -#ifdef CONFIG_KGDB - /* - * If the kgdb is connected no debug regs should be altered. This - * is only applicable when KGDB and a KGDB I/O module are built - * into the kernel and you are using early debugging with - * kgdbwait. KGDB will control the kernel HW breakpoint registers. - */ - if (kgdb_connected && arch_kgdb_ops.correct_hw_break) - arch_kgdb_ops.correct_hw_break(); - else { -#endif - /* - * Clear all 6 debug registers: - */ - - set_debugreg(0UL, 0); - set_debugreg(0UL, 1); - set_debugreg(0UL, 2); - set_debugreg(0UL, 3); - set_debugreg(0UL, 6); - set_debugreg(0UL, 7); -#ifdef CONFIG_KGDB - /* If the kgdb is connected no debug regs should be altered. */ - } -#endif - - fpu_init(); - - raw_local_save_flags(kernel_eflags); - - if (is_uv_system()) - uv_cpu_init(); -} diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 4d894e8565fe..de4094a39210 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -21,23 +21,16 @@ struct cpu_dev { void (*c_init)(struct cpuinfo_x86 * c); void (*c_identify)(struct cpuinfo_x86 * c); unsigned int (*c_size_cache)(struct cpuinfo_x86 * c, unsigned int size); + int c_x86_vendor; }; -extern struct cpu_dev * cpu_devs [X86_VENDOR_NUM]; +#define cpu_dev_register(cpu_devX) \ + static struct cpu_dev *__cpu_dev_##cpu_devX __used \ + __attribute__((__section__(".x86_cpu_dev.init"))) = \ + &cpu_devX; -struct cpu_vendor_dev { - int vendor; - struct cpu_dev *cpu_dev; -}; - -#define cpu_vendor_dev_register(cpu_vendor_id, cpu_dev) \ - static struct cpu_vendor_dev __cpu_vendor_dev_##cpu_vendor_id __used \ - __attribute__((__section__(".x86cpuvendor.init"))) = \ - { cpu_vendor_id, cpu_dev } - -extern struct cpu_vendor_dev __x86cpuvendor_start[], __x86cpuvendor_end[]; +extern struct cpu_dev *__x86_cpu_dev_start[], *__x86_cpu_dev_end[]; -extern int get_model_name(struct cpuinfo_x86 *c); extern void display_cacheinfo(struct cpuinfo_x86 *c); #endif diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig index cb7a5715596d..efae3b22a0ff 100644 --- a/arch/x86/kernel/cpu/cpufreq/Kconfig +++ b/arch/x86/kernel/cpu/cpufreq/Kconfig @@ -235,9 +235,9 @@ config X86_LONGHAUL If in doubt, say N. config X86_E_POWERSAVER - tristate "VIA C7 Enhanced PowerSaver (EXPERIMENTAL)" + tristate "VIA C7 Enhanced PowerSaver" select CPU_FREQ_TABLE - depends on X86_32 && EXPERIMENTAL + depends on X86_32 help This adds the CPUFreq driver for VIA C7 processors. diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index b0c8208df9fa..8e48c5d4467d 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -202,7 +202,7 @@ static void drv_write(struct drv_cmd *cmd) cpumask_t saved_mask = current->cpus_allowed; unsigned int i; - for_each_cpu_mask(i, cmd->mask) { + for_each_cpu_mask_nr(i, cmd->mask) { set_cpus_allowed_ptr(current, &cpumask_of_cpu(i)); do_drv_write(cmd); } @@ -256,7 +256,8 @@ static u32 get_cur_val(const cpumask_t *mask) * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and * no meaning should be associated with absolute values of these MSRs. */ -static unsigned int get_measured_perf(unsigned int cpu) +static unsigned int get_measured_perf(struct cpufreq_policy *policy, + unsigned int cpu) { union { struct { @@ -326,7 +327,7 @@ static unsigned int get_measured_perf(unsigned int cpu) #endif - retval = per_cpu(drv_data, cpu)->max_freq * perf_percent / 100; + retval = per_cpu(drv_data, policy->cpu)->max_freq * perf_percent / 100; put_cpu(); set_cpus_allowed_ptr(current, &saved_mask); @@ -451,7 +452,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, freqs.old = perf->states[perf->state].core_frequency * 1000; freqs.new = data->freq_table[next_state].frequency; - for_each_cpu_mask(i, cmd.mask) { + for_each_cpu_mask_nr(i, cmd.mask) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); } @@ -466,7 +467,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, } } - for_each_cpu_mask(i, cmd.mask) { + for_each_cpu_mask_nr(i, cmd.mask) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } @@ -779,13 +780,20 @@ static int __init acpi_cpufreq_init(void) { int ret; + if (acpi_disabled) + return 0; + dprintk("acpi_cpufreq_init\n"); ret = acpi_cpufreq_early_init(); if (ret) return ret; - return cpufreq_register_driver(&acpi_cpufreq_driver); + ret = cpufreq_register_driver(&acpi_cpufreq_driver); + if (ret) + free_percpu(acpi_perf_data); + + return ret; } static void __exit acpi_cpufreq_exit(void) @@ -795,8 +803,6 @@ static void __exit acpi_cpufreq_exit(void) cpufreq_unregister_driver(&acpi_cpufreq_driver); free_percpu(acpi_perf_data); - - return; } module_param(acpi_pstate_strict, uint, 0644); diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c index 94619c22f563..fe613c93b366 100644 --- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c +++ b/arch/x86/kernel/cpu/cpufreq/elanfreq.c @@ -25,8 +25,8 @@ #include <linux/cpufreq.h> #include <asm/msr.h> -#include <asm/timex.h> -#include <asm/io.h> +#include <linux/timex.h> +#include <linux/io.h> #define REG_CSCIR 0x22 /* Chip Setup and Control Index Register */ #define REG_CSCDR 0x23 /* Chip Setup and Control Data Register */ @@ -44,7 +44,7 @@ struct s_elan_multiplier { * It is important that the frequencies * are listed in ascending order here! */ -struct s_elan_multiplier elan_multiplier[] = { +static struct s_elan_multiplier elan_multiplier[] = { {1000, 0x02, 0x18}, {2000, 0x02, 0x10}, {4000, 0x02, 0x08}, @@ -82,7 +82,7 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu) u8 clockspeed_reg; /* Clock Speed Register */ local_irq_disable(); - outb_p(0x80,REG_CSCIR); + outb_p(0x80, REG_CSCIR); clockspeed_reg = inb_p(REG_CSCDR); local_irq_enable(); @@ -98,10 +98,10 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu) } /* 33 MHz is not 32 MHz... */ - if ((clockspeed_reg & 0xE0)==0xA0) + if ((clockspeed_reg & 0xE0) == 0xA0) return 33000; - return ((1<<((clockspeed_reg & 0xE0) >> 5)) * 1000); + return (1<<((clockspeed_reg & 0xE0) >> 5)) * 1000; } @@ -117,7 +117,7 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu) * There is no return value. */ -static void elanfreq_set_cpu_state (unsigned int state) +static void elanfreq_set_cpu_state(unsigned int state) { struct cpufreq_freqs freqs; @@ -144,20 +144,20 @@ static void elanfreq_set_cpu_state (unsigned int state) */ local_irq_disable(); - outb_p(0x40,REG_CSCIR); /* Disable hyperspeed mode */ - outb_p(0x00,REG_CSCDR); + outb_p(0x40, REG_CSCIR); /* Disable hyperspeed mode */ + outb_p(0x00, REG_CSCDR); local_irq_enable(); /* wait till internal pipelines and */ udelay(1000); /* buffers have cleaned up */ local_irq_disable(); /* now, set the CPU clock speed register (0x80) */ - outb_p(0x80,REG_CSCIR); - outb_p(elan_multiplier[state].val80h,REG_CSCDR); + outb_p(0x80, REG_CSCIR); + outb_p(elan_multiplier[state].val80h, REG_CSCDR); /* now, the hyperspeed bit in PMU Force Mode Register (0x40) */ - outb_p(0x40,REG_CSCIR); - outb_p(elan_multiplier[state].val40h,REG_CSCDR); + outb_p(0x40, REG_CSCIR); + outb_p(elan_multiplier[state].val40h, REG_CSCDR); udelay(10000); local_irq_enable(); @@ -173,12 +173,12 @@ static void elanfreq_set_cpu_state (unsigned int state) * for the hardware supported by the driver. */ -static int elanfreq_verify (struct cpufreq_policy *policy) +static int elanfreq_verify(struct cpufreq_policy *policy) { return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]); } -static int elanfreq_target (struct cpufreq_policy *policy, +static int elanfreq_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) { @@ -205,7 +205,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy) /* capability check */ if ((c->x86_vendor != X86_VENDOR_AMD) || - (c->x86 != 4) || (c->x86_model!=10)) + (c->x86 != 4) || (c->x86_model != 10)) return -ENODEV; /* max freq */ @@ -213,7 +213,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy) max_freq = elanfreq_get_cpu_frequency(0); /* table init */ - for (i=0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) { + for (i = 0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) { if (elanfreq_table[i].frequency > max_freq) elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID; } @@ -224,7 +224,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy) result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table); if (result) - return (result); + return result; cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu); return 0; @@ -260,7 +260,7 @@ __setup("elanfreq=", elanfreq_setup); #endif -static struct freq_attr* elanfreq_attr[] = { +static struct freq_attr *elanfreq_attr[] = { &cpufreq_freq_attr_scaling_available_freqs, NULL, }; @@ -284,9 +284,9 @@ static int __init elanfreq_init(void) /* Test if we have the right hardware */ if ((c->x86_vendor != X86_VENDOR_AMD) || - (c->x86 != 4) || (c->x86_model!=10)) { + (c->x86 != 4) || (c->x86_model != 10)) { printk(KERN_INFO "elanfreq: error: no Elan processor found!\n"); - return -ENODEV; + return -ENODEV; } return cpufreq_register_driver(&elanfreq_driver); } @@ -298,7 +298,7 @@ static void __exit elanfreq_exit(void) } -module_param (max_freq, int, 0444); +module_param(max_freq, int, 0444); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, Sven Geggus <sven@geggus.net>"); diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c index 06fcce516d51..b0461856acfb 100644 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.c +++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c @@ -1,5 +1,5 @@ /* - * (C) 2001-2004 Dave Jones. <davej@codemonkey.org.uk> + * (C) 2001-2004 Dave Jones. <davej@redhat.com> * (C) 2002 Padraig Brady. <padraig@antefacto.com> * * Licensed under the terms of the GNU GPL License version 2. @@ -1019,7 +1019,7 @@ MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor"); module_param(revid_errata, int, 0644); MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID"); -MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>"); +MODULE_AUTHOR ("Dave Jones <davej@redhat.com>"); MODULE_DESCRIPTION ("Longhaul driver for VIA Cyrix processors."); MODULE_LICENSE ("GPL"); diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c index 199e4e05e5dc..b8e05ee4f736 100644 --- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c +++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c @@ -122,7 +122,7 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy, return 0; /* notifiers */ - for_each_cpu_mask(i, policy->cpus) { + for_each_cpu_mask_nr(i, policy->cpus) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); } @@ -130,11 +130,11 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy, /* run on each logical CPU, see section 13.15.3 of IA32 Intel Architecture Software * Developer's Manual, Volume 3 */ - for_each_cpu_mask(i, policy->cpus) + for_each_cpu_mask_nr(i, policy->cpus) cpufreq_p4_setdc(i, p4clockmod_table[newstate].index); /* notifiers */ - for_each_cpu_mask(i, policy->cpus) { + for_each_cpu_mask_nr(i, policy->cpus) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } @@ -171,7 +171,7 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c) } if (c->x86 != 0xF) { - printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to <cpufreq@lists.linux.org.uk>\n"); + printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to <cpufreq@vger.kernel.org>\n"); return 0; } diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c index eb9b62b0830c..c1ac5790c63e 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c @@ -15,12 +15,11 @@ #include <linux/slab.h> #include <asm/msr.h> -#include <asm/timex.h> -#include <asm/io.h> +#include <linux/timex.h> +#include <linux/io.h> - -#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long - as it is unused */ +#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long + as it is unused */ static unsigned int busfreq; /* FSB, in 10 kHz */ static unsigned int max_multiplier; @@ -53,7 +52,7 @@ static int powernow_k6_get_cpu_multiplier(void) msrval = POWERNOW_IOPORT + 0x1; wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ - invalue=inl(POWERNOW_IOPORT + 0x8); + invalue = inl(POWERNOW_IOPORT + 0x8); msrval = POWERNOW_IOPORT + 0x0; wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ @@ -67,9 +66,9 @@ static int powernow_k6_get_cpu_multiplier(void) * * Tries to change the PowerNow! multiplier */ -static void powernow_k6_set_state (unsigned int best_i) +static void powernow_k6_set_state(unsigned int best_i) { - unsigned long outvalue=0, invalue=0; + unsigned long outvalue = 0, invalue = 0; unsigned long msrval; struct cpufreq_freqs freqs; @@ -90,10 +89,10 @@ static void powernow_k6_set_state (unsigned int best_i) msrval = POWERNOW_IOPORT + 0x1; wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ - invalue=inl(POWERNOW_IOPORT + 0x8); + invalue = inl(POWERNOW_IOPORT + 0x8); invalue = invalue & 0xf; outvalue = outvalue | invalue; - outl(outvalue ,(POWERNOW_IOPORT + 0x8)); + outl(outvalue , (POWERNOW_IOPORT + 0x8)); msrval = POWERNOW_IOPORT + 0x0; wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ @@ -124,7 +123,7 @@ static int powernow_k6_verify(struct cpufreq_policy *policy) * * sets a new CPUFreq policy */ -static int powernow_k6_target (struct cpufreq_policy *policy, +static int powernow_k6_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) { @@ -152,7 +151,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy) busfreq = cpu_khz / max_multiplier; /* table init */ - for (i=0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) { + for (i = 0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) { if (clock_ratio[i].index > max_multiplier) clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID; else @@ -165,7 +164,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy) result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio); if (result) - return (result); + return result; cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu); @@ -176,8 +175,8 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy) static int powernow_k6_cpu_exit(struct cpufreq_policy *policy) { unsigned int i; - for (i=0; i<8; i++) { - if (i==max_multiplier) + for (i = 0; i < 8; i++) { + if (i == max_multiplier) powernow_k6_set_state(i); } cpufreq_frequency_table_put_attr(policy->cpu); @@ -189,7 +188,7 @@ static unsigned int powernow_k6_get(unsigned int cpu) return busfreq * powernow_k6_get_cpu_multiplier(); } -static struct freq_attr* powernow_k6_attr[] = { +static struct freq_attr *powernow_k6_attr[] = { &cpufreq_freq_attr_scaling_available_freqs, NULL, }; @@ -227,7 +226,7 @@ static int __init powernow_k6_init(void) } if (cpufreq_register_driver(&powernow_k6_driver)) { - release_region (POWERNOW_IOPORT, 16); + release_region(POWERNOW_IOPORT, 16); return -EINVAL; } @@ -243,13 +242,13 @@ static int __init powernow_k6_init(void) static void __exit powernow_k6_exit(void) { cpufreq_unregister_driver(&powernow_k6_driver); - release_region (POWERNOW_IOPORT, 16); + release_region(POWERNOW_IOPORT, 16); } -MODULE_AUTHOR ("Arjan van de Ven <arjanv@redhat.com>, Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>"); -MODULE_DESCRIPTION ("PowerNow! driver for AMD K6-2+ / K6-3+ processors."); -MODULE_LICENSE ("GPL"); +MODULE_AUTHOR("Arjan van de Ven, Dave Jones <davej@redhat.com>, Dominik Brodowski <linux@brodo.de>"); +MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors."); +MODULE_LICENSE("GPL"); module_init(powernow_k6_init); module_exit(powernow_k6_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c index 0a61159d7b71..7c7d56b43136 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c @@ -1,6 +1,6 @@ /* * AMD K7 Powernow driver. - * (C) 2003 Dave Jones <davej@codemonkey.org.uk> on behalf of SuSE Labs. + * (C) 2003 Dave Jones on behalf of SuSE Labs. * (C) 2003-2004 Dave Jones <davej@redhat.com> * * Licensed under the terms of the GNU GPL License version 2. @@ -692,7 +692,7 @@ static void __exit powernow_exit (void) module_param(acpi_force, int, 0444); MODULE_PARM_DESC(acpi_force, "Force ACPI to be used."); -MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>"); +MODULE_AUTHOR ("Dave Jones <davej@redhat.com>"); MODULE_DESCRIPTION ("Powernow driver for AMD K7 processors."); MODULE_LICENSE ("GPL"); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h index f8a63b3664e3..35fb4eaf6e1c 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h @@ -1,5 +1,4 @@ /* - * $Id: powernow-k7.h,v 1.2 2003/02/10 18:26:01 davej Exp $ * (C) 2003 Dave Jones. * * Licensed under the terms of the GNU GPL License version 2. diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 206791eb46e3..d3dcd58b87cd 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -7,7 +7,7 @@ * Support : mark.langsdorf@amd.com * * Based on the powernow-k7.c module written by Dave Jones. - * (C) 2003 Dave Jones <davej@codemonkey.org.uk> on behalf of SuSE Labs + * (C) 2003 Dave Jones on behalf of SuSE Labs * (C) 2004 Dominik Brodowski <linux@brodo.de> * (C) 2004 Pavel Machek <pavel@suse.cz> * Licensed under the terms of the GNU GPL License version 2. @@ -45,7 +45,6 @@ #endif #define PFX "powernow-k8: " -#define BFX PFX "BIOS error: " #define VERSION "version 2.20.00" #include "powernow-k8.h" @@ -66,7 +65,6 @@ static u32 find_freq_from_fid(u32 fid) return 800 + (fid * 100); } - /* Return a frequency in KHz, given an input fid */ static u32 find_khz_freq_from_fid(u32 fid) { @@ -78,7 +76,6 @@ static u32 find_khz_freq_from_pstate(struct cpufreq_frequency_table *data, u32 p return data[pstate].frequency; } - /* Return the vco fid for an input fid * * Each "low" fid has corresponding "high" fid, and you can get to "low" fids @@ -166,7 +163,6 @@ static void fidvid_msr_init(void) wrmsr(MSR_FIDVID_CTL, lo, hi); } - /* write the new fid value along with the other control fields to the msr */ static int write_new_fid(struct powernow_k8_data *data, u32 fid) { @@ -539,35 +535,40 @@ static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, u8 for (j = 0; j < data->numps; j++) { if (pst[j].vid > LEAST_VID) { - printk(KERN_ERR PFX "vid %d invalid : 0x%x\n", j, pst[j].vid); + printk(KERN_ERR FW_BUG PFX "vid %d invalid : 0x%x\n", + j, pst[j].vid); return -EINVAL; } if (pst[j].vid < data->rvo) { /* vid + rvo >= 0 */ - printk(KERN_ERR BFX "0 vid exceeded with pstate %d\n", j); + printk(KERN_ERR FW_BUG PFX "0 vid exceeded with pstate" + " %d\n", j); return -ENODEV; } if (pst[j].vid < maxvid + data->rvo) { /* vid + rvo >= maxvid */ - printk(KERN_ERR BFX "maxvid exceeded with pstate %d\n", j); + printk(KERN_ERR FW_BUG PFX "maxvid exceeded with pstate" + " %d\n", j); return -ENODEV; } if (pst[j].fid > MAX_FID) { - printk(KERN_ERR BFX "maxfid exceeded with pstate %d\n", j); + printk(KERN_ERR FW_BUG PFX "maxfid exceeded with pstate" + " %d\n", j); return -ENODEV; } if (j && (pst[j].fid < HI_FID_TABLE_BOTTOM)) { /* Only first fid is allowed to be in "low" range */ - printk(KERN_ERR BFX "two low fids - %d : 0x%x\n", j, pst[j].fid); + printk(KERN_ERR FW_BUG PFX "two low fids - %d : " + "0x%x\n", j, pst[j].fid); return -EINVAL; } if (pst[j].fid < lastfid) lastfid = pst[j].fid; } if (lastfid & 1) { - printk(KERN_ERR BFX "lastfid invalid\n"); + printk(KERN_ERR FW_BUG PFX "lastfid invalid\n"); return -EINVAL; } if (lastfid > LO_FID_TABLE_TOP) - printk(KERN_INFO BFX "first fid not from lo freq table\n"); + printk(KERN_INFO FW_BUG PFX "first fid not from lo freq table\n"); return 0; } @@ -675,13 +676,13 @@ static int find_psb_table(struct powernow_k8_data *data) dprintk("table vers: 0x%x\n", psb->tableversion); if (psb->tableversion != PSB_VERSION_1_4) { - printk(KERN_ERR BFX "PSB table is not v1.4\n"); + printk(KERN_ERR FW_BUG PFX "PSB table is not v1.4\n"); return -ENODEV; } dprintk("flags: 0x%x\n", psb->flags1); if (psb->flags1) { - printk(KERN_ERR BFX "unknown flags\n"); + printk(KERN_ERR FW_BUG PFX "unknown flags\n"); return -ENODEV; } @@ -708,7 +709,7 @@ static int find_psb_table(struct powernow_k8_data *data) } } if (cpst != 1) { - printk(KERN_ERR BFX "numpst must be 1\n"); + printk(KERN_ERR FW_BUG PFX "numpst must be 1\n"); return -ENODEV; } @@ -966,7 +967,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned i freqs.old = find_khz_freq_from_fid(data->currfid); freqs.new = find_khz_freq_from_fid(fid); - for_each_cpu_mask(i, *(data->available_cores)) { + for_each_cpu_mask_nr(i, *(data->available_cores)) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); } @@ -974,7 +975,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned i res = transition_fid_vid(data, fid, vid); freqs.new = find_khz_freq_from_fid(data->currfid); - for_each_cpu_mask(i, *(data->available_cores)) { + for_each_cpu_mask_nr(i, *(data->available_cores)) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } @@ -997,7 +998,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i freqs.old = find_khz_freq_from_pstate(data->powernow_table, data->currpstate); freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); - for_each_cpu_mask(i, *(data->available_cores)) { + for_each_cpu_mask_nr(i, *(data->available_cores)) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); } @@ -1005,7 +1006,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i res = transition_pstate(data, pstate); freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); - for_each_cpu_mask(i, *(data->available_cores)) { + for_each_cpu_mask_nr(i, *(data->available_cores)) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } @@ -1133,17 +1134,19 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) "ACPI Processor module before starting this " "driver.\n"); #else - printk(KERN_ERR PFX "Your BIOS does not provide ACPI " - "_PSS objects in a way that Linux understands. " - "Please report this to the Linux ACPI maintainers" - " and complain to your BIOS vendor.\n"); + printk(KERN_ERR FW_BUG PFX "Your BIOS does not provide" + " ACPI _PSS objects in a way that Linux " + "understands. Please report this to the Linux " + "ACPI maintainers and complain to your BIOS " + "vendor.\n"); #endif kfree(data); return -ENODEV; } if (pol->cpu != 0) { - printk(KERN_ERR PFX "No ACPI _PSS objects for CPU other than " - "CPU0. Complain to your BIOS vendor.\n"); + printk(KERN_ERR FW_BUG PFX "No ACPI _PSS objects for " + "CPU other than CPU0. Complain to your BIOS " + "vendor.\n"); kfree(data); return -ENODEV; } @@ -1196,7 +1199,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) /* min/max the cpu is capable of */ if (cpufreq_frequency_table_cpuinfo(pol, data->powernow_table)) { - printk(KERN_ERR PFX "invalid powernow_table\n"); + printk(KERN_ERR FW_BUG PFX "invalid powernow_table\n"); powernow_k8_cpu_exit_acpi(data); kfree(data->powernow_table); kfree(data); diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c index 908dd347c67e..3b5f06423e77 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c @@ -26,9 +26,10 @@ #include <asm/cpufeature.h> #define PFX "speedstep-centrino: " -#define MAINTAINER "cpufreq@lists.linux.org.uk" +#define MAINTAINER "cpufreq@vger.kernel.org" -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg) +#define dprintk(msg...) \ + cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg) #define INTEL_MSR_RANGE (0xffff) @@ -66,11 +67,12 @@ struct cpu_model struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */ }; -static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, const struct cpu_id *x); +static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, + const struct cpu_id *x); /* Operating points for current CPU */ -static struct cpu_model *centrino_model[NR_CPUS]; -static const struct cpu_id *centrino_cpu[NR_CPUS]; +static DEFINE_PER_CPU(struct cpu_model *, centrino_model); +static DEFINE_PER_CPU(const struct cpu_id *, centrino_cpu); static struct cpufreq_driver centrino_driver; @@ -255,7 +257,7 @@ static int centrino_cpu_init_table(struct cpufreq_policy *policy) return -ENOENT; } - centrino_model[policy->cpu] = model; + per_cpu(centrino_model, policy->cpu) = model; dprintk("found \"%s\": max frequency: %dkHz\n", model->model_name, model->max_freq); @@ -264,10 +266,14 @@ static int centrino_cpu_init_table(struct cpufreq_policy *policy) } #else -static inline int centrino_cpu_init_table(struct cpufreq_policy *policy) { return -ENODEV; } +static inline int centrino_cpu_init_table(struct cpufreq_policy *policy) +{ + return -ENODEV; +} #endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */ -static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, const struct cpu_id *x) +static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, + const struct cpu_id *x) { if ((c->x86 == x->x86) && (c->x86_model == x->x86_model) && @@ -286,23 +292,28 @@ static unsigned extract_clock(unsigned msr, unsigned int cpu, int failsafe) * for centrino, as some DSDTs are buggy. * Ideally, this can be done using the acpi_data structure. */ - if ((centrino_cpu[cpu] == &cpu_ids[CPU_BANIAS]) || - (centrino_cpu[cpu] == &cpu_ids[CPU_DOTHAN_A1]) || - (centrino_cpu[cpu] == &cpu_ids[CPU_DOTHAN_B0])) { + if ((per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_BANIAS]) || + (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_A1]) || + (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_B0])) { msr = (msr >> 8) & 0xff; return msr * 100000; } - if ((!centrino_model[cpu]) || (!centrino_model[cpu]->op_points)) + if ((!per_cpu(centrino_model, cpu)) || + (!per_cpu(centrino_model, cpu)->op_points)) return 0; msr &= 0xffff; - for (i=0;centrino_model[cpu]->op_points[i].frequency != CPUFREQ_TABLE_END; i++) { - if (msr == centrino_model[cpu]->op_points[i].index) - return centrino_model[cpu]->op_points[i].frequency; + for (i = 0; + per_cpu(centrino_model, cpu)->op_points[i].frequency + != CPUFREQ_TABLE_END; + i++) { + if (msr == per_cpu(centrino_model, cpu)->op_points[i].index) + return per_cpu(centrino_model, cpu)-> + op_points[i].frequency; } if (failsafe) - return centrino_model[cpu]->op_points[i-1].frequency; + return per_cpu(centrino_model, cpu)->op_points[i-1].frequency; else return 0; } @@ -347,7 +358,8 @@ static int centrino_cpu_init(struct cpufreq_policy *policy) int i; /* Only Intel makes Enhanced Speedstep-capable CPUs */ - if (cpu->x86_vendor != X86_VENDOR_INTEL || !cpu_has(cpu, X86_FEATURE_EST)) + if (cpu->x86_vendor != X86_VENDOR_INTEL || + !cpu_has(cpu, X86_FEATURE_EST)) return -ENODEV; if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC)) @@ -361,9 +373,9 @@ static int centrino_cpu_init(struct cpufreq_policy *policy) break; if (i != N_IDS) - centrino_cpu[policy->cpu] = &cpu_ids[i]; + per_cpu(centrino_cpu, policy->cpu) = &cpu_ids[i]; - if (!centrino_cpu[policy->cpu]) { + if (!per_cpu(centrino_cpu, policy->cpu)) { dprintk("found unsupported CPU with " "Enhanced SpeedStep: send /proc/cpuinfo to " MAINTAINER "\n"); @@ -386,23 +398,26 @@ static int centrino_cpu_init(struct cpufreq_policy *policy) /* check to see if it stuck */ rdmsr(MSR_IA32_MISC_ENABLE, l, h); if (!(l & (1<<16))) { - printk(KERN_INFO PFX "couldn't enable Enhanced SpeedStep\n"); + printk(KERN_INFO PFX + "couldn't enable Enhanced SpeedStep\n"); return -ENODEV; } } freq = get_cur_freq(policy->cpu); - - policy->cpuinfo.transition_latency = 10000; /* 10uS transition latency */ + policy->cpuinfo.transition_latency = 10000; + /* 10uS transition latency */ policy->cur = freq; dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur); - ret = cpufreq_frequency_table_cpuinfo(policy, centrino_model[policy->cpu]->op_points); + ret = cpufreq_frequency_table_cpuinfo(policy, + per_cpu(centrino_model, policy->cpu)->op_points); if (ret) return (ret); - cpufreq_frequency_table_get_attr(centrino_model[policy->cpu]->op_points, policy->cpu); + cpufreq_frequency_table_get_attr( + per_cpu(centrino_model, policy->cpu)->op_points, policy->cpu); return 0; } @@ -411,12 +426,12 @@ static int centrino_cpu_exit(struct cpufreq_policy *policy) { unsigned int cpu = policy->cpu; - if (!centrino_model[cpu]) + if (!per_cpu(centrino_model, cpu)) return -ENODEV; cpufreq_frequency_table_put_attr(cpu); - centrino_model[cpu] = NULL; + per_cpu(centrino_model, cpu) = NULL; return 0; } @@ -430,17 +445,26 @@ static int centrino_cpu_exit(struct cpufreq_policy *policy) */ static int centrino_verify (struct cpufreq_policy *policy) { - return cpufreq_frequency_table_verify(policy, centrino_model[policy->cpu]->op_points); + return cpufreq_frequency_table_verify(policy, + per_cpu(centrino_model, policy->cpu)->op_points); } /** * centrino_setpolicy - set a new CPUFreq policy * @policy: new policy * @target_freq: the target frequency - * @relation: how that frequency relates to achieved frequency (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) + * @relation: how that frequency relates to achieved frequency + * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) * * Sets a new CPUFreq policy. */ +struct allmasks { + cpumask_t online_policy_cpus; + cpumask_t saved_mask; + cpumask_t set_mask; + cpumask_t covered_cpus; +}; + static int centrino_target (struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) @@ -448,48 +472,55 @@ static int centrino_target (struct cpufreq_policy *policy, unsigned int newstate = 0; unsigned int msr, oldmsr = 0, h = 0, cpu = policy->cpu; struct cpufreq_freqs freqs; - cpumask_t online_policy_cpus; - cpumask_t saved_mask; - cpumask_t set_mask; - cpumask_t covered_cpus; int retval = 0; unsigned int j, k, first_cpu, tmp; - - if (unlikely(centrino_model[cpu] == NULL)) - return -ENODEV; + CPUMASK_ALLOC(allmasks); + CPUMASK_PTR(online_policy_cpus, allmasks); + CPUMASK_PTR(saved_mask, allmasks); + CPUMASK_PTR(set_mask, allmasks); + CPUMASK_PTR(covered_cpus, allmasks); + + if (unlikely(allmasks == NULL)) + return -ENOMEM; + + if (unlikely(per_cpu(centrino_model, cpu) == NULL)) { + retval = -ENODEV; + goto out; + } if (unlikely(cpufreq_frequency_table_target(policy, - centrino_model[cpu]->op_points, + per_cpu(centrino_model, cpu)->op_points, target_freq, relation, &newstate))) { - return -EINVAL; + retval = -EINVAL; + goto out; } #ifdef CONFIG_HOTPLUG_CPU /* cpufreq holds the hotplug lock, so we are safe from here on */ - cpus_and(online_policy_cpus, cpu_online_map, policy->cpus); + cpus_and(*online_policy_cpus, cpu_online_map, policy->cpus); #else - online_policy_cpus = policy->cpus; + *online_policy_cpus = policy->cpus; #endif - saved_mask = current->cpus_allowed; + *saved_mask = current->cpus_allowed; first_cpu = 1; - cpus_clear(covered_cpus); - for_each_cpu_mask(j, online_policy_cpus) { + cpus_clear(*covered_cpus); + for_each_cpu_mask_nr(j, *online_policy_cpus) { /* * Support for SMP systems. * Make sure we are running on CPU that wants to change freq */ - cpus_clear(set_mask); + cpus_clear(*set_mask); if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) - cpus_or(set_mask, set_mask, online_policy_cpus); + cpus_or(*set_mask, *set_mask, *online_policy_cpus); else - cpu_set(j, set_mask); + cpu_set(j, *set_mask); - set_cpus_allowed_ptr(current, &set_mask); + set_cpus_allowed_ptr(current, set_mask); preempt_disable(); - if (unlikely(!cpu_isset(smp_processor_id(), set_mask))) { + if (unlikely(!cpu_isset(smp_processor_id(), *set_mask))) { dprintk("couldn't limit to CPUs in this domain\n"); retval = -EAGAIN; if (first_cpu) { @@ -500,7 +531,7 @@ static int centrino_target (struct cpufreq_policy *policy, break; } - msr = centrino_model[cpu]->op_points[newstate].index; + msr = per_cpu(centrino_model, cpu)->op_points[newstate].index; if (first_cpu) { rdmsr(MSR_IA32_PERF_CTL, oldmsr, h); @@ -517,7 +548,7 @@ static int centrino_target (struct cpufreq_policy *policy, dprintk("target=%dkHz old=%d new=%d msr=%04x\n", target_freq, freqs.old, freqs.new, msr); - for_each_cpu_mask(k, online_policy_cpus) { + for_each_cpu_mask_nr(k, *online_policy_cpus) { freqs.cpu = k; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); @@ -536,11 +567,11 @@ static int centrino_target (struct cpufreq_policy *policy, break; } - cpu_set(j, covered_cpus); + cpu_set(j, *covered_cpus); preempt_enable(); } - for_each_cpu_mask(k, online_policy_cpus) { + for_each_cpu_mask_nr(k, *online_policy_cpus) { freqs.cpu = k; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } @@ -553,30 +584,32 @@ static int centrino_target (struct cpufreq_policy *policy, * Best effort undo.. */ - if (!cpus_empty(covered_cpus)) { - for_each_cpu_mask(j, covered_cpus) { + if (!cpus_empty(*covered_cpus)) + for_each_cpu_mask_nr(j, *covered_cpus) { set_cpus_allowed_ptr(current, &cpumask_of_cpu(j)); wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); } - } tmp = freqs.new; freqs.new = freqs.old; freqs.old = tmp; - for_each_cpu_mask(j, online_policy_cpus) { + for_each_cpu_mask_nr(j, *online_policy_cpus) { freqs.cpu = j; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } } - set_cpus_allowed_ptr(current, &saved_mask); - return 0; + set_cpus_allowed_ptr(current, saved_mask); + retval = 0; + goto out; migrate_end: preempt_enable(); - set_cpus_allowed_ptr(current, &saved_mask); - return 0; + set_cpus_allowed_ptr(current, saved_mask); +out: + CPUMASK_FREE(allmasks); + return retval; } static struct freq_attr* centrino_attr[] = { diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c index 1b50244b1fdf..04d0376b64b0 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c @@ -279,7 +279,7 @@ static int speedstep_target (struct cpufreq_policy *policy, cpus_allowed = current->cpus_allowed; - for_each_cpu_mask(i, policy->cpus) { + for_each_cpu_mask_nr(i, policy->cpus) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); } @@ -292,7 +292,7 @@ static int speedstep_target (struct cpufreq_policy *policy, /* allow to be run on all CPUs */ set_cpus_allowed_ptr(current, &cpus_allowed); - for_each_cpu_mask(i, policy->cpus) { + for_each_cpu_mask_nr(i, policy->cpus) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } @@ -431,7 +431,7 @@ static void __exit speedstep_exit(void) } -MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>"); +MODULE_AUTHOR ("Dave Jones <davej@redhat.com>, Dominik Brodowski <linux@brodo.de>"); MODULE_DESCRIPTION ("Speedstep driver for Intel mobile processors on chipsets with ICH-M southbridges."); MODULE_LICENSE ("GPL"); diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 3fd7a67bb06a..ffd0f5ed071a 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -15,13 +15,11 @@ /* * Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU */ -static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) +static void __cpuinit __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) { unsigned char ccr2, ccr3; - unsigned long flags; /* we test for DEVID by checking whether CCR3 is writable */ - local_irq_save(flags); ccr3 = getCx86(CX86_CCR3); setCx86(CX86_CCR3, ccr3 ^ 0x80); getCx86(0xc0); /* dummy to change bus */ @@ -44,9 +42,16 @@ static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) *dir0 = getCx86(CX86_DIR0); *dir1 = getCx86(CX86_DIR1); } - local_irq_restore(flags); } +static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) +{ + unsigned long flags; + + local_irq_save(flags); + __do_cyrix_devid(dir0, dir1); + local_irq_restore(flags); +} /* * Cx86_dir0_msb is a HACK needed by check_cx686_cpuid/slop in bugs.h in * order to identify the Cyrix CPU model after we're out of setup.c @@ -116,7 +121,7 @@ static void __cpuinit set_cx86_reorder(void) setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ /* Load/Store Serialize to mem access disable (=reorder it) */ - setCx86(CX86_PCR0, getCx86(CX86_PCR0) & ~0x80); + setCx86_old(CX86_PCR0, getCx86_old(CX86_PCR0) & ~0x80); /* set load/store serialize from 1GB to 4GB */ ccr3 |= 0xe0; setCx86(CX86_CCR3, ccr3); @@ -127,28 +132,11 @@ static void __cpuinit set_cx86_memwb(void) printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n"); /* CCR2 bit 2: unlock NW bit */ - setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04); + setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) & ~0x04); /* set 'Not Write-through' */ write_cr0(read_cr0() | X86_CR0_NW); /* CCR2 bit 2: lock NW bit and set WT1 */ - setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14); -} - -static void __cpuinit set_cx86_inc(void) -{ - unsigned char ccr3; - - printk(KERN_INFO "Enable Incrementor on Cyrix/NSC processor.\n"); - - ccr3 = getCx86(CX86_CCR3); - setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ - /* PCR1 -- Performance Control */ - /* Incrementor on, whatever that is */ - setCx86(CX86_PCR1, getCx86(CX86_PCR1) | 0x02); - /* PCR0 -- Performance Control */ - /* Incrementor Margin 10 */ - setCx86(CX86_PCR0, getCx86(CX86_PCR0) | 0x04); - setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ + setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x14); } /* @@ -162,23 +150,40 @@ static void __cpuinit geode_configure(void) local_irq_save(flags); /* Suspend on halt power saving and enable #SUSP pin */ - setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88); + setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x88); ccr3 = getCx86(CX86_CCR3); setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ /* FPU fast, DTE cache, Mem bypass */ - setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x38); + setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x38); setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ set_cx86_memwb(); set_cx86_reorder(); - set_cx86_inc(); local_irq_restore(flags); } +static void __cpuinit early_init_cyrix(struct cpuinfo_x86 *c) +{ + unsigned char dir0, dir0_msn, dir1 = 0; + + __do_cyrix_devid(&dir0, &dir1); + dir0_msn = dir0 >> 4; /* identifies CPU "family" */ + + switch (dir0_msn) { + case 3: /* 6x86/6x86L */ + /* Emulate MTRRs using Cyrix's ARRs. */ + set_cpu_cap(c, X86_FEATURE_CYRIX_ARR); + break; + case 5: /* 6x86MX/M II */ + /* Emulate MTRRs using Cyrix's ARRs. */ + set_cpu_cap(c, X86_FEATURE_CYRIX_ARR); + break; + } +} static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) { @@ -286,7 +291,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) /* GXm supports extended cpuid levels 'ala' AMD */ if (c->cpuid_level == 2) { /* Enable cxMMX extensions (GX1 Datasheet 54) */ - setCx86(CX86_CCR7, getCx86(CX86_CCR7) | 1); + setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7) | 1); /* * GXm : 0x30 ... 0x5f GXm datasheet 51 @@ -296,7 +301,6 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) */ if ((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <= dir1 && dir1 <= 0x8f)) geode_configure(); - get_model_name(c); /* get CPU marketing name */ return; } else { /* MediaGX */ Cx86_cb[2] = (dir0_lsn & 1) ? '3' : '4'; @@ -309,7 +313,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) if (dir1 > 7) { dir0_msn++; /* M II */ /* Enable MMX extensions (App note 108) */ - setCx86(CX86_CCR7, getCx86(CX86_CCR7)|1); + setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7)|1); } else { c->coma_bug = 1; /* 6x86MX, it has the bug. */ } @@ -424,7 +428,7 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c) local_irq_save(flags); ccr3 = getCx86(CX86_CCR3); setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ - setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x80); /* enable cpuid */ + setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80); /* enable cpuid */ setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ local_irq_restore(flags); } @@ -434,16 +438,19 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c) static struct cpu_dev cyrix_cpu_dev __cpuinitdata = { .c_vendor = "Cyrix", .c_ident = { "CyrixInstead" }, + .c_early_init = early_init_cyrix, .c_init = init_cyrix, .c_identify = cyrix_identify, + .c_x86_vendor = X86_VENDOR_CYRIX, }; -cpu_vendor_dev_register(X86_VENDOR_CYRIX, &cyrix_cpu_dev); +cpu_dev_register(cyrix_cpu_dev); static struct cpu_dev nsc_cpu_dev __cpuinitdata = { .c_vendor = "NSC", .c_ident = { "Geode by NSC" }, .c_init = init_nsc, + .c_x86_vendor = X86_VENDOR_NSC, }; -cpu_vendor_dev_register(X86_VENDOR_NSC, &nsc_cpu_dev); +cpu_dev_register(nsc_cpu_dev); diff --git a/arch/x86/kernel/cpu/feature_names.c b/arch/x86/kernel/cpu/feature_names.c deleted file mode 100644 index e43ad4ad4cba..000000000000 --- a/arch/x86/kernel/cpu/feature_names.c +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Strings for the various x86 capability flags. - * - * This file must not contain any executable code. - */ - -#include <asm/cpufeature.h> - -/* - * These flag bits must match the definitions in <asm/cpufeature.h>. - * NULL means this bit is undefined or reserved; either way it doesn't - * have meaning as far as Linux is concerned. Note that it's important - * to realize there is a difference between this table and CPUID -- if - * applications want to get the raw CPUID data, they should access - * /dev/cpu/<cpu_nr>/cpuid instead. - */ -const char * const x86_cap_flags[NCAPINTS*32] = { - /* Intel-defined */ - "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", - "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", - "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx", - "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe", - - /* AMD-defined */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, - NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL, - NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm", - "3dnowext", "3dnow", - - /* Transmeta-defined */ - "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* Other (Linux-defined) */ - "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr", - NULL, NULL, NULL, NULL, - "constant_tsc", "up", NULL, "arch_perfmon", - "pebs", "bts", NULL, NULL, - "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* Intel-defined (#2) */ - "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", - "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, - NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt", - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* VIA/Cyrix/Centaur-defined */ - NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en", - "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* AMD-defined (#2) */ - "lahf_lm", "cmp_legacy", "svm", "extapic", - "cr8_legacy", "abm", "sse4a", "misalignsse", - "3dnowprefetch", "osvw", "ibs", "sse5", - "skinit", "wdt", NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* Auxiliary (Linux-defined) */ - "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, -}; - -const char *const x86_power_flags[32] = { - "ts", /* temperature sensor */ - "fid", /* frequency id control */ - "vid", /* voltage id control */ - "ttp", /* thermal trip */ - "tm", - "stc", - "100mhzsteps", - "hwpstate", - "", /* tsc invariant mapped to constant_tsc */ - /* nothing */ -}; diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 70609efdf1da..cce0b6118d55 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -15,6 +15,11 @@ #include <asm/ds.h> #include <asm/bugs.h> +#ifdef CONFIG_X86_64 +#include <asm/topology.h> +#include <asm/numa_64.h> +#endif + #include "cpu.h" #ifdef CONFIG_X86_LOCAL_APIC @@ -23,23 +28,22 @@ #include <mach_apic.h> #endif -#ifdef CONFIG_X86_INTEL_USERCOPY -/* - * Alignment at which movsl is preferred for bulk memory copies. - */ -struct movsl_mask movsl_mask __read_mostly; -#endif - static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) { - /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */ - if (c->x86 == 15 && c->x86_cache_alignment == 64) - c->x86_cache_alignment = 128; if ((c->x86 == 0xf && c->x86_model >= 0x03) || (c->x86 == 0x6 && c->x86_model >= 0x0e)) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + +#ifdef CONFIG_X86_64 + set_cpu_cap(c, X86_FEATURE_SYSENTER32); +#else + /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */ + if (c->x86 == 15 && c->x86_cache_alignment == 64) + c->x86_cache_alignment = 128; +#endif } +#ifdef CONFIG_X86_32 /* * Early probe support logic for ppro memory erratum #50 * @@ -59,15 +63,54 @@ int __cpuinit ppro_with_ram_bug(void) return 0; } +#ifdef CONFIG_X86_F00F_BUG +static void __cpuinit trap_init_f00f_bug(void) +{ + __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); -/* - * P4 Xeon errata 037 workaround. - * Hardware prefetcher may cause stale data to be loaded into the cache. - */ -static void __cpuinit Intel_errata_workarounds(struct cpuinfo_x86 *c) + /* + * Update the IDT descriptor and reload the IDT so that + * it uses the read-only mapped virtual address. + */ + idt_descr.address = fix_to_virt(FIX_F00F_IDT); + load_idt(&idt_descr); +} +#endif + +static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) { unsigned long lo, hi; +#ifdef CONFIG_X86_F00F_BUG + /* + * All current models of Pentium and Pentium with MMX technology CPUs + * have the F0 0F bug, which lets nonprivileged users lock up the system. + * Note that the workaround only should be initialized once... + */ + c->f00f_bug = 0; + if (!paravirt_enabled() && c->x86 == 5) { + static int f00f_workaround_enabled; + + c->f00f_bug = 1; + if (!f00f_workaround_enabled) { + trap_init_f00f_bug(); + printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n"); + f00f_workaround_enabled = 1; + } + } +#endif + + /* + * SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until + * model 3 mask 3 + */ + if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633) + clear_cpu_cap(c, X86_FEATURE_SEP); + + /* + * P4 Xeon errata 037 workaround. + * Hardware prefetcher may cause stale data to be loaded into the cache. + */ if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) { rdmsr(MSR_IA32_MISC_ENABLE, lo, hi); if ((lo & (1<<9)) == 0) { @@ -77,13 +120,68 @@ static void __cpuinit Intel_errata_workarounds(struct cpuinfo_x86 *c) wrmsr (MSR_IA32_MISC_ENABLE, lo, hi); } } + + /* + * See if we have a good local APIC by checking for buggy Pentia, + * i.e. all B steppings and the C2 stepping of P54C when using their + * integrated APIC (see 11AP erratum in "Pentium Processor + * Specification Update"). + */ + if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 && + (c->x86_mask < 0x6 || c->x86_mask == 0xb)) + set_cpu_cap(c, X86_FEATURE_11AP); + + +#ifdef CONFIG_X86_INTEL_USERCOPY + /* + * Set up the preferred alignment for movsl bulk memory moves + */ + switch (c->x86) { + case 4: /* 486: untested */ + break; + case 5: /* Old Pentia: untested */ + break; + case 6: /* PII/PIII only like movsl with 8-byte alignment */ + movsl_mask.mask = 7; + break; + case 15: /* P4 is OK down to 8-byte alignment */ + movsl_mask.mask = 7; + break; + } +#endif + +#ifdef CONFIG_X86_NUMAQ + numaq_tsc_disable(); +#endif } +#else +static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) +{ +} +#endif +static void __cpuinit srat_detect_node(void) +{ +#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) + unsigned node; + int cpu = smp_processor_id(); + int apicid = hard_smp_processor_id(); + + /* Don't do the funky fallback heuristics the AMD version employs + for now. */ + node = apicid_to_node[apicid]; + if (node == NUMA_NO_NODE || !node_online(node)) + node = first_node(node_online_map); + numa_set_node(cpu, node); + + printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node); +#endif +} /* * find out the number of processor cores on the die */ -static int __cpuinit num_cpu_cores(struct cpuinfo_x86 *c) +static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) { unsigned int eax, ebx, ecx, edx; @@ -98,45 +196,51 @@ static int __cpuinit num_cpu_cores(struct cpuinfo_x86 *c) return 1; } -#ifdef CONFIG_X86_F00F_BUG -static void __cpuinit trap_init_f00f_bug(void) +static void __cpuinit detect_vmx_virtcap(struct cpuinfo_x86 *c) { - __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); - - /* - * Update the IDT descriptor and reload the IDT so that - * it uses the read-only mapped virtual address. - */ - idt_descr.address = fix_to_virt(FIX_F00F_IDT); - load_idt(&idt_descr); + /* Intel VMX MSR indicated features */ +#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000 +#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000 +#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000 +#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001 +#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002 +#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020 + + u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2; + + clear_cpu_cap(c, X86_FEATURE_TPR_SHADOW); + clear_cpu_cap(c, X86_FEATURE_VNMI); + clear_cpu_cap(c, X86_FEATURE_FLEXPRIORITY); + clear_cpu_cap(c, X86_FEATURE_EPT); + clear_cpu_cap(c, X86_FEATURE_VPID); + + rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high); + msr_ctl = vmx_msr_high | vmx_msr_low; + if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW) + set_cpu_cap(c, X86_FEATURE_TPR_SHADOW); + if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI) + set_cpu_cap(c, X86_FEATURE_VNMI); + if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) { + rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, + vmx_msr_low, vmx_msr_high); + msr_ctl2 = vmx_msr_high | vmx_msr_low; + if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) && + (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)) + set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY); + if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT) + set_cpu_cap(c, X86_FEATURE_EPT); + if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID) + set_cpu_cap(c, X86_FEATURE_VPID); + } } -#endif static void __cpuinit init_intel(struct cpuinfo_x86 *c) { unsigned int l2 = 0; - char *p = NULL; early_init_intel(c); -#ifdef CONFIG_X86_F00F_BUG - /* - * All current models of Pentium and Pentium with MMX technology CPUs - * have the F0 0F bug, which lets nonprivileged users lock up the system. - * Note that the workaround only should be initialized once... - */ - c->f00f_bug = 0; - if (!paravirt_enabled() && c->x86 == 5) { - static int f00f_workaround_enabled; - - c->f00f_bug = 1; - if (!f00f_workaround_enabled) { - trap_init_f00f_bug(); - printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n"); - f00f_workaround_enabled = 1; - } - } -#endif + intel_workarounds(c); l2 = init_intel_cacheinfo(c); if (c->cpuid_level > 9) { @@ -146,16 +250,32 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); } - /* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until model 3 mask 3 */ - if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633) - clear_cpu_cap(c, X86_FEATURE_SEP); + if (cpu_has_xmm2) + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); + if (cpu_has_ds) { + unsigned int l1; + rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); + if (!(l1 & (1<<11))) + set_cpu_cap(c, X86_FEATURE_BTS); + if (!(l1 & (1<<12))) + set_cpu_cap(c, X86_FEATURE_PEBS); + ds_init_intel(c); + } +#ifdef CONFIG_X86_64 + if (c->x86 == 15) + c->x86_cache_alignment = c->x86_clflush_size * 2; + if (c->x86 == 6) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); +#else /* * Names for the Pentium II/Celeron processors * detectable only by also checking the cache size. * Dixon is NOT a Celeron. */ if (c->x86 == 6) { + char *p = NULL; + switch (c->x86_model) { case 5: if (c->x86_mask == 0) { @@ -178,60 +298,41 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) p = "Celeron (Coppermine)"; break; } - } - if (p) - strcpy(c->x86_model_id, p); - - c->x86_max_cores = num_cpu_cores(c); - - detect_ht(c); - - /* Work around errata */ - Intel_errata_workarounds(c); - -#ifdef CONFIG_X86_INTEL_USERCOPY - /* - * Set up the preferred alignment for movsl bulk memory moves - */ - switch (c->x86) { - case 4: /* 486: untested */ - break; - case 5: /* Old Pentia: untested */ - break; - case 6: /* PII/PIII only like movsl with 8-byte alignment */ - movsl_mask.mask = 7; - break; - case 15: /* P4 is OK down to 8-byte alignment */ - movsl_mask.mask = 7; - break; + if (p) + strcpy(c->x86_model_id, p); } -#endif - if (cpu_has_xmm2) - set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); - if (c->x86 == 15) { + if (c->x86 == 15) set_cpu_cap(c, X86_FEATURE_P4); - } if (c->x86 == 6) set_cpu_cap(c, X86_FEATURE_P3); - if (cpu_has_ds) { - unsigned int l1; - rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); - if (!(l1 & (1<<11))) - set_cpu_cap(c, X86_FEATURE_BTS); - if (!(l1 & (1<<12))) - set_cpu_cap(c, X86_FEATURE_PEBS); - } if (cpu_has_bts) - ds_init_intel(c); + ptrace_bts_init_intel(c); -#ifdef CONFIG_X86_NUMAQ - numaq_tsc_disable(); #endif + + detect_extended_topology(c); + if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { + /* + * let's use the legacy cpuid vector 0x1 and 0x4 for topology + * detection. + */ + c->x86_max_cores = intel_num_cpu_cores(c); +#ifdef CONFIG_X86_32 + detect_ht(c); +#endif + } + + /* Work around errata */ + srat_detect_node(); + + if (cpu_has(c, X86_FEATURE_VMX)) + detect_vmx_virtcap(c); } +#ifdef CONFIG_X86_32 static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) { /* @@ -244,10 +345,12 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i size = 256; return size; } +#endif static struct cpu_dev intel_cpu_dev __cpuinitdata = { .c_vendor = "Intel", .c_ident = { "GenuineIntel" }, +#ifdef CONFIG_X86_32 .c_models = { { .vendor = X86_VENDOR_INTEL, .family = 4, .model_names = { @@ -297,76 +400,12 @@ static struct cpu_dev intel_cpu_dev __cpuinitdata = { } }, }, + .c_size_cache = intel_size_cache, +#endif .c_early_init = early_init_intel, .c_init = init_intel, - .c_size_cache = intel_size_cache, + .c_x86_vendor = X86_VENDOR_INTEL, }; -cpu_vendor_dev_register(X86_VENDOR_INTEL, &intel_cpu_dev); - -#ifndef CONFIG_X86_CMPXCHG -unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new) -{ - u8 prev; - unsigned long flags; - - /* Poor man's cmpxchg for 386. Unsuitable for SMP */ - local_irq_save(flags); - prev = *(u8 *)ptr; - if (prev == old) - *(u8 *)ptr = new; - local_irq_restore(flags); - return prev; -} -EXPORT_SYMBOL(cmpxchg_386_u8); - -unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new) -{ - u16 prev; - unsigned long flags; - - /* Poor man's cmpxchg for 386. Unsuitable for SMP */ - local_irq_save(flags); - prev = *(u16 *)ptr; - if (prev == old) - *(u16 *)ptr = new; - local_irq_restore(flags); - return prev; -} -EXPORT_SYMBOL(cmpxchg_386_u16); - -unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new) -{ - u32 prev; - unsigned long flags; - - /* Poor man's cmpxchg for 386. Unsuitable for SMP */ - local_irq_save(flags); - prev = *(u32 *)ptr; - if (prev == old) - *(u32 *)ptr = new; - local_irq_restore(flags); - return prev; -} -EXPORT_SYMBOL(cmpxchg_386_u32); -#endif - -#ifndef CONFIG_X86_CMPXCHG64 -unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new) -{ - u64 prev; - unsigned long flags; - - /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */ - local_irq_save(flags); - prev = *(u64 *)ptr; - if (prev == old) - *(u64 *)ptr = new; - local_irq_restore(flags); - return prev; -} -EXPORT_SYMBOL(cmpxchg_486_u64); -#endif - -/* arch_initcall(intel_cpu_init); */ +cpu_dev_register(intel_cpu_dev); diff --git a/arch/x86/kernel/cpu/intel_64.c b/arch/x86/kernel/cpu/intel_64.c deleted file mode 100644 index 1019c58d39f0..000000000000 --- a/arch/x86/kernel/cpu/intel_64.c +++ /dev/null @@ -1,95 +0,0 @@ -#include <linux/init.h> -#include <linux/smp.h> -#include <asm/processor.h> -#include <asm/ptrace.h> -#include <asm/topology.h> -#include <asm/numa_64.h> - -#include "cpu.h" - -static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) -{ - if ((c->x86 == 0xf && c->x86_model >= 0x03) || - (c->x86 == 0x6 && c->x86_model >= 0x0e)) - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); - - set_cpu_cap(c, X86_FEATURE_SYSENTER32); -} - -/* - * find out the number of processor cores on the die - */ -static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) -{ - unsigned int eax, t; - - if (c->cpuid_level < 4) - return 1; - - cpuid_count(4, 0, &eax, &t, &t, &t); - - if (eax & 0x1f) - return ((eax >> 26) + 1); - else - return 1; -} - -static void __cpuinit srat_detect_node(void) -{ -#ifdef CONFIG_NUMA - unsigned node; - int cpu = smp_processor_id(); - int apicid = hard_smp_processor_id(); - - /* Don't do the funky fallback heuristics the AMD version employs - for now. */ - node = apicid_to_node[apicid]; - if (node == NUMA_NO_NODE || !node_online(node)) - node = first_node(node_online_map); - numa_set_node(cpu, node); - - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); -#endif -} - -static void __cpuinit init_intel(struct cpuinfo_x86 *c) -{ - init_intel_cacheinfo(c); - if (c->cpuid_level > 9) { - unsigned eax = cpuid_eax(10); - /* Check for version and the number of counters */ - if ((eax & 0xff) && (((eax>>8) & 0xff) > 1)) - set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); - } - - if (cpu_has_ds) { - unsigned int l1, l2; - rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); - if (!(l1 & (1<<11))) - set_cpu_cap(c, X86_FEATURE_BTS); - if (!(l1 & (1<<12))) - set_cpu_cap(c, X86_FEATURE_PEBS); - } - - - if (cpu_has_bts) - ds_init_intel(c); - - if (c->x86 == 15) - c->x86_cache_alignment = c->x86_clflush_size * 2; - if (c->x86 == 6) - set_cpu_cap(c, X86_FEATURE_REP_GOOD); - set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); - c->x86_max_cores = intel_num_cpu_cores(c); - - srat_detect_node(); -} - -static struct cpu_dev intel_cpu_dev __cpuinitdata = { - .c_vendor = "Intel", - .c_ident = { "GenuineIntel" }, - .c_early_init = early_init_intel, - .c_init = init_intel, -}; -cpu_vendor_dev_register(X86_VENDOR_INTEL, &intel_cpu_dev); - diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 2c8afafa18e8..3f46afbb1cf1 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -1,8 +1,8 @@ /* - * Routines to indentify caches on Intel CPU. + * Routines to indentify caches on Intel CPU. * - * Changes: - * Venkatesh Pallipadi : Adding cache identification through cpuid(4) + * Changes: + * Venkatesh Pallipadi : Adding cache identification through cpuid(4) * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure. * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. */ @@ -13,6 +13,7 @@ #include <linux/compiler.h> #include <linux/cpu.h> #include <linux/sched.h> +#include <linux/pci.h> #include <asm/processor.h> #include <asm/smp.h> @@ -130,9 +131,18 @@ struct _cpuid4_info { union _cpuid4_leaf_ebx ebx; union _cpuid4_leaf_ecx ecx; unsigned long size; + unsigned long can_disable; cpumask_t shared_cpu_map; /* future?: only cpus/node is needed */ }; +#ifdef CONFIG_PCI +static struct pci_device_id k8_nb_id[] = { + { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) }, + {} +}; +#endif + unsigned short num_cache_leaves; /* AMD doesn't have CPUID4. Emulate it here to report the same @@ -182,9 +192,10 @@ static unsigned short assocs[] __cpuinitdata = { static unsigned char levels[] __cpuinitdata = { 1, 1, 2, 3 }; static unsigned char types[] __cpuinitdata = { 1, 2, 3, 3 }; -static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, - union _cpuid4_leaf_ebx *ebx, - union _cpuid4_leaf_ecx *ecx) +static void __cpuinit +amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, + union _cpuid4_leaf_ebx *ebx, + union _cpuid4_leaf_ecx *ecx) { unsigned dummy; unsigned line_size, lines_per_tag, assoc, size_in_kb; @@ -251,27 +262,40 @@ static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, (ebx->split.ways_of_associativity + 1) - 1; } -static int __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf) +static void __cpuinit +amd_check_l3_disable(int index, struct _cpuid4_info *this_leaf) +{ + if (index < 3) + return; + this_leaf->can_disable = 1; +} + +static int +__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf) { union _cpuid4_leaf_eax eax; union _cpuid4_leaf_ebx ebx; union _cpuid4_leaf_ecx ecx; unsigned edx; - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { amd_cpuid4(index, &eax, &ebx, &ecx); - else - cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); + if (boot_cpu_data.x86 >= 0x10) + amd_check_l3_disable(index, this_leaf); + } else { + cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); + } + if (eax.split.type == CACHE_TYPE_NULL) return -EIO; /* better error ? */ this_leaf->eax = eax; this_leaf->ebx = ebx; this_leaf->ecx = ecx; - this_leaf->size = (ecx.split.number_of_sets + 1) * - (ebx.split.coherency_line_size + 1) * - (ebx.split.physical_line_partition + 1) * - (ebx.split.ways_of_associativity + 1); + this_leaf->size = (ecx.split.number_of_sets + 1) * + (ebx.split.coherency_line_size + 1) * + (ebx.split.physical_line_partition + 1) * + (ebx.split.ways_of_associativity + 1); return 0; } @@ -453,7 +477,7 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) /* pointer to _cpuid4_info array (for each cache leaf) */ static DEFINE_PER_CPU(struct _cpuid4_info *, cpuid4_info); -#define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y])) +#define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y])) #ifdef CONFIG_SMP static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) @@ -489,8 +513,8 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) int sibling; this_leaf = CPUID4_INFO_IDX(cpu, index); - for_each_cpu_mask(sibling, this_leaf->shared_cpu_map) { - sibling_leaf = CPUID4_INFO_IDX(sibling, index); + for_each_cpu_mask_nr(sibling, this_leaf->shared_cpu_map) { + sibling_leaf = CPUID4_INFO_IDX(sibling, index); cpu_clear(cpu, sibling_leaf->shared_cpu_map); } } @@ -572,7 +596,7 @@ struct _index_kobject { /* pointer to array of kobjects for cpuX/cache/indexY */ static DEFINE_PER_CPU(struct _index_kobject *, index_kobject); -#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(index_kobject, x))[y])) +#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(index_kobject, x))[y])) #define show_one_plus(file_name, object, val) \ static ssize_t show_##file_name \ @@ -637,6 +661,99 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) { } } +#define to_object(k) container_of(k, struct _index_kobject, kobj) +#define to_attr(a) container_of(a, struct _cache_attr, attr) + +#ifdef CONFIG_PCI +static struct pci_dev *get_k8_northbridge(int node) +{ + struct pci_dev *dev = NULL; + int i; + + for (i = 0; i <= node; i++) { + do { + dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); + if (!dev) + break; + } while (!pci_match_id(&k8_nb_id[0], dev)); + if (!dev) + break; + } + return dev; +} +#else +static struct pci_dev *get_k8_northbridge(int node) +{ + return NULL; +} +#endif + +static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf) +{ + int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map)); + struct pci_dev *dev = NULL; + ssize_t ret = 0; + int i; + + if (!this_leaf->can_disable) + return sprintf(buf, "Feature not enabled\n"); + + dev = get_k8_northbridge(node); + if (!dev) { + printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n"); + return -EINVAL; + } + + for (i = 0; i < 2; i++) { + unsigned int reg; + + pci_read_config_dword(dev, 0x1BC + i * 4, ®); + + ret += sprintf(buf, "%sEntry: %d\n", buf, i); + ret += sprintf(buf, "%sReads: %s\tNew Entries: %s\n", + buf, + reg & 0x80000000 ? "Disabled" : "Allowed", + reg & 0x40000000 ? "Disabled" : "Allowed"); + ret += sprintf(buf, "%sSubCache: %x\tIndex: %x\n", + buf, (reg & 0x30000) >> 16, reg & 0xfff); + } + return ret; +} + +static ssize_t +store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, + size_t count) +{ + int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map)); + struct pci_dev *dev = NULL; + unsigned int ret, index, val; + + if (!this_leaf->can_disable) + return 0; + + if (strlen(buf) > 15) + return -EINVAL; + + ret = sscanf(buf, "%x %x", &index, &val); + if (ret != 2) + return -EINVAL; + if (index > 1) + return -EINVAL; + + val |= 0xc0000000; + dev = get_k8_northbridge(node); + if (!dev) { + printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n"); + return -EINVAL; + } + + pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000); + wbinvd(); + pci_write_config_dword(dev, 0x1BC + index * 4, val); + + return 1; +} + struct _cache_attr { struct attribute attr; ssize_t (*show)(struct _cpuid4_info *, char *); @@ -657,6 +774,8 @@ define_one_ro(size); define_one_ro(shared_cpu_map); define_one_ro(shared_cpu_list); +static struct _cache_attr cache_disable = __ATTR(cache_disable, 0644, show_cache_disable, store_cache_disable); + static struct attribute * default_attrs[] = { &type.attr, &level.attr, @@ -667,12 +786,10 @@ static struct attribute * default_attrs[] = { &size.attr, &shared_cpu_map.attr, &shared_cpu_list.attr, + &cache_disable.attr, NULL }; -#define to_object(k) container_of(k, struct _index_kobject, kobj) -#define to_attr(a) container_of(a, struct _cache_attr, attr) - static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf) { struct _cache_attr *fattr = to_attr(attr); @@ -682,14 +799,22 @@ static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf) ret = fattr->show ? fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), buf) : - 0; + 0; return ret; } static ssize_t store(struct kobject * kobj, struct attribute * attr, const char * buf, size_t count) { - return 0; + struct _cache_attr *fattr = to_attr(attr); + struct _index_kobject *this_leaf = to_object(kobj); + ssize_t ret; + + ret = fattr->store ? + fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), + buf, count) : + 0; + return ret; } static struct sysfs_ops sysfs_ops = { @@ -780,15 +905,14 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) } kobject_put(per_cpu(cache_kobject, cpu)); cpuid4_cache_sysfs_exit(cpu); - break; + return retval; } kobject_uevent(&(this_object->kobj), KOBJ_ADD); } - if (!retval) - cpu_set(cpu, cache_dev_map); + cpu_set(cpu, cache_dev_map); kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD); - return retval; + return 0; } static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c index f390c9f66351..dd3af6e7b39a 100644 --- a/arch/x86/kernel/cpu/mcheck/k7.c +++ b/arch/x86/kernel/cpu/mcheck/k7.c @@ -1,6 +1,6 @@ /* - * Athlon/Hammer specific Machine Check Exception Reporting - * (C) Copyright 2002 Dave Jones <davej@codemonkey.org.uk> + * Athlon specific Machine Check Exception Reporting + * (C) Copyright 2002 Dave Jones <davej@redhat.com> */ #include <linux/init.h> diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c index 774d87cfd8cd..0ebf3fc6a610 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_32.c +++ b/arch/x86/kernel/cpu/mcheck/mce_32.c @@ -1,6 +1,6 @@ /* * mce.c - x86 Machine Check Exception Reporting - * (c) 2002 Alan Cox <alan@redhat.com>, Dave Jones <davej@codemonkey.org.uk> + * (c) 2002 Alan Cox <alan@redhat.com>, Dave Jones <davej@redhat.com> */ #include <linux/init.h> diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index c4a7ec31394c..4b031a4ac856 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -580,7 +580,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, char __user *buf = ubuf; int i, err; - cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL); + cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); if (!cpu_tsc) return -ENOMEM; @@ -759,13 +759,18 @@ static struct sysdev_class mce_sysclass = { }; DEFINE_PER_CPU(struct sys_device, device_mce); +void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata; /* Why are there no generic functions for this? */ #define ACCESSOR(name, var, start) \ - static ssize_t show_ ## name(struct sys_device *s, char *buf) { \ + static ssize_t show_ ## name(struct sys_device *s, \ + struct sysdev_attribute *attr, \ + char *buf) { \ return sprintf(buf, "%lx\n", (unsigned long)var); \ } \ - static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \ + static ssize_t set_ ## name(struct sys_device *s, \ + struct sysdev_attribute *attr, \ + const char *buf, size_t siz) { \ char *end; \ unsigned long new = simple_strtoul(buf, &end, 0); \ if (end == buf) return -EINVAL; \ @@ -786,14 +791,16 @@ ACCESSOR(bank3ctl,bank[3],mce_restart()) ACCESSOR(bank4ctl,bank[4],mce_restart()) ACCESSOR(bank5ctl,bank[5],mce_restart()) -static ssize_t show_trigger(struct sys_device *s, char *buf) +static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, + char *buf) { strcpy(buf, trigger); strcat(buf, "\n"); return strlen(trigger) + 1; } -static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz) +static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, + const char *buf,size_t siz) { char *p; int len; @@ -806,12 +813,12 @@ static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz) } static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); -ACCESSOR(tolerant,tolerant,) +static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); ACCESSOR(check_interval,check_interval,mce_restart()) static struct sysdev_attribute *mce_attributes[] = { &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl, - &attr_tolerant, &attr_check_interval, &attr_trigger, + &attr_tolerant.attr, &attr_check_interval, &attr_trigger, NULL }; @@ -853,7 +860,7 @@ error: return err; } -static void mce_remove_device(unsigned int cpu) +static __cpuinit void mce_remove_device(unsigned int cpu) { int i; @@ -877,9 +884,13 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, case CPU_ONLINE: case CPU_ONLINE_FROZEN: mce_create_device(cpu); + if (threshold_cpu_callback) + threshold_cpu_callback(action, cpu); break; case CPU_DEAD: case CPU_DEAD_FROZEN: + if (threshold_cpu_callback) + threshold_cpu_callback(action, cpu); mce_remove_device(cpu); break; } diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 7c9a813e1193..5eb390a4b2e9 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -527,7 +527,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (err) goto out_free; - for_each_cpu_mask(i, b->cpus) { + for_each_cpu_mask_nr(i, b->cpus) { if (i == cpu) continue; @@ -617,7 +617,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) #endif /* remove all sibling symlinks before unregistering */ - for_each_cpu_mask(i, b->cpus) { + for_each_cpu_mask_nr(i, b->cpus) { if (i == cpu) continue; @@ -628,6 +628,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) deallocate_threshold_block(cpu, bank); free_out: + kobject_del(b->kobj); kobject_put(b->kobj); kfree(b); per_cpu(threshold_banks, cpu)[bank] = NULL; @@ -645,14 +646,11 @@ static void threshold_remove_device(unsigned int cpu) } /* get notified when a cpu comes on/off */ -static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static void __cpuinit amd_64_threshold_cpu_callback(unsigned long action, + unsigned int cpu) { - /* cpu was unsigned int to begin with */ - unsigned int cpu = (unsigned long)hcpu; - if (cpu >= NR_CPUS) - goto out; + return; switch (action) { case CPU_ONLINE: @@ -666,14 +664,8 @@ static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb, default: break; } - out: - return NOTIFY_OK; } -static struct notifier_block threshold_cpu_notifier __cpuinitdata = { - .notifier_call = threshold_cpu_callback, -}; - static __init int threshold_init_device(void) { unsigned lcpu = 0; @@ -684,7 +676,7 @@ static __init int threshold_init_device(void) if (err) return err; } - register_hotcpu_notifier(&threshold_cpu_notifier); + threshold_cpu_callback = amd_64_threshold_cpu_callback; return 0; } diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c index cc1fccdd31e0..a74af128efc9 100644 --- a/arch/x86/kernel/cpu/mcheck/non-fatal.c +++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c @@ -1,7 +1,7 @@ /* * Non Fatal Machine Check Exception Reporting * - * (C) Copyright 2002 Dave Jones. <davej@codemonkey.org.uk> + * (C) Copyright 2002 Dave Jones. <davej@redhat.com> * * This file contains routines to check for non-fatal MCEs every 15s * diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index eef001ad3bde..9b60fce09f75 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -102,7 +102,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) /* The temperature transition interrupt handler setup */ h = THERMAL_APIC_VECTOR; /* our delivery vector */ h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */ - apic_write_around(APIC_LVTTHMR, h); + apic_write(APIC_LVTTHMR, h); rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h); @@ -114,7 +114,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h); l = apic_read(APIC_LVTTHMR); - apic_write_around(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); + apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); /* enable thermal throttle processing */ diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 1f4cc48c14c6..d5ae2243f0b9 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -35,6 +35,7 @@ atomic_t therm_throt_en = ATOMIC_INIT(0); #define define_therm_throt_sysdev_show_func(name) \ static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ + struct sysdev_attribute *attr, \ char *buf) \ { \ unsigned int cpu = dev->id; \ diff --git a/arch/x86/kernel/cpu/mkcapflags.pl b/arch/x86/kernel/cpu/mkcapflags.pl new file mode 100644 index 000000000000..dfea390e1608 --- /dev/null +++ b/arch/x86/kernel/cpu/mkcapflags.pl @@ -0,0 +1,32 @@ +#!/usr/bin/perl +# +# Generate the x86_cap_flags[] array from include/asm-x86/cpufeature.h +# + +($in, $out) = @ARGV; + +open(IN, "< $in\0") or die "$0: cannot open: $in: $!\n"; +open(OUT, "> $out\0") or die "$0: cannot create: $out: $!\n"; + +print OUT "#include <asm/cpufeature.h>\n\n"; +print OUT "const char * const x86_cap_flags[NCAPINTS*32] = {\n"; + +while (defined($line = <IN>)) { + if ($line =~ /^\s*\#\s*define\s+(X86_FEATURE_(\S+))\s+(.*)$/) { + $macro = $1; + $feature = $2; + $tail = $3; + if ($tail =~ /\/\*\s*\"([^"]*)\".*\*\//) { + $feature = $1; + } + + if ($feature ne '') { + printf OUT "\t%-32s = \"%s\",\n", + "[$macro]", "\L$feature"; + } + } +} +print OUT "};\n"; + +close(IN); +close(OUT); diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 509bd3d9eacd..4e8d77f01eeb 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -379,6 +379,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, unsigned long *size, mtrr_type *type) { unsigned int mask_lo, mask_hi, base_lo, base_hi; + unsigned int tmp, hi; rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); if ((mask_lo & 0x800) == 0) { @@ -392,8 +393,18 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi); /* Work out the shifted address mask. */ - mask_lo = size_or_mask | mask_hi << (32 - PAGE_SHIFT) - | mask_lo >> PAGE_SHIFT; + tmp = mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT; + mask_lo = size_or_mask | tmp; + /* Expand tmp with high bits to all 1s*/ + hi = fls(tmp); + if (hi > 0) { + tmp |= ~((1<<(hi - 1)) - 1); + + if (tmp != mask_lo) { + WARN_ONCE(1, KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n"); + mask_lo = tmp; + } + } /* This works correctly if size is a power of two, i.e. a contiguous range. */ diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index 84c480bb3715..4c4214690dd1 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -405,9 +405,9 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset) } /* RED-PEN: base can be > 32bit */ len += seq_printf(seq, - "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n", + "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n", i, base, base >> (20 - PAGE_SHIFT), size, factor, - mtrr_attrib_to_str(type), mtrr_usage_table[i]); + mtrr_usage_table[i], mtrr_attrib_to_str(type)); } } return 0; diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 6f23969c8faf..c78c04821ea1 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -729,7 +729,7 @@ struct var_mtrr_range_state { mtrr_type type; }; -struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; +static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; static int __initdata debug_print; static int __init @@ -759,7 +759,8 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, /* take out UC ranges */ for (i = 0; i < num_var_ranges; i++) { type = range_state[i].type; - if (type != MTRR_TYPE_UNCACHABLE) + if (type != MTRR_TYPE_UNCACHABLE && + type != MTRR_TYPE_WRPROT) continue; size = range_state[i].size_pfn; if (!size) @@ -834,7 +835,14 @@ static int __init enable_mtrr_cleanup_setup(char *str) enable_mtrr_cleanup = 1; return 0; } -early_param("enble_mtrr_cleanup", enable_mtrr_cleanup_setup); +early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup); + +static int __init mtrr_cleanup_debug_setup(char *str) +{ + debug_print = 1; + return 0; +} +early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup); struct var_mtrr_state { unsigned long range_startk; @@ -898,6 +906,27 @@ set_var_mtrr_all(unsigned int address_bits) } } +static unsigned long to_size_factor(unsigned long sizek, char *factorp) +{ + char factor; + unsigned long base = sizek; + + if (base & ((1<<10) - 1)) { + /* not MB alignment */ + factor = 'K'; + } else if (base & ((1<<20) - 1)){ + factor = 'M'; + base >>= 10; + } else { + factor = 'G'; + base >>= 20; + } + + *factorp = factor; + + return base; +} + static unsigned int __init range_to_mtrr(unsigned int reg, unsigned long range_startk, unsigned long range_sizek, unsigned char type) @@ -919,13 +948,21 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk, align = max_align; sizek = 1 << align; - if (debug_print) + if (debug_print) { + char start_factor = 'K', size_factor = 'K'; + unsigned long start_base, size_base; + + start_base = to_size_factor(range_startk, &start_factor), + size_base = to_size_factor(sizek, &size_factor), + printk(KERN_DEBUG "Setting variable MTRR %d, " - "base: %ldMB, range: %ldMB, type %s\n", - reg, range_startk >> 10, sizek >> 10, + "base: %ld%cB, range: %ld%cB, type %s\n", + reg, start_base, start_factor, + size_base, size_factor, (type == MTRR_TYPE_UNCACHABLE)?"UC": ((type == MTRR_TYPE_WRBACK)?"WB":"Other") ); + } save_var_mtrr(reg++, range_startk, sizek, type); range_startk += sizek; range_sizek -= sizek; @@ -970,6 +1007,8 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, /* try to append some small hole */ range0_basek = state->range_startk; range0_sizek = ALIGN(state->range_sizek, chunk_sizek); + + /* no increase */ if (range0_sizek == state->range_sizek) { if (debug_print) printk(KERN_DEBUG "rangeX: %016lx - %016lx\n", @@ -980,13 +1019,40 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, return 0; } - range0_sizek -= chunk_sizek; - if (range0_sizek && sizek) { - while (range0_basek + range0_sizek > (basek + sizek)) { - range0_sizek -= chunk_sizek; - if (!range0_sizek) - break; - } + /* only cut back, when it is not the last */ + if (sizek) { + while (range0_basek + range0_sizek > (basek + sizek)) { + if (range0_sizek >= chunk_sizek) + range0_sizek -= chunk_sizek; + else + range0_sizek = 0; + + if (!range0_sizek) + break; + } + } + +second_try: + range_basek = range0_basek + range0_sizek; + + /* one hole in the middle */ + if (range_basek > basek && range_basek <= (basek + sizek)) + second_sizek = range_basek - basek; + + if (range0_sizek > state->range_sizek) { + + /* one hole in middle or at end */ + hole_sizek = range0_sizek - state->range_sizek - second_sizek; + + /* hole size should be less than half of range0 size */ + if (hole_sizek >= (range0_sizek >> 1) && + range0_sizek >= chunk_sizek) { + range0_sizek -= chunk_sizek; + second_sizek = 0; + hole_sizek = 0; + + goto second_try; + } } if (range0_sizek) { @@ -996,50 +1062,28 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, (range0_basek + range0_sizek)<<10); state->reg = range_to_mtrr(state->reg, range0_basek, range0_sizek, MTRR_TYPE_WRBACK); - - } - - range_basek = range0_basek + range0_sizek; - range_sizek = chunk_sizek; - - if (range_basek + range_sizek > basek && - range_basek + range_sizek <= (basek + sizek)) { - /* one hole */ - second_basek = basek; - second_sizek = range_basek + range_sizek - basek; } - /* if last piece, only could one hole near end */ - if ((second_basek || !basek) && - range_sizek - (state->range_sizek - range0_sizek) - second_sizek < - (chunk_sizek >> 1)) { - /* - * one hole in middle (second_sizek is 0) or at end - * (second_sizek is 0 ) - */ - hole_sizek = range_sizek - (state->range_sizek - range0_sizek) - - second_sizek; - hole_basek = range_basek + range_sizek - hole_sizek - - second_sizek; - } else { - /* fallback for big hole, or several holes */ + if (range0_sizek < state->range_sizek) { + /* need to handle left over */ range_sizek = state->range_sizek - range0_sizek; - second_basek = 0; - second_sizek = 0; + + if (debug_print) + printk(KERN_DEBUG "range: %016lx - %016lx\n", + range_basek<<10, + (range_basek + range_sizek)<<10); + state->reg = range_to_mtrr(state->reg, range_basek, + range_sizek, MTRR_TYPE_WRBACK); } - if (debug_print) - printk(KERN_DEBUG "range: %016lx - %016lx\n", range_basek<<10, - (range_basek + range_sizek)<<10); - state->reg = range_to_mtrr(state->reg, range_basek, range_sizek, - MTRR_TYPE_WRBACK); if (hole_sizek) { + hole_basek = range_basek - hole_sizek - second_sizek; if (debug_print) printk(KERN_DEBUG "hole: %016lx - %016lx\n", - hole_basek<<10, (hole_basek + hole_sizek)<<10); - state->reg = range_to_mtrr(state->reg, hole_basek, hole_sizek, - MTRR_TYPE_UNCACHABLE); - + hole_basek<<10, + (hole_basek + hole_sizek)<<10); + state->reg = range_to_mtrr(state->reg, hole_basek, + hole_sizek, MTRR_TYPE_UNCACHABLE); } return second_sizek; @@ -1154,11 +1198,11 @@ struct mtrr_cleanup_result { }; /* - * gran_size: 1M, 2M, ..., 2G - * chunk size: gran_size, ..., 4G - * so we need (2+13)*6 + * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G + * chunk size: gran_size, ..., 2G + * so we need (1+16)*8 */ -#define NUM_RESULT 90 +#define NUM_RESULT 136 #define PSHIFT (PAGE_SHIFT - 10) static struct mtrr_cleanup_result __initdata result[NUM_RESULT]; @@ -1168,13 +1212,14 @@ static unsigned long __initdata min_loss_pfn[RANGE_NUM]; static int __init mtrr_cleanup(unsigned address_bits) { unsigned long extra_remove_base, extra_remove_size; - unsigned long i, base, size, def, dummy; + unsigned long base, size, def, dummy; mtrr_type type; int nr_range, nr_range_new; u64 chunk_size, gran_size; unsigned long range_sums, range_sums_new; int index_good; int num_reg_good; + int i; /* extra one for all 0 */ int num[MTRR_NUM_TYPES + 1]; @@ -1204,6 +1249,8 @@ static int __init mtrr_cleanup(unsigned address_bits) continue; if (!size) type = MTRR_NUM_TYPES; + if (type == MTRR_TYPE_WRPROT) + type = MTRR_TYPE_UNCACHABLE; num[type]++; } @@ -1216,23 +1263,57 @@ static int __init mtrr_cleanup(unsigned address_bits) num_var_ranges - num[MTRR_NUM_TYPES]) return 0; + /* print original var MTRRs at first, for debugging: */ + printk(KERN_DEBUG "original variable MTRRs\n"); + for (i = 0; i < num_var_ranges; i++) { + char start_factor = 'K', size_factor = 'K'; + unsigned long start_base, size_base; + + size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10); + if (!size_base) + continue; + + size_base = to_size_factor(size_base, &size_factor), + start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10); + start_base = to_size_factor(start_base, &start_factor), + type = range_state[i].type; + + printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n", + i, start_base, start_factor, + size_base, size_factor, + (type == MTRR_TYPE_UNCACHABLE) ? "UC" : + ((type == MTRR_TYPE_WRPROT) ? "WP" : + ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other")) + ); + } + memset(range, 0, sizeof(range)); extra_remove_size = 0; - if (mtrr_tom2) { - extra_remove_base = 1 << (32 - PAGE_SHIFT); + extra_remove_base = 1 << (32 - PAGE_SHIFT); + if (mtrr_tom2) extra_remove_size = (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base; - } nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base, extra_remove_size); + /* + * [0, 1M) should always be coverred by var mtrr with WB + * and fixed mtrrs should take effective before var mtrr for it + */ + nr_range = add_range_with_merge(range, nr_range, 0, + (1ULL<<(20 - PAGE_SHIFT)) - 1); + /* sort the ranges */ + sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); + range_sums = sum_ranges(range, nr_range); printk(KERN_INFO "total RAM coverred: %ldM\n", range_sums >> (20 - PAGE_SHIFT)); if (mtrr_chunk_size && mtrr_gran_size) { int num_reg; + char gran_factor, chunk_factor, lose_factor; + unsigned long gran_base, chunk_base, lose_base; - debug_print = 1; + debug_print++; /* convert ranges to var ranges state */ num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size, mtrr_gran_size); @@ -1256,34 +1337,48 @@ static int __init mtrr_cleanup(unsigned address_bits) result[i].lose_cover_sizek = (range_sums - range_sums_new) << PSHIFT; - printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t", - result[i].bad?"*BAD*":" ", result[i].gran_sizek >> 10, - result[i].chunk_sizek >> 10); - printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ldM \n", + gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), + chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), + lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), + printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t", + result[i].bad?"*BAD*":" ", + gran_base, gran_factor, chunk_base, chunk_factor); + printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n", result[i].num_reg, result[i].bad?"-":"", - result[i].lose_cover_sizek >> 10); + lose_base, lose_factor); if (!result[i].bad) { set_var_mtrr_all(address_bits); return 1; } printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, " "will find optimal one\n"); - debug_print = 0; + debug_print--; memset(result, 0, sizeof(result[0])); } i = 0; memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn)); memset(result, 0, sizeof(result)); - for (gran_size = (1ULL<<20); gran_size < (1ULL<<32); gran_size <<= 1) { - for (chunk_size = gran_size; chunk_size < (1ULL<<33); + for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) { + char gran_factor; + unsigned long gran_base; + + if (debug_print) + gran_base = to_size_factor(gran_size >> 10, &gran_factor); + + for (chunk_size = gran_size; chunk_size < (1ULL<<32); chunk_size <<= 1) { int num_reg; - if (debug_print) - printk(KERN_INFO - "\ngran_size: %lldM chunk_size_size: %lldM\n", - gran_size >> 20, chunk_size >> 20); + if (debug_print) { + char chunk_factor; + unsigned long chunk_base; + + chunk_base = to_size_factor(chunk_size>>10, &chunk_factor), + printk(KERN_INFO "\n"); + printk(KERN_INFO "gran_size: %ld%c chunk_size: %ld%c \n", + gran_base, gran_factor, chunk_base, chunk_factor); + } if (i >= NUM_RESULT) continue; @@ -1326,12 +1421,18 @@ static int __init mtrr_cleanup(unsigned address_bits) /* print out all */ for (i = 0; i < NUM_RESULT; i++) { - printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t", - result[i].bad?"*BAD* ":" ", result[i].gran_sizek >> 10, - result[i].chunk_sizek >> 10); - printk(KERN_CONT "num_reg: %d \tlose RAM: %s%ldM\n", - result[i].num_reg, result[i].bad?"-":"", - result[i].lose_cover_sizek >> 10); + char gran_factor, chunk_factor, lose_factor; + unsigned long gran_base, chunk_base, lose_base; + + gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), + chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), + lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), + printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t", + result[i].bad?"*BAD*":" ", + gran_base, gran_factor, chunk_base, chunk_factor); + printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n", + result[i].num_reg, result[i].bad?"-":"", + lose_base, lose_factor); } /* try to find the optimal index */ @@ -1339,10 +1440,8 @@ static int __init mtrr_cleanup(unsigned address_bits) nr_mtrr_spare_reg = num_var_ranges - 1; num_reg_good = -1; for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { - if (!min_loss_pfn[i]) { + if (!min_loss_pfn[i]) num_reg_good = i; - break; - } } index_good = -1; @@ -1358,21 +1457,26 @@ static int __init mtrr_cleanup(unsigned address_bits) } if (index_good != -1) { + char gran_factor, chunk_factor, lose_factor; + unsigned long gran_base, chunk_base, lose_base; + printk(KERN_INFO "Found optimal setting for mtrr clean up\n"); i = index_good; - printk(KERN_INFO "gran_size: %ldM \tchunk_size: %ldM \t", - result[i].gran_sizek >> 10, - result[i].chunk_sizek >> 10); - printk(KERN_CONT "num_reg: %d \tlose RAM: %ldM\n", - result[i].num_reg, - result[i].lose_cover_sizek >> 10); + gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), + chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), + lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), + printk(KERN_INFO "gran_size: %ld%c \tchunk_size: %ld%c \t", + gran_base, gran_factor, chunk_base, chunk_factor); + printk(KERN_CONT "num_reg: %d \tlose RAM: %ld%c\n", + result[i].num_reg, lose_base, lose_factor); /* convert ranges to var ranges state */ chunk_size = result[i].chunk_sizek; chunk_size <<= 10; gran_size = result[i].gran_sizek; gran_size <<= 10; - debug_print = 1; + debug_print++; x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); + debug_print--; set_var_mtrr_all(address_bits); return 1; } @@ -1496,11 +1600,8 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) /* kvm/qemu doesn't have mtrr set right, don't trim them all */ if (!highest_pfn) { - if (!kvm_para_available()) { - printk(KERN_WARNING + WARN(!kvm_para_available(), KERN_WARNING "WARNING: strange, CPU MTRRs all blank?\n"); - WARN_ON(1); - } return 0; } diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 6d4bdc02388a..9abd48b22674 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -17,6 +17,8 @@ #include <linux/bitops.h> #include <linux/smp.h> #include <linux/nmi.h> +#include <linux/kprobes.h> + #include <asm/apic.h> #include <asm/intel_arch_perfmon.h> @@ -250,7 +252,7 @@ static void write_watchdog_counter(unsigned int perfctr_msr, do_div(count, nmi_hz); if(descr) - Dprintk("setting %s to -0x%08Lx\n", descr, count); + pr_debug("setting %s to -0x%08Lx\n", descr, count); wrmsrl(perfctr_msr, 0 - count); } @@ -261,7 +263,7 @@ static void write_watchdog_counter32(unsigned int perfctr_msr, do_div(count, nmi_hz); if(descr) - Dprintk("setting %s to -0x%08Lx\n", descr, count); + pr_debug("setting %s to -0x%08Lx\n", descr, count); wrmsr(perfctr_msr, (u32)(-count), 0); } @@ -295,13 +297,19 @@ static int setup_k7_watchdog(unsigned nmi_hz) /* setup the timer */ wrmsr(evntsel_msr, evntsel, 0); write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz); - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= K7_EVNTSEL_ENABLE; - wrmsr(evntsel_msr, evntsel, 0); + /* initialize the wd struct before enabling */ wd->perfctr_msr = perfctr_msr; wd->evntsel_msr = evntsel_msr; wd->cccr_msr = 0; /* unused */ + + /* ok, everything is initialized, announce that we're set */ + cpu_nmi_set_wd_enabled(); + + apic_write(APIC_LVTPC, APIC_DM_NMI); + evntsel |= K7_EVNTSEL_ENABLE; + wrmsr(evntsel_msr, evntsel, 0); + return 1; } @@ -330,7 +338,8 @@ static void single_msr_unreserve(void) release_perfctr_nmi(wd_ops->perfctr); } -static void single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) +static void __kprobes +single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) { /* start the cycle over again */ write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz); @@ -379,17 +388,23 @@ static int setup_p6_watchdog(unsigned nmi_hz) wrmsr(evntsel_msr, evntsel, 0); nmi_hz = adjust_for_32bit_ctr(nmi_hz); write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz); - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= P6_EVNTSEL0_ENABLE; - wrmsr(evntsel_msr, evntsel, 0); + /* initialize the wd struct before enabling */ wd->perfctr_msr = perfctr_msr; wd->evntsel_msr = evntsel_msr; wd->cccr_msr = 0; /* unused */ + + /* ok, everything is initialized, announce that we're set */ + cpu_nmi_set_wd_enabled(); + + apic_write(APIC_LVTPC, APIC_DM_NMI); + evntsel |= P6_EVNTSEL0_ENABLE; + wrmsr(evntsel_msr, evntsel, 0); + return 1; } -static void p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) +static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) { /* * P6 based Pentium M need to re-unmask @@ -432,6 +447,27 @@ static const struct wd_ops p6_wd_ops = { #define P4_CCCR_ENABLE (1 << 12) #define P4_CCCR_OVF (1 << 31) +#define P4_CONTROLS 18 +static unsigned int p4_controls[18] = { + MSR_P4_BPU_CCCR0, + MSR_P4_BPU_CCCR1, + MSR_P4_BPU_CCCR2, + MSR_P4_BPU_CCCR3, + MSR_P4_MS_CCCR0, + MSR_P4_MS_CCCR1, + MSR_P4_MS_CCCR2, + MSR_P4_MS_CCCR3, + MSR_P4_FLAME_CCCR0, + MSR_P4_FLAME_CCCR1, + MSR_P4_FLAME_CCCR2, + MSR_P4_FLAME_CCCR3, + MSR_P4_IQ_CCCR0, + MSR_P4_IQ_CCCR1, + MSR_P4_IQ_CCCR2, + MSR_P4_IQ_CCCR3, + MSR_P4_IQ_CCCR4, + MSR_P4_IQ_CCCR5, +}; /* * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter * CRU_ESCR0 (with any non-null event selector) through a complemented @@ -473,12 +509,38 @@ static int setup_p4_watchdog(unsigned nmi_hz) evntsel_msr = MSR_P4_CRU_ESCR0; cccr_msr = MSR_P4_IQ_CCCR0; cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4); + + /* + * If we're on the kdump kernel or other situation, we may + * still have other performance counter registers set to + * interrupt and they'll keep interrupting forever because + * of the P4_CCCR_OVF quirk. So we need to ACK all the + * pending interrupts and disable all the registers here, + * before reenabling the NMI delivery. Refer to p4_rearm() + * about the P4_CCCR_OVF quirk. + */ + if (reset_devices) { + unsigned int low, high; + int i; + + for (i = 0; i < P4_CONTROLS; i++) { + rdmsr(p4_controls[i], low, high); + low &= ~(P4_CCCR_ENABLE | P4_CCCR_OVF); + wrmsr(p4_controls[i], low, high); + } + } } else { /* logical cpu 1 */ perfctr_msr = MSR_P4_IQ_PERFCTR1; evntsel_msr = MSR_P4_CRU_ESCR0; cccr_msr = MSR_P4_IQ_CCCR1; - cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4); + + /* Pentium 4 D processors don't support P4_CCCR_OVF_PMI1 */ + if (boot_cpu_data.x86_model == 4 && boot_cpu_data.x86_mask == 4) + cccr_val = P4_CCCR_OVF_PMI0; + else + cccr_val = P4_CCCR_OVF_PMI1; + cccr_val |= P4_CCCR_ESCR_SELECT(4); } evntsel = P4_ESCR_EVENT_SELECT(0x3F) @@ -493,12 +555,17 @@ static int setup_p4_watchdog(unsigned nmi_hz) wrmsr(evntsel_msr, evntsel, 0); wrmsr(cccr_msr, cccr_val, 0); write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz); - apic_write(APIC_LVTPC, APIC_DM_NMI); - cccr_val |= P4_CCCR_ENABLE; - wrmsr(cccr_msr, cccr_val, 0); + wd->perfctr_msr = perfctr_msr; wd->evntsel_msr = evntsel_msr; wd->cccr_msr = cccr_msr; + + /* ok, everything is initialized, announce that we're set */ + cpu_nmi_set_wd_enabled(); + + apic_write(APIC_LVTPC, APIC_DM_NMI); + cccr_val |= P4_CCCR_ENABLE; + wrmsr(cccr_msr, cccr_val, 0); return 1; } @@ -541,7 +608,7 @@ static void p4_unreserve(void) release_perfctr_nmi(MSR_P4_IQ_PERFCTR0); } -static void p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) +static void __kprobes p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) { unsigned dummy; /* @@ -614,13 +681,17 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz) wrmsr(evntsel_msr, evntsel, 0); nmi_hz = adjust_for_32bit_ctr(nmi_hz); write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz); - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsr(evntsel_msr, evntsel, 0); wd->perfctr_msr = perfctr_msr; wd->evntsel_msr = evntsel_msr; wd->cccr_msr = 0; /* unused */ + + /* ok, everything is initialized, announce that we're set */ + cpu_nmi_set_wd_enabled(); + + apic_write(APIC_LVTPC, APIC_DM_NMI); + evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsr(evntsel_msr, evntsel, 0); intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1); return 1; } @@ -716,7 +787,7 @@ unsigned lapic_adjust_nmi_hz(unsigned hz) return hz; } -int lapic_wd_event(unsigned nmi_hz) +int __kprobes lapic_wd_event(unsigned nmi_hz) { struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); u64 ctr; diff --git a/arch/x86/kernel/cpu/powerflags.c b/arch/x86/kernel/cpu/powerflags.c new file mode 100644 index 000000000000..5abbea297e0c --- /dev/null +++ b/arch/x86/kernel/cpu/powerflags.c @@ -0,0 +1,20 @@ +/* + * Strings for the various x86 power flags + * + * This file must not contain any executable code. + */ + +#include <asm/cpufeature.h> + +const char *const x86_power_flags[32] = { + "ts", /* temperature sensor */ + "fid", /* frequency id control */ + "vid", /* voltage id control */ + "ttp", /* thermal trip */ + "tm", + "stc", + "100mhzsteps", + "hwpstate", + "", /* tsc invariant mapped to constant_tsc */ + /* nothing */ +}; diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 0d0d9057e7c0..01b1244ef1c0 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -160,14 +160,16 @@ static void *c_start(struct seq_file *m, loff_t *pos) { if (*pos == 0) /* just in case, cpu 0 is not the first */ *pos = first_cpu(cpu_online_map); - if ((*pos) < NR_CPUS && cpu_online(*pos)) + else + *pos = next_cpu_nr(*pos - 1, cpu_online_map); + if ((*pos) < nr_cpu_ids) return &cpu_data(*pos); return NULL; } static void *c_next(struct seq_file *m, void *v, loff_t *pos) { - *pos = next_cpu(*pos, cpu_online_map); + (*pos)++; return c_start(m, pos); } diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c index b911a2c61b8f..52b3fefbd5af 100644 --- a/arch/x86/kernel/cpu/transmeta.c +++ b/arch/x86/kernel/cpu/transmeta.c @@ -5,6 +5,18 @@ #include <asm/msr.h> #include "cpu.h" +static void __cpuinit early_init_transmeta(struct cpuinfo_x86 *c) +{ + u32 xlvl; + + /* Transmeta-defined flags: level 0x80860001 */ + xlvl = cpuid_eax(0x80860000); + if ((xlvl & 0xffff0000) == 0x80860000) { + if (xlvl >= 0x80860001) + c->x86_capability[2] = cpuid_edx(0x80860001); + } +} + static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) { unsigned int cap_mask, uk, max, dummy; @@ -12,7 +24,8 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev; char cpu_info[65]; - get_model_name(c); /* Same as AMD/Cyrix */ + early_init_transmeta(c); + display_cacheinfo(c); /* Print CMS and CPU revision */ @@ -85,23 +98,12 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) #endif } -static void __cpuinit transmeta_identify(struct cpuinfo_x86 *c) -{ - u32 xlvl; - - /* Transmeta-defined flags: level 0x80860001 */ - xlvl = cpuid_eax(0x80860000); - if ((xlvl & 0xffff0000) == 0x80860000) { - if (xlvl >= 0x80860001) - c->x86_capability[2] = cpuid_edx(0x80860001); - } -} - static struct cpu_dev transmeta_cpu_dev __cpuinitdata = { .c_vendor = "Transmeta", .c_ident = { "GenuineTMx86", "TransmetaCPU" }, + .c_early_init = early_init_transmeta, .c_init = init_transmeta, - .c_identify = transmeta_identify, + .c_x86_vendor = X86_VENDOR_TRANSMETA, }; -cpu_vendor_dev_register(X86_VENDOR_TRANSMETA, &transmeta_cpu_dev); +cpu_dev_register(transmeta_cpu_dev); diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c index b1fc90989d75..e777f79e0960 100644 --- a/arch/x86/kernel/cpu/umc.c +++ b/arch/x86/kernel/cpu/umc.c @@ -19,7 +19,8 @@ static struct cpu_dev umc_cpu_dev __cpuinitdata = { } }, }, + .c_x86_vendor = X86_VENDOR_UMC, }; -cpu_vendor_dev_register(X86_VENDOR_UMC, &umc_cpu_dev); +cpu_dev_register(umc_cpu_dev); diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 2de5fa2bbf77..72cefd1e649b 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -36,7 +36,6 @@ #include <linux/smp_lock.h> #include <linux/major.h> #include <linux/fs.h> -#include <linux/smp_lock.h> #include <linux/device.h> #include <linux/cpu.h> #include <linux/notifier.h> @@ -89,6 +88,8 @@ static ssize_t cpuid_read(struct file *file, char __user *buf, struct cpuid_regs cmd; int cpu = iminor(file->f_path.dentry->d_inode); u64 pos = *ppos; + ssize_t bytes = 0; + int err = 0; if (count % 16) return -EINVAL; /* Invalid chunk size */ @@ -96,14 +97,19 @@ static ssize_t cpuid_read(struct file *file, char __user *buf, for (; count; count -= 16) { cmd.eax = pos; cmd.ecx = pos >> 32; - smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1); - if (copy_to_user(tmp, &cmd, 16)) - return -EFAULT; + err = smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1); + if (err) + break; + if (copy_to_user(tmp, &cmd, 16)) { + err = -EFAULT; + break; + } tmp += 16; + bytes += 16; *ppos = ++pos; } - return tmp - buf; + return bytes ? bytes : err; } static int cpuid_open(struct inode *inode, struct file *file) @@ -141,7 +147,7 @@ static __cpuinit int cpuid_device_create(int cpu) { struct device *dev; - dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), + dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL, "cpu%d", cpu); return IS_ERR(dev) ? PTR_ERR(dev) : 0; } diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index 72d0c56c1b48..f7cdb3b457aa 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -13,6 +13,9 @@ static void *kdump_buf_page; +/* Stores the physical address of elf header of crash image. */ +unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; + /** * copy_oldmem_page - copy one page from "oldmem" * @pfn: page frame number to be copied diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index 15e6c6bc4a46..045b36cada65 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c @@ -7,9 +7,11 @@ #include <linux/errno.h> #include <linux/crash_dump.h> +#include <linux/uaccess.h> +#include <linux/io.h> -#include <asm/uaccess.h> -#include <asm/io.h> +/* Stores the physical address of elf header of crash image. */ +unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; /** * copy_oldmem_page - copy one page from "oldmem" @@ -25,7 +27,7 @@ * in the current kernel. We stitch up a pte, similar to kmap_atomic. */ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, - size_t csize, unsigned long offset, int userbuf) + size_t csize, unsigned long offset, int userbuf) { void *vaddr; @@ -33,14 +35,16 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, return 0; vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE); + if (!vaddr) + return -ENOMEM; if (userbuf) { - if (copy_to_user(buf, (vaddr + offset), csize)) { + if (copy_to_user(buf, vaddr + offset, csize)) { iounmap(vaddr); return -EFAULT; } } else - memcpy(buf, (vaddr + offset), csize); + memcpy(buf, vaddr + offset, csize); iounmap(vaddr); return csize; diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c index a47798b59f07..b4f14c6c09d9 100644 --- a/arch/x86/kernel/doublefault_32.c +++ b/arch/x86/kernel/doublefault_32.c @@ -66,6 +66,6 @@ struct tss_struct doublefault_tss __cacheline_aligned = { .ds = __USER_DS, .fs = __KERNEL_PERCPU, - .__cr3 = __pa(swapper_pg_dir) + .__cr3 = __pa_nodebug(swapper_pg_dir), } }; diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 11c11b8ec48d..2b69994fd3a8 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -2,26 +2,49 @@ * Debug Store support * * This provides a low-level interface to the hardware's Debug Store - * feature that is used for last branch recording (LBR) and + * feature that is used for branch trace store (BTS) and * precise-event based sampling (PEBS). * - * Different architectures use a different DS layout/pointer size. - * The below functions therefore work on a void*. + * It manages: + * - per-thread and per-cpu allocation of BTS and PEBS + * - buffer memory allocation (optional) + * - buffer overflow handling + * - buffer access * + * It assumes: + * - get_task_struct on all parameter tasks + * - current is allowed to trace parameter tasks * - * Since there is no user for PEBS, yet, only LBR (or branch - * trace store, BTS) is supported. * - * - * Copyright (C) 2007 Intel Corporation. - * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007 + * Copyright (C) 2007-2008 Intel Corporation. + * Markus Metzger <markus.t.metzger@intel.com>, 2007-2008 */ + +#ifdef CONFIG_X86_DS + #include <asm/ds.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/slab.h> +#include <linux/sched.h> +#include <linux/mm.h> + + +/* + * The configuration for a particular DS hardware implementation. + */ +struct ds_configuration { + /* the size of the DS structure in bytes */ + unsigned char sizeof_ds; + /* the size of one pointer-typed field in the DS structure in bytes; + this covers the first 8 fields related to buffer management. */ + unsigned char sizeof_field; + /* the size of a BTS/PEBS record in bytes */ + unsigned char sizeof_rec[2]; +}; +static struct ds_configuration ds_cfg; /* @@ -44,378 +67,747 @@ * (interrupt occurs when write pointer passes interrupt pointer) * - value to which counter is reset following counter overflow * - * On later architectures, the last branch recording hardware uses - * 64bit pointers even in 32bit mode. - * - * - * Branch Trace Store (BTS) records store information about control - * flow changes. They at least provide the following information: - * - source linear address - * - destination linear address + * Later architectures use 64bit pointers throughout, whereas earlier + * architectures use 32bit pointers in 32bit mode. * - * Netburst supported a predicated bit that had been dropped in later - * architectures. We do not suppor it. * + * We compute the base address for the first 8 fields based on: + * - the field size stored in the DS configuration + * - the relative field position + * - an offset giving the start of the respective region * - * In order to abstract from the actual DS and BTS layout, we describe - * the access to the relevant fields. - * Thanks to Andi Kleen for proposing this design. + * This offset is further used to index various arrays holding + * information for BTS and PEBS at the respective index. * - * The implementation, however, is not as general as it might seem. In - * order to stay somewhat simple and efficient, we assume an - * underlying unsigned type (mostly a pointer type) and we expect the - * field to be at least as big as that type. + * On later 32bit processors, we only access the lower 32bit of the + * 64bit pointer fields. The upper halves will be zeroed out. */ -/* - * A special from_ip address to indicate that the BTS record is an - * info record that needs to be interpreted or skipped. - */ -#define BTS_ESCAPE_ADDRESS (-1) +enum ds_field { + ds_buffer_base = 0, + ds_index, + ds_absolute_maximum, + ds_interrupt_threshold, +}; -/* - * A field access descriptor - */ -struct access_desc { - unsigned char offset; - unsigned char size; +enum ds_qualifier { + ds_bts = 0, + ds_pebs }; +static inline unsigned long ds_get(const unsigned char *base, + enum ds_qualifier qual, enum ds_field field) +{ + base += (ds_cfg.sizeof_field * (field + (4 * qual))); + return *(unsigned long *)base; +} + +static inline void ds_set(unsigned char *base, enum ds_qualifier qual, + enum ds_field field, unsigned long value) +{ + base += (ds_cfg.sizeof_field * (field + (4 * qual))); + (*(unsigned long *)base) = value; +} + + /* - * The configuration for a particular DS/BTS hardware implementation. + * Locking is done only for allocating BTS or PEBS resources and for + * guarding context and buffer memory allocation. + * + * Most functions require the current task to own the ds context part + * they are going to access. All the locking is done when validating + * access to the context. */ -struct ds_configuration { - /* the DS configuration */ - unsigned char sizeof_ds; - struct access_desc bts_buffer_base; - struct access_desc bts_index; - struct access_desc bts_absolute_maximum; - struct access_desc bts_interrupt_threshold; - /* the BTS configuration */ - unsigned char sizeof_bts; - struct access_desc from_ip; - struct access_desc to_ip; - /* BTS variants used to store additional information like - timestamps */ - struct access_desc info_type; - struct access_desc info_data; - unsigned long debugctl_mask; -}; +static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock); /* - * The global configuration used by the below accessor functions + * Validate that the current task is allowed to access the BTS/PEBS + * buffer of the parameter task. + * + * Returns 0, if access is granted; -Eerrno, otherwise. */ -static struct ds_configuration ds_cfg; +static inline int ds_validate_access(struct ds_context *context, + enum ds_qualifier qual) +{ + if (!context) + return -EPERM; + + if (context->owner[qual] == current) + return 0; + + return -EPERM; +} + /* - * Accessor functions for some DS and BTS fields using the above - * global ptrace_bts_cfg. + * We either support (system-wide) per-cpu or per-thread allocation. + * We distinguish the two based on the task_struct pointer, where a + * NULL pointer indicates per-cpu allocation for the current cpu. + * + * Allocations are use-counted. As soon as resources are allocated, + * further allocations must be of the same type (per-cpu or + * per-thread). We model this by counting allocations (i.e. the number + * of tracers of a certain type) for one type negatively: + * =0 no tracers + * >0 number of per-thread tracers + * <0 number of per-cpu tracers + * + * The below functions to get and put tracers and to check the + * allocation type require the ds_lock to be held by the caller. + * + * Tracers essentially gives the number of ds contexts for a certain + * type of allocation. */ -static inline unsigned long get_bts_buffer_base(char *base) +static long tracers; + +static inline void get_tracer(struct task_struct *task) { - return *(unsigned long *)(base + ds_cfg.bts_buffer_base.offset); + tracers += (task ? 1 : -1); } -static inline void set_bts_buffer_base(char *base, unsigned long value) + +static inline void put_tracer(struct task_struct *task) { - (*(unsigned long *)(base + ds_cfg.bts_buffer_base.offset)) = value; + tracers -= (task ? 1 : -1); } -static inline unsigned long get_bts_index(char *base) + +static inline int check_tracer(struct task_struct *task) { - return *(unsigned long *)(base + ds_cfg.bts_index.offset); + return (task ? (tracers >= 0) : (tracers <= 0)); } -static inline void set_bts_index(char *base, unsigned long value) + + +/* + * The DS context is either attached to a thread or to a cpu: + * - in the former case, the thread_struct contains a pointer to the + * attached context. + * - in the latter case, we use a static array of per-cpu context + * pointers. + * + * Contexts are use-counted. They are allocated on first access and + * deallocated when the last user puts the context. + * + * We distinguish between an allocating and a non-allocating get of a + * context: + * - the allocating get is used for requesting BTS/PEBS resources. It + * requires the caller to hold the global ds_lock. + * - the non-allocating get is used for all other cases. A + * non-existing context indicates an error. It acquires and releases + * the ds_lock itself for obtaining the context. + * + * A context and its DS configuration are allocated and deallocated + * together. A context always has a DS configuration of the + * appropriate size. + */ +static DEFINE_PER_CPU(struct ds_context *, system_context); + +#define this_system_context per_cpu(system_context, smp_processor_id()) + +/* + * Returns the pointer to the parameter task's context or to the + * system-wide context, if task is NULL. + * + * Increases the use count of the returned context, if not NULL. + */ +static inline struct ds_context *ds_get_context(struct task_struct *task) { - (*(unsigned long *)(base + ds_cfg.bts_index.offset)) = value; + struct ds_context *context; + + spin_lock(&ds_lock); + + context = (task ? task->thread.ds_ctx : this_system_context); + if (context) + context->count++; + + spin_unlock(&ds_lock); + + return context; } -static inline unsigned long get_bts_absolute_maximum(char *base) + +/* + * Same as ds_get_context, but allocates the context and it's DS + * structure, if necessary; returns NULL; if out of memory. + * + * pre: requires ds_lock to be held + */ +static inline struct ds_context *ds_alloc_context(struct task_struct *task) { - return *(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset); + struct ds_context **p_context = + (task ? &task->thread.ds_ctx : &this_system_context); + struct ds_context *context = *p_context; + + if (!context) { + context = kzalloc(sizeof(*context), GFP_KERNEL); + + if (!context) + return NULL; + + context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL); + if (!context->ds) { + kfree(context); + return NULL; + } + + *p_context = context; + + context->this = p_context; + context->task = task; + + if (task) + set_tsk_thread_flag(task, TIF_DS_AREA_MSR); + + if (!task || (task == current)) + wrmsr(MSR_IA32_DS_AREA, (unsigned long)context->ds, 0); + + get_tracer(task); + } + + context->count++; + + return context; } -static inline void set_bts_absolute_maximum(char *base, unsigned long value) + +/* + * Decreases the use count of the parameter context, if not NULL. + * Deallocates the context, if the use count reaches zero. + */ +static inline void ds_put_context(struct ds_context *context) { - (*(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset)) = value; + if (!context) + return; + + spin_lock(&ds_lock); + + if (--context->count) + goto out; + + *(context->this) = NULL; + + if (context->task) + clear_tsk_thread_flag(context->task, TIF_DS_AREA_MSR); + + if (!context->task || (context->task == current)) + wrmsrl(MSR_IA32_DS_AREA, 0); + + put_tracer(context->task); + + /* free any leftover buffers from tracers that did not + * deallocate them properly. */ + kfree(context->buffer[ds_bts]); + kfree(context->buffer[ds_pebs]); + kfree(context->ds); + kfree(context); + out: + spin_unlock(&ds_lock); } -static inline unsigned long get_bts_interrupt_threshold(char *base) + + +/* + * Handle a buffer overflow + * + * task: the task whose buffers are overflowing; + * NULL for a buffer overflow on the current cpu + * context: the ds context + * qual: the buffer type + */ +static void ds_overflow(struct task_struct *task, struct ds_context *context, + enum ds_qualifier qual) { - return *(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset); + if (!context) + return; + + if (context->callback[qual]) + (*context->callback[qual])(task); + + /* todo: do some more overflow handling */ } -static inline void set_bts_interrupt_threshold(char *base, unsigned long value) + + +/* + * Allocate a non-pageable buffer of the parameter size. + * Checks the memory and the locked memory rlimit. + * + * Returns the buffer, if successful; + * NULL, if out of memory or rlimit exceeded. + * + * size: the requested buffer size in bytes + * pages (out): if not NULL, contains the number of pages reserved + */ +static inline void *ds_allocate_buffer(size_t size, unsigned int *pages) { - (*(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset)) = value; + unsigned long rlim, vm, pgsz; + void *buffer; + + pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; + + rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; + vm = current->mm->total_vm + pgsz; + if (rlim < vm) + return NULL; + + rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; + vm = current->mm->locked_vm + pgsz; + if (rlim < vm) + return NULL; + + buffer = kzalloc(size, GFP_KERNEL); + if (!buffer) + return NULL; + + current->mm->total_vm += pgsz; + current->mm->locked_vm += pgsz; + + if (pages) + *pages = pgsz; + + return buffer; } -static inline unsigned long get_from_ip(char *base) + +static int ds_request(struct task_struct *task, void *base, size_t size, + ds_ovfl_callback_t ovfl, enum ds_qualifier qual) { - return *(unsigned long *)(base + ds_cfg.from_ip.offset); + struct ds_context *context; + unsigned long buffer, adj; + const unsigned long alignment = (1 << 3); + int error = 0; + + if (!ds_cfg.sizeof_ds) + return -EOPNOTSUPP; + + /* we require some space to do alignment adjustments below */ + if (size < (alignment + ds_cfg.sizeof_rec[qual])) + return -EINVAL; + + /* buffer overflow notification is not yet implemented */ + if (ovfl) + return -EOPNOTSUPP; + + + spin_lock(&ds_lock); + + if (!check_tracer(task)) + return -EPERM; + + error = -ENOMEM; + context = ds_alloc_context(task); + if (!context) + goto out_unlock; + + error = -EALREADY; + if (context->owner[qual] == current) + goto out_unlock; + error = -EPERM; + if (context->owner[qual] != NULL) + goto out_unlock; + context->owner[qual] = current; + + spin_unlock(&ds_lock); + + + error = -ENOMEM; + if (!base) { + base = ds_allocate_buffer(size, &context->pages[qual]); + if (!base) + goto out_release; + + context->buffer[qual] = base; + } + error = 0; + + context->callback[qual] = ovfl; + + /* adjust the buffer address and size to meet alignment + * constraints: + * - buffer is double-word aligned + * - size is multiple of record size + * + * We checked the size at the very beginning; we have enough + * space to do the adjustment. + */ + buffer = (unsigned long)base; + + adj = ALIGN(buffer, alignment) - buffer; + buffer += adj; + size -= adj; + + size /= ds_cfg.sizeof_rec[qual]; + size *= ds_cfg.sizeof_rec[qual]; + + ds_set(context->ds, qual, ds_buffer_base, buffer); + ds_set(context->ds, qual, ds_index, buffer); + ds_set(context->ds, qual, ds_absolute_maximum, buffer + size); + + if (ovfl) { + /* todo: select a suitable interrupt threshold */ + } else + ds_set(context->ds, qual, + ds_interrupt_threshold, buffer + size + 1); + + /* we keep the context until ds_release */ + return error; + + out_release: + context->owner[qual] = NULL; + ds_put_context(context); + return error; + + out_unlock: + spin_unlock(&ds_lock); + ds_put_context(context); + return error; } -static inline void set_from_ip(char *base, unsigned long value) + +int ds_request_bts(struct task_struct *task, void *base, size_t size, + ds_ovfl_callback_t ovfl) { - (*(unsigned long *)(base + ds_cfg.from_ip.offset)) = value; + return ds_request(task, base, size, ovfl, ds_bts); } -static inline unsigned long get_to_ip(char *base) + +int ds_request_pebs(struct task_struct *task, void *base, size_t size, + ds_ovfl_callback_t ovfl) { - return *(unsigned long *)(base + ds_cfg.to_ip.offset); + return ds_request(task, base, size, ovfl, ds_pebs); } -static inline void set_to_ip(char *base, unsigned long value) + +static int ds_release(struct task_struct *task, enum ds_qualifier qual) { - (*(unsigned long *)(base + ds_cfg.to_ip.offset)) = value; + struct ds_context *context; + int error; + + context = ds_get_context(task); + error = ds_validate_access(context, qual); + if (error < 0) + goto out; + + kfree(context->buffer[qual]); + context->buffer[qual] = NULL; + + current->mm->total_vm -= context->pages[qual]; + current->mm->locked_vm -= context->pages[qual]; + context->pages[qual] = 0; + context->owner[qual] = NULL; + + /* + * we put the context twice: + * once for the ds_get_context + * once for the corresponding ds_request + */ + ds_put_context(context); + out: + ds_put_context(context); + return error; } -static inline unsigned char get_info_type(char *base) + +int ds_release_bts(struct task_struct *task) { - return *(unsigned char *)(base + ds_cfg.info_type.offset); + return ds_release(task, ds_bts); } -static inline void set_info_type(char *base, unsigned char value) + +int ds_release_pebs(struct task_struct *task) { - (*(unsigned char *)(base + ds_cfg.info_type.offset)) = value; + return ds_release(task, ds_pebs); } -static inline unsigned long get_info_data(char *base) + +static int ds_get_index(struct task_struct *task, size_t *pos, + enum ds_qualifier qual) { - return *(unsigned long *)(base + ds_cfg.info_data.offset); + struct ds_context *context; + unsigned long base, index; + int error; + + context = ds_get_context(task); + error = ds_validate_access(context, qual); + if (error < 0) + goto out; + + base = ds_get(context->ds, qual, ds_buffer_base); + index = ds_get(context->ds, qual, ds_index); + + error = ((index - base) / ds_cfg.sizeof_rec[qual]); + if (pos) + *pos = error; + out: + ds_put_context(context); + return error; } -static inline void set_info_data(char *base, unsigned long value) + +int ds_get_bts_index(struct task_struct *task, size_t *pos) { - (*(unsigned long *)(base + ds_cfg.info_data.offset)) = value; + return ds_get_index(task, pos, ds_bts); } +int ds_get_pebs_index(struct task_struct *task, size_t *pos) +{ + return ds_get_index(task, pos, ds_pebs); +} -int ds_allocate(void **dsp, size_t bts_size_in_bytes) +static int ds_get_end(struct task_struct *task, size_t *pos, + enum ds_qualifier qual) { - size_t bts_size_in_records; - unsigned long bts; - void *ds; + struct ds_context *context; + unsigned long base, end; + int error; + + context = ds_get_context(task); + error = ds_validate_access(context, qual); + if (error < 0) + goto out; + + base = ds_get(context->ds, qual, ds_buffer_base); + end = ds_get(context->ds, qual, ds_absolute_maximum); + + error = ((end - base) / ds_cfg.sizeof_rec[qual]); + if (pos) + *pos = error; + out: + ds_put_context(context); + return error; +} - if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) - return -EOPNOTSUPP; +int ds_get_bts_end(struct task_struct *task, size_t *pos) +{ + return ds_get_end(task, pos, ds_bts); +} - if (bts_size_in_bytes < 0) - return -EINVAL; +int ds_get_pebs_end(struct task_struct *task, size_t *pos) +{ + return ds_get_end(task, pos, ds_pebs); +} - bts_size_in_records = - bts_size_in_bytes / ds_cfg.sizeof_bts; - bts_size_in_bytes = - bts_size_in_records * ds_cfg.sizeof_bts; +static int ds_access(struct task_struct *task, size_t index, + const void **record, enum ds_qualifier qual) +{ + struct ds_context *context; + unsigned long base, idx; + int error; - if (bts_size_in_bytes <= 0) + if (!record) return -EINVAL; - bts = (unsigned long)kzalloc(bts_size_in_bytes, GFP_KERNEL); - - if (!bts) - return -ENOMEM; + context = ds_get_context(task); + error = ds_validate_access(context, qual); + if (error < 0) + goto out; - ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL); + base = ds_get(context->ds, qual, ds_buffer_base); + idx = base + (index * ds_cfg.sizeof_rec[qual]); - if (!ds) { - kfree((void *)bts); - return -ENOMEM; - } - - set_bts_buffer_base(ds, bts); - set_bts_index(ds, bts); - set_bts_absolute_maximum(ds, bts + bts_size_in_bytes); - set_bts_interrupt_threshold(ds, bts + bts_size_in_bytes + 1); + error = -EINVAL; + if (idx > ds_get(context->ds, qual, ds_absolute_maximum)) + goto out; - *dsp = ds; - return 0; + *record = (const void *)idx; + error = ds_cfg.sizeof_rec[qual]; + out: + ds_put_context(context); + return error; } -int ds_free(void **dsp) +int ds_access_bts(struct task_struct *task, size_t index, const void **record) { - if (*dsp) { - kfree((void *)get_bts_buffer_base(*dsp)); - kfree(*dsp); - *dsp = NULL; - } - return 0; + return ds_access(task, index, record, ds_bts); } -int ds_get_bts_size(void *ds) +int ds_access_pebs(struct task_struct *task, size_t index, const void **record) { - int size_in_bytes; - - if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) - return -EOPNOTSUPP; - - if (!ds) - return 0; - - size_in_bytes = - get_bts_absolute_maximum(ds) - - get_bts_buffer_base(ds); - return size_in_bytes; + return ds_access(task, index, record, ds_pebs); } -int ds_get_bts_end(void *ds) +static int ds_write(struct task_struct *task, const void *record, size_t size, + enum ds_qualifier qual, int force) { - int size_in_bytes = ds_get_bts_size(ds); - - if (size_in_bytes <= 0) - return size_in_bytes; + struct ds_context *context; + int error; - return size_in_bytes / ds_cfg.sizeof_bts; -} + if (!record) + return -EINVAL; -int ds_get_bts_index(void *ds) -{ - int index_offset_in_bytes; + error = -EPERM; + context = ds_get_context(task); + if (!context) + goto out; - if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) - return -EOPNOTSUPP; + if (!force) { + error = ds_validate_access(context, qual); + if (error < 0) + goto out; + } - index_offset_in_bytes = - get_bts_index(ds) - - get_bts_buffer_base(ds); + error = 0; + while (size) { + unsigned long base, index, end, write_end, int_th; + unsigned long write_size, adj_write_size; + + /* + * write as much as possible without producing an + * overflow interrupt. + * + * interrupt_threshold must either be + * - bigger than absolute_maximum or + * - point to a record between buffer_base and absolute_maximum + * + * index points to a valid record. + */ + base = ds_get(context->ds, qual, ds_buffer_base); + index = ds_get(context->ds, qual, ds_index); + end = ds_get(context->ds, qual, ds_absolute_maximum); + int_th = ds_get(context->ds, qual, ds_interrupt_threshold); + + write_end = min(end, int_th); + + /* if we are already beyond the interrupt threshold, + * we fill the entire buffer */ + if (write_end <= index) + write_end = end; + + if (write_end <= index) + goto out; + + write_size = min((unsigned long) size, write_end - index); + memcpy((void *)index, record, write_size); + + record = (const char *)record + write_size; + size -= write_size; + error += write_size; + + adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; + adj_write_size *= ds_cfg.sizeof_rec[qual]; + + /* zero out trailing bytes */ + memset((char *)index + write_size, 0, + adj_write_size - write_size); + index += adj_write_size; + + if (index >= end) + index = base; + ds_set(context->ds, qual, ds_index, index); + + if (index >= int_th) + ds_overflow(task, context, qual); + } - return index_offset_in_bytes / ds_cfg.sizeof_bts; + out: + ds_put_context(context); + return error; } -int ds_set_overflow(void *ds, int method) +int ds_write_bts(struct task_struct *task, const void *record, size_t size) { - switch (method) { - case DS_O_SIGNAL: - return -EOPNOTSUPP; - case DS_O_WRAP: - return 0; - default: - return -EINVAL; - } + return ds_write(task, record, size, ds_bts, /* force = */ 0); } -int ds_get_overflow(void *ds) +int ds_write_pebs(struct task_struct *task, const void *record, size_t size) { - return DS_O_WRAP; + return ds_write(task, record, size, ds_pebs, /* force = */ 0); } -int ds_clear(void *ds) +int ds_unchecked_write_bts(struct task_struct *task, + const void *record, size_t size) { - int bts_size = ds_get_bts_size(ds); - unsigned long bts_base; - - if (bts_size <= 0) - return bts_size; - - bts_base = get_bts_buffer_base(ds); - memset((void *)bts_base, 0, bts_size); - - set_bts_index(ds, bts_base); - return 0; + return ds_write(task, record, size, ds_bts, /* force = */ 1); } -int ds_read_bts(void *ds, int index, struct bts_struct *out) +int ds_unchecked_write_pebs(struct task_struct *task, + const void *record, size_t size) { - void *bts; + return ds_write(task, record, size, ds_pebs, /* force = */ 1); +} - if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) - return -EOPNOTSUPP; +static int ds_reset_or_clear(struct task_struct *task, + enum ds_qualifier qual, int clear) +{ + struct ds_context *context; + unsigned long base, end; + int error; - if (index < 0) - return -EINVAL; + context = ds_get_context(task); + error = ds_validate_access(context, qual); + if (error < 0) + goto out; - if (index >= ds_get_bts_size(ds)) - return -EINVAL; + base = ds_get(context->ds, qual, ds_buffer_base); + end = ds_get(context->ds, qual, ds_absolute_maximum); - bts = (void *)(get_bts_buffer_base(ds) + (index * ds_cfg.sizeof_bts)); + if (clear) + memset((void *)base, 0, end - base); - memset(out, 0, sizeof(*out)); - if (get_from_ip(bts) == BTS_ESCAPE_ADDRESS) { - out->qualifier = get_info_type(bts); - out->variant.jiffies = get_info_data(bts); - } else { - out->qualifier = BTS_BRANCH; - out->variant.lbr.from_ip = get_from_ip(bts); - out->variant.lbr.to_ip = get_to_ip(bts); - } + ds_set(context->ds, qual, ds_index, base); - return sizeof(*out);; + error = 0; + out: + ds_put_context(context); + return error; } -int ds_write_bts(void *ds, const struct bts_struct *in) +int ds_reset_bts(struct task_struct *task) { - unsigned long bts; - - if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) - return -EOPNOTSUPP; - - if (ds_get_bts_size(ds) <= 0) - return -ENXIO; + return ds_reset_or_clear(task, ds_bts, /* clear = */ 0); +} - bts = get_bts_index(ds); +int ds_reset_pebs(struct task_struct *task) +{ + return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0); +} - memset((void *)bts, 0, ds_cfg.sizeof_bts); - switch (in->qualifier) { - case BTS_INVALID: - break; +int ds_clear_bts(struct task_struct *task) +{ + return ds_reset_or_clear(task, ds_bts, /* clear = */ 1); +} - case BTS_BRANCH: - set_from_ip((void *)bts, in->variant.lbr.from_ip); - set_to_ip((void *)bts, in->variant.lbr.to_ip); - break; +int ds_clear_pebs(struct task_struct *task) +{ + return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1); +} - case BTS_TASK_ARRIVES: - case BTS_TASK_DEPARTS: - set_from_ip((void *)bts, BTS_ESCAPE_ADDRESS); - set_info_type((void *)bts, in->qualifier); - set_info_data((void *)bts, in->variant.jiffies); - break; +int ds_get_pebs_reset(struct task_struct *task, u64 *value) +{ + struct ds_context *context; + int error; - default: + if (!value) return -EINVAL; - } - bts = bts + ds_cfg.sizeof_bts; - if (bts >= get_bts_absolute_maximum(ds)) - bts = get_bts_buffer_base(ds); - set_bts_index(ds, bts); + context = ds_get_context(task); + error = ds_validate_access(context, ds_pebs); + if (error < 0) + goto out; - return ds_cfg.sizeof_bts; + *value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)); + + error = 0; + out: + ds_put_context(context); + return error; } -unsigned long ds_debugctl_mask(void) +int ds_set_pebs_reset(struct task_struct *task, u64 value) { - return ds_cfg.debugctl_mask; -} + struct ds_context *context; + int error; -#ifdef __i386__ -static const struct ds_configuration ds_cfg_netburst = { - .sizeof_ds = 9 * 4, - .bts_buffer_base = { 0, 4 }, - .bts_index = { 4, 4 }, - .bts_absolute_maximum = { 8, 4 }, - .bts_interrupt_threshold = { 12, 4 }, - .sizeof_bts = 3 * 4, - .from_ip = { 0, 4 }, - .to_ip = { 4, 4 }, - .info_type = { 4, 1 }, - .info_data = { 8, 4 }, - .debugctl_mask = (1<<2)|(1<<3) -}; + context = ds_get_context(task); + error = ds_validate_access(context, ds_pebs); + if (error < 0) + goto out; -static const struct ds_configuration ds_cfg_pentium_m = { - .sizeof_ds = 9 * 4, - .bts_buffer_base = { 0, 4 }, - .bts_index = { 4, 4 }, - .bts_absolute_maximum = { 8, 4 }, - .bts_interrupt_threshold = { 12, 4 }, - .sizeof_bts = 3 * 4, - .from_ip = { 0, 4 }, - .to_ip = { 4, 4 }, - .info_type = { 4, 1 }, - .info_data = { 8, 4 }, - .debugctl_mask = (1<<6)|(1<<7) + *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value; + + error = 0; + out: + ds_put_context(context); + return error; +} + +static const struct ds_configuration ds_cfg_var = { + .sizeof_ds = sizeof(long) * 12, + .sizeof_field = sizeof(long), + .sizeof_rec[ds_bts] = sizeof(long) * 3, + .sizeof_rec[ds_pebs] = sizeof(long) * 10 }; -#endif /* _i386_ */ - -static const struct ds_configuration ds_cfg_core2 = { - .sizeof_ds = 9 * 8, - .bts_buffer_base = { 0, 8 }, - .bts_index = { 8, 8 }, - .bts_absolute_maximum = { 16, 8 }, - .bts_interrupt_threshold = { 24, 8 }, - .sizeof_bts = 3 * 8, - .from_ip = { 0, 8 }, - .to_ip = { 8, 8 }, - .info_type = { 8, 1 }, - .info_data = { 16, 8 }, - .debugctl_mask = (1<<6)|(1<<7)|(1<<9) +static const struct ds_configuration ds_cfg_64 = { + .sizeof_ds = 8 * 12, + .sizeof_field = 8, + .sizeof_rec[ds_bts] = 8 * 3, + .sizeof_rec[ds_pebs] = 8 * 10 }; static inline void @@ -429,14 +821,13 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) switch (c->x86) { case 0x6: switch (c->x86_model) { -#ifdef __i386__ case 0xD: case 0xE: /* Pentium M */ - ds_configure(&ds_cfg_pentium_m); + ds_configure(&ds_cfg_var); break; -#endif /* _i386_ */ case 0xF: /* Core2 */ - ds_configure(&ds_cfg_core2); + case 0x1C: /* Atom */ + ds_configure(&ds_cfg_64); break; default: /* sorry, don't know about them */ @@ -445,13 +836,11 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) break; case 0xF: switch (c->x86_model) { -#ifdef __i386__ case 0x0: case 0x1: case 0x2: /* Netburst */ - ds_configure(&ds_cfg_netburst); + ds_configure(&ds_cfg_var); break; -#endif /* _i386_ */ default: /* sorry, don't know about them */ break; @@ -462,3 +851,14 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) break; } } + +void ds_free(struct ds_context *context) +{ + /* This is called when the task owning the parameter context + * is dying. There should not be any user of that context left + * to disturb us, anymore. */ + unsigned long leftovers = context->count; + while (leftovers--) + ds_put_context(context); +} +#endif /* CONFIG_X86_DS */ diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c new file mode 100644 index 000000000000..b3614752197b --- /dev/null +++ b/arch/x86/kernel/dumpstack_32.c @@ -0,0 +1,449 @@ +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + */ +#include <linux/kallsyms.h> +#include <linux/kprobes.h> +#include <linux/uaccess.h> +#include <linux/utsname.h> +#include <linux/hardirq.h> +#include <linux/kdebug.h> +#include <linux/module.h> +#include <linux/ptrace.h> +#include <linux/kexec.h> +#include <linux/bug.h> +#include <linux/nmi.h> +#include <linux/sysfs.h> + +#include <asm/stacktrace.h> + +#define STACKSLOTS_PER_LINE 8 +#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :) + +int panic_on_unrecovered_nmi; +int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; +static unsigned int code_bytes = 64; +static int die_counter; + +void printk_address(unsigned long address, int reliable) +{ + printk(" [<%p>] %s%pS\n", (void *) address, + reliable ? "" : "? ", (void *) address); +} + +static inline int valid_stack_ptr(struct thread_info *tinfo, + void *p, unsigned int size, void *end) +{ + void *t = tinfo; + if (end) { + if (p < end && p >= (end-THREAD_SIZE)) + return 1; + else + return 0; + } + return p > t && p < t + THREAD_SIZE - size; +} + +/* The form of the top of the frame on the stack */ +struct stack_frame { + struct stack_frame *next_frame; + unsigned long return_address; +}; + +static inline unsigned long +print_context_stack(struct thread_info *tinfo, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data, + unsigned long *end) +{ + struct stack_frame *frame = (struct stack_frame *)bp; + + while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { + unsigned long addr; + + addr = *stack; + if (__kernel_text_address(addr)) { + if ((unsigned long) stack == bp + sizeof(long)) { + ops->address(data, addr, 1); + frame = frame->next_frame; + bp = (unsigned long) frame; + } else { + ops->address(data, addr, bp == 0); + } + } + stack++; + } + return bp; +} + +void dump_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data) +{ + if (!task) + task = current; + + if (!stack) { + unsigned long dummy; + stack = &dummy; + if (task && task != current) + stack = (unsigned long *)task->thread.sp; + } + +#ifdef CONFIG_FRAME_POINTER + if (!bp) { + if (task == current) { + /* Grab bp right from our regs */ + get_bp(bp); + } else { + /* bp is the last reg pushed by switch_to */ + bp = *(unsigned long *) task->thread.sp; + } + } +#endif + + for (;;) { + struct thread_info *context; + + context = (struct thread_info *) + ((unsigned long)stack & (~(THREAD_SIZE - 1))); + bp = print_context_stack(context, stack, bp, ops, data, NULL); + + stack = (unsigned long *)context->previous_esp; + if (!stack) + break; + if (ops->stack(data, "IRQ") < 0) + break; + touch_nmi_watchdog(); + } +} +EXPORT_SYMBOL(dump_trace); + +static void +print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) +{ + printk(data); + print_symbol(msg, symbol); + printk("\n"); +} + +static void print_trace_warning(void *data, char *msg) +{ + printk("%s%s\n", (char *)data, msg); +} + +static int print_trace_stack(void *data, char *name) +{ + printk("%s <%s> ", (char *)data, name); + return 0; +} + +/* + * Print one address/symbol entries per line. + */ +static void print_trace_address(void *data, unsigned long addr, int reliable) +{ + touch_nmi_watchdog(); + printk(data); + printk_address(addr, reliable); +} + +static const struct stacktrace_ops print_trace_ops = { + .warning = print_trace_warning, + .warning_symbol = print_trace_warning_symbol, + .stack = print_trace_stack, + .address = print_trace_address, +}; + +static void +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, char *log_lvl) +{ + printk("%sCall Trace:\n", log_lvl); + dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); +} + +void show_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp) +{ + show_trace_log_lvl(task, regs, stack, bp, ""); +} + +static void +show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *sp, unsigned long bp, char *log_lvl) +{ + unsigned long *stack; + int i; + + if (sp == NULL) { + if (task) + sp = (unsigned long *)task->thread.sp; + else + sp = (unsigned long *)&sp; + } + + stack = sp; + for (i = 0; i < kstack_depth_to_print; i++) { + if (kstack_end(stack)) + break; + if (i && ((i % STACKSLOTS_PER_LINE) == 0)) + printk("\n%s", log_lvl); + printk(" %08lx", *stack++); + touch_nmi_watchdog(); + } + printk("\n"); + show_trace_log_lvl(task, regs, sp, bp, log_lvl); +} + +void show_stack(struct task_struct *task, unsigned long *sp) +{ + show_stack_log_lvl(task, NULL, sp, 0, ""); +} + +/* + * The architecture-independent dump_stack generator + */ +void dump_stack(void) +{ + unsigned long bp = 0; + unsigned long stack; + +#ifdef CONFIG_FRAME_POINTER + if (!bp) + get_bp(bp); +#endif + + printk("Pid: %d, comm: %.20s %s %s %.*s\n", + current->pid, current->comm, print_tainted(), + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); + show_trace(NULL, NULL, &stack, bp); +} + +EXPORT_SYMBOL(dump_stack); + +void show_registers(struct pt_regs *regs) +{ + int i; + + print_modules(); + __show_regs(regs, 0); + + printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n", + TASK_COMM_LEN, current->comm, task_pid_nr(current), + current_thread_info(), current, task_thread_info(current)); + /* + * When in-kernel, we also print out the stack and code at the + * time of the fault.. + */ + if (!user_mode_vm(regs)) { + unsigned int code_prologue = code_bytes * 43 / 64; + unsigned int code_len = code_bytes; + unsigned char c; + u8 *ip; + + printk(KERN_EMERG "Stack:\n"); + show_stack_log_lvl(NULL, regs, ®s->sp, + 0, KERN_EMERG); + + printk(KERN_EMERG "Code: "); + + ip = (u8 *)regs->ip - code_prologue; + if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { + /* try starting at IP */ + ip = (u8 *)regs->ip; + code_len = code_len - code_prologue + 1; + } + for (i = 0; i < code_len; i++, ip++) { + if (ip < (u8 *)PAGE_OFFSET || + probe_kernel_address(ip, c)) { + printk(" Bad EIP value."); + break; + } + if (ip == (u8 *)regs->ip) + printk("<%02x> ", c); + else + printk("%02x ", c); + } + } + printk("\n"); +} + +int is_valid_bugaddr(unsigned long ip) +{ + unsigned short ud2; + + if (ip < PAGE_OFFSET) + return 0; + if (probe_kernel_address((unsigned short *)ip, ud2)) + return 0; + + return ud2 == 0x0b0f; +} + +static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; +static int die_owner = -1; +static unsigned int die_nest_count; + +unsigned __kprobes long oops_begin(void) +{ + unsigned long flags; + + oops_enter(); + + if (die_owner != raw_smp_processor_id()) { + console_verbose(); + raw_local_irq_save(flags); + __raw_spin_lock(&die_lock); + die_owner = smp_processor_id(); + die_nest_count = 0; + bust_spinlocks(1); + } else { + raw_local_irq_save(flags); + } + die_nest_count++; + return flags; +} + +void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) +{ + bust_spinlocks(0); + die_owner = -1; + add_taint(TAINT_DIE); + __raw_spin_unlock(&die_lock); + raw_local_irq_restore(flags); + + if (!regs) + return; + + if (kexec_should_crash(current)) + crash_kexec(regs); + if (in_interrupt()) + panic("Fatal exception in interrupt"); + if (panic_on_oops) + panic("Fatal exception"); + oops_exit(); + do_exit(signr); +} + +int __kprobes __die(const char *str, struct pt_regs *regs, long err) +{ + unsigned short ss; + unsigned long sp; + + printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); +#ifdef CONFIG_PREEMPT + printk("PREEMPT "); +#endif +#ifdef CONFIG_SMP + printk("SMP "); +#endif +#ifdef CONFIG_DEBUG_PAGEALLOC + printk("DEBUG_PAGEALLOC"); +#endif + printk("\n"); + sysfs_printk_last_file(); + if (notify_die(DIE_OOPS, str, regs, err, + current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) + return 1; + + show_registers(regs); + /* Executive summary in case the oops scrolled away */ + sp = (unsigned long) (®s->sp); + savesegment(ss, ss); + if (user_mode(regs)) { + sp = regs->sp; + ss = regs->ss & 0xffff; + } + printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); + print_symbol("%s", regs->ip); + printk(" SS:ESP %04x:%08lx\n", ss, sp); + return 0; +} + +/* + * This is gone through when something in the kernel has done something bad + * and is about to be terminated: + */ +void die(const char *str, struct pt_regs *regs, long err) +{ + unsigned long flags = oops_begin(); + + if (die_nest_count < 3) { + report_bug(regs->ip, regs); + + if (__die(str, regs, err)) + regs = NULL; + } else { + printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); + } + + oops_end(flags, regs, SIGSEGV); +} + +static DEFINE_SPINLOCK(nmi_print_lock); + +void notrace __kprobes +die_nmi(char *str, struct pt_regs *regs, int do_panic) +{ + if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) + return; + + spin_lock(&nmi_print_lock); + /* + * We are in trouble anyway, lets at least try + * to get a message out: + */ + bust_spinlocks(1); + printk(KERN_EMERG "%s", str); + printk(" on CPU%d, ip %08lx, registers:\n", + smp_processor_id(), regs->ip); + show_registers(regs); + if (do_panic) + panic("Non maskable interrupt"); + console_silent(); + spin_unlock(&nmi_print_lock); + + /* + * If we are in kernel we are probably nested up pretty bad + * and might aswell get out now while we still can: + */ + if (!user_mode_vm(regs)) { + current->thread.trap_no = 2; + crash_kexec(regs); + } + + bust_spinlocks(0); + do_exit(SIGSEGV); +} + +static int __init oops_setup(char *s) +{ + if (!s) + return -EINVAL; + if (!strcmp(s, "panic")) + panic_on_oops = 1; + return 0; +} +early_param("oops", oops_setup); + +static int __init kstack_setup(char *s) +{ + if (!s) + return -EINVAL; + kstack_depth_to_print = simple_strtoul(s, NULL, 0); + return 0; +} +early_param("kstack", kstack_setup); + +static int __init code_bytes_setup(char *s) +{ + code_bytes = simple_strtoul(s, NULL, 0); + if (code_bytes > 8192) + code_bytes = 8192; + + return 1; +} +__setup("code_bytes=", code_bytes_setup); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c new file mode 100644 index 000000000000..96a5db7da8a7 --- /dev/null +++ b/arch/x86/kernel/dumpstack_64.c @@ -0,0 +1,575 @@ +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + */ +#include <linux/kallsyms.h> +#include <linux/kprobes.h> +#include <linux/uaccess.h> +#include <linux/utsname.h> +#include <linux/hardirq.h> +#include <linux/kdebug.h> +#include <linux/module.h> +#include <linux/ptrace.h> +#include <linux/kexec.h> +#include <linux/bug.h> +#include <linux/nmi.h> +#include <linux/sysfs.h> + +#include <asm/stacktrace.h> + +#define STACKSLOTS_PER_LINE 4 +#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) + +int panic_on_unrecovered_nmi; +int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; +static unsigned int code_bytes = 64; +static int die_counter; + +void printk_address(unsigned long address, int reliable) +{ + printk(" [<%p>] %s%pS\n", (void *) address, + reliable ? "" : "? ", (void *) address); +} + +static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, + unsigned *usedp, char **idp) +{ + static char ids[][8] = { + [DEBUG_STACK - 1] = "#DB", + [NMI_STACK - 1] = "NMI", + [DOUBLEFAULT_STACK - 1] = "#DF", + [STACKFAULT_STACK - 1] = "#SS", + [MCE_STACK - 1] = "#MC", +#if DEBUG_STKSZ > EXCEPTION_STKSZ + [N_EXCEPTION_STACKS ... + N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" +#endif + }; + unsigned k; + + /* + * Iterate over all exception stacks, and figure out whether + * 'stack' is in one of them: + */ + for (k = 0; k < N_EXCEPTION_STACKS; k++) { + unsigned long end = per_cpu(orig_ist, cpu).ist[k]; + /* + * Is 'stack' above this exception frame's end? + * If yes then skip to the next frame. + */ + if (stack >= end) + continue; + /* + * Is 'stack' above this exception frame's start address? + * If yes then we found the right frame. + */ + if (stack >= end - EXCEPTION_STKSZ) { + /* + * Make sure we only iterate through an exception + * stack once. If it comes up for the second time + * then there's something wrong going on - just + * break out and return NULL: + */ + if (*usedp & (1U << k)) + break; + *usedp |= 1U << k; + *idp = ids[k]; + return (unsigned long *)end; + } + /* + * If this is a debug stack, and if it has a larger size than + * the usual exception stacks, then 'stack' might still + * be within the lower portion of the debug stack: + */ +#if DEBUG_STKSZ > EXCEPTION_STKSZ + if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) { + unsigned j = N_EXCEPTION_STACKS - 1; + + /* + * Black magic. A large debug stack is composed of + * multiple exception stack entries, which we + * iterate through now. Dont look: + */ + do { + ++j; + end -= EXCEPTION_STKSZ; + ids[j][4] = '1' + (j - N_EXCEPTION_STACKS); + } while (stack < end - EXCEPTION_STKSZ); + if (*usedp & (1U << j)) + break; + *usedp |= 1U << j; + *idp = ids[j]; + return (unsigned long *)end; + } +#endif + } + return NULL; +} + +/* + * x86-64 can have up to three kernel stacks: + * process stack + * interrupt stack + * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack + */ + +static inline int valid_stack_ptr(struct thread_info *tinfo, + void *p, unsigned int size, void *end) +{ + void *t = tinfo; + if (end) { + if (p < end && p >= (end-THREAD_SIZE)) + return 1; + else + return 0; + } + return p > t && p < t + THREAD_SIZE - size; +} + +/* The form of the top of the frame on the stack */ +struct stack_frame { + struct stack_frame *next_frame; + unsigned long return_address; +}; + +static inline unsigned long +print_context_stack(struct thread_info *tinfo, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data, + unsigned long *end) +{ + struct stack_frame *frame = (struct stack_frame *)bp; + + while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { + unsigned long addr; + + addr = *stack; + if (__kernel_text_address(addr)) { + if ((unsigned long) stack == bp + sizeof(long)) { + ops->address(data, addr, 1); + frame = frame->next_frame; + bp = (unsigned long) frame; + } else { + ops->address(data, addr, bp == 0); + } + } + stack++; + } + return bp; +} + +void dump_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data) +{ + const unsigned cpu = get_cpu(); + unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; + unsigned used = 0; + struct thread_info *tinfo; + + if (!task) + task = current; + + if (!stack) { + unsigned long dummy; + stack = &dummy; + if (task && task != current) + stack = (unsigned long *)task->thread.sp; + } + +#ifdef CONFIG_FRAME_POINTER + if (!bp) { + if (task == current) { + /* Grab bp right from our regs */ + get_bp(bp); + } else { + /* bp is the last reg pushed by switch_to */ + bp = *(unsigned long *) task->thread.sp; + } + } +#endif + + /* + * Print function call entries in all stacks, starting at the + * current stack address. If the stacks consist of nested + * exceptions + */ + tinfo = task_thread_info(task); + for (;;) { + char *id; + unsigned long *estack_end; + estack_end = in_exception_stack(cpu, (unsigned long)stack, + &used, &id); + + if (estack_end) { + if (ops->stack(data, id) < 0) + break; + + bp = print_context_stack(tinfo, stack, bp, ops, + data, estack_end); + ops->stack(data, "<EOE>"); + /* + * We link to the next stack via the + * second-to-last pointer (index -2 to end) in the + * exception stack: + */ + stack = (unsigned long *) estack_end[-2]; + continue; + } + if (irqstack_end) { + unsigned long *irqstack; + irqstack = irqstack_end - + (IRQSTACKSIZE - 64) / sizeof(*irqstack); + + if (stack >= irqstack && stack < irqstack_end) { + if (ops->stack(data, "IRQ") < 0) + break; + bp = print_context_stack(tinfo, stack, bp, + ops, data, irqstack_end); + /* + * We link to the next stack (which would be + * the process stack normally) the last + * pointer (index -1 to end) in the IRQ stack: + */ + stack = (unsigned long *) (irqstack_end[-1]); + irqstack_end = NULL; + ops->stack(data, "EOI"); + continue; + } + } + break; + } + + /* + * This handles the process stack: + */ + bp = print_context_stack(tinfo, stack, bp, ops, data, NULL); + put_cpu(); +} +EXPORT_SYMBOL(dump_trace); + +static void +print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) +{ + printk(data); + print_symbol(msg, symbol); + printk("\n"); +} + +static void print_trace_warning(void *data, char *msg) +{ + printk("%s%s\n", (char *)data, msg); +} + +static int print_trace_stack(void *data, char *name) +{ + printk("%s <%s> ", (char *)data, name); + return 0; +} + +/* + * Print one address/symbol entries per line. + */ +static void print_trace_address(void *data, unsigned long addr, int reliable) +{ + touch_nmi_watchdog(); + printk(data); + printk_address(addr, reliable); +} + +static const struct stacktrace_ops print_trace_ops = { + .warning = print_trace_warning, + .warning_symbol = print_trace_warning_symbol, + .stack = print_trace_stack, + .address = print_trace_address, +}; + +static void +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, char *log_lvl) +{ + printk("%sCall Trace:\n", log_lvl); + dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); +} + +void show_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp) +{ + show_trace_log_lvl(task, regs, stack, bp, ""); +} + +static void +show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *sp, unsigned long bp, char *log_lvl) +{ + unsigned long *stack; + int i; + const int cpu = smp_processor_id(); + unsigned long *irqstack_end = + (unsigned long *) (cpu_pda(cpu)->irqstackptr); + unsigned long *irqstack = + (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); + + /* + * debugging aid: "show_stack(NULL, NULL);" prints the + * back trace for this cpu. + */ + + if (sp == NULL) { + if (task) + sp = (unsigned long *)task->thread.sp; + else + sp = (unsigned long *)&sp; + } + + stack = sp; + for (i = 0; i < kstack_depth_to_print; i++) { + if (stack >= irqstack && stack <= irqstack_end) { + if (stack == irqstack_end) { + stack = (unsigned long *) (irqstack_end[-1]); + printk(" <EOI> "); + } + } else { + if (((long) stack & (THREAD_SIZE-1)) == 0) + break; + } + if (i && ((i % STACKSLOTS_PER_LINE) == 0)) + printk("\n%s", log_lvl); + printk(" %016lx", *stack++); + touch_nmi_watchdog(); + } + printk("\n"); + show_trace_log_lvl(task, regs, sp, bp, log_lvl); +} + +void show_stack(struct task_struct *task, unsigned long *sp) +{ + show_stack_log_lvl(task, NULL, sp, 0, ""); +} + +/* + * The architecture-independent dump_stack generator + */ +void dump_stack(void) +{ + unsigned long bp = 0; + unsigned long stack; + +#ifdef CONFIG_FRAME_POINTER + if (!bp) + get_bp(bp); +#endif + + printk("Pid: %d, comm: %.20s %s %s %.*s\n", + current->pid, current->comm, print_tainted(), + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); + show_trace(NULL, NULL, &stack, bp); +} +EXPORT_SYMBOL(dump_stack); + +void show_registers(struct pt_regs *regs) +{ + int i; + unsigned long sp; + const int cpu = smp_processor_id(); + struct task_struct *cur = cpu_pda(cpu)->pcurrent; + + sp = regs->sp; + printk("CPU %d ", cpu); + __show_regs(regs, 1); + printk("Process %s (pid: %d, threadinfo %p, task %p)\n", + cur->comm, cur->pid, task_thread_info(cur), cur); + + /* + * When in-kernel, we also print out the stack and code at the + * time of the fault.. + */ + if (!user_mode(regs)) { + unsigned int code_prologue = code_bytes * 43 / 64; + unsigned int code_len = code_bytes; + unsigned char c; + u8 *ip; + + printk(KERN_EMERG "Stack:\n"); + show_stack_log_lvl(NULL, regs, (unsigned long *)sp, + regs->bp, KERN_EMERG); + + printk(KERN_EMERG "Code: "); + + ip = (u8 *)regs->ip - code_prologue; + if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { + /* try starting at IP */ + ip = (u8 *)regs->ip; + code_len = code_len - code_prologue + 1; + } + for (i = 0; i < code_len; i++, ip++) { + if (ip < (u8 *)PAGE_OFFSET || + probe_kernel_address(ip, c)) { + printk(" Bad RIP value."); + break; + } + if (ip == (u8 *)regs->ip) + printk("<%02x> ", c); + else + printk("%02x ", c); + } + } + printk("\n"); +} + +int is_valid_bugaddr(unsigned long ip) +{ + unsigned short ud2; + + if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2))) + return 0; + + return ud2 == 0x0b0f; +} + +static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; +static int die_owner = -1; +static unsigned int die_nest_count; + +unsigned __kprobes long oops_begin(void) +{ + int cpu; + unsigned long flags; + + oops_enter(); + + /* racy, but better than risking deadlock. */ + raw_local_irq_save(flags); + cpu = smp_processor_id(); + if (!__raw_spin_trylock(&die_lock)) { + if (cpu == die_owner) + /* nested oops. should stop eventually */; + else + __raw_spin_lock(&die_lock); + } + die_nest_count++; + die_owner = cpu; + console_verbose(); + bust_spinlocks(1); + return flags; +} + +void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) +{ + die_owner = -1; + bust_spinlocks(0); + die_nest_count--; + if (!die_nest_count) + /* Nest count reaches zero, release the lock. */ + __raw_spin_unlock(&die_lock); + raw_local_irq_restore(flags); + if (!regs) { + oops_exit(); + return; + } + if (in_interrupt()) + panic("Fatal exception in interrupt"); + if (panic_on_oops) + panic("Fatal exception"); + oops_exit(); + do_exit(signr); +} + +int __kprobes __die(const char *str, struct pt_regs *regs, long err) +{ + printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); +#ifdef CONFIG_PREEMPT + printk("PREEMPT "); +#endif +#ifdef CONFIG_SMP + printk("SMP "); +#endif +#ifdef CONFIG_DEBUG_PAGEALLOC + printk("DEBUG_PAGEALLOC"); +#endif + printk("\n"); + sysfs_printk_last_file(); + if (notify_die(DIE_OOPS, str, regs, err, + current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) + return 1; + + show_registers(regs); + add_taint(TAINT_DIE); + /* Executive summary in case the oops scrolled away */ + printk(KERN_ALERT "RIP "); + printk_address(regs->ip, 1); + printk(" RSP <%016lx>\n", regs->sp); + if (kexec_should_crash(current)) + crash_kexec(regs); + return 0; +} + +void die(const char *str, struct pt_regs *regs, long err) +{ + unsigned long flags = oops_begin(); + + if (!user_mode(regs)) + report_bug(regs->ip, regs); + + if (__die(str, regs, err)) + regs = NULL; + oops_end(flags, regs, SIGSEGV); +} + +notrace __kprobes void +die_nmi(char *str, struct pt_regs *regs, int do_panic) +{ + unsigned long flags; + + if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) + return; + + flags = oops_begin(); + /* + * We are in trouble anyway, lets at least try + * to get a message out. + */ + printk(KERN_EMERG "%s", str); + printk(" on CPU%d, ip %08lx, registers:\n", + smp_processor_id(), regs->ip); + show_registers(regs); + if (kexec_should_crash(current)) + crash_kexec(regs); + if (do_panic || panic_on_oops) + panic("Non maskable interrupt"); + oops_end(flags, NULL, SIGBUS); + nmi_exit(); + local_irq_enable(); + do_exit(SIGBUS); +} + +static int __init oops_setup(char *s) +{ + if (!s) + return -EINVAL; + if (!strcmp(s, "panic")) + panic_on_oops = 1; + return 0; +} +early_param("oops", oops_setup); + +static int __init kstack_setup(char *s) +{ + if (!s) + return -EINVAL; + kstack_depth_to_print = simple_strtoul(s, NULL, 0); + return 0; +} +early_param("kstack", kstack_setup); + +static int __init code_bytes_setup(char *s) +{ + code_bytes = simple_strtoul(s, NULL, 0); + if (code_bytes > 8192) + code_bytes = 8192; + + return 1; +} +__setup("code_bytes=", code_bytes_setup); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 28c29180b380..ce97bf3bed12 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -148,6 +148,9 @@ void __init e820_print_map(char *who) case E820_NVS: printk(KERN_CONT "(ACPI NVS)\n"); break; + case E820_UNUSABLE: + printk("(unusable)\n"); + break; default: printk(KERN_CONT "type %u\n", e820.map[i].type); break; @@ -877,7 +880,8 @@ void __init early_res_to_bootmem(u64 start, u64 end) for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) count++; - printk(KERN_INFO "(%d early reservations) ==> bootmem\n", count); + printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n", + count, start, end); for (i = 0; i < count; i++) { struct early_res *r = &early_res[i]; printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i, @@ -1202,7 +1206,7 @@ static int __init parse_memmap_opt(char *p) if (!p) return -EINVAL; - if (!strcmp(p, "exactmap")) { + if (!strncmp(p, "exactmap", 8)) { #ifdef CONFIG_CRASH_DUMP /* * If we are doing a crash dump, we still need to know @@ -1259,6 +1263,7 @@ static inline const char *e820_type_to_string(int e820_type) case E820_RAM: return "System RAM"; case E820_ACPI: return "ACPI Tables"; case E820_NVS: return "ACPI Non-volatile Storage"; + case E820_UNUSABLE: return "Unusable memory"; default: return "reserved"; } } @@ -1266,6 +1271,7 @@ static inline const char *e820_type_to_string(int e820_type) /* * Mark e820 reserved areas as busy for the resource manager. */ +static struct resource __initdata *e820_res; void __init e820_reserve_resources(void) { int i; @@ -1273,20 +1279,26 @@ void __init e820_reserve_resources(void) u64 end; res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map); + e820_res = res; for (i = 0; i < e820.nr_map; i++) { end = e820.map[i].addr + e820.map[i].size - 1; -#ifndef CONFIG_RESOURCES_64BIT - if (end > 0x100000000ULL) { + if (end != (resource_size_t)end) { res++; continue; } -#endif res->name = e820_type_to_string(e820.map[i].type); res->start = e820.map[i].addr; res->end = end; res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; - insert_resource(&iomem_resource, res); + + /* + * don't register the region that could be conflicted with + * pci device BAR resource and insert them later in + * pcibios_resource_survey() + */ + if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20)) + insert_resource(&iomem_resource, res); res++; } @@ -1298,10 +1310,18 @@ void __init e820_reserve_resources(void) } } -/* - * Non-standard memory setup can be specified via this quirk: - */ -char * (*arch_memory_setup_quirk)(void); +void __init e820_reserve_resources_late(void) +{ + int i; + struct resource *res; + + res = e820_res; + for (i = 0; i < e820.nr_map; i++) { + if (!res->parent && res->end) + reserve_region_with_split(&iomem_resource, res->start, res->end, res->name); + res++; + } +} char *__init default_machine_specific_memory_setup(void) { @@ -1343,8 +1363,8 @@ char *__init default_machine_specific_memory_setup(void) char *__init __attribute__((weak)) machine_specific_memory_setup(void) { - if (arch_memory_setup_quirk) { - char *who = arch_memory_setup_quirk(); + if (x86_quirks->arch_memory_setup) { + char *who = x86_quirks->arch_memory_setup(); if (who) return who; @@ -1367,24 +1387,3 @@ void __init setup_memory_map(void) printk(KERN_INFO "BIOS-provided physical RAM map:\n"); e820_print_map(who); } - -#ifdef CONFIG_X86_64 -int __init arch_get_ram_range(int slot, u64 *addr, u64 *size) -{ - int i; - - if (slot < 0 || slot >= e820.nr_map) - return -1; - for (i = slot; i < e820.nr_map; i++) { - if (e820.map[i].type != E820_RAM) - continue; - break; - } - if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT)) - return -1; - *addr = e820.map[i].addr; - *size = min_t(u64, e820.map[i].size + e820.map[i].addr, - max_pfn << PAGE_SHIFT) - *addr; - return i + 1; -} -#endif diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index a0e11c0cc872..3ce029ffaa55 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -16,10 +16,7 @@ #include <asm/dma.h> #include <asm/io_apic.h> #include <asm/apic.h> - -#ifdef CONFIG_GART_IOMMU -#include <asm/gart.h> -#endif +#include <asm/iommu.h> static void __init fix_hypertransport_config(int num, int slot, int func) { @@ -98,6 +95,113 @@ static void __init nvidia_bugs(int num, int slot, int func) } +#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) +static u32 __init ati_ixp4x0_rev(int num, int slot, int func) +{ + u32 d; + u8 b; + + b = read_pci_config_byte(num, slot, func, 0xac); + b &= ~(1<<5); + write_pci_config_byte(num, slot, func, 0xac, b); + + d = read_pci_config(num, slot, func, 0x70); + d |= 1<<8; + write_pci_config(num, slot, func, 0x70, d); + + d = read_pci_config(num, slot, func, 0x8); + d &= 0xff; + return d; +} + +static void __init ati_bugs(int num, int slot, int func) +{ + u32 d; + u8 b; + + if (acpi_use_timer_override) + return; + + d = ati_ixp4x0_rev(num, slot, func); + if (d < 0x82) + acpi_skip_timer_override = 1; + else { + /* check for IRQ0 interrupt swap */ + outb(0x72, 0xcd6); b = inb(0xcd7); + if (!(b & 0x2)) + acpi_skip_timer_override = 1; + } + + if (acpi_skip_timer_override) { + printk(KERN_INFO "SB4X0 revision 0x%x\n", d); + printk(KERN_INFO "Ignoring ACPI timer override.\n"); + printk(KERN_INFO "If you got timer trouble " + "try acpi_use_timer_override\n"); + } +} + +static u32 __init ati_sbx00_rev(int num, int slot, int func) +{ + u32 old, d; + + d = read_pci_config(num, slot, func, 0x70); + old = d; + d &= ~(1<<8); + write_pci_config(num, slot, func, 0x70, d); + d = read_pci_config(num, slot, func, 0x8); + d &= 0xff; + write_pci_config(num, slot, func, 0x70, old); + + return d; +} + +static void __init ati_bugs_contd(int num, int slot, int func) +{ + u32 d, rev; + + if (acpi_use_timer_override) + return; + + rev = ati_sbx00_rev(num, slot, func); + if (rev > 0x13) + return; + + /* check for IRQ0 interrupt swap */ + d = read_pci_config(num, slot, func, 0x64); + if (!(d & (1<<14))) + acpi_skip_timer_override = 1; + + if (acpi_skip_timer_override) { + printk(KERN_INFO "SB600 revision 0x%x\n", rev); + printk(KERN_INFO "Ignoring ACPI timer override.\n"); + printk(KERN_INFO "If you got timer trouble " + "try acpi_use_timer_override\n"); + } +} +#else +static void __init ati_bugs(int num, int slot, int func) +{ +} + +static void __init ati_bugs_contd(int num, int slot, int func) +{ +} +#endif + +#ifdef CONFIG_DMAR +static void __init intel_g33_dmar(int num, int slot, int func) +{ + struct acpi_table_header *dmar_tbl; + acpi_status status; + + status = acpi_get_table(ACPI_SIG_DMAR, 0, &dmar_tbl); + if (ACPI_SUCCESS(status)) { + printk(KERN_INFO "BIOS BUG: DMAR advertised on Intel G31/G33 chipset -- ignoring\n"); + dmar_disabled = 1; + } +} +#endif + #define QFLAG_APPLY_ONCE 0x1 #define QFLAG_APPLIED 0x2 #define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) @@ -117,6 +221,14 @@ static struct chipset early_qrk[] __initdata = { PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs }, { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config }, + { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS, + PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs }, + { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, + PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd }, +#ifdef CONFIG_DMAR + { PCI_VENDOR_ID_INTEL, 0x29c0, + PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, intel_g33_dmar }, +#endif {} }; diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index ff9e7350da54..34ad997d3834 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -3,11 +3,19 @@ #include <linux/init.h> #include <linux/string.h> #include <linux/screen_info.h> +#include <linux/usb/ch9.h> +#include <linux/pci_regs.h> +#include <linux/pci_ids.h> +#include <linux/errno.h> #include <asm/io.h> #include <asm/processor.h> #include <asm/fcntl.h> #include <asm/setup.h> #include <xen/hvc-console.h> +#include <asm/pci-direct.h> +#include <asm/pgtable.h> +#include <asm/fixmap.h> +#include <linux/usb/ehci_def.h> /* Simple VGA output */ #define VGABASE (__ISA_IO_base + 0xb8000) @@ -78,6 +86,7 @@ static int early_serial_base = 0x3f8; /* ttyS0 */ static int early_serial_putc(unsigned char ch) { unsigned timeout = 0xffff; + while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) cpu_relax(); outb(ch, early_serial_base + TXR); @@ -111,7 +120,7 @@ static __init void early_serial_init(char *s) if (!strncmp(s, "0x", 2)) { early_serial_base = simple_strtoul(s, &e, 16); } else { - static int bases[] = { 0x3f8, 0x2f8 }; + static const int __initconst bases[] = { 0x3f8, 0x2f8 }; if (!strncmp(s, "ttyS", 4)) s += 4; @@ -151,6 +160,721 @@ static struct console early_serial_console = { .index = -1, }; +#ifdef CONFIG_EARLY_PRINTK_DBGP + +static struct ehci_caps __iomem *ehci_caps; +static struct ehci_regs __iomem *ehci_regs; +static struct ehci_dbg_port __iomem *ehci_debug; +static unsigned int dbgp_endpoint_out; + +struct ehci_dev { + u32 bus; + u32 slot; + u32 func; +}; + +static struct ehci_dev ehci_dev; + +#define USB_DEBUG_DEVNUM 127 + +#define DBGP_DATA_TOGGLE 0x8800 + +static inline u32 dbgp_pid_update(u32 x, u32 tok) +{ + return ((x ^ DBGP_DATA_TOGGLE) & 0xffff00) | (tok & 0xff); +} + +static inline u32 dbgp_len_update(u32 x, u32 len) +{ + return (x & ~0x0f) | (len & 0x0f); +} + +/* + * USB Packet IDs (PIDs) + */ + +/* token */ +#define USB_PID_OUT 0xe1 +#define USB_PID_IN 0x69 +#define USB_PID_SOF 0xa5 +#define USB_PID_SETUP 0x2d +/* handshake */ +#define USB_PID_ACK 0xd2 +#define USB_PID_NAK 0x5a +#define USB_PID_STALL 0x1e +#define USB_PID_NYET 0x96 +/* data */ +#define USB_PID_DATA0 0xc3 +#define USB_PID_DATA1 0x4b +#define USB_PID_DATA2 0x87 +#define USB_PID_MDATA 0x0f +/* Special */ +#define USB_PID_PREAMBLE 0x3c +#define USB_PID_ERR 0x3c +#define USB_PID_SPLIT 0x78 +#define USB_PID_PING 0xb4 +#define USB_PID_UNDEF_0 0xf0 + +#define USB_PID_DATA_TOGGLE 0x88 +#define DBGP_CLAIM (DBGP_OWNER | DBGP_ENABLED | DBGP_INUSE) + +#define PCI_CAP_ID_EHCI_DEBUG 0xa + +#define HUB_ROOT_RESET_TIME 50 /* times are in msec */ +#define HUB_SHORT_RESET_TIME 10 +#define HUB_LONG_RESET_TIME 200 +#define HUB_RESET_TIMEOUT 500 + +#define DBGP_MAX_PACKET 8 + +static int dbgp_wait_until_complete(void) +{ + u32 ctrl; + int loop = 0x100000; + + do { + ctrl = readl(&ehci_debug->control); + /* Stop when the transaction is finished */ + if (ctrl & DBGP_DONE) + break; + } while (--loop > 0); + + if (!loop) + return -1; + + /* + * Now that we have observed the completed transaction, + * clear the done bit. + */ + writel(ctrl | DBGP_DONE, &ehci_debug->control); + return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl); +} + +static void dbgp_mdelay(int ms) +{ + int i; + + while (ms--) { + for (i = 0; i < 1000; i++) + outb(0x1, 0x80); + } +} + +static void dbgp_breath(void) +{ + /* Sleep to give the debug port a chance to breathe */ +} + +static int dbgp_wait_until_done(unsigned ctrl) +{ + u32 pids, lpid; + int ret; + int loop = 3; + +retry: + writel(ctrl | DBGP_GO, &ehci_debug->control); + ret = dbgp_wait_until_complete(); + pids = readl(&ehci_debug->pids); + lpid = DBGP_PID_GET(pids); + + if (ret < 0) + return ret; + + /* + * If the port is getting full or it has dropped data + * start pacing ourselves, not necessary but it's friendly. + */ + if ((lpid == USB_PID_NAK) || (lpid == USB_PID_NYET)) + dbgp_breath(); + + /* If I get a NACK reissue the transmission */ + if (lpid == USB_PID_NAK) { + if (--loop > 0) + goto retry; + } + + return ret; +} + +static void dbgp_set_data(const void *buf, int size) +{ + const unsigned char *bytes = buf; + u32 lo, hi; + int i; + + lo = hi = 0; + for (i = 0; i < 4 && i < size; i++) + lo |= bytes[i] << (8*i); + for (; i < 8 && i < size; i++) + hi |= bytes[i] << (8*(i - 4)); + writel(lo, &ehci_debug->data03); + writel(hi, &ehci_debug->data47); +} + +static void dbgp_get_data(void *buf, int size) +{ + unsigned char *bytes = buf; + u32 lo, hi; + int i; + + lo = readl(&ehci_debug->data03); + hi = readl(&ehci_debug->data47); + for (i = 0; i < 4 && i < size; i++) + bytes[i] = (lo >> (8*i)) & 0xff; + for (; i < 8 && i < size; i++) + bytes[i] = (hi >> (8*(i - 4))) & 0xff; +} + +static int dbgp_bulk_write(unsigned devnum, unsigned endpoint, + const char *bytes, int size) +{ + u32 pids, addr, ctrl; + int ret; + + if (size > DBGP_MAX_PACKET) + return -1; + + addr = DBGP_EPADDR(devnum, endpoint); + + pids = readl(&ehci_debug->pids); + pids = dbgp_pid_update(pids, USB_PID_OUT); + + ctrl = readl(&ehci_debug->control); + ctrl = dbgp_len_update(ctrl, size); + ctrl |= DBGP_OUT; + ctrl |= DBGP_GO; + + dbgp_set_data(bytes, size); + writel(addr, &ehci_debug->address); + writel(pids, &ehci_debug->pids); + + ret = dbgp_wait_until_done(ctrl); + if (ret < 0) + return ret; + + return ret; +} + +static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data, + int size) +{ + u32 pids, addr, ctrl; + int ret; + + if (size > DBGP_MAX_PACKET) + return -1; + + addr = DBGP_EPADDR(devnum, endpoint); + + pids = readl(&ehci_debug->pids); + pids = dbgp_pid_update(pids, USB_PID_IN); + + ctrl = readl(&ehci_debug->control); + ctrl = dbgp_len_update(ctrl, size); + ctrl &= ~DBGP_OUT; + ctrl |= DBGP_GO; + + writel(addr, &ehci_debug->address); + writel(pids, &ehci_debug->pids); + ret = dbgp_wait_until_done(ctrl); + if (ret < 0) + return ret; + + if (size > ret) + size = ret; + dbgp_get_data(data, size); + return ret; +} + +static int dbgp_control_msg(unsigned devnum, int requesttype, int request, + int value, int index, void *data, int size) +{ + u32 pids, addr, ctrl; + struct usb_ctrlrequest req; + int read; + int ret; + + read = (requesttype & USB_DIR_IN) != 0; + if (size > (read ? DBGP_MAX_PACKET:0)) + return -1; + + /* Compute the control message */ + req.bRequestType = requesttype; + req.bRequest = request; + req.wValue = cpu_to_le16(value); + req.wIndex = cpu_to_le16(index); + req.wLength = cpu_to_le16(size); + + pids = DBGP_PID_SET(USB_PID_DATA0, USB_PID_SETUP); + addr = DBGP_EPADDR(devnum, 0); + + ctrl = readl(&ehci_debug->control); + ctrl = dbgp_len_update(ctrl, sizeof(req)); + ctrl |= DBGP_OUT; + ctrl |= DBGP_GO; + + /* Send the setup message */ + dbgp_set_data(&req, sizeof(req)); + writel(addr, &ehci_debug->address); + writel(pids, &ehci_debug->pids); + ret = dbgp_wait_until_done(ctrl); + if (ret < 0) + return ret; + + /* Read the result */ + return dbgp_bulk_read(devnum, 0, data, size); +} + + +/* Find a PCI capability */ +static u32 __init find_cap(u32 num, u32 slot, u32 func, int cap) +{ + u8 pos; + int bytes; + + if (!(read_pci_config_16(num, slot, func, PCI_STATUS) & + PCI_STATUS_CAP_LIST)) + return 0; + + pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST); + for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { + u8 id; + + pos &= ~3; + id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID); + if (id == 0xff) + break; + if (id == cap) + return pos; + + pos = read_pci_config_byte(num, slot, func, + pos+PCI_CAP_LIST_NEXT); + } + return 0; +} + +static u32 __init __find_dbgp(u32 bus, u32 slot, u32 func) +{ + u32 class; + + class = read_pci_config(bus, slot, func, PCI_CLASS_REVISION); + if ((class >> 8) != PCI_CLASS_SERIAL_USB_EHCI) + return 0; + + return find_cap(bus, slot, func, PCI_CAP_ID_EHCI_DEBUG); +} + +static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc) +{ + u32 bus, slot, func; + + for (bus = 0; bus < 256; bus++) { + for (slot = 0; slot < 32; slot++) { + for (func = 0; func < 8; func++) { + unsigned cap; + + cap = __find_dbgp(bus, slot, func); + + if (!cap) + continue; + if (ehci_num-- != 0) + continue; + *rbus = bus; + *rslot = slot; + *rfunc = func; + return cap; + } + } + } + return 0; +} + +static int ehci_reset_port(int port) +{ + u32 portsc; + u32 delay_time, delay; + int loop; + + /* Reset the usb debug port */ + portsc = readl(&ehci_regs->port_status[port - 1]); + portsc &= ~PORT_PE; + portsc |= PORT_RESET; + writel(portsc, &ehci_regs->port_status[port - 1]); + + delay = HUB_ROOT_RESET_TIME; + for (delay_time = 0; delay_time < HUB_RESET_TIMEOUT; + delay_time += delay) { + dbgp_mdelay(delay); + + portsc = readl(&ehci_regs->port_status[port - 1]); + if (portsc & PORT_RESET) { + /* force reset to complete */ + loop = 2; + writel(portsc & ~(PORT_RWC_BITS | PORT_RESET), + &ehci_regs->port_status[port - 1]); + do { + portsc = readl(&ehci_regs->port_status[port-1]); + } while ((portsc & PORT_RESET) && (--loop > 0)); + } + + /* Device went away? */ + if (!(portsc & PORT_CONNECT)) + return -ENOTCONN; + + /* bomb out completely if something weird happend */ + if ((portsc & PORT_CSC)) + return -EINVAL; + + /* If we've finished resetting, then break out of the loop */ + if (!(portsc & PORT_RESET) && (portsc & PORT_PE)) + return 0; + } + return -EBUSY; +} + +static int ehci_wait_for_port(int port) +{ + u32 status; + int ret, reps; + + for (reps = 0; reps < 3; reps++) { + dbgp_mdelay(100); + status = readl(&ehci_regs->status); + if (status & STS_PCD) { + ret = ehci_reset_port(port); + if (ret == 0) + return 0; + } + } + return -ENOTCONN; +} + +#ifdef DBGP_DEBUG +# define dbgp_printk early_printk +#else +static inline void dbgp_printk(const char *fmt, ...) { } +#endif + +typedef void (*set_debug_port_t)(int port); + +static void default_set_debug_port(int port) +{ +} + +static set_debug_port_t set_debug_port = default_set_debug_port; + +static void nvidia_set_debug_port(int port) +{ + u32 dword; + dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, + 0x74); + dword &= ~(0x0f<<12); + dword |= ((port & 0x0f)<<12); + write_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, 0x74, + dword); + dbgp_printk("set debug port to %d\n", port); +} + +static void __init detect_set_debug_port(void) +{ + u32 vendorid; + + vendorid = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, + 0x00); + + if ((vendorid & 0xffff) == 0x10de) { + dbgp_printk("using nvidia set_debug_port\n"); + set_debug_port = nvidia_set_debug_port; + } +} + +static int __init ehci_setup(void) +{ + struct usb_debug_descriptor dbgp_desc; + u32 cmd, ctrl, status, portsc, hcs_params; + u32 debug_port, new_debug_port = 0, n_ports; + u32 devnum; + int ret, i; + int loop; + int port_map_tried; + int playtimes = 3; + +try_next_time: + port_map_tried = 0; + +try_next_port: + + hcs_params = readl(&ehci_caps->hcs_params); + debug_port = HCS_DEBUG_PORT(hcs_params); + n_ports = HCS_N_PORTS(hcs_params); + + dbgp_printk("debug_port: %d\n", debug_port); + dbgp_printk("n_ports: %d\n", n_ports); + + for (i = 1; i <= n_ports; i++) { + portsc = readl(&ehci_regs->port_status[i-1]); + dbgp_printk("portstatus%d: %08x\n", i, portsc); + } + + if (port_map_tried && (new_debug_port != debug_port)) { + if (--playtimes) { + set_debug_port(new_debug_port); + goto try_next_time; + } + return -1; + } + + loop = 10; + /* Reset the EHCI controller */ + cmd = readl(&ehci_regs->command); + cmd |= CMD_RESET; + writel(cmd, &ehci_regs->command); + do { + cmd = readl(&ehci_regs->command); + } while ((cmd & CMD_RESET) && (--loop > 0)); + + if (!loop) { + dbgp_printk("can not reset ehci\n"); + return -1; + } + dbgp_printk("ehci reset done\n"); + + /* Claim ownership, but do not enable yet */ + ctrl = readl(&ehci_debug->control); + ctrl |= DBGP_OWNER; + ctrl &= ~(DBGP_ENABLED | DBGP_INUSE); + writel(ctrl, &ehci_debug->control); + + /* Start the ehci running */ + cmd = readl(&ehci_regs->command); + cmd &= ~(CMD_LRESET | CMD_IAAD | CMD_PSE | CMD_ASE | CMD_RESET); + cmd |= CMD_RUN; + writel(cmd, &ehci_regs->command); + + /* Ensure everything is routed to the EHCI */ + writel(FLAG_CF, &ehci_regs->configured_flag); + + /* Wait until the controller is no longer halted */ + loop = 10; + do { + status = readl(&ehci_regs->status); + } while ((status & STS_HALT) && (--loop > 0)); + + if (!loop) { + dbgp_printk("ehci can be started\n"); + return -1; + } + dbgp_printk("ehci started\n"); + + /* Wait for a device to show up in the debug port */ + ret = ehci_wait_for_port(debug_port); + if (ret < 0) { + dbgp_printk("No device found in debug port\n"); + goto next_debug_port; + } + dbgp_printk("ehci wait for port done\n"); + + /* Enable the debug port */ + ctrl = readl(&ehci_debug->control); + ctrl |= DBGP_CLAIM; + writel(ctrl, &ehci_debug->control); + ctrl = readl(&ehci_debug->control); + if ((ctrl & DBGP_CLAIM) != DBGP_CLAIM) { + dbgp_printk("No device in debug port\n"); + writel(ctrl & ~DBGP_CLAIM, &ehci_debug->control); + goto err; + } + dbgp_printk("debug ported enabled\n"); + + /* Completely transfer the debug device to the debug controller */ + portsc = readl(&ehci_regs->port_status[debug_port - 1]); + portsc &= ~PORT_PE; + writel(portsc, &ehci_regs->port_status[debug_port - 1]); + + dbgp_mdelay(100); + + /* Find the debug device and make it device number 127 */ + for (devnum = 0; devnum <= 127; devnum++) { + ret = dbgp_control_msg(devnum, + USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE, + USB_REQ_GET_DESCRIPTOR, (USB_DT_DEBUG << 8), 0, + &dbgp_desc, sizeof(dbgp_desc)); + if (ret > 0) + break; + } + if (devnum > 127) { + dbgp_printk("Could not find attached debug device\n"); + goto err; + } + if (ret < 0) { + dbgp_printk("Attached device is not a debug device\n"); + goto err; + } + dbgp_endpoint_out = dbgp_desc.bDebugOutEndpoint; + + /* Move the device to 127 if it isn't already there */ + if (devnum != USB_DEBUG_DEVNUM) { + ret = dbgp_control_msg(devnum, + USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE, + USB_REQ_SET_ADDRESS, USB_DEBUG_DEVNUM, 0, NULL, 0); + if (ret < 0) { + dbgp_printk("Could not move attached device to %d\n", + USB_DEBUG_DEVNUM); + goto err; + } + devnum = USB_DEBUG_DEVNUM; + dbgp_printk("debug device renamed to 127\n"); + } + + /* Enable the debug interface */ + ret = dbgp_control_msg(USB_DEBUG_DEVNUM, + USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE, + USB_REQ_SET_FEATURE, USB_DEVICE_DEBUG_MODE, 0, NULL, 0); + if (ret < 0) { + dbgp_printk(" Could not enable the debug device\n"); + goto err; + } + dbgp_printk("debug interface enabled\n"); + + /* Perform a small write to get the even/odd data state in sync + */ + ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, dbgp_endpoint_out, " ", 1); + if (ret < 0) { + dbgp_printk("dbgp_bulk_write failed: %d\n", ret); + goto err; + } + dbgp_printk("small write doned\n"); + + return 0; +err: + /* Things didn't work so remove my claim */ + ctrl = readl(&ehci_debug->control); + ctrl &= ~(DBGP_CLAIM | DBGP_OUT); + writel(ctrl, &ehci_debug->control); + return -1; + +next_debug_port: + port_map_tried |= (1<<(debug_port - 1)); + new_debug_port = ((debug_port-1+1)%n_ports) + 1; + if (port_map_tried != ((1<<n_ports) - 1)) { + set_debug_port(new_debug_port); + goto try_next_port; + } + if (--playtimes) { + set_debug_port(new_debug_port); + goto try_next_time; + } + + return -1; +} + +static int __init early_dbgp_init(char *s) +{ + u32 debug_port, bar, offset; + u32 bus, slot, func, cap; + void __iomem *ehci_bar; + u32 dbgp_num; + u32 bar_val; + char *e; + int ret; + u8 byte; + + if (!early_pci_allowed()) + return -1; + + dbgp_num = 0; + if (*s) + dbgp_num = simple_strtoul(s, &e, 10); + dbgp_printk("dbgp_num: %d\n", dbgp_num); + + cap = find_dbgp(dbgp_num, &bus, &slot, &func); + if (!cap) + return -1; + + dbgp_printk("Found EHCI debug port on %02x:%02x.%1x\n", bus, slot, + func); + + debug_port = read_pci_config(bus, slot, func, cap); + bar = (debug_port >> 29) & 0x7; + bar = (bar * 4) + 0xc; + offset = (debug_port >> 16) & 0xfff; + dbgp_printk("bar: %02x offset: %03x\n", bar, offset); + if (bar != PCI_BASE_ADDRESS_0) { + dbgp_printk("only debug ports on bar 1 handled.\n"); + + return -1; + } + + bar_val = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0); + dbgp_printk("bar_val: %02x offset: %03x\n", bar_val, offset); + if (bar_val & ~PCI_BASE_ADDRESS_MEM_MASK) { + dbgp_printk("only simple 32bit mmio bars supported\n"); + + return -1; + } + + /* double check if the mem space is enabled */ + byte = read_pci_config_byte(bus, slot, func, 0x04); + if (!(byte & 0x2)) { + byte |= 0x02; + write_pci_config_byte(bus, slot, func, 0x04, byte); + dbgp_printk("mmio for ehci enabled\n"); + } + + /* + * FIXME I don't have the bar size so just guess PAGE_SIZE is more + * than enough. 1K is the biggest I have seen. + */ + set_fixmap_nocache(FIX_DBGP_BASE, bar_val & PAGE_MASK); + ehci_bar = (void __iomem *)__fix_to_virt(FIX_DBGP_BASE); + ehci_bar += bar_val & ~PAGE_MASK; + dbgp_printk("ehci_bar: %p\n", ehci_bar); + + ehci_caps = ehci_bar; + ehci_regs = ehci_bar + HC_LENGTH(readl(&ehci_caps->hc_capbase)); + ehci_debug = ehci_bar + offset; + ehci_dev.bus = bus; + ehci_dev.slot = slot; + ehci_dev.func = func; + + detect_set_debug_port(); + + ret = ehci_setup(); + if (ret < 0) { + dbgp_printk("ehci_setup failed\n"); + ehci_debug = NULL; + + return -1; + } + + return 0; +} + +static void early_dbgp_write(struct console *con, const char *str, u32 n) +{ + int chunk, ret; + + if (!ehci_debug) + return; + while (n > 0) { + chunk = n; + if (chunk > DBGP_MAX_PACKET) + chunk = DBGP_MAX_PACKET; + ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, + dbgp_endpoint_out, str, chunk); + str += chunk; + n -= chunk; + } +} + +static struct console early_dbgp_console = { + .name = "earlydbg", + .write = early_dbgp_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; +#endif + /* Console interface to a host file on AMD's SimNow! */ static int simnow_fd; @@ -165,6 +889,7 @@ enum { static noinline long simnow(long cmd, long a, long b, long c) { long ret; + asm volatile("cpuid" : "=a" (ret) : "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2)); @@ -174,6 +899,7 @@ static noinline long simnow(long cmd, long a, long b, long c) static void __init simnow_init(char *str) { char *fn = "klog"; + if (*str == '=') fn = ++str; /* error ignored */ @@ -194,7 +920,7 @@ static struct console simnow_console = { /* Direct interface for emergencies */ static struct console *early_console = &early_vga_console; -static int early_console_initialized; +static int __initdata early_console_initialized; asmlinkage void early_printk(const char *fmt, ...) { @@ -208,10 +934,11 @@ asmlinkage void early_printk(const char *fmt, ...) va_end(ap); } -static int __initdata keep_early; static int __init setup_early_printk(char *buf) { + int keep_early; + if (!buf) return 0; @@ -219,8 +946,7 @@ static int __init setup_early_printk(char *buf) return 0; early_console_initialized = 1; - if (strstr(buf, "keep")) - keep_early = 1; + keep_early = (strstr(buf, "keep") != NULL); if (!strncmp(buf, "serial", 6)) { early_serial_init(buf + 6); @@ -238,6 +964,17 @@ static int __init setup_early_printk(char *buf) simnow_init(buf + 6); early_console = &simnow_console; keep_early = 1; +#ifdef CONFIG_EARLY_PRINTK_DBGP + } else if (!strncmp(buf, "dbgp", 4)) { + if (early_dbgp_init(buf+4) < 0) + return 0; + early_console = &early_dbgp_console; + /* + * usb subsys will reset ehci controller, so don't keep + * that early console + */ + keep_early = 0; +#endif #ifdef CONFIG_HVC_XEN } else if (!strncmp(buf, "xen", 3)) { early_console = &xenboot_console; @@ -251,4 +988,5 @@ static int __init setup_early_printk(char *buf) register_console(early_console); return 0; } + early_param("earlyprintk", setup_early_printk); diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index 06cc8d4254b1..1119d247fe11 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -367,6 +367,10 @@ void __init efi_init(void) efi.smbios = config_tables[i].table; printk(" SMBIOS=0x%lx ", config_tables[i].table); } else if (!efi_guidcmp(config_tables[i].guid, + UV_SYSTEM_TABLE_GUID)) { + efi.uv_systab = config_tables[i].table; + printk(" UVsystab=0x%lx ", config_tables[i].table); + } else if (!efi_guidcmp(config_tables[i].guid, HCDP_TABLE_GUID)) { efi.hcdp = config_tables[i].table; printk(" HCDP=0x%lx ", config_tables[i].table); @@ -414,9 +418,11 @@ void __init efi_init(void) if (memmap.map == NULL) printk(KERN_ERR "Could not map the EFI memory map!\n"); memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size); + if (memmap.desc_size != sizeof(efi_memory_desc_t)) - printk(KERN_WARNING "Kernel-defined memdesc" - "doesn't match the one from EFI!\n"); + printk(KERN_WARNING + "Kernel-defined memdesc doesn't match the one from EFI!\n"); + if (add_efi_memmap) do_add_efi_memmap(); diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c index 4b63c8e1f13b..5cab48ee61a4 100644 --- a/arch/x86/kernel/efi_32.c +++ b/arch/x86/kernel/efi_32.c @@ -53,7 +53,7 @@ void efi_call_phys_prelog(void) * directory. If I have PAE, I just need to duplicate one entry in * page directory. */ - cr4 = read_cr4(); + cr4 = read_cr4_safe(); if (cr4 & X86_CR4_PAE) { efi_bak_pg_dir_pointer[0].pgd = @@ -91,7 +91,7 @@ void efi_call_phys_epilog(void) gdt_descr.size = GDT_SIZE - 1; load_gdt(&gdt_descr); - cr4 = read_cr4(); + cr4 = read_cr4_safe(); if (cr4 & X86_CR4_PAE) { swapper_pg_dir[pgd_index(0)].pgd = diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 6bc07f0f1202..dd65143941a8 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -54,6 +54,16 @@ #include <asm/ftrace.h> #include <asm/irq_vectors.h> +/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ +#include <linux/elf-em.h> +#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_LE 0x40000000 + +#ifndef CONFIG_AUDITSYSCALL +#define sysenter_audit syscall_trace_entry +#define sysexit_audit syscall_exit_work +#endif + /* * We use macros for low-level operations which need to be overridden * for paravirtualization. The following will never clobber any registers: @@ -332,8 +342,9 @@ sysenter_past_esp: GET_THREAD_INFO(%ebp) /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ - testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) - jnz syscall_trace_entry + testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) + jnz sysenter_audit +sysenter_do_call: cmpl $(nr_syscalls), %eax jae syscall_badsys call *sys_call_table(,%eax,4) @@ -343,7 +354,8 @@ sysenter_past_esp: TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx - jne syscall_exit_work + jne sysexit_audit +sysenter_exit: /* if something modifies registers it must also disable sysexit */ movl PT_EIP(%esp), %edx movl PT_OLDESP(%esp), %ecx @@ -351,6 +363,45 @@ sysenter_past_esp: TRACE_IRQS_ON 1: mov PT_FS(%esp), %fs ENABLE_INTERRUPTS_SYSEXIT + +#ifdef CONFIG_AUDITSYSCALL +sysenter_audit: + testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + jnz syscall_trace_entry + addl $4,%esp + CFI_ADJUST_CFA_OFFSET -4 + /* %esi already in 8(%esp) 6th arg: 4th syscall arg */ + /* %edx already in 4(%esp) 5th arg: 3rd syscall arg */ + /* %ecx already in 0(%esp) 4th arg: 2nd syscall arg */ + movl %ebx,%ecx /* 3rd arg: 1st syscall arg */ + movl %eax,%edx /* 2nd arg: syscall number */ + movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */ + call audit_syscall_entry + pushl %ebx + CFI_ADJUST_CFA_OFFSET 4 + movl PT_EAX(%esp),%eax /* reload syscall number */ + jmp sysenter_do_call + +sysexit_audit: + testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx + jne syscall_exit_work + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_ANY) + movl %eax,%edx /* second arg, syscall return value */ + cmpl $0,%eax /* is it < 0? */ + setl %al /* 1 if so, 0 if not */ + movzbl %al,%eax /* zero-extend that */ + inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ + call audit_syscall_exit + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx + jne syscall_exit_work + movl PT_EAX(%esp),%eax /* reload syscall return value */ + jmp sysenter_exit +#endif + CFI_ENDPROC .pushsection .fixup,"ax" 2: movl $0,PT_FS(%esp) @@ -370,7 +421,7 @@ ENTRY(system_call) GET_THREAD_INFO(%ebp) # system call tracing in operation / emulation /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ - testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) jnz syscall_trace_entry cmpl $(nr_syscalls), %eax jae syscall_badsys @@ -383,10 +434,6 @@ syscall_exit: # setting need_resched or sigpending # between sampling and the iret TRACE_IRQS_OFF - testl $X86_EFLAGS_TF,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit - jz no_singlestep - orl $_TIF_SINGLESTEP,TI_flags(%ebp) -no_singlestep: movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx # current->work jne syscall_exit_work @@ -514,12 +561,8 @@ END(work_pending) syscall_trace_entry: movl $-ENOSYS,PT_EAX(%esp) movl %esp, %eax - xorl %edx,%edx - call do_syscall_trace - cmpl $0, %eax - jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU, - # so must skip actual syscall - movl PT_ORIG_EAX(%esp), %eax + call syscall_trace_enter + /* What it returned is what we'll actually use. */ cmpl $(nr_syscalls), %eax jnae syscall_call jmp syscall_exit @@ -528,14 +571,13 @@ END(syscall_trace_entry) # perform syscall exit tracing ALIGN syscall_exit_work: - testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl + testb $_TIF_WORK_SYSCALL_EXIT, %cl jz work_pending TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call + ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call # schedule() instead movl %esp, %eax - movl $1, %edx - call do_syscall_trace + call syscall_trace_leave jmp resume_userspace END(syscall_exit_work) CFI_ENDPROC @@ -587,7 +629,7 @@ ENTRY(interrupt) ENTRY(irq_entries_start) RING0_INT_FRAME vector=0 -.rept NR_IRQS +.rept NR_VECTORS ALIGN .if vector CFI_ADJUST_CFA_OFFSET -4 @@ -688,6 +730,7 @@ error_code: movl $(__USER_DS), %ecx movl %ecx, %ds movl %ecx, %es + TRACE_IRQS_OFF movl %esp,%eax # pt_regs pointer call *%edi jmp ret_from_exception @@ -718,20 +761,9 @@ ENTRY(device_not_available) RING0_INT_FRAME pushl $-1 # mark this as an int CFI_ADJUST_CFA_OFFSET 4 - SAVE_ALL - GET_CR0_INTO_EAX - testl $0x4, %eax # EM (math emulation bit) - jne device_not_available_emulate - preempt_stop(CLBR_ANY) - call math_state_restore - jmp ret_from_exception -device_not_available_emulate: - pushl $0 # temporary storage for ORIG_EIP + pushl $do_device_not_available CFI_ADJUST_CFA_OFFSET 4 - call math_emulate - addl $4, %esp - CFI_ADJUST_CFA_OFFSET -4 - jmp ret_from_exception + jmp error_code CFI_ENDPROC END(device_not_available) @@ -772,6 +804,7 @@ debug_stack_correct: pushl $-1 # mark this as an int CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL + TRACE_IRQS_OFF xorl %edx,%edx # error code 0 movl %esp,%eax # pt_regs pointer call do_debug @@ -816,6 +849,7 @@ nmi_stack_correct: pushl %eax CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL + TRACE_IRQS_OFF xorl %edx,%edx # zero error code movl %esp,%eax # pt_regs pointer call do_nmi @@ -856,6 +890,7 @@ nmi_espfix_stack: pushl %eax CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL + TRACE_IRQS_OFF FIXUP_ESPFIX_STACK # %eax == %esp xorl %edx,%edx # zero error code call do_nmi @@ -886,6 +921,7 @@ KPROBE_ENTRY(int3) pushl $-1 # mark this as an int CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL + TRACE_IRQS_OFF xorl %edx,%edx # zero error code movl %esp,%eax # pt_regs pointer call do_int3 @@ -1024,6 +1060,7 @@ ENDPROC(kernel_thread_helper) ENTRY(xen_sysenter_target) RING0_INT_FRAME addl $5*4, %esp /* remove xen-provided frame */ + CFI_ADJUST_CFA_OFFSET -5*4 jmp sysenter_past_esp CFI_ENDPROC @@ -1116,20 +1153,6 @@ ENDPROC(xen_failsafe_callback) #ifdef CONFIG_DYNAMIC_FTRACE ENTRY(mcount) - pushl %eax - pushl %ecx - pushl %edx - movl 0xc(%esp), %eax - subl $MCOUNT_INSN_SIZE, %eax - -.globl mcount_call -mcount_call: - call ftrace_stub - - popl %edx - popl %ecx - popl %eax - ret END(mcount) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index ae63e584c340..09e7145484c5 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -53,37 +53,17 @@ #include <asm/paravirt.h> #include <asm/ftrace.h> +/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ +#include <linux/elf-em.h> +#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_64BIT 0x80000000 +#define __AUDIT_ARCH_LE 0x40000000 + .code64 #ifdef CONFIG_FTRACE #ifdef CONFIG_DYNAMIC_FTRACE ENTRY(mcount) - - subq $0x38, %rsp - movq %rax, (%rsp) - movq %rcx, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rsi, 24(%rsp) - movq %rdi, 32(%rsp) - movq %r8, 40(%rsp) - movq %r9, 48(%rsp) - - movq 0x38(%rsp), %rdi - subq $MCOUNT_INSN_SIZE, %rdi - -.globl mcount_call -mcount_call: - call ftrace_stub - - movq 48(%rsp), %r9 - movq 40(%rsp), %r8 - movq 32(%rsp), %rdi - movq 24(%rsp), %rsi - movq 16(%rsp), %rdx - movq 8(%rsp), %rcx - movq (%rsp), %rax - addq $0x38, %rsp - retq END(mcount) @@ -269,9 +249,9 @@ ENTRY(native_usergs_sysret64) ENTRY(ret_from_fork) CFI_DEFAULT_STACK push kernel_eflags(%rip) - CFI_ADJUST_CFA_OFFSET 4 + CFI_ADJUST_CFA_OFFSET 8 popf # reset kernel eflags - CFI_ADJUST_CFA_OFFSET -4 + CFI_ADJUST_CFA_OFFSET -8 call schedule_tail GET_THREAD_INFO(%rcx) testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx) @@ -349,9 +329,9 @@ ENTRY(system_call_after_swapgs) movq %rcx,RIP-ARGOFFSET(%rsp) CFI_REL_OFFSET rip,RIP-ARGOFFSET GET_THREAD_INFO(%rcx) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ - TI_flags(%rcx) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx) jnz tracesys +system_call_fastpath: cmpq $__NR_syscall_max,%rax ja badsys movq %r10,%rcx @@ -403,16 +383,16 @@ sysret_careful: sysret_signal: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - testl $_TIF_DO_NOTIFY_MASK,%edx - jz 1f - - /* Really a signal */ +#ifdef CONFIG_AUDITSYSCALL + bt $TIF_SYSCALL_AUDIT,%edx + jc sysret_audit +#endif /* edx: work flags (arg3) */ leaq do_notify_resume(%rip),%rax leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 xorl %esi,%esi # oldset -> arg2 call ptregscall_common -1: movl $_TIF_WORK_MASK,%edi + movl $_TIF_WORK_MASK,%edi /* Use IRET because user could have changed frame. This works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ DISABLE_INTERRUPTS(CLBR_NONE) @@ -423,14 +403,56 @@ badsys: movq $-ENOSYS,RAX-ARGOFFSET(%rsp) jmp ret_from_sys_call +#ifdef CONFIG_AUDITSYSCALL + /* + * Fast path for syscall audit without full syscall trace. + * We just call audit_syscall_entry() directly, and then + * jump back to the normal fast path. + */ +auditsys: + movq %r10,%r9 /* 6th arg: 4th syscall arg */ + movq %rdx,%r8 /* 5th arg: 3rd syscall arg */ + movq %rsi,%rcx /* 4th arg: 2nd syscall arg */ + movq %rdi,%rdx /* 3rd arg: 1st syscall arg */ + movq %rax,%rsi /* 2nd arg: syscall number */ + movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */ + call audit_syscall_entry + LOAD_ARGS 0 /* reload call-clobbered registers */ + jmp system_call_fastpath + + /* + * Return fast path for syscall audit. Call audit_syscall_exit() + * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT + * masked off. + */ +sysret_audit: + movq %rax,%rsi /* second arg, syscall return value */ + cmpq $0,%rax /* is it < 0? */ + setl %al /* 1 if so, 0 if not */ + movzbl %al,%edi /* zero-extend that into %edi */ + inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ + call audit_syscall_exit + movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi + jmp sysret_check +#endif /* CONFIG_AUDITSYSCALL */ + /* Do syscall tracing */ tracesys: +#ifdef CONFIG_AUDITSYSCALL + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx) + jz auditsys +#endif SAVE_REST movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ FIXUP_TOP_OF_STACK %rdi movq %rsp,%rdi call syscall_trace_enter - LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ + /* + * Reload arg registers from stack in case ptrace changed them. + * We don't reload %rax because syscall_trace_enter() returned + * the value it wants us to use in the table lookup. + */ + LOAD_ARGS ARGOFFSET, 1 RESTORE_REST cmpq $__NR_syscall_max,%rax ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */ @@ -444,6 +466,7 @@ tracesys: * Has correct top of stack, but partial stack frame. */ .globl int_ret_from_sys_call + .globl int_with_check int_ret_from_sys_call: DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF @@ -483,7 +506,7 @@ int_very_careful: ENABLE_INTERRUPTS(CLBR_NONE) SAVE_REST /* Check for syscall exit trace */ - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx + testl $_TIF_WORK_SYSCALL_EXIT,%edx jz int_signal pushq %rdi CFI_ADJUST_CFA_OFFSET 8 @@ -491,7 +514,7 @@ int_very_careful: call syscall_trace_leave popq %rdi CFI_ADJUST_CFA_OFFSET -8 - andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi + andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi jmp int_restore_rest int_signal: @@ -618,6 +641,13 @@ END(stub_rt_sigreturn) SAVE_ARGS leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler pushq %rbp + /* + * Save rbp twice: One is for marking the stack frame, as usual, and the + * other, to fill pt_regs properly. This is because bx comes right + * before the last saved register in that structure, and not bp. If the + * base pointer were in the place bx is today, this would not be needed. + */ + movq %rbp, -8(%rsp) CFI_ADJUST_CFA_OFFSET 8 CFI_REL_OFFSET rbp, 0 movq %rsp,%rbp @@ -883,6 +913,9 @@ END(spurious_interrupt) .if \ist movq %gs:pda_data_offset, %rbp .endif + .if \irqtrace + TRACE_IRQS_OFF + .endif movq %rsp,%rdi movq ORIG_RAX(%rsp),%rsi movq $-1,ORIG_RAX(%rsp) @@ -1009,7 +1042,8 @@ KPROBE_ENTRY(error_entry) je error_kernelspace error_swapgs: SWAPGS -error_sti: +error_sti: + TRACE_IRQS_OFF movq %rdi,RDI(%rsp) CFI_REL_OFFSET rdi,RDI movq %rsp,%rdi @@ -1183,12 +1217,13 @@ ENTRY(simd_coprocessor_error) END(simd_coprocessor_error) ENTRY(device_not_available) - zeroentry math_state_restore + zeroentry do_device_not_available END(device_not_available) /* runs on exception stack */ KPROBE_ENTRY(debug) INTR_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME pushq $0 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_debug, DEBUG_STACK @@ -1198,6 +1233,7 @@ KPROBE_END(debug) /* runs on exception stack */ KPROBE_ENTRY(nmi) INTR_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME pushq $-1 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_nmi, 0, 0 @@ -1211,6 +1247,7 @@ KPROBE_END(nmi) KPROBE_ENTRY(int3) INTR_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME pushq $0 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_int3, DEBUG_STACK @@ -1237,6 +1274,7 @@ END(coprocessor_segment_overrun) /* runs on exception stack */ ENTRY(double_fault) XCPT_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME paranoidentry do_double_fault jmp paranoid_exit1 CFI_ENDPROC @@ -1253,6 +1291,7 @@ END(segment_not_present) /* runs on exception stack */ ENTRY(stack_segment) XCPT_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME paranoidentry do_stack_segment jmp paranoid_exit1 CFI_ENDPROC @@ -1278,6 +1317,7 @@ END(spurious_interrupt_bug) /* runs on exception stack */ ENTRY(machine_check) INTR_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME pushq $0 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_machine_check @@ -1312,3 +1352,103 @@ KPROBE_ENTRY(ignore_sysret) sysret CFI_ENDPROC ENDPROC(ignore_sysret) + +#ifdef CONFIG_XEN +ENTRY(xen_hypervisor_callback) + zeroentry xen_do_hypervisor_callback +END(xen_hypervisor_callback) + +/* +# A note on the "critical region" in our callback handler. +# We want to avoid stacking callback handlers due to events occurring +# during handling of the last event. To do this, we keep events disabled +# until we've done all processing. HOWEVER, we must enable events before +# popping the stack frame (can't be done atomically) and so it would still +# be possible to get enough handler activations to overflow the stack. +# Although unlikely, bugs of that kind are hard to track down, so we'd +# like to avoid the possibility. +# So, on entry to the handler we detect whether we interrupted an +# existing activation in its critical region -- if so, we pop the current +# activation and restart the handler using the previous one. +*/ +ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) + CFI_STARTPROC +/* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will + see the correct pointer to the pt_regs */ + movq %rdi, %rsp # we don't return, adjust the stack frame + CFI_ENDPROC + CFI_DEFAULT_STACK +11: incl %gs:pda_irqcount + movq %rsp,%rbp + CFI_DEF_CFA_REGISTER rbp + cmovzq %gs:pda_irqstackptr,%rsp + pushq %rbp # backlink for old unwinder + call xen_evtchn_do_upcall + popq %rsp + CFI_DEF_CFA_REGISTER rsp + decl %gs:pda_irqcount + jmp error_exit + CFI_ENDPROC +END(do_hypervisor_callback) + +/* +# Hypervisor uses this for application faults while it executes. +# We get here for two reasons: +# 1. Fault while reloading DS, ES, FS or GS +# 2. Fault while executing IRET +# Category 1 we do not need to fix up as Xen has already reloaded all segment +# registers that could be reloaded and zeroed the others. +# Category 2 we fix up by killing the current process. We cannot use the +# normal Linux return path in this case because if we use the IRET hypercall +# to pop the stack frame we end up in an infinite loop of failsafe callbacks. +# We distinguish between categories by comparing each saved segment register +# with its current contents: any discrepancy means we in category 1. +*/ +ENTRY(xen_failsafe_callback) + framesz = (RIP-0x30) /* workaround buggy gas */ + _frame framesz + CFI_REL_OFFSET rcx, 0 + CFI_REL_OFFSET r11, 8 + movw %ds,%cx + cmpw %cx,0x10(%rsp) + CFI_REMEMBER_STATE + jne 1f + movw %es,%cx + cmpw %cx,0x18(%rsp) + jne 1f + movw %fs,%cx + cmpw %cx,0x20(%rsp) + jne 1f + movw %gs,%cx + cmpw %cx,0x28(%rsp) + jne 1f + /* All segments match their saved values => Category 2 (Bad IRET). */ + movq (%rsp),%rcx + CFI_RESTORE rcx + movq 8(%rsp),%r11 + CFI_RESTORE r11 + addq $0x30,%rsp + CFI_ADJUST_CFA_OFFSET -0x30 + pushq $0 + CFI_ADJUST_CFA_OFFSET 8 + pushq %r11 + CFI_ADJUST_CFA_OFFSET 8 + pushq %rcx + CFI_ADJUST_CFA_OFFSET 8 + jmp general_protection + CFI_RESTORE_STATE +1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ + movq (%rsp),%rcx + CFI_RESTORE rcx + movq 8(%rsp),%r11 + CFI_RESTORE r11 + addq $0x30,%rsp + CFI_ADJUST_CFA_OFFSET -0x30 + pushq $0 + CFI_ADJUST_CFA_OFFSET 8 + SAVE_ALL + jmp error_exit + CFI_ENDPROC +END(xen_failsafe_callback) + +#endif /* CONFIG_XEN */ diff --git a/arch/x86/mach-es7000/es7000plat.c b/arch/x86/kernel/es7000_32.c index 4354ce804889..f454c78fcef6 100644 --- a/arch/x86/mach-es7000/es7000plat.c +++ b/arch/x86/kernel/es7000_32.c @@ -39,10 +39,94 @@ #include <asm/nmi.h> #include <asm/smp.h> #include <asm/apicdef.h> -#include "es7000.h" #include <mach_mpparse.h> /* + * ES7000 chipsets + */ + +#define NON_UNISYS 0 +#define ES7000_CLASSIC 1 +#define ES7000_ZORRO 2 + + +#define MIP_REG 1 +#define MIP_PSAI_REG 4 + +#define MIP_BUSY 1 +#define MIP_SPIN 0xf0000 +#define MIP_VALID 0x0100000000000000ULL +#define MIP_PORT(VALUE) ((VALUE >> 32) & 0xffff) + +#define MIP_RD_LO(VALUE) (VALUE & 0xffffffff) + +struct mip_reg_info { + unsigned long long mip_info; + unsigned long long delivery_info; + unsigned long long host_reg; + unsigned long long mip_reg; +}; + +struct part_info { + unsigned char type; + unsigned char length; + unsigned char part_id; + unsigned char apic_mode; + unsigned long snum; + char ptype[16]; + char sname[64]; + char pname[64]; +}; + +struct psai { + unsigned long long entry_type; + unsigned long long addr; + unsigned long long bep_addr; +}; + +struct es7000_mem_info { + unsigned char type; + unsigned char length; + unsigned char resv[6]; + unsigned long long start; + unsigned long long size; +}; + +struct es7000_oem_table { + unsigned long long hdr; + struct mip_reg_info mip; + struct part_info pif; + struct es7000_mem_info shm; + struct psai psai; +}; + +#ifdef CONFIG_ACPI + +struct oem_table { + struct acpi_table_header Header; + u32 OEMTableAddr; + u32 OEMTableSize; +}; + +extern int find_unisys_acpi_oem_table(unsigned long *oem_addr); +extern void unmap_unisys_acpi_oem_table(unsigned long oem_addr); +#endif + +struct mip_reg { + unsigned long long off_0; + unsigned long long off_8; + unsigned long long off_10; + unsigned long long off_18; + unsigned long long off_20; + unsigned long long off_28; + unsigned long long off_30; + unsigned long long off_38; +}; + +#define MIP_SW_APIC 0x1020b +#define MIP_FUNC(VALUE) (VALUE & 0xff) + +/* * ES7000 Globals */ @@ -72,7 +156,7 @@ es7000_rename_gsi(int ioapic, int gsi) base += nr_ioapic_registers[i]; } - if (!ioapic && (gsi < 16)) + if (!ioapic && (gsi < 16)) gsi += base; return gsi; } @@ -130,10 +214,10 @@ parse_unisys_oem (char *oemptr) mip_addr = val; mip = (struct mip_reg *)val; mip_reg = __va(mip); - Dprintk("es7000_mipcfg: host_reg = 0x%lx \n", - (unsigned long)host_reg); - Dprintk("es7000_mipcfg: mip_reg = 0x%lx \n", - (unsigned long)mip_reg); + pr_debug("es7000_mipcfg: host_reg = 0x%lx \n", + (unsigned long)host_reg); + pr_debug("es7000_mipcfg: mip_reg = 0x%lx \n", + (unsigned long)mip_reg); success++; break; case MIP_PSAI_REG: @@ -160,21 +244,38 @@ parse_unisys_oem (char *oemptr) } #ifdef CONFIG_ACPI -int __init -find_unisys_acpi_oem_table(unsigned long *oem_addr) +static unsigned long oem_addrX; +static unsigned long oem_size; +int __init find_unisys_acpi_oem_table(unsigned long *oem_addr) { struct acpi_table_header *header = NULL; int i = 0; - while (ACPI_SUCCESS(acpi_get_table("OEM1", i++, &header))) { + acpi_size tbl_size; + + while (ACPI_SUCCESS(acpi_get_table_with_size("OEM1", i++, &header, &tbl_size))) { if (!memcmp((char *) &header->oem_id, "UNISYS", 6)) { struct oem_table *t = (struct oem_table *)header; - *oem_addr = (unsigned long)__acpi_map_table(t->OEMTableAddr, - t->OEMTableSize); + + oem_addrX = t->OEMTableAddr; + oem_size = t->OEMTableSize; + early_acpi_os_unmap_memory(header, tbl_size); + + *oem_addr = (unsigned long)__acpi_map_table(oem_addrX, + oem_size); return 0; } + early_acpi_os_unmap_memory(header, tbl_size); } return -1; } + +void __init unmap_unisys_acpi_oem_table(unsigned long oem_addr) +{ + if (!oem_addr) + return; + + __acpi_unmap_table((char *)oem_addr, oem_size); +} #endif static void diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index ab115cd15fdf..d073d981a730 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -11,17 +11,18 @@ #include <linux/spinlock.h> #include <linux/hardirq.h> +#include <linux/uaccess.h> #include <linux/ftrace.h> #include <linux/percpu.h> #include <linux/init.h> #include <linux/list.h> -#include <asm/alternative.h> #include <asm/ftrace.h> +#include <asm/nops.h> /* Long is fine, even if it is only 4 bytes ;-) */ -static long *ftrace_nop; +static unsigned long *ftrace_nop; union ftrace_code_union { char code[MCOUNT_INSN_SIZE]; @@ -60,11 +61,7 @@ notrace int ftrace_modify_code(unsigned long ip, unsigned char *old_code, unsigned char *new_code) { - unsigned replaced; - unsigned old = *(unsigned *)old_code; /* 4 bytes */ - unsigned new = *(unsigned *)new_code; /* 4 bytes */ - unsigned char newch = new_code[4]; - int faulted = 0; + unsigned char replaced[MCOUNT_INSN_SIZE]; /* * Note: Due to modules and __init, code can @@ -72,29 +69,20 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, * as well as code changing. * * No real locking needed, this code is run through - * kstop_machine. + * kstop_machine, or before SMP starts. */ - asm volatile ( - "1: lock\n" - " cmpxchg %3, (%2)\n" - " jnz 2f\n" - " movb %b4, 4(%2)\n" - "2:\n" - ".section .fixup, \"ax\"\n" - "3: movl $1, %0\n" - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b, 3b) - : "=r"(faulted), "=a"(replaced) - : "r"(ip), "r"(new), "c"(newch), - "0"(faulted), "a"(old) - : "memory"); - sync_core(); + if (__copy_from_user_inatomic(replaced, (char __user *)ip, MCOUNT_INSN_SIZE)) + return 1; + + if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0) + return 2; - if (replaced != old && replaced != new) - faulted = 2; + WARN_ON_ONCE(__copy_to_user_inatomic((char __user *)ip, new_code, + MCOUNT_INSN_SIZE)); - return faulted; + sync_core(); + + return 0; } notrace int ftrace_update_ftrace_func(ftrace_func_t func) @@ -112,30 +100,76 @@ notrace int ftrace_update_ftrace_func(ftrace_func_t func) notrace int ftrace_mcount_set(unsigned long *data) { - unsigned long ip = (long)(&mcount_call); - unsigned long *addr = data; - unsigned char old[MCOUNT_INSN_SIZE], *new; - - /* - * Replace the mcount stub with a pointer to the - * ip recorder function. - */ - memcpy(old, &mcount_call, MCOUNT_INSN_SIZE); - new = ftrace_call_replace(ip, *addr); - *addr = ftrace_modify_code(ip, old, new); - + /* mcount is initialized as a nop */ + *data = 0; return 0; } int __init ftrace_dyn_arch_init(void *data) { - const unsigned char *const *noptable = find_nop_table(); - - /* This is running in kstop_machine */ - - ftrace_mcount_set(data); + extern const unsigned char ftrace_test_p6nop[]; + extern const unsigned char ftrace_test_nop5[]; + extern const unsigned char ftrace_test_jmp[]; + int faulted = 0; - ftrace_nop = (unsigned long *)noptable[MCOUNT_INSN_SIZE]; + /* + * There is no good nop for all x86 archs. + * We will default to using the P6_NOP5, but first we + * will test to make sure that the nop will actually + * work on this CPU. If it faults, we will then + * go to a lesser efficient 5 byte nop. If that fails + * we then just use a jmp as our nop. This isn't the most + * efficient nop, but we can not use a multi part nop + * since we would then risk being preempted in the middle + * of that nop, and if we enabled tracing then, it might + * cause a system crash. + * + * TODO: check the cpuid to determine the best nop. + */ + asm volatile ( + "jmp ftrace_test_jmp\n" + /* This code needs to stay around */ + ".section .text, \"ax\"\n" + "ftrace_test_jmp:" + "jmp ftrace_test_p6nop\n" + "nop\n" + "nop\n" + "nop\n" /* 2 byte jmp + 3 bytes */ + "ftrace_test_p6nop:" + P6_NOP5 + "jmp 1f\n" + "ftrace_test_nop5:" + ".byte 0x66,0x66,0x66,0x66,0x90\n" + "jmp 1f\n" + ".previous\n" + "1:" + ".section .fixup, \"ax\"\n" + "2: movl $1, %0\n" + " jmp ftrace_test_nop5\n" + "3: movl $2, %0\n" + " jmp 1b\n" + ".previous\n" + _ASM_EXTABLE(ftrace_test_p6nop, 2b) + _ASM_EXTABLE(ftrace_test_nop5, 3b) + : "=r"(faulted) : "0" (faulted)); + + switch (faulted) { + case 0: + pr_info("ftrace: converting mcount calls to 0f 1f 44 00 00\n"); + ftrace_nop = (unsigned long *)ftrace_test_p6nop; + break; + case 1: + pr_info("ftrace: converting mcount calls to 66 66 66 66 90\n"); + ftrace_nop = (unsigned long *)ftrace_test_nop5; + break; + case 2: + pr_info("ftrace: converting mcount calls to jmp . + 5\n"); + ftrace_nop = (unsigned long *)ftrace_test_jmp; + break; + } + + /* The return code is retured via data */ + *(unsigned long *)data = 0; return 0; } diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c index 1fa8be5bd217..6c9bfc9e1e95 100644 --- a/arch/x86/kernel/genapic_64.c +++ b/arch/x86/kernel/genapic_64.c @@ -16,86 +16,63 @@ #include <linux/ctype.h> #include <linux/init.h> #include <linux/hardirq.h> +#include <linux/dmar.h> #include <asm/smp.h> #include <asm/ipi.h> #include <asm/genapic.h> -#ifdef CONFIG_ACPI -#include <acpi/acpi_bus.h> -#endif - -DEFINE_PER_CPU(int, x2apic_extra_bits); +extern struct genapic apic_flat; +extern struct genapic apic_physflat; +extern struct genapic apic_x2xpic_uv_x; +extern struct genapic apic_x2apic_phys; +extern struct genapic apic_x2apic_cluster; struct genapic __read_mostly *genapic = &apic_flat; -static enum uv_system_type uv_system_type; +static struct genapic *apic_probe[] __initdata = { + &apic_x2apic_uv_x, + &apic_x2apic_phys, + &apic_x2apic_cluster, + &apic_physflat, + NULL, +}; /* * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. */ void __init setup_apic_routing(void) { - if (uv_system_type == UV_NON_UNIQUE_APIC) - genapic = &apic_x2apic_uv_x; - else -#ifdef CONFIG_ACPI - /* - * Quirk: some x86_64 machines can only use physical APIC mode - * regardless of how many processors are present (x86_64 ES7000 - * is an example). - */ - if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID && - (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) - genapic = &apic_physflat; - else -#endif - - if (max_physical_apicid < 8) - genapic = &apic_flat; - else - genapic = &apic_physflat; + if (genapic == &apic_x2apic_phys || genapic == &apic_x2apic_cluster) { + if (!intr_remapping_enabled) + genapic = &apic_flat; + } - printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); + if (genapic == &apic_flat) { + if (max_physical_apicid >= 8) + genapic = &apic_physflat; + printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); + } } /* Same for both flat and physical. */ -void send_IPI_self(int vector) +void apic_send_IPI_self(int vector) { __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); } int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) { - if (!strcmp(oem_id, "SGI")) { - if (!strcmp(oem_table_id, "UVL")) - uv_system_type = UV_LEGACY_APIC; - else if (!strcmp(oem_table_id, "UVX")) - uv_system_type = UV_X2APIC; - else if (!strcmp(oem_table_id, "UVH")) - uv_system_type = UV_NON_UNIQUE_APIC; + int i; + + for (i = 0; apic_probe[i]; ++i) { + if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) { + genapic = apic_probe[i]; + printk(KERN_INFO "Setting APIC routing to %s.\n", + genapic->name); + return 1; + } } return 0; } - -unsigned int read_apic_id(void) -{ - unsigned int id; - - WARN_ON(preemptible() && num_online_cpus() > 1); - id = apic_read(APIC_ID); - if (uv_system_type >= UV_X2APIC) - id |= __get_cpu_var(x2apic_extra_bits); - return id; -} - -enum uv_system_type get_uv_system_type(void) -{ - return uv_system_type; -} - -int is_uv_system(void) -{ - return uv_system_type != UV_NONE; -} diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c index 1a9c68845ee8..c0262791bda4 100644 --- a/arch/x86/kernel/genapic_flat_64.c +++ b/arch/x86/kernel/genapic_flat_64.c @@ -15,9 +15,20 @@ #include <linux/kernel.h> #include <linux/ctype.h> #include <linux/init.h> +#include <linux/hardirq.h> #include <asm/smp.h> #include <asm/ipi.h> #include <asm/genapic.h> +#include <mach_apicdef.h> + +#ifdef CONFIG_ACPI +#include <acpi/acpi_bus.h> +#endif + +static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + return 1; +} static cpumask_t flat_target_cpus(void) { @@ -95,9 +106,33 @@ static void flat_send_IPI_all(int vector) __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); } +static unsigned int get_apic_id(unsigned long x) +{ + unsigned int id; + + id = (((x)>>24) & 0xFFu); + return id; +} + +static unsigned long set_apic_id(unsigned int id) +{ + unsigned long x; + + x = ((id & 0xFFu)<<24); + return x; +} + +static unsigned int read_xapic_id(void) +{ + unsigned int id; + + id = get_apic_id(apic_read(APIC_ID)); + return id; +} + static int flat_apic_id_registered(void) { - return physid_isset(GET_APIC_ID(read_apic_id()), phys_cpu_present_map); + return physid_isset(read_xapic_id(), phys_cpu_present_map); } static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask) @@ -112,6 +147,7 @@ static unsigned int phys_pkg_id(int index_msb) struct genapic apic_flat = { .name = "flat", + .acpi_madt_oem_check = flat_acpi_madt_oem_check, .int_delivery_mode = dest_LowestPrio, .int_dest_mode = (APIC_DEST_LOGICAL != 0), .target_cpus = flat_target_cpus, @@ -121,8 +157,12 @@ struct genapic apic_flat = { .send_IPI_all = flat_send_IPI_all, .send_IPI_allbutself = flat_send_IPI_allbutself, .send_IPI_mask = flat_send_IPI_mask, + .send_IPI_self = apic_send_IPI_self, .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, .phys_pkg_id = phys_pkg_id, + .get_apic_id = get_apic_id, + .set_apic_id = set_apic_id, + .apic_id_mask = (0xFFu<<24), }; /* @@ -130,6 +170,23 @@ struct genapic apic_flat = { * We cannot use logical delivery in this case because the mask * overflows, so use physical mode. */ +static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ +#ifdef CONFIG_ACPI + /* + * Quirk: some x86_64 machines can only use physical APIC mode + * regardless of how many processors are present (x86_64 ES7000 + * is an example). + */ + if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID && + (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) { + printk(KERN_DEBUG "system APIC only can use physical flat"); + return 1; + } +#endif + + return 0; +} static cpumask_t physflat_target_cpus(void) { @@ -168,7 +225,7 @@ static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask) * May as well be the first. */ cpu = first_cpu(cpumask); - if ((unsigned)cpu < NR_CPUS) + if ((unsigned)cpu < nr_cpu_ids) return per_cpu(x86_cpu_to_apicid, cpu); else return BAD_APICID; @@ -176,6 +233,7 @@ static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask) struct genapic apic_physflat = { .name = "physical flat", + .acpi_madt_oem_check = physflat_acpi_madt_oem_check, .int_delivery_mode = dest_Fixed, .int_dest_mode = (APIC_DEST_PHYSICAL != 0), .target_cpus = physflat_target_cpus, @@ -185,6 +243,10 @@ struct genapic apic_physflat = { .send_IPI_all = physflat_send_IPI_all, .send_IPI_allbutself = physflat_send_IPI_allbutself, .send_IPI_mask = physflat_send_IPI_mask, + .send_IPI_self = apic_send_IPI_self, .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid, .phys_pkg_id = phys_pkg_id, + .get_apic_id = get_apic_id, + .set_apic_id = set_apic_id, + .apic_id_mask = (0xFFu<<24), }; diff --git a/arch/x86/kernel/genx2apic_cluster.c b/arch/x86/kernel/genx2apic_cluster.c new file mode 100644 index 000000000000..f6a2c8eb48a6 --- /dev/null +++ b/arch/x86/kernel/genx2apic_cluster.c @@ -0,0 +1,159 @@ +#include <linux/threads.h> +#include <linux/cpumask.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/ctype.h> +#include <linux/init.h> +#include <linux/dmar.h> + +#include <asm/smp.h> +#include <asm/ipi.h> +#include <asm/genapic.h> + +DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); + +static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + if (cpu_has_x2apic) + return 1; + + return 0; +} + +/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ + +static cpumask_t x2apic_target_cpus(void) +{ + return cpumask_of_cpu(0); +} + +/* + * for now each logical cpu is in its own vector allocation domain. + */ +static cpumask_t x2apic_vector_allocation_domain(int cpu) +{ + cpumask_t domain = CPU_MASK_NONE; + cpu_set(cpu, domain); + return domain; +} + +static void __x2apic_send_IPI_dest(unsigned int apicid, int vector, + unsigned int dest) +{ + unsigned long cfg; + + cfg = __prepare_ICR(0, vector, dest); + + /* + * send the IPI. + */ + x2apic_icr_write(cfg, apicid); +} + +/* + * for now, we send the IPI's one by one in the cpumask. + * TBD: Based on the cpu mask, we can send the IPI's to the cluster group + * at once. We have 16 cpu's in a cluster. This will minimize IPI register + * writes. + */ +static void x2apic_send_IPI_mask(cpumask_t mask, int vector) +{ + unsigned long flags; + unsigned long query_cpu; + + local_irq_save(flags); + for_each_cpu_mask(query_cpu, mask) { + __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_logical_apicid, query_cpu), + vector, APIC_DEST_LOGICAL); + } + local_irq_restore(flags); +} + +static void x2apic_send_IPI_allbutself(int vector) +{ + cpumask_t mask = cpu_online_map; + + cpu_clear(smp_processor_id(), mask); + + if (!cpus_empty(mask)) + x2apic_send_IPI_mask(mask, vector); +} + +static void x2apic_send_IPI_all(int vector) +{ + x2apic_send_IPI_mask(cpu_online_map, vector); +} + +static int x2apic_apic_id_registered(void) +{ + return 1; +} + +static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask) +{ + int cpu; + + /* + * We're using fixed IRQ delivery, can only return one phys APIC ID. + * May as well be the first. + */ + cpu = first_cpu(cpumask); + if ((unsigned)cpu < NR_CPUS) + return per_cpu(x86_cpu_to_logical_apicid, cpu); + else + return BAD_APICID; +} + +static unsigned int get_apic_id(unsigned long x) +{ + unsigned int id; + + id = x; + return id; +} + +static unsigned long set_apic_id(unsigned int id) +{ + unsigned long x; + + x = id; + return x; +} + +static unsigned int phys_pkg_id(int index_msb) +{ + return current_cpu_data.initial_apicid >> index_msb; +} + +static void x2apic_send_IPI_self(int vector) +{ + apic_write(APIC_SELF_IPI, vector); +} + +static void init_x2apic_ldr(void) +{ + int cpu = smp_processor_id(); + + per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR); + return; +} + +struct genapic apic_x2apic_cluster = { + .name = "cluster x2apic", + .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, + .int_delivery_mode = dest_LowestPrio, + .int_dest_mode = (APIC_DEST_LOGICAL != 0), + .target_cpus = x2apic_target_cpus, + .vector_allocation_domain = x2apic_vector_allocation_domain, + .apic_id_registered = x2apic_apic_id_registered, + .init_apic_ldr = init_x2apic_ldr, + .send_IPI_all = x2apic_send_IPI_all, + .send_IPI_allbutself = x2apic_send_IPI_allbutself, + .send_IPI_mask = x2apic_send_IPI_mask, + .send_IPI_self = x2apic_send_IPI_self, + .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, + .phys_pkg_id = phys_pkg_id, + .get_apic_id = get_apic_id, + .set_apic_id = set_apic_id, + .apic_id_mask = (0xFFFFFFFFu), +}; diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/genx2apic_phys.c new file mode 100644 index 000000000000..d042211768b7 --- /dev/null +++ b/arch/x86/kernel/genx2apic_phys.c @@ -0,0 +1,154 @@ +#include <linux/threads.h> +#include <linux/cpumask.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/ctype.h> +#include <linux/init.h> +#include <linux/dmar.h> + +#include <asm/smp.h> +#include <asm/ipi.h> +#include <asm/genapic.h> + +static int x2apic_phys; + +static int set_x2apic_phys_mode(char *arg) +{ + x2apic_phys = 1; + return 0; +} +early_param("x2apic_phys", set_x2apic_phys_mode); + +static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + if (cpu_has_x2apic && x2apic_phys) + return 1; + + return 0; +} + +/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ + +static cpumask_t x2apic_target_cpus(void) +{ + return cpumask_of_cpu(0); +} + +static cpumask_t x2apic_vector_allocation_domain(int cpu) +{ + cpumask_t domain = CPU_MASK_NONE; + cpu_set(cpu, domain); + return domain; +} + +static void __x2apic_send_IPI_dest(unsigned int apicid, int vector, + unsigned int dest) +{ + unsigned long cfg; + + cfg = __prepare_ICR(0, vector, dest); + + /* + * send the IPI. + */ + x2apic_icr_write(cfg, apicid); +} + +static void x2apic_send_IPI_mask(cpumask_t mask, int vector) +{ + unsigned long flags; + unsigned long query_cpu; + + local_irq_save(flags); + for_each_cpu_mask(query_cpu, mask) { + __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu), + vector, APIC_DEST_PHYSICAL); + } + local_irq_restore(flags); +} + +static void x2apic_send_IPI_allbutself(int vector) +{ + cpumask_t mask = cpu_online_map; + + cpu_clear(smp_processor_id(), mask); + + if (!cpus_empty(mask)) + x2apic_send_IPI_mask(mask, vector); +} + +static void x2apic_send_IPI_all(int vector) +{ + x2apic_send_IPI_mask(cpu_online_map, vector); +} + +static int x2apic_apic_id_registered(void) +{ + return 1; +} + +static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask) +{ + int cpu; + + /* + * We're using fixed IRQ delivery, can only return one phys APIC ID. + * May as well be the first. + */ + cpu = first_cpu(cpumask); + if ((unsigned)cpu < NR_CPUS) + return per_cpu(x86_cpu_to_apicid, cpu); + else + return BAD_APICID; +} + +static unsigned int get_apic_id(unsigned long x) +{ + unsigned int id; + + id = x; + return id; +} + +static unsigned long set_apic_id(unsigned int id) +{ + unsigned long x; + + x = id; + return x; +} + +static unsigned int phys_pkg_id(int index_msb) +{ + return current_cpu_data.initial_apicid >> index_msb; +} + +void x2apic_send_IPI_self(int vector) +{ + apic_write(APIC_SELF_IPI, vector); +} + +void init_x2apic_ldr(void) +{ + return; +} + +struct genapic apic_x2apic_phys = { + .name = "physical x2apic", + .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, + .int_delivery_mode = dest_Fixed, + .int_dest_mode = (APIC_DEST_PHYSICAL != 0), + .target_cpus = x2apic_target_cpus, + .vector_allocation_domain = x2apic_vector_allocation_domain, + .apic_id_registered = x2apic_apic_id_registered, + .init_apic_ldr = init_x2apic_ldr, + .send_IPI_all = x2apic_send_IPI_all, + .send_IPI_allbutself = x2apic_send_IPI_allbutself, + .send_IPI_mask = x2apic_send_IPI_mask, + .send_IPI_self = x2apic_send_IPI_self, + .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, + .phys_pkg_id = phys_pkg_id, + .get_apic_id = get_apic_id, + .set_apic_id = set_apic_id, + .apic_id_mask = (0xFFFFFFFFu), +}; diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index 711f11c30b06..680a06557c5e 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -12,18 +12,49 @@ #include <linux/threads.h> #include <linux/cpumask.h> #include <linux/string.h> -#include <linux/kernel.h> #include <linux/ctype.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/bootmem.h> #include <linux/module.h> +#include <linux/hardirq.h> #include <asm/smp.h> #include <asm/ipi.h> #include <asm/genapic.h> #include <asm/pgtable.h> #include <asm/uv/uv_mmrs.h> #include <asm/uv/uv_hub.h> +#include <asm/uv/bios.h> + +DEFINE_PER_CPU(int, x2apic_extra_bits); + +static enum uv_system_type uv_system_type; + +static int uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + if (!strcmp(oem_id, "SGI")) { + if (!strcmp(oem_table_id, "UVL")) + uv_system_type = UV_LEGACY_APIC; + else if (!strcmp(oem_table_id, "UVX")) + uv_system_type = UV_X2APIC; + else if (!strcmp(oem_table_id, "UVH")) { + uv_system_type = UV_NON_UNIQUE_APIC; + return 1; + } + } + return 0; +} + +enum uv_system_type get_uv_system_type(void) +{ + return uv_system_type; +} + +int is_uv_system(void) +{ + return uv_system_type != UV_NONE; +} +EXPORT_SYMBOL_GPL(is_uv_system); DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info); @@ -40,6 +71,9 @@ EXPORT_SYMBOL_GPL(uv_cpu_to_blade); short uv_possible_blades; EXPORT_SYMBOL_GPL(uv_possible_blades); +unsigned long sn_rtc_cycles_per_second; +EXPORT_SYMBOL(sn_rtc_cycles_per_second); + /* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ static cpumask_t uv_target_cpus(void) @@ -80,7 +114,7 @@ static void uv_send_IPI_one(int cpu, int vector) unsigned long val, apicid, lapicid; int pnode; - apicid = per_cpu(x86_cpu_to_apicid, cpu); /* ZZZ - cache node-local ? */ + apicid = per_cpu(x86_cpu_to_apicid, cpu); lapicid = apicid & 0x3f; /* ZZZ macro needed */ pnode = uv_apicid_to_pnode(apicid); val = @@ -94,7 +128,7 @@ static void uv_send_IPI_mask(cpumask_t mask, int vector) { unsigned int cpu; - for (cpu = 0; cpu < NR_CPUS; ++cpu) + for_each_possible_cpu(cpu) if (cpu_isset(cpu, mask)) uv_send_IPI_one(cpu, vector); } @@ -119,6 +153,10 @@ static int uv_apic_id_registered(void) return 1; } +static void uv_init_apic_ldr(void) +{ +} + static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask) { int cpu; @@ -128,37 +166,65 @@ static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask) * May as well be the first. */ cpu = first_cpu(cpumask); - if ((unsigned)cpu < NR_CPUS) + if ((unsigned)cpu < nr_cpu_ids) return per_cpu(x86_cpu_to_apicid, cpu); else return BAD_APICID; } +static unsigned int get_apic_id(unsigned long x) +{ + unsigned int id; + + WARN_ON(preemptible() && num_online_cpus() > 1); + id = x | __get_cpu_var(x2apic_extra_bits); + + return id; +} + +static unsigned long set_apic_id(unsigned int id) +{ + unsigned long x; + + /* maskout x2apic_extra_bits ? */ + x = id; + return x; +} + +static unsigned int uv_read_apic_id(void) +{ + + return get_apic_id(apic_read(APIC_ID)); +} + static unsigned int phys_pkg_id(int index_msb) { - return GET_APIC_ID(read_apic_id()) >> index_msb; + return uv_read_apic_id() >> index_msb; } -#ifdef ZZZ /* Needs x2apic patch */ static void uv_send_IPI_self(int vector) { apic_write(APIC_SELF_IPI, vector); } -#endif struct genapic apic_x2apic_uv_x = { .name = "UV large system", + .acpi_madt_oem_check = uv_acpi_madt_oem_check, .int_delivery_mode = dest_Fixed, .int_dest_mode = (APIC_DEST_PHYSICAL != 0), .target_cpus = uv_target_cpus, - .vector_allocation_domain = uv_vector_allocation_domain,/* Fixme ZZZ */ + .vector_allocation_domain = uv_vector_allocation_domain, .apic_id_registered = uv_apic_id_registered, + .init_apic_ldr = uv_init_apic_ldr, .send_IPI_all = uv_send_IPI_all, .send_IPI_allbutself = uv_send_IPI_allbutself, .send_IPI_mask = uv_send_IPI_mask, - /* ZZZ.send_IPI_self = uv_send_IPI_self, */ + .send_IPI_self = uv_send_IPI_self, .cpu_mask_to_apicid = uv_cpu_mask_to_apicid, - .phys_pkg_id = phys_pkg_id, /* Fixme ZZZ */ + .phys_pkg_id = phys_pkg_id, + .get_apic_id = get_apic_id, + .set_apic_id = set_apic_id, + .apic_id_mask = (0xFFFFFFFFu), }; static __cpuinit void set_x2apic_extra_bits(int pnode) @@ -218,12 +284,13 @@ static __init void map_low_mmrs(void) enum map_type {map_wb, map_uc}; -static void map_high(char *id, unsigned long base, int shift, enum map_type map_type) +static __init void map_high(char *id, unsigned long base, int shift, + int max_pnode, enum map_type map_type) { unsigned long bytes, paddr; paddr = base << shift; - bytes = (1UL << shift); + bytes = (1UL << shift) * (max_pnode + 1); printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, paddr + bytes); if (map_type == map_uc) @@ -239,7 +306,7 @@ static __init void map_gru_high(int max_pnode) gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); if (gru.s.enable) - map_high("GRU", gru.s.base, shift, map_wb); + map_high("GRU", gru.s.base, shift, max_pnode, map_wb); } static __init void map_config_high(int max_pnode) @@ -249,7 +316,7 @@ static __init void map_config_high(int max_pnode) cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR); if (cfg.s.enable) - map_high("CONFIG", cfg.s.base, shift, map_uc); + map_high("CONFIG", cfg.s.base, shift, max_pnode, map_uc); } static __init void map_mmr_high(int max_pnode) @@ -259,7 +326,7 @@ static __init void map_mmr_high(int max_pnode) mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); if (mmr.s.enable) - map_high("MMR", mmr.s.base, shift, map_uc); + map_high("MMR", mmr.s.base, shift, max_pnode, map_uc); } static __init void map_mmioh_high(int max_pnode) @@ -269,10 +336,44 @@ static __init void map_mmioh_high(int max_pnode) mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); if (mmioh.s.enable) - map_high("MMIOH", mmioh.s.base, shift, map_uc); + map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc); +} + +static __init void uv_rtc_init(void) +{ + long status; + u64 ticks_per_sec; + + status = uv_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, + &ticks_per_sec); + if (status != BIOS_STATUS_SUCCESS || ticks_per_sec < 100000) { + printk(KERN_WARNING + "unable to determine platform RTC clock frequency, " + "guessing.\n"); + /* BIOS gives wrong value for clock freq. so guess */ + sn_rtc_cycles_per_second = 1000000000000UL / 30000UL; + } else + sn_rtc_cycles_per_second = ticks_per_sec; } -static __init void uv_system_init(void) +/* + * Called on each cpu to initialize the per_cpu UV data area. + * ZZZ hotplug not supported yet + */ +void __cpuinit uv_cpu_init(void) +{ + /* CPU 0 initilization will be done via uv_system_init. */ + if (!uv_blade_info) + return; + + uv_blade_info[uv_numa_blade_id()].nr_online_cpus++; + + if (get_uv_system_type() == UV_NON_UNIQUE_APIC) + set_x2apic_extra_bits(uv_hub_info->pnode); +} + + +void __init uv_system_init(void) { union uvh_si_addr_map_config_u m_n_config; union uvh_node_id_u node_id; @@ -326,6 +427,11 @@ static __init void uv_system_init(void) gnode_upper = (((unsigned long)node_id.s.node_id) & ~((1 << n_val) - 1)) << m_val; + uv_bios_init(); + uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, + &uv_coherency_id, &uv_region_size); + uv_rtc_init(); + for_each_present_cpu(cpu) { nid = cpu_to_node(cpu); pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu)); @@ -345,7 +451,7 @@ static __init void uv_system_init(void) uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; - uv_cpu_hub_info(cpu)->coherency_domain_number = 0;/* ZZZ */ + uv_cpu_hub_info(cpu)->coherency_domain_number = uv_coherency_id; uv_node_to_blade[nid] = blade; uv_cpu_to_blade[cpu] = blade; max_pnode = max(pnode, max_pnode); @@ -360,19 +466,6 @@ static __init void uv_system_init(void) map_mmr_high(max_pnode); map_config_high(max_pnode); map_mmioh_high(max_pnode); -} -/* - * Called on each cpu to initialize the per_cpu UV data area. - * ZZZ hotplug not supported yet - */ -void __cpuinit uv_cpu_init(void) -{ - if (!uv_node_to_blade) - uv_system_init(); - - uv_blade_info[uv_numa_blade_id()].nr_online_cpus++; - - if (get_uv_system_type() == UV_NON_UNIQUE_APIC) - set_x2apic_extra_bits(uv_hub_info->pnode); + uv_cpu_init(); } diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c index 3e66bd364a9d..1dcb0f13897e 100644 --- a/arch/x86/kernel/head.c +++ b/arch/x86/kernel/head.c @@ -35,6 +35,7 @@ void __init reserve_ebda_region(void) /* start of EBDA area */ ebda_addr = get_bios_ebda(); + printk(KERN_INFO "BIOS EBDA/lowmem at: %08x/%08x\n", ebda_addr, lowmem); /* Fixup: bios puts an EBDA in the top 64K segment */ /* of conventional memory, but does not adjust lowmem. */ diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index c97819829146..d16084f90649 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -39,6 +39,13 @@ static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata; static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly; #endif +void __init x86_64_init_pda(void) +{ + _cpu_pda = __cpu_pda; + cpu_pda(0) = &_boot_cpu_pda; + pda_init(0); +} + static void __init zap_identity_mappings(void) { pgd_t *pgd = pgd_offset_k(0UL); @@ -81,6 +88,7 @@ void __init x86_64_start_kernel(char * real_mode_data) BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == (__START_KERNEL & PGDIR_MASK))); + BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); /* clear bss before set_intr_gate with early_idt_handler */ clear_bss(); @@ -100,13 +108,10 @@ void __init x86_64_start_kernel(char * real_mode_data) } load_idt((const struct desc_ptr *)&idt_descr); - early_printk("Kernel alive\n"); - - _cpu_pda = __cpu_pda; - cpu_pda(0) = &_boot_cpu_pda; - pda_init(0); + if (console_loglevel == 10) + early_printk("Kernel alive\n"); - early_printk("Kernel really alive\n"); + x86_64_init_pda(); x86_64_start_reservations(real_mode_data); } diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index f67e93441caf..e835b4eea70b 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -172,10 +172,6 @@ num_subarch_entries = (. - subarch_entries) / 4 * * Note that the stack is not yet set up! */ -#define PTE_ATTR 0x007 /* PRESENT+RW+USER */ -#define PDE_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */ -#define PGD_ATTR 0x001 /* PRESENT (no other attributes) */ - default_entry: #ifdef CONFIG_X86_PAE @@ -196,9 +192,9 @@ default_entry: movl $pa(pg0), %edi movl %edi, pa(init_pg_tables_start) movl $pa(swapper_pg_pmd), %edx - movl $PTE_ATTR, %eax + movl $PTE_IDENT_ATTR, %eax 10: - leal PDE_ATTR(%edi),%ecx /* Create PMD entry */ + leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */ movl %ecx,(%edx) /* Store PMD entry */ /* Upper half already zero */ addl $8,%edx @@ -215,7 +211,7 @@ default_entry: * End condition: we must map up to and including INIT_MAP_BEYOND_END * bytes beyond the end of our own page tables. */ - leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp + leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp cmpl %ebp,%eax jb 10b 1: @@ -224,7 +220,7 @@ default_entry: movl %eax, pa(max_pfn_mapped) /* Do early initialization of the fixmap area */ - movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax + movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8) #else /* Not PAE */ @@ -233,9 +229,9 @@ page_pde_offset = (__PAGE_OFFSET >> 20); movl $pa(pg0), %edi movl %edi, pa(init_pg_tables_start) movl $pa(swapper_pg_dir), %edx - movl $PTE_ATTR, %eax + movl $PTE_IDENT_ATTR, %eax 10: - leal PDE_ATTR(%edi),%ecx /* Create PDE entry */ + leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */ movl %ecx,(%edx) /* Store identity PDE entry */ movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */ addl $4,%edx @@ -249,7 +245,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20); * bytes beyond the end of our own page tables; the +0x007 is * the attribute bits */ - leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp + leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp cmpl %ebp,%eax jb 10b movl %edi,pa(init_pg_tables_end) @@ -257,7 +253,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20); movl %eax, pa(max_pfn_mapped) /* Do early initialization of the fixmap area */ - movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax + movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax movl %eax,pa(swapper_pg_dir+0xffc) #endif jmp 3f @@ -456,9 +452,6 @@ is386: movl $2,%ecx # set MP 1: #endif /* CONFIG_SMP */ jmp *(initial_code) -.align 4 -ENTRY(initial_code) - .long i386_start_kernel /* * We depend on ET to be correct. This checks for 287/387. @@ -601,6 +594,11 @@ ignore_int: #endif iret +.section .cpuinit.data,"wa" +.align 4 +ENTRY(initial_code) + .long i386_start_kernel + .section .text /* * Real beginning of normal "text" segment @@ -632,19 +630,19 @@ ENTRY(empty_zero_page) /* Page-aligned for the benefit of paravirt? */ .align PAGE_SIZE_asm ENTRY(swapper_pg_dir) - .long pa(swapper_pg_pmd+PGD_ATTR),0 /* low identity map */ + .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */ # if KPMDS == 3 - .long pa(swapper_pg_pmd+PGD_ATTR),0 - .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0 - .long pa(swapper_pg_pmd+PGD_ATTR+0x2000),0 + .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 + .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0 + .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x2000),0 # elif KPMDS == 2 .long 0,0 - .long pa(swapper_pg_pmd+PGD_ATTR),0 - .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0 + .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 + .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0 # elif KPMDS == 1 .long 0,0 .long 0,0 - .long pa(swapper_pg_pmd+PGD_ATTR),0 + .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 # else # error "Kernel PMDs should be 1, 2 or 3" # endif diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index b07ac7b217cb..26cfdc1d7c7f 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -110,7 +110,7 @@ startup_64: movq %rdi, %rax shrq $PMD_SHIFT, %rax andq $(PTRS_PER_PMD - 1), %rax - leaq __PAGE_KERNEL_LARGE_EXEC(%rdi), %rdx + leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx leaq level2_spare_pgt(%rip), %rbx movq %rdx, 0(%rbx, %rax, 8) ident_complete: @@ -374,7 +374,7 @@ NEXT_PAGE(level2_ident_pgt) /* Since I easily can, map the first 1G. * Don't set NX because code runs from these pages. */ - PMDS(0, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) + PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) NEXT_PAGE(level2_kernel_pgt) /* @@ -407,6 +407,7 @@ ENTRY(phys_base) /* This must match the first entry in level2_kernel_pgt */ .quad 0x0000000000000000 +#include "../../x86/xen/xen-head.S" .section .bss, "aw", @nobits .align L1_CACHE_BYTES diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 0ea6a19bfdfe..77017e834cf7 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -1,29 +1,49 @@ #include <linux/clocksource.h> #include <linux/clockchips.h> +#include <linux/interrupt.h> +#include <linux/sysdev.h> #include <linux/delay.h> #include <linux/errno.h> #include <linux/hpet.h> #include <linux/init.h> -#include <linux/sysdev.h> +#include <linux/cpu.h> #include <linux/pm.h> +#include <linux/io.h> #include <asm/fixmap.h> -#include <asm/hpet.h> #include <asm/i8253.h> -#include <asm/io.h> +#include <asm/hpet.h> -#define HPET_MASK CLOCKSOURCE_MASK(32) -#define HPET_SHIFT 22 +#define HPET_MASK CLOCKSOURCE_MASK(32) +#define HPET_SHIFT 22 /* FSEC = 10^-15 NSEC = 10^-9 */ -#define FSEC_PER_NSEC 1000000L +#define FSEC_PER_NSEC 1000000L + +#define HPET_DEV_USED_BIT 2 +#define HPET_DEV_USED (1 << HPET_DEV_USED_BIT) +#define HPET_DEV_VALID 0x8 +#define HPET_DEV_FSB_CAP 0x1000 +#define HPET_DEV_PERI_CAP 0x2000 + +#define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt) /* * HPET address is set in acpi/boot.c, when an ACPI entry exists */ -unsigned long hpet_address; -static void __iomem *hpet_virt_address; +unsigned long hpet_address; +unsigned long hpet_num_timers; +static void __iomem *hpet_virt_address; + +struct hpet_dev { + struct clock_event_device evt; + unsigned int num; + int cpu; + unsigned int irq; + unsigned int flags; + char name[10]; +}; unsigned long hpet_readl(unsigned long a) { @@ -59,7 +79,7 @@ static inline void hpet_clear_mapping(void) static int boot_hpet_disable; int hpet_force_user; -static int __init hpet_setup(char* str) +static int __init hpet_setup(char *str) { if (str) { if (!strncmp("disable", str, 7)) @@ -80,7 +100,7 @@ __setup("nohpet", disable_hpet); static inline int is_hpet_capable(void) { - return (!boot_hpet_disable && hpet_address); + return !boot_hpet_disable && hpet_address; } /* @@ -102,6 +122,9 @@ EXPORT_SYMBOL_GPL(is_hpet_enabled); * timer 0 and timer 1 in case of RTC emulation. */ #ifdef CONFIG_HPET + +static void hpet_reserve_msi_timers(struct hpet_data *hd); + static void hpet_reserve_platform_timers(unsigned long id) { struct hpet __iomem *hpet = hpet_virt_address; @@ -111,25 +134,31 @@ static void hpet_reserve_platform_timers(unsigned long id) nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1; - memset(&hd, 0, sizeof (hd)); - hd.hd_phys_address = hpet_address; - hd.hd_address = hpet; - hd.hd_nirqs = nrtimers; - hd.hd_flags = HPET_DATA_PLATFORM; + memset(&hd, 0, sizeof(hd)); + hd.hd_phys_address = hpet_address; + hd.hd_address = hpet; + hd.hd_nirqs = nrtimers; hpet_reserve_timer(&hd, 0); #ifdef CONFIG_HPET_EMULATE_RTC hpet_reserve_timer(&hd, 1); #endif + /* + * NOTE that hd_irq[] reflects IOAPIC input pins (LEGACY_8254 + * is wrong for i8259!) not the output IRQ. Many BIOS writers + * don't bother configuring *any* comparator interrupts. + */ hd.hd_irq[0] = HPET_LEGACY_8254; hd.hd_irq[1] = HPET_LEGACY_RTC; for (i = 2; i < nrtimers; timer++, i++) { - hd.hd_irq[i] = (readl(&timer->hpet_config) & Tn_INT_ROUTE_CNF_MASK) >> - Tn_INT_ROUTE_CNF_SHIFT; + hd.hd_irq[i] = (readl(&timer->hpet_config) & + Tn_INT_ROUTE_CNF_MASK) >> Tn_INT_ROUTE_CNF_SHIFT; } + hpet_reserve_msi_timers(&hd); + hpet_alloc(&hd); } @@ -210,8 +239,8 @@ static void hpet_legacy_clockevent_register(void) /* Calculate the min / max delta */ hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, &hpet_clockevent); - hpet_clockevent.min_delta_ns = clockevent_delta2ns(0x30, - &hpet_clockevent); + /* 5 usec minimum reprogramming delta. */ + hpet_clockevent.min_delta_ns = 5000; /* * Start hpet with the boot cpu mask and make it @@ -223,63 +252,421 @@ static void hpet_legacy_clockevent_register(void) printk(KERN_DEBUG "hpet clockevent registered\n"); } -static void hpet_legacy_set_mode(enum clock_event_mode mode, - struct clock_event_device *evt) +static int hpet_setup_msi_irq(unsigned int irq); + +static void hpet_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt, int timer) { unsigned long cfg, cmp, now; uint64_t delta; - switch(mode) { + switch (mode) { case CLOCK_EVT_MODE_PERIODIC: - delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * hpet_clockevent.mult; - delta >>= hpet_clockevent.shift; + delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult; + delta >>= evt->shift; now = hpet_readl(HPET_COUNTER); cmp = now + (unsigned long) delta; - cfg = hpet_readl(HPET_T0_CFG); + cfg = hpet_readl(HPET_Tn_CFG(timer)); cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | HPET_TN_32BIT; - hpet_writel(cfg, HPET_T0_CFG); + hpet_writel(cfg, HPET_Tn_CFG(timer)); /* * The first write after writing TN_SETVAL to the * config register sets the counter value, the second * write sets the period. */ - hpet_writel(cmp, HPET_T0_CMP); + hpet_writel(cmp, HPET_Tn_CMP(timer)); udelay(1); - hpet_writel((unsigned long) delta, HPET_T0_CMP); + hpet_writel((unsigned long) delta, HPET_Tn_CMP(timer)); break; case CLOCK_EVT_MODE_ONESHOT: - cfg = hpet_readl(HPET_T0_CFG); + cfg = hpet_readl(HPET_Tn_CFG(timer)); cfg &= ~HPET_TN_PERIODIC; cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; - hpet_writel(cfg, HPET_T0_CFG); + hpet_writel(cfg, HPET_Tn_CFG(timer)); break; case CLOCK_EVT_MODE_UNUSED: case CLOCK_EVT_MODE_SHUTDOWN: - cfg = hpet_readl(HPET_T0_CFG); + cfg = hpet_readl(HPET_Tn_CFG(timer)); cfg &= ~HPET_TN_ENABLE; - hpet_writel(cfg, HPET_T0_CFG); + hpet_writel(cfg, HPET_Tn_CFG(timer)); break; case CLOCK_EVT_MODE_RESUME: - hpet_enable_legacy_int(); + if (timer == 0) { + hpet_enable_legacy_int(); + } else { + struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); + hpet_setup_msi_irq(hdev->irq); + disable_irq(hdev->irq); + irq_set_affinity(hdev->irq, cpumask_of_cpu(hdev->cpu)); + enable_irq(hdev->irq); + } break; } } -static int hpet_legacy_next_event(unsigned long delta, - struct clock_event_device *evt) +static int hpet_next_event(unsigned long delta, + struct clock_event_device *evt, int timer) { - unsigned long cnt; + u32 cnt; cnt = hpet_readl(HPET_COUNTER); - cnt += delta; - hpet_writel(cnt, HPET_T0_CMP); + cnt += (u32) delta; + hpet_writel(cnt, HPET_Tn_CMP(timer)); + + /* + * We need to read back the CMP register to make sure that + * what we wrote hit the chip before we compare it to the + * counter. + */ + WARN_ON((u32)hpet_readl(HPET_T0_CMP) != cnt); + + return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; +} + +static void hpet_legacy_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + hpet_set_mode(mode, evt, 0); +} + +static int hpet_legacy_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + return hpet_next_event(delta, evt, 0); +} + +/* + * HPET MSI Support + */ +#ifdef CONFIG_PCI_MSI + +static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev); +static struct hpet_dev *hpet_devs; + +void hpet_msi_unmask(unsigned int irq) +{ + struct hpet_dev *hdev = get_irq_data(irq); + unsigned long cfg; + + /* unmask it */ + cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); + cfg |= HPET_TN_FSB; + hpet_writel(cfg, HPET_Tn_CFG(hdev->num)); +} + +void hpet_msi_mask(unsigned int irq) +{ + unsigned long cfg; + struct hpet_dev *hdev = get_irq_data(irq); + + /* mask it */ + cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); + cfg &= ~HPET_TN_FSB; + hpet_writel(cfg, HPET_Tn_CFG(hdev->num)); +} + +void hpet_msi_write(unsigned int irq, struct msi_msg *msg) +{ + struct hpet_dev *hdev = get_irq_data(irq); + + hpet_writel(msg->data, HPET_Tn_ROUTE(hdev->num)); + hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hdev->num) + 4); +} + +void hpet_msi_read(unsigned int irq, struct msi_msg *msg) +{ + struct hpet_dev *hdev = get_irq_data(irq); + + msg->data = hpet_readl(HPET_Tn_ROUTE(hdev->num)); + msg->address_lo = hpet_readl(HPET_Tn_ROUTE(hdev->num) + 4); + msg->address_hi = 0; +} + +static void hpet_msi_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); + hpet_set_mode(mode, evt, hdev->num); +} + +static int hpet_msi_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); + return hpet_next_event(delta, evt, hdev->num); +} + +static int hpet_setup_msi_irq(unsigned int irq) +{ + if (arch_setup_hpet_msi(irq)) { + destroy_irq(irq); + return -EINVAL; + } + return 0; +} + +static int hpet_assign_irq(struct hpet_dev *dev) +{ + unsigned int irq; + + irq = create_irq(); + if (!irq) + return -EINVAL; + + set_irq_data(irq, dev); + + if (hpet_setup_msi_irq(irq)) + return -EINVAL; + + dev->irq = irq; + return 0; +} + +static irqreturn_t hpet_interrupt_handler(int irq, void *data) +{ + struct hpet_dev *dev = (struct hpet_dev *)data; + struct clock_event_device *hevt = &dev->evt; + + if (!hevt->event_handler) { + printk(KERN_INFO "Spurious HPET timer interrupt on HPET timer %d\n", + dev->num); + return IRQ_HANDLED; + } + + hevt->event_handler(hevt); + return IRQ_HANDLED; +} + +static int hpet_setup_irq(struct hpet_dev *dev) +{ + + if (request_irq(dev->irq, hpet_interrupt_handler, + IRQF_SHARED|IRQF_NOBALANCING, dev->name, dev)) + return -1; + + disable_irq(dev->irq); + irq_set_affinity(dev->irq, cpumask_of_cpu(dev->cpu)); + enable_irq(dev->irq); + + printk(KERN_DEBUG "hpet: %s irq %d for MSI\n", + dev->name, dev->irq); + + return 0; +} + +/* This should be called in specific @cpu */ +static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu) +{ + struct clock_event_device *evt = &hdev->evt; + uint64_t hpet_freq; + + WARN_ON(cpu != smp_processor_id()); + if (!(hdev->flags & HPET_DEV_VALID)) + return; - return ((long)(hpet_readl(HPET_COUNTER) - cnt ) > 0) ? -ETIME : 0; + if (hpet_setup_msi_irq(hdev->irq)) + return; + + hdev->cpu = cpu; + per_cpu(cpu_hpet_dev, cpu) = hdev; + evt->name = hdev->name; + hpet_setup_irq(hdev); + evt->irq = hdev->irq; + + evt->rating = 110; + evt->features = CLOCK_EVT_FEAT_ONESHOT; + if (hdev->flags & HPET_DEV_PERI_CAP) + evt->features |= CLOCK_EVT_FEAT_PERIODIC; + + evt->set_mode = hpet_msi_set_mode; + evt->set_next_event = hpet_msi_next_event; + evt->shift = 32; + + /* + * The period is a femto seconds value. We need to calculate the + * scaled math multiplication factor for nanosecond to hpet tick + * conversion. + */ + hpet_freq = 1000000000000000ULL; + do_div(hpet_freq, hpet_period); + evt->mult = div_sc((unsigned long) hpet_freq, + NSEC_PER_SEC, evt->shift); + /* Calculate the max delta */ + evt->max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, evt); + /* 5 usec minimum reprogramming delta. */ + evt->min_delta_ns = 5000; + + evt->cpumask = cpumask_of_cpu(hdev->cpu); + clockevents_register_device(evt); +} + +#ifdef CONFIG_HPET +/* Reserve at least one timer for userspace (/dev/hpet) */ +#define RESERVE_TIMERS 1 +#else +#define RESERVE_TIMERS 0 +#endif + +static void hpet_msi_capability_lookup(unsigned int start_timer) +{ + unsigned int id; + unsigned int num_timers; + unsigned int num_timers_used = 0; + int i; + + id = hpet_readl(HPET_ID); + + num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT); + num_timers++; /* Value read out starts from 0 */ + + hpet_devs = kzalloc(sizeof(struct hpet_dev) * num_timers, GFP_KERNEL); + if (!hpet_devs) + return; + + hpet_num_timers = num_timers; + + for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) { + struct hpet_dev *hdev = &hpet_devs[num_timers_used]; + unsigned long cfg = hpet_readl(HPET_Tn_CFG(i)); + + /* Only consider HPET timer with MSI support */ + if (!(cfg & HPET_TN_FSB_CAP)) + continue; + + hdev->flags = 0; + if (cfg & HPET_TN_PERIODIC_CAP) + hdev->flags |= HPET_DEV_PERI_CAP; + hdev->num = i; + + sprintf(hdev->name, "hpet%d", i); + if (hpet_assign_irq(hdev)) + continue; + + hdev->flags |= HPET_DEV_FSB_CAP; + hdev->flags |= HPET_DEV_VALID; + num_timers_used++; + if (num_timers_used == num_possible_cpus()) + break; + } + + printk(KERN_INFO "HPET: %d timers in total, %d timers will be used for per-cpu timer\n", + num_timers, num_timers_used); +} + +#ifdef CONFIG_HPET +static void hpet_reserve_msi_timers(struct hpet_data *hd) +{ + int i; + + if (!hpet_devs) + return; + + for (i = 0; i < hpet_num_timers; i++) { + struct hpet_dev *hdev = &hpet_devs[i]; + + if (!(hdev->flags & HPET_DEV_VALID)) + continue; + + hd->hd_irq[hdev->num] = hdev->irq; + hpet_reserve_timer(hd, hdev->num); + } +} +#endif + +static struct hpet_dev *hpet_get_unused_timer(void) +{ + int i; + + if (!hpet_devs) + return NULL; + + for (i = 0; i < hpet_num_timers; i++) { + struct hpet_dev *hdev = &hpet_devs[i]; + + if (!(hdev->flags & HPET_DEV_VALID)) + continue; + if (test_and_set_bit(HPET_DEV_USED_BIT, + (unsigned long *)&hdev->flags)) + continue; + return hdev; + } + return NULL; +} + +struct hpet_work_struct { + struct delayed_work work; + struct completion complete; +}; + +static void hpet_work(struct work_struct *w) +{ + struct hpet_dev *hdev; + int cpu = smp_processor_id(); + struct hpet_work_struct *hpet_work; + + hpet_work = container_of(w, struct hpet_work_struct, work.work); + + hdev = hpet_get_unused_timer(); + if (hdev) + init_one_hpet_msi_clockevent(hdev, cpu); + + complete(&hpet_work->complete); +} + +static int hpet_cpuhp_notify(struct notifier_block *n, + unsigned long action, void *hcpu) +{ + unsigned long cpu = (unsigned long)hcpu; + struct hpet_work_struct work; + struct hpet_dev *hdev = per_cpu(cpu_hpet_dev, cpu); + + switch (action & 0xf) { + case CPU_ONLINE: + INIT_DELAYED_WORK(&work.work, hpet_work); + init_completion(&work.complete); + /* FIXME: add schedule_work_on() */ + schedule_delayed_work_on(cpu, &work.work, 0); + wait_for_completion(&work.complete); + break; + case CPU_DEAD: + if (hdev) { + free_irq(hdev->irq, hdev); + hdev->flags &= ~HPET_DEV_USED; + per_cpu(cpu_hpet_dev, cpu) = NULL; + } + break; + } + return NOTIFY_OK; +} +#else + +static int hpet_setup_msi_irq(unsigned int irq) +{ + return 0; +} +static void hpet_msi_capability_lookup(unsigned int start_timer) +{ + return; +} + +#ifdef CONFIG_HPET +static void hpet_reserve_msi_timers(struct hpet_data *hd) +{ + return; } +#endif + +static int hpet_cpuhp_notify(struct notifier_block *n, + unsigned long action, void *hcpu) +{ + return NOTIFY_OK; +} + +#endif /* * Clock source related code @@ -359,6 +746,7 @@ static int hpet_clocksource_register(void) int __init hpet_enable(void) { unsigned long id; + int i; if (!is_hpet_capable()) return 0; @@ -369,6 +757,29 @@ int __init hpet_enable(void) * Read the period and check for a sane value: */ hpet_period = hpet_readl(HPET_PERIOD); + + /* + * AMD SB700 based systems with spread spectrum enabled use a + * SMM based HPET emulation to provide proper frequency + * setting. The SMM code is initialized with the first HPET + * register access and takes some time to complete. During + * this time the config register reads 0xffffffff. We check + * for max. 1000 loops whether the config register reads a non + * 0xffffffff value to make sure that HPET is up and running + * before we go further. A counting loop is safe, as the HPET + * access takes thousands of CPU cycles. On non SB700 based + * machines this check is only done once and has no side + * effects. + */ + for (i = 0; hpet_readl(HPET_CFG) == 0xFFFFFFFF; i++) { + if (i == 1000) { + printk(KERN_WARNING + "HPET config register value = 0xFFFFFFFF. " + "Disabling HPET\n"); + goto out_nohpet; + } + } + if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD) goto out_nohpet; @@ -392,8 +803,10 @@ int __init hpet_enable(void) if (id & HPET_ID_LEGSUP) { hpet_legacy_clockevent_register(); + hpet_msi_capability_lookup(2); return 1; } + hpet_msi_capability_lookup(0); return 0; out_nohpet: @@ -410,6 +823,8 @@ out_nohpet: */ static __init int hpet_late_init(void) { + int cpu; + if (boot_hpet_disable) return -ENODEV; @@ -425,6 +840,13 @@ static __init int hpet_late_init(void) hpet_reserve_platform_timers(hpet_readl(HPET_ID)); + for_each_online_cpu(cpu) { + hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu); + } + + /* This notifier should be called after workqueue is ready */ + hotcpu_notifier(hpet_cpuhp_notify, -20); + return 0; } fs_initcall(hpet_late_init); @@ -468,7 +890,7 @@ void hpet_disable(void) #define RTC_NUM_INTS 1 static unsigned long hpet_rtc_flags; -static unsigned long hpet_prev_update_sec; +static int hpet_prev_update_sec; static struct rtc_time hpet_alarm_time; static unsigned long hpet_pie_count; static unsigned long hpet_t1_cmp; @@ -575,6 +997,9 @@ int hpet_set_rtc_irq_bit(unsigned long bit_mask) hpet_rtc_flags |= bit_mask; + if ((bit_mask & RTC_UIE) && !(oldbits & RTC_UIE)) + hpet_prev_update_sec = -1; + if (!oldbits) hpet_rtc_timer_init(); @@ -652,7 +1077,7 @@ static void hpet_rtc_timer_reinit(void) if (hpet_rtc_flags & RTC_PIE) hpet_pie_count += lost_ints; if (printk_ratelimit()) - printk(KERN_WARNING "rtc: lost %d interrupts\n", + printk(KERN_WARNING "hpet1: lost %d rtc interrupts\n", lost_ints); } } @@ -670,7 +1095,8 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) if (hpet_rtc_flags & RTC_UIE && curr_time.tm_sec != hpet_prev_update_sec) { - rtc_int_flag = RTC_UF; + if (hpet_prev_update_sec >= 0) + rtc_int_flag = RTC_UF; hpet_prev_update_sec = curr_time.tm_sec; } diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index eb9ddd8efb82..1f20608d4ca8 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -21,9 +21,12 @@ # include <asm/sigcontext32.h> # include <asm/user32.h> #else -# define save_i387_ia32 save_i387 -# define restore_i387_ia32 restore_i387 +# define save_i387_xstate_ia32 save_i387_xstate +# define restore_i387_xstate_ia32 restore_i387_xstate # define _fpstate_ia32 _fpstate +# define _xstate_ia32 _xstate +# define sig_xstate_ia32_size sig_xstate_size +# define fx_sw_reserved_ia32 fx_sw_reserved # define user_i387_ia32_struct user_i387_struct # define user32_fxsr_struct user_fxsr_struct #endif @@ -36,6 +39,7 @@ static unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; unsigned int xstate_size; +unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32); static struct i387_fxsave_struct fx_scratch __cpuinitdata; void __cpuinit mxcsr_feature_mask_init(void) @@ -61,6 +65,11 @@ void __init init_thread_xstate(void) return; } + if (cpu_has_xsave) { + xsave_cntxt_init(); + return; + } + if (cpu_has_fxsr) xstate_size = sizeof(struct i387_fxsave_struct); #ifdef CONFIG_X86_32 @@ -83,9 +92,19 @@ void __cpuinit fpu_init(void) write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */ + /* + * Boot processor to setup the FP and extended state context info. + */ + if (!smp_processor_id()) + init_thread_xstate(); + xsave_init(); + mxcsr_feature_mask_init(); /* clean state in init */ - current_thread_info()->status = 0; + if (cpu_has_xsave) + current_thread_info()->status = TS_XSAVE; + else + current_thread_info()->status = 0; clear_used_math(); } #endif /* CONFIG_X86_64 */ @@ -195,6 +214,13 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, */ target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; + /* + * update the header bits in the xsave header, indicating the + * presence of FP and SSE state. + */ + if (cpu_has_xsave) + target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE; + return ret; } @@ -395,6 +421,12 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, if (!ret) convert_to_fxsr(target, &env); + /* + * update the header bit in the xsave header, indicating the + * presence of FP. + */ + if (cpu_has_xsave) + target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FP; return ret; } @@ -407,7 +439,6 @@ static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf) struct task_struct *tsk = current; struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave; - unlazy_fpu(tsk); fp->status = fp->swd; if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct))) return -1; @@ -421,8 +452,6 @@ static int save_i387_fxsave(struct _fpstate_ia32 __user *buf) struct user_i387_ia32_struct env; int err = 0; - unlazy_fpu(tsk); - convert_from_fxsr(&env, tsk); if (__copy_to_user(buf, &env, sizeof(env))) return -1; @@ -432,16 +461,54 @@ static int save_i387_fxsave(struct _fpstate_ia32 __user *buf) if (err) return -1; - if (__copy_to_user(&buf->_fxsr_env[0], fx, - sizeof(struct i387_fxsave_struct))) + if (__copy_to_user(&buf->_fxsr_env[0], fx, xstate_size)) + return -1; + return 1; +} + +static int save_i387_xsave(void __user *buf) +{ + struct task_struct *tsk = current; + struct _fpstate_ia32 __user *fx = buf; + int err = 0; + + /* + * For legacy compatible, we always set FP/SSE bits in the bit + * vector while saving the state to the user context. + * This will enable us capturing any changes(during sigreturn) to + * the FP/SSE bits by the legacy applications which don't touch + * xstate_bv in the xsave header. + * + * xsave aware applications can change the xstate_bv in the xsave + * header as well as change any contents in the memory layout. + * xrestore as part of sigreturn will capture all the changes. + */ + tsk->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE; + + if (save_i387_fxsave(fx) < 0) + return -1; + + err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved_ia32, + sizeof(struct _fpx_sw_bytes)); + err |= __put_user(FP_XSTATE_MAGIC2, + (__u32 __user *) (buf + sig_xstate_ia32_size + - FP_XSTATE_MAGIC2_SIZE)); + if (err) return -1; + return 1; } -int save_i387_ia32(struct _fpstate_ia32 __user *buf) +int save_i387_xstate_ia32(void __user *buf) { + struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf; + struct task_struct *tsk = current; + if (!used_math()) return 0; + + if (!access_ok(VERIFY_WRITE, buf, sig_xstate_ia32_size)) + return -EACCES; /* * This will cause a "finit" to be triggered by the next * attempted FPU operation by the 'current' process. @@ -451,13 +518,17 @@ int save_i387_ia32(struct _fpstate_ia32 __user *buf) if (!HAVE_HWFP) { return fpregs_soft_get(current, NULL, 0, sizeof(struct user_i387_ia32_struct), - NULL, buf) ? -1 : 1; + NULL, fp) ? -1 : 1; } + unlazy_fpu(tsk); + + if (cpu_has_xsave) + return save_i387_xsave(fp); if (cpu_has_fxsr) - return save_i387_fxsave(buf); + return save_i387_fxsave(fp); else - return save_i387_fsave(buf); + return save_i387_fsave(fp); } static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf) @@ -468,14 +539,15 @@ static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf) sizeof(struct i387_fsave_struct)); } -static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf) +static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf, + unsigned int size) { struct task_struct *tsk = current; struct user_i387_ia32_struct env; int err; err = __copy_from_user(&tsk->thread.xstate->fxsave, &buf->_fxsr_env[0], - sizeof(struct i387_fxsave_struct)); + size); /* mxcsr reserved bits must be masked to zero for security reasons */ tsk->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; if (err || __copy_from_user(&env, buf, sizeof(env))) @@ -485,14 +557,69 @@ static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf) return 0; } -int restore_i387_ia32(struct _fpstate_ia32 __user *buf) +static int restore_i387_xsave(void __user *buf) +{ + struct _fpx_sw_bytes fx_sw_user; + struct _fpstate_ia32 __user *fx_user = + ((struct _fpstate_ia32 __user *) buf); + struct i387_fxsave_struct __user *fx = + (struct i387_fxsave_struct __user *) &fx_user->_fxsr_env[0]; + struct xsave_hdr_struct *xsave_hdr = + ¤t->thread.xstate->xsave.xsave_hdr; + u64 mask; + int err; + + if (check_for_xstate(fx, buf, &fx_sw_user)) + goto fx_only; + + mask = fx_sw_user.xstate_bv; + + err = restore_i387_fxsave(buf, fx_sw_user.xstate_size); + + xsave_hdr->xstate_bv &= pcntxt_mask; + /* + * These bits must be zero. + */ + xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0; + + /* + * Init the state that is not present in the memory layout + * and enabled by the OS. + */ + mask = ~(pcntxt_mask & ~mask); + xsave_hdr->xstate_bv &= mask; + + return err; +fx_only: + /* + * Couldn't find the extended state information in the memory + * layout. Restore the FP/SSE and init the other extended state + * enabled by the OS. + */ + xsave_hdr->xstate_bv = XSTATE_FPSSE; + return restore_i387_fxsave(buf, sizeof(struct i387_fxsave_struct)); +} + +int restore_i387_xstate_ia32(void __user *buf) { int err; struct task_struct *tsk = current; + struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf; if (HAVE_HWFP) clear_fpu(tsk); + if (!buf) { + if (used_math()) { + clear_fpu(tsk); + clear_used_math(); + } + + return 0; + } else + if (!access_ok(VERIFY_READ, buf, sig_xstate_ia32_size)) + return -EACCES; + if (!used_math()) { err = init_fpu(tsk); if (err) @@ -500,14 +627,17 @@ int restore_i387_ia32(struct _fpstate_ia32 __user *buf) } if (HAVE_HWFP) { - if (cpu_has_fxsr) - err = restore_i387_fxsave(buf); + if (cpu_has_xsave) + err = restore_i387_xsave(buf); + else if (cpu_has_fxsr) + err = restore_i387_fxsave(fp, sizeof(struct + i387_fxsave_struct)); else - err = restore_i387_fsave(buf); + err = restore_i387_fsave(fp); } else { err = fpregs_soft_set(current, NULL, 0, sizeof(struct user_i387_ia32_struct), - NULL, buf) != 0; + NULL, fp) != 0; } set_used_math(); diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index dc92b49d9204..4b8a53d841f7 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -282,6 +282,30 @@ static int __init i8259A_init_sysfs(void) device_initcall(i8259A_init_sysfs); +void mask_8259A(void) +{ + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + + outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ + outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ + + spin_unlock_irqrestore(&i8259A_lock, flags); +} + +void unmask_8259A(void) +{ + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + + outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ + outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ + + spin_unlock_irqrestore(&i8259A_lock, flags); +} + void init_8259A(int auto_eoi) { unsigned long flags; diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic.c index 6510cde36b35..b764d7429c61 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic.c @@ -27,16 +27,21 @@ #include <linux/sched.h> #include <linux/pci.h> #include <linux/mc146818rtc.h> +#include <linux/compiler.h> #include <linux/acpi.h> +#include <linux/module.h> #include <linux/sysdev.h> #include <linux/msi.h> #include <linux/htirq.h> -#include <linux/dmar.h> -#include <linux/jiffies.h> +#include <linux/freezer.h> +#include <linux/kthread.h> +#include <linux/jiffies.h> /* time_after() */ #ifdef CONFIG_ACPI #include <acpi/acpi_bus.h> #endif #include <linux/bootmem.h> +#include <linux/dmar.h> +#include <linux/hpet.h> #include <asm/idle.h> #include <asm/io.h> @@ -45,62 +50,31 @@ #include <asm/proto.h> #include <asm/acpi.h> #include <asm/dma.h> +#include <asm/timer.h> +#include <asm/i8259.h> #include <asm/nmi.h> #include <asm/msidef.h> #include <asm/hypertransport.h> +#include <asm/setup.h> +#include <asm/irq_remapping.h> +#include <asm/hpet.h> +#include <asm/uv/uv_hub.h> +#include <asm/uv/uv_irq.h> #include <mach_ipi.h> #include <mach_apic.h> +#include <mach_apicdef.h> -struct irq_cfg { - cpumask_t domain; - cpumask_t old_domain; - unsigned move_cleanup_count; - u8 vector; - u8 move_in_progress : 1; -}; - -/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ -static struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = { - [0] = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, - [1] = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, - [2] = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, - [3] = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, }, - [4] = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, }, - [5] = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, }, - [6] = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, }, - [7] = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, }, - [8] = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, }, - [9] = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, }, - [10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, }, - [11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, }, - [12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, }, - [13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, }, - [14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, }, - [15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, }, -}; - -static int assign_irq_vector(int irq, cpumask_t mask); +#define __apicdebuginit(type) static type __init -int first_system_vector = 0xfe; - -char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; - -#define __apicdebuginit __init - -int sis_apic_bug; /* not actually supported, dummy for compile */ - -static int no_timer_check; - -static int disable_timer_pin_1 __initdata; - -int timer_through_8259 __initdata; - -/* Where if anywhere is the i8259 connect in external int mode */ -static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; +/* + * Is the SiS APIC rmw bug present ? + * -1 = don't know, 0 = no, 1 = yes + */ +int sis_apic_bug = -1; static DEFINE_SPINLOCK(ioapic_lock); -DEFINE_SPINLOCK(vector_lock); +static DEFINE_SPINLOCK(vector_lock); /* * # of IRQ routing registers @@ -117,11 +91,69 @@ struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; /* # of MP IRQ source entries */ int mp_irq_entries; +#if defined (CONFIG_MCA) || defined (CONFIG_EISA) +int mp_bus_id_to_type[MAX_MP_BUSSES]; +#endif + DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); +int skip_ioapic_setup; + +static int __init parse_noapic(char *str) +{ + /* disable IO-APIC */ + disable_ioapic_setup(); + return 0; +} +early_param("noapic", parse_noapic); + +struct irq_pin_list; +struct irq_cfg { + unsigned int irq; + struct irq_pin_list *irq_2_pin; + cpumask_t domain; + cpumask_t old_domain; + unsigned move_cleanup_count; + u8 vector; + u8 move_in_progress : 1; +}; + +/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ +static struct irq_cfg irq_cfgx[NR_IRQS] = { + [0] = { .irq = 0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, + [1] = { .irq = 1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, + [2] = { .irq = 2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, + [3] = { .irq = 3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, }, + [4] = { .irq = 4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, }, + [5] = { .irq = 5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, }, + [6] = { .irq = 6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, }, + [7] = { .irq = 7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, }, + [8] = { .irq = 8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, }, + [9] = { .irq = 9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, }, + [10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, }, + [11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, }, + [12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, }, + [13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, }, + [14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, }, + [15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, }, +}; + +#define for_each_irq_cfg(irq, cfg) \ + for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++) + +static struct irq_cfg *irq_cfg(unsigned int irq) +{ + return irq < nr_irqs ? irq_cfgx + irq : NULL; +} + +static struct irq_cfg *irq_cfg_alloc(unsigned int irq) +{ + return irq_cfg(irq); +} + /* - * Rough estimation of how many shared IRQs there are, can - * be changed anytime. + * Rough estimation of how many shared IRQs there are, can be changed + * anytime. */ #define MAX_PLUS_SHARED_IRQS NR_IRQS #define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) @@ -133,9 +165,36 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); * between pins and IRQs. */ -static struct irq_pin_list { - short apic, pin, next; -} irq_2_pin[PIN_MAP_SIZE]; +struct irq_pin_list { + int apic, pin; + struct irq_pin_list *next; +}; + +static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE]; +static struct irq_pin_list *irq_2_pin_ptr; + +static void __init irq_2_pin_init(void) +{ + struct irq_pin_list *pin = irq_2_pin_head; + int i; + + for (i = 1; i < PIN_MAP_SIZE; i++) + pin[i-1].next = &pin[i]; + + irq_2_pin_ptr = &pin[0]; +} + +static struct irq_pin_list *get_one_free_irq_2_pin(void) +{ + struct irq_pin_list *pin = irq_2_pin_ptr; + + if (!pin) + panic("can not get more irq_2_pin\n"); + + irq_2_pin_ptr = pin->next; + pin->next = NULL; + return pin; +} struct io_apic { unsigned int index; @@ -166,10 +225,15 @@ static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned i /* * Re-write a value: to be used for read-modify-write * cycles where the read already set up the index register. + * + * Older SiS APIC requires we rewrite the index register */ -static inline void io_apic_modify(unsigned int apic, unsigned int value) +static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) { struct io_apic __iomem *io_apic = io_apic_base(apic); + + if (sis_apic_bug) + writel(reg, &io_apic->index); writel(value, &io_apic->data); } @@ -177,16 +241,17 @@ static bool io_apic_level_ack_pending(unsigned int irq) { struct irq_pin_list *entry; unsigned long flags; + struct irq_cfg *cfg = irq_cfg(irq); spin_lock_irqsave(&ioapic_lock, flags); - entry = irq_2_pin + irq; + entry = cfg->irq_2_pin; for (;;) { unsigned int reg; int pin; - pin = entry->pin; - if (pin == -1) + if (!entry) break; + pin = entry->pin; reg = io_apic_read(entry->apic, 0x10 + pin*2); /* Is the remote IRR bit set? */ if (reg & IO_APIC_REDIR_REMOTE_IRR) { @@ -195,45 +260,13 @@ static bool io_apic_level_ack_pending(unsigned int irq) } if (!entry->next) break; - entry = irq_2_pin + entry->next; + entry = entry->next; } spin_unlock_irqrestore(&ioapic_lock, flags); return false; } -/* - * Synchronize the IO-APIC and the CPU by doing - * a dummy read from the IO-APIC - */ -static inline void io_apic_sync(unsigned int apic) -{ - struct io_apic __iomem *io_apic = io_apic_base(apic); - readl(&io_apic->data); -} - -#define __DO_ACTION(R, ACTION, FINAL) \ - \ -{ \ - int pin; \ - struct irq_pin_list *entry = irq_2_pin + irq; \ - \ - BUG_ON(irq >= NR_IRQS); \ - for (;;) { \ - unsigned int reg; \ - pin = entry->pin; \ - if (pin == -1) \ - break; \ - reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ - reg ACTION; \ - io_apic_modify(entry->apic, reg); \ - FINAL; \ - if (!entry->next) \ - break; \ - entry = irq_2_pin + entry->next; \ - } \ -} - union entry_union { struct { u32 w1, w2; }; struct IO_APIC_route_entry entry; @@ -293,54 +326,71 @@ static void ioapic_mask_entry(int apic, int pin) static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) { int apic, pin; - struct irq_pin_list *entry = irq_2_pin + irq; + struct irq_cfg *cfg; + struct irq_pin_list *entry; - BUG_ON(irq >= NR_IRQS); + cfg = irq_cfg(irq); + entry = cfg->irq_2_pin; for (;;) { unsigned int reg; + + if (!entry) + break; + apic = entry->apic; pin = entry->pin; - if (pin == -1) - break; +#ifdef CONFIG_INTR_REMAP + /* + * With interrupt-remapping, destination information comes + * from interrupt-remapping table entry. + */ + if (!irq_remapped(irq)) + io_apic_write(apic, 0x11 + pin*2, dest); +#else io_apic_write(apic, 0x11 + pin*2, dest); +#endif reg = io_apic_read(apic, 0x10 + pin*2); reg &= ~IO_APIC_REDIR_VECTOR_MASK; reg |= vector; - io_apic_modify(apic, reg); + io_apic_modify(apic, 0x10 + pin*2, reg); if (!entry->next) break; - entry = irq_2_pin + entry->next; + entry = entry->next; } } +static int assign_irq_vector(int irq, cpumask_t mask); + static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) { - struct irq_cfg *cfg = irq_cfg + irq; + struct irq_cfg *cfg; unsigned long flags; unsigned int dest; cpumask_t tmp; + struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) return; + cfg = irq_cfg(irq); if (assign_irq_vector(irq, mask)) return; cpus_and(tmp, cfg->domain, mask); dest = cpu_mask_to_apicid(tmp); - /* * Only the high 8 bits are valid. */ dest = SET_APIC_LOGICAL_ID(dest); + desc = irq_to_desc(irq); spin_lock_irqsave(&ioapic_lock, flags); __target_IO_APIC_irq(irq, dest, cfg->vector); - irq_desc[irq].affinity = mask; + desc->affinity = mask; spin_unlock_irqrestore(&ioapic_lock, flags); } -#endif +#endif /* CONFIG_SMP */ /* * The common case is 1:1 IRQ<->pin mappings. Sometimes there are @@ -349,19 +399,30 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) */ static void add_pin_to_irq(unsigned int irq, int apic, int pin) { - static int first_free_entry = NR_IRQS; - struct irq_pin_list *entry = irq_2_pin + irq; + struct irq_cfg *cfg; + struct irq_pin_list *entry; + + /* first time to refer irq_cfg, so with new */ + cfg = irq_cfg_alloc(irq); + entry = cfg->irq_2_pin; + if (!entry) { + entry = get_one_free_irq_2_pin(); + cfg->irq_2_pin = entry; + entry->apic = apic; + entry->pin = pin; + return; + } - BUG_ON(irq >= NR_IRQS); - while (entry->next) - entry = irq_2_pin + entry->next; + while (entry->next) { + /* not again, please */ + if (entry->apic == apic && entry->pin == pin) + return; - if (entry->pin != -1) { - entry->next = first_free_entry; - entry = irq_2_pin + entry->next; - if (++first_free_entry >= PIN_MAP_SIZE) - panic("io_apic.c: ran out of irq_2_pin entries!"); + entry = entry->next; } + + entry->next = get_one_free_irq_2_pin(); + entry = entry->next; entry->apic = apic; entry->pin = pin; } @@ -373,30 +434,86 @@ static void __init replace_pin_at_irq(unsigned int irq, int oldapic, int oldpin, int newapic, int newpin) { - struct irq_pin_list *entry = irq_2_pin + irq; + struct irq_cfg *cfg = irq_cfg(irq); + struct irq_pin_list *entry = cfg->irq_2_pin; + int replaced = 0; - while (1) { + while (entry) { if (entry->apic == oldapic && entry->pin == oldpin) { entry->apic = newapic; entry->pin = newpin; - } - if (!entry->next) + replaced = 1; + /* every one is different, right? */ break; - entry = irq_2_pin + entry->next; + } + entry = entry->next; } + + /* why? call replace before add? */ + if (!replaced) + add_pin_to_irq(irq, newapic, newpin); } +static inline void io_apic_modify_irq(unsigned int irq, + int mask_and, int mask_or, + void (*final)(struct irq_pin_list *entry)) +{ + int pin; + struct irq_cfg *cfg; + struct irq_pin_list *entry; -#define DO_ACTION(name,R,ACTION, FINAL) \ - \ - static void name##_IO_APIC_irq (unsigned int irq) \ - __DO_ACTION(R, ACTION, FINAL) + cfg = irq_cfg(irq); + for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { + unsigned int reg; + pin = entry->pin; + reg = io_apic_read(entry->apic, 0x10 + pin * 2); + reg &= mask_and; + reg |= mask_or; + io_apic_modify(entry->apic, 0x10 + pin * 2, reg); + if (final) + final(entry); + } +} -/* mask = 1 */ -DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic)) +static void __unmask_IO_APIC_irq(unsigned int irq) +{ + io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 0, NULL); +} -/* mask = 0 */ -DO_ACTION(__unmask, 0, &= ~IO_APIC_REDIR_MASKED, ) +#ifdef CONFIG_X86_64 +void io_apic_sync(struct irq_pin_list *entry) +{ + /* + * Synchronize the IO-APIC and the CPU by doing + * a dummy read from the IO-APIC + */ + struct io_apic __iomem *io_apic; + io_apic = io_apic_base(entry->apic); + readl(&io_apic->data); +} + +static void __mask_IO_APIC_irq(unsigned int irq) +{ + io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); +} +#else /* CONFIG_X86_32 */ +static void __mask_IO_APIC_irq(unsigned int irq) +{ + io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, NULL); +} + +static void __mask_and_edge_IO_APIC_irq(unsigned int irq) +{ + io_apic_modify_irq(irq, ~IO_APIC_REDIR_LEVEL_TRIGGER, + IO_APIC_REDIR_MASKED, NULL); +} + +static void __unmask_and_level_IO_APIC_irq(unsigned int irq) +{ + io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, + IO_APIC_REDIR_LEVEL_TRIGGER, NULL); +} +#endif /* CONFIG_X86_32 */ static void mask_IO_APIC_irq (unsigned int irq) { @@ -439,24 +556,145 @@ static void clear_IO_APIC (void) clear_IO_APIC_pin(apic, pin); } -int skip_ioapic_setup; -int ioapic_force; +#if !defined(CONFIG_SMP) && defined(CONFIG_X86_32) +void send_IPI_self(int vector) +{ + unsigned int cfg; -static int __init parse_noapic(char *str) + /* + * Wait for idle. + */ + apic_wait_icr_idle(); + cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL; + /* + * Send the IPI. The write to APIC_ICR fires this off. + */ + apic_write(APIC_ICR, cfg); +} +#endif /* !CONFIG_SMP && CONFIG_X86_32*/ + +#ifdef CONFIG_X86_32 +/* + * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to + * specific CPU-side IRQs. + */ + +#define MAX_PIRQS 8 +static int pirq_entries [MAX_PIRQS]; +static int pirqs_enabled; + +static int __init ioapic_pirq_setup(char *str) { - disable_ioapic_setup(); + int i, max; + int ints[MAX_PIRQS+1]; + + get_options(str, ARRAY_SIZE(ints), ints); + + for (i = 0; i < MAX_PIRQS; i++) + pirq_entries[i] = -1; + + pirqs_enabled = 1; + apic_printk(APIC_VERBOSE, KERN_INFO + "PIRQ redirection, working around broken MP-BIOS.\n"); + max = MAX_PIRQS; + if (ints[0] < MAX_PIRQS) + max = ints[0]; + + for (i = 0; i < max; i++) { + apic_printk(APIC_VERBOSE, KERN_DEBUG + "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); + /* + * PIRQs are mapped upside down, usually. + */ + pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; + } + return 1; +} + +__setup("pirq=", ioapic_pirq_setup); +#endif /* CONFIG_X86_32 */ + +#ifdef CONFIG_INTR_REMAP +/* I/O APIC RTE contents at the OS boot up */ +static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS]; + +/* + * Saves and masks all the unmasked IO-APIC RTE's + */ +int save_mask_IO_APIC_setup(void) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + int apic, pin; + + /* + * The number of IO-APIC IRQ registers (== #pins): + */ + for (apic = 0; apic < nr_ioapics; apic++) { + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(apic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + nr_ioapic_registers[apic] = reg_01.bits.entries+1; + } + + for (apic = 0; apic < nr_ioapics; apic++) { + early_ioapic_entries[apic] = + kzalloc(sizeof(struct IO_APIC_route_entry) * + nr_ioapic_registers[apic], GFP_KERNEL); + if (!early_ioapic_entries[apic]) + goto nomem; + } + + for (apic = 0; apic < nr_ioapics; apic++) + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + struct IO_APIC_route_entry entry; + + entry = early_ioapic_entries[apic][pin] = + ioapic_read_entry(apic, pin); + if (!entry.mask) { + entry.mask = 1; + ioapic_write_entry(apic, pin, entry); + } + } + return 0; + +nomem: + while (apic >= 0) + kfree(early_ioapic_entries[apic--]); + memset(early_ioapic_entries, 0, + ARRAY_SIZE(early_ioapic_entries)); + + return -ENOMEM; } -early_param("noapic", parse_noapic); -/* Actually the next is obsolete, but keep it for paranoid reasons -AK */ -static int __init disable_timer_pin_setup(char *arg) +void restore_IO_APIC_setup(void) { - disable_timer_pin_1 = 1; - return 1; + int apic, pin; + + for (apic = 0; apic < nr_ioapics; apic++) { + if (!early_ioapic_entries[apic]) + break; + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) + ioapic_write_entry(apic, pin, + early_ioapic_entries[apic][pin]); + kfree(early_ioapic_entries[apic]); + early_ioapic_entries[apic] = NULL; + } } -__setup("disable_timer_pin_1", disable_timer_pin_setup); +void reinit_intr_remapped_IO_APIC(int intr_remapping) +{ + /* + * for now plain restore of previous settings. + * TBD: In the case of OS enabling interrupt-remapping, + * IO-APIC RTE's need to be setup to point to interrupt-remapping + * table entries. for now, do a plain restore, and wait for + * the setup_IO_APIC_irqs() to do proper initialization. + */ + restore_IO_APIC_setup(); +} +#endif /* * Find the IRQ entry number of a certain pin. @@ -560,22 +798,54 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) best_guess = irq; } } - BUG_ON(best_guess >= NR_IRQS); return best_guess; } +EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); + +#if defined(CONFIG_EISA) || defined(CONFIG_MCA) +/* + * EISA Edge/Level control register, ELCR + */ +static int EISA_ELCR(unsigned int irq) +{ + if (irq < 16) { + unsigned int port = 0x4d0 + (irq >> 3); + return (inb(port) >> (irq & 7)) & 1; + } + apic_printk(APIC_VERBOSE, KERN_INFO + "Broken MPtable reports ISA irq %d\n", irq); + return 0; +} + +#endif + /* ISA interrupts are always polarity zero edge triggered, * when listed as conforming in the MP table. */ #define default_ISA_trigger(idx) (0) #define default_ISA_polarity(idx) (0) +/* EISA interrupts are always polarity zero and can be edge or level + * trigger depending on the ELCR value. If an interrupt is listed as + * EISA conforming in the MP table, that means its trigger type must + * be read in from the ELCR */ + +#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq)) +#define default_EISA_polarity(idx) default_ISA_polarity(idx) + /* PCI interrupts are always polarity one level triggered, * when listed as conforming in the MP table. */ #define default_PCI_trigger(idx) (1) #define default_PCI_polarity(idx) (1) +/* MCA interrupts are always polarity zero level triggered, + * when listed as conforming in the MP table. */ + +#define default_MCA_trigger(idx) (1) +#define default_MCA_polarity(idx) default_ISA_polarity(idx) + static int MPBIOS_polarity(int idx) { int bus = mp_irqs[idx].mp_srcbus; @@ -633,6 +903,36 @@ static int MPBIOS_trigger(int idx) trigger = default_ISA_trigger(idx); else trigger = default_PCI_trigger(idx); +#if defined(CONFIG_EISA) || defined(CONFIG_MCA) + switch (mp_bus_id_to_type[bus]) { + case MP_BUS_ISA: /* ISA pin */ + { + /* set before the switch */ + break; + } + case MP_BUS_EISA: /* EISA pin */ + { + trigger = default_EISA_trigger(idx); + break; + } + case MP_BUS_PCI: /* PCI pin */ + { + /* set before the switch */ + break; + } + case MP_BUS_MCA: /* MCA pin */ + { + trigger = default_MCA_trigger(idx); + break; + } + default: + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 1; + break; + } + } +#endif break; case 1: /* edge */ { @@ -670,6 +970,7 @@ static inline int irq_trigger(int idx) return MPBIOS_trigger(idx); } +int (*ioapic_renumber_irq)(int ioapic, int irq); static int pin_2_irq(int idx, int apic, int pin) { int irq, i; @@ -691,11 +992,48 @@ static int pin_2_irq(int idx, int apic, int pin) while (i < apic) irq += nr_ioapic_registers[i++]; irq += pin; + /* + * For MPS mode, so far only needed by ES7000 platform + */ + if (ioapic_renumber_irq) + irq = ioapic_renumber_irq(apic, irq); } - BUG_ON(irq >= NR_IRQS); + +#ifdef CONFIG_X86_32 + /* + * PCI IRQ command line redirection. Yes, limits are hardcoded. + */ + if ((pin >= 16) && (pin <= 23)) { + if (pirq_entries[pin-16] != -1) { + if (!pirq_entries[pin-16]) { + apic_printk(APIC_VERBOSE, KERN_DEBUG + "disabling PIRQ%d\n", pin-16); + } else { + irq = pirq_entries[pin-16]; + apic_printk(APIC_VERBOSE, KERN_DEBUG + "using PIRQ%d -> IRQ %d\n", + pin-16, irq); + } + } + } +#endif + return irq; } +void lock_vector_lock(void) +{ + /* Used to the online set of cpus does not change + * during assign_irq_vector. + */ + spin_lock(&vector_lock); +} + +void unlock_vector_lock(void) +{ + spin_unlock(&vector_lock); +} + static int __assign_irq_vector(int irq, cpumask_t mask) { /* @@ -714,8 +1052,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask) int cpu; struct irq_cfg *cfg; - BUG_ON((unsigned)irq >= NR_IRQS); - cfg = &irq_cfg[irq]; + cfg = irq_cfg(irq); /* Only try and allocate irqs on cpus that are present */ cpus_and(mask, mask, cpu_online_map); @@ -731,7 +1068,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask) return 0; } - for_each_cpu_mask(cpu, mask) { + for_each_cpu_mask_nr(cpu, mask) { cpumask_t domain, new_mask; int new_cpu; int vector, offset; @@ -750,9 +1087,14 @@ next: } if (unlikely(current_vector == vector)) continue; +#ifdef CONFIG_X86_64 if (vector == IA32_SYSCALL_VECTOR) goto next; - for_each_cpu_mask(new_cpu, new_mask) +#else + if (vector == SYSCALL_VECTOR) + goto next; +#endif + for_each_cpu_mask_nr(new_cpu, new_mask) if (per_cpu(vector_irq, new_cpu)[vector] != -1) goto next; /* Found one! */ @@ -762,7 +1104,7 @@ next: cfg->move_in_progress = 1; cfg->old_domain = cfg->domain; } - for_each_cpu_mask(new_cpu, new_mask) + for_each_cpu_mask_nr(new_cpu, new_mask) per_cpu(vector_irq, new_cpu)[vector] = irq; cfg->vector = vector; cfg->domain = domain; @@ -788,30 +1130,30 @@ static void __clear_irq_vector(int irq) cpumask_t mask; int cpu, vector; - BUG_ON((unsigned)irq >= NR_IRQS); - cfg = &irq_cfg[irq]; + cfg = irq_cfg(irq); BUG_ON(!cfg->vector); vector = cfg->vector; cpus_and(mask, cfg->domain, cpu_online_map); - for_each_cpu_mask(cpu, mask) + for_each_cpu_mask_nr(cpu, mask) per_cpu(vector_irq, cpu)[vector] = -1; cfg->vector = 0; cpus_clear(cfg->domain); } -static void __setup_vector_irq(int cpu) +void __setup_vector_irq(int cpu) { /* Initialize vector_irq on a new cpu */ /* This function must be called with vector_lock held */ int irq, vector; + struct irq_cfg *cfg; /* Mark the inuse vectors */ - for (irq = 0; irq < NR_IRQS; ++irq) { - if (!cpu_isset(cpu, irq_cfg[irq].domain)) + for_each_irq_cfg(irq, cfg) { + if (!cpu_isset(cpu, cfg->domain)) continue; - vector = irq_cfg[irq].vector; + vector = cfg->vector; per_cpu(vector_irq, cpu)[vector] = irq; } /* Mark the free vectors */ @@ -819,44 +1161,154 @@ static void __setup_vector_irq(int cpu) irq = per_cpu(vector_irq, cpu)[vector]; if (irq < 0) continue; - if (!cpu_isset(cpu, irq_cfg[irq].domain)) + + cfg = irq_cfg(irq); + if (!cpu_isset(cpu, cfg->domain)) per_cpu(vector_irq, cpu)[vector] = -1; } } -void setup_vector_irq(int cpu) -{ - spin_lock(&vector_lock); - __setup_vector_irq(smp_processor_id()); - spin_unlock(&vector_lock); -} +static struct irq_chip ioapic_chip; +#ifdef CONFIG_INTR_REMAP +static struct irq_chip ir_ioapic_chip; +#endif +#define IOAPIC_AUTO -1 +#define IOAPIC_EDGE 0 +#define IOAPIC_LEVEL 1 -static struct irq_chip ioapic_chip; +#ifdef CONFIG_X86_32 +static inline int IO_APIC_irq_trigger(int irq) +{ + int apic, idx, pin; + + for (apic = 0; apic < nr_ioapics; apic++) { + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + idx = find_irq_entry(apic, pin, mp_INT); + if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) + return irq_trigger(idx); + } + } + /* + * nonexistent IRQs are edge default + */ + return 0; +} +#else +static inline int IO_APIC_irq_trigger(int irq) +{ + return 1; +} +#endif static void ioapic_register_intr(int irq, unsigned long trigger) { - if (trigger) { - irq_desc[irq].status |= IRQ_LEVEL; + struct irq_desc *desc; + + desc = irq_to_desc(irq); + + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || + trigger == IOAPIC_LEVEL) + desc->status |= IRQ_LEVEL; + else + desc->status &= ~IRQ_LEVEL; + +#ifdef CONFIG_INTR_REMAP + if (irq_remapped(irq)) { + desc->status |= IRQ_MOVE_PCNTXT; + if (trigger) + set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, + handle_fasteoi_irq, + "fasteoi"); + else + set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, + handle_edge_irq, "edge"); + return; + } +#endif + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || + trigger == IOAPIC_LEVEL) set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_fasteoi_irq, "fasteoi"); - } else { - irq_desc[irq].status &= ~IRQ_LEVEL; + handle_fasteoi_irq, + "fasteoi"); + else set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_edge_irq, "edge"); +} + +static int setup_ioapic_entry(int apic, int irq, + struct IO_APIC_route_entry *entry, + unsigned int destination, int trigger, + int polarity, int vector) +{ + /* + * add it to the IO-APIC irq-routing table: + */ + memset(entry,0,sizeof(*entry)); + +#ifdef CONFIG_INTR_REMAP + if (intr_remapping_enabled) { + struct intel_iommu *iommu = map_ioapic_to_ir(apic); + struct irte irte; + struct IR_IO_APIC_route_entry *ir_entry = + (struct IR_IO_APIC_route_entry *) entry; + int index; + + if (!iommu) + panic("No mapping iommu for ioapic %d\n", apic); + + index = alloc_irte(iommu, irq, 1); + if (index < 0) + panic("Failed to allocate IRTE for ioapic %d\n", apic); + + memset(&irte, 0, sizeof(irte)); + + irte.present = 1; + irte.dst_mode = INT_DEST_MODE; + irte.trigger_mode = trigger; + irte.dlvry_mode = INT_DELIVERY_MODE; + irte.vector = vector; + irte.dest_id = IRTE_DEST(destination); + + modify_irte(irq, &irte); + + ir_entry->index2 = (index >> 15) & 0x1; + ir_entry->zero = 0; + ir_entry->format = 1; + ir_entry->index = (index & 0x7fff); + } else +#endif + { + entry->delivery_mode = INT_DELIVERY_MODE; + entry->dest_mode = INT_DEST_MODE; + entry->dest = destination; } + + entry->mask = 0; /* enable IRQ */ + entry->trigger = trigger; + entry->polarity = polarity; + entry->vector = vector; + + /* Mask level triggered irqs. + * Use IRQ_DELAYED_DISABLE for edge triggered irqs. + */ + if (trigger) + entry->mask = 1; + return 0; } static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, int trigger, int polarity) { - struct irq_cfg *cfg = irq_cfg + irq; + struct irq_cfg *cfg; struct IO_APIC_route_entry entry; cpumask_t mask; if (!IO_APIC_IRQ(irq)) return; + cfg = irq_cfg(irq); + mask = TARGET_CPUS; if (assign_irq_vector(irq, mask)) return; @@ -869,24 +1321,15 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector, irq, trigger, polarity); - /* - * add it to the IO-APIC irq-routing table: - */ - memset(&entry,0,sizeof(entry)); - entry.delivery_mode = INT_DELIVERY_MODE; - entry.dest_mode = INT_DEST_MODE; - entry.dest = cpu_mask_to_apicid(mask); - entry.mask = 0; /* enable IRQ */ - entry.trigger = trigger; - entry.polarity = polarity; - entry.vector = cfg->vector; - - /* Mask level triggered irqs. - * Use IRQ_DELAYED_DISABLE for edge triggered irqs. - */ - if (trigger) - entry.mask = 1; + if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry, + cpu_mask_to_apicid(mask), trigger, polarity, + cfg->vector)) { + printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", + mp_ioapics[apic].mp_apicid, pin); + __clear_irq_vector(irq); + return; + } ioapic_register_intr(irq, trigger); if (irq < 16) @@ -897,37 +1340,49 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, static void __init setup_IO_APIC_irqs(void) { - int apic, pin, idx, irq, first_notcon = 1; + int apic, pin, idx, irq; + int notcon = 0; apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - - idx = find_irq_entry(apic,pin,mp_INT); - if (idx == -1) { - if (first_notcon) { - apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin); - first_notcon = 0; - } else - apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin); - continue; - } - if (!first_notcon) { - apic_printk(APIC_VERBOSE, " not connected.\n"); - first_notcon = 1; - } + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + + idx = find_irq_entry(apic, pin, mp_INT); + if (idx == -1) { + if (!notcon) { + notcon = 1; + apic_printk(APIC_VERBOSE, + KERN_DEBUG " %d-%d", + mp_ioapics[apic].mp_apicid, + pin); + } else + apic_printk(APIC_VERBOSE, " %d-%d", + mp_ioapics[apic].mp_apicid, + pin); + continue; + } + if (notcon) { + apic_printk(APIC_VERBOSE, + " (apicid-pin) not connected\n"); + notcon = 0; + } - irq = pin_2_irq(idx, apic, pin); - add_pin_to_irq(irq, apic, pin); + irq = pin_2_irq(idx, apic, pin); +#ifdef CONFIG_X86_32 + if (multi_timer_check(apic, irq)) + continue; +#endif + add_pin_to_irq(irq, apic, pin); - setup_IO_APIC_irq(apic, pin, irq, - irq_trigger(idx), irq_polarity(idx)); - } + setup_IO_APIC_irq(apic, pin, irq, + irq_trigger(idx), irq_polarity(idx)); + } } - if (!first_notcon) - apic_printk(APIC_VERBOSE, " not connected.\n"); + if (notcon) + apic_printk(APIC_VERBOSE, + " (apicid-pin) not connected\n"); } /* @@ -938,6 +1393,11 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, { struct IO_APIC_route_entry entry; +#ifdef CONFIG_INTR_REMAP + if (intr_remapping_enabled) + return; +#endif + memset(&entry, 0, sizeof(entry)); /* @@ -964,13 +1424,17 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, ioapic_write_entry(apic, pin, entry); } -void __apicdebuginit print_IO_APIC(void) + +__apicdebuginit(void) print_IO_APIC(void) { int apic, i; union IO_APIC_reg_00 reg_00; union IO_APIC_reg_01 reg_01; union IO_APIC_reg_02 reg_02; + union IO_APIC_reg_03 reg_03; unsigned long flags; + struct irq_cfg *cfg; + unsigned int irq; if (apic_verbosity == APIC_QUIET) return; @@ -993,12 +1457,16 @@ void __apicdebuginit print_IO_APIC(void) reg_01.raw = io_apic_read(apic, 1); if (reg_01.bits.version >= 0x10) reg_02.raw = io_apic_read(apic, 2); + if (reg_01.bits.version >= 0x20) + reg_03.raw = io_apic_read(apic, 3); spin_unlock_irqrestore(&ioapic_lock, flags); printk("\n"); printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid); printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); + printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); + printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); @@ -1006,11 +1474,27 @@ void __apicdebuginit print_IO_APIC(void) printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); - if (reg_01.bits.version >= 0x10) { + /* + * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, + * but the value of reg_02 is read as the previous read register + * value, so ignore it if reg_02 == reg_01. + */ + if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); } + /* + * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02 + * or reg_03, but the value of reg_0[23] is read as the previous read + * register value, so ignore it if reg_03 == reg_0[12]. + */ + if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw && + reg_03.raw != reg_01.raw) { + printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); + printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); + } + printk(KERN_DEBUG ".... IRQ redirection table:\n"); printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" @@ -1039,16 +1523,16 @@ void __apicdebuginit print_IO_APIC(void) } } printk(KERN_DEBUG "IRQ to pin mappings:\n"); - for (i = 0; i < NR_IRQS; i++) { - struct irq_pin_list *entry = irq_2_pin + i; - if (entry->pin < 0) + for_each_irq_cfg(irq, cfg) { + struct irq_pin_list *entry = cfg->irq_2_pin; + if (!entry) continue; - printk(KERN_DEBUG "IRQ%d ", i); + printk(KERN_DEBUG "IRQ%d ", irq); for (;;) { printk("-> %d:%d", entry->apic, entry->pin); if (!entry->next) break; - entry = irq_2_pin + entry->next; + entry = entry->next; } printk("\n"); } @@ -1058,9 +1542,7 @@ void __apicdebuginit print_IO_APIC(void) return; } -#if 0 - -static __apicdebuginit void print_APIC_bitfield (int base) +__apicdebuginit(void) print_APIC_bitfield(int base) { unsigned int v; int i, j; @@ -1081,9 +1563,10 @@ static __apicdebuginit void print_APIC_bitfield (int base) } } -void __apicdebuginit print_local_APIC(void * dummy) +__apicdebuginit(void) print_local_APIC(void *dummy) { unsigned int v, ver, maxlvt; + u64 icr; if (apic_verbosity == APIC_QUIET) return; @@ -1091,7 +1574,7 @@ void __apicdebuginit print_local_APIC(void * dummy) printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", smp_processor_id(), hard_smp_processor_id()); v = apic_read(APIC_ID); - printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(read_apic_id())); + printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, read_apic_id()); v = apic_read(APIC_LVR); printk(KERN_INFO "... APIC VERSION: %08x\n", v); ver = GET_APIC_VERSION(v); @@ -1100,20 +1583,31 @@ void __apicdebuginit print_local_APIC(void * dummy) v = apic_read(APIC_TASKPRI); printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); - v = apic_read(APIC_ARBPRI); - printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, - v & APIC_ARBPRI_MASK); - v = apic_read(APIC_PROCPRI); - printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); + if (APIC_INTEGRATED(ver)) { /* !82489DX */ + if (!APIC_XAPIC(ver)) { + v = apic_read(APIC_ARBPRI); + printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, + v & APIC_ARBPRI_MASK); + } + v = apic_read(APIC_PROCPRI); + printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); + } + + /* + * Remote read supported only in the 82489DX and local APIC for + * Pentium processors. + */ + if (!APIC_INTEGRATED(ver) || maxlvt == 3) { + v = apic_read(APIC_RRR); + printk(KERN_DEBUG "... APIC RRR: %08x\n", v); + } - v = apic_read(APIC_EOI); - printk(KERN_DEBUG "... APIC EOI: %08x\n", v); - v = apic_read(APIC_RRR); - printk(KERN_DEBUG "... APIC RRR: %08x\n", v); v = apic_read(APIC_LDR); printk(KERN_DEBUG "... APIC LDR: %08x\n", v); - v = apic_read(APIC_DFR); - printk(KERN_DEBUG "... APIC DFR: %08x\n", v); + if (!x2apic_enabled()) { + v = apic_read(APIC_DFR); + printk(KERN_DEBUG "... APIC DFR: %08x\n", v); + } v = apic_read(APIC_SPIV); printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); @@ -1124,13 +1618,17 @@ void __apicdebuginit print_local_APIC(void * dummy) printk(KERN_DEBUG "... APIC IRR field:\n"); print_APIC_bitfield(APIC_IRR); - v = apic_read(APIC_ESR); - printk(KERN_DEBUG "... APIC ESR: %08x\n", v); + if (APIC_INTEGRATED(ver)) { /* !82489DX */ + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); - v = apic_read(APIC_ICR); - printk(KERN_DEBUG "... APIC ICR: %08x\n", v); - v = apic_read(APIC_ICR2); - printk(KERN_DEBUG "... APIC ICR2: %08x\n", v); + v = apic_read(APIC_ESR); + printk(KERN_DEBUG "... APIC ESR: %08x\n", v); + } + + icr = apic_icr_read(); + printk(KERN_DEBUG "... APIC ICR: %08x\n", (u32)icr); + printk(KERN_DEBUG "... APIC ICR2: %08x\n", (u32)(icr >> 32)); v = apic_read(APIC_LVTT); printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); @@ -1158,12 +1656,17 @@ void __apicdebuginit print_local_APIC(void * dummy) printk("\n"); } -void print_all_local_APICs (void) +__apicdebuginit(void) print_all_local_APICs(void) { - on_each_cpu(print_local_APIC, NULL, 1); + int cpu; + + preempt_disable(); + for_each_online_cpu(cpu) + smp_call_function_single(cpu, print_local_APIC, NULL, 1); + preempt_enable(); } -void __apicdebuginit print_PIC(void) +__apicdebuginit(void) print_PIC(void) { unsigned int v; unsigned long flags; @@ -1195,19 +1698,34 @@ void __apicdebuginit print_PIC(void) printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); } -#endif /* 0 */ +__apicdebuginit(int) print_all_ICs(void) +{ + print_PIC(); + print_all_local_APICs(); + print_IO_APIC(); + + return 0; +} + +fs_initcall(print_all_ICs); + + +/* Where if anywhere is the i8259 connect in external int mode */ +static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; void __init enable_IO_APIC(void) { union IO_APIC_reg_01 reg_01; int i8259_apic, i8259_pin; - int i, apic; + int apic; unsigned long flags; - for (i = 0; i < PIN_MAP_SIZE; i++) { - irq_2_pin[i].pin = -1; - irq_2_pin[i].next = 0; - } +#ifdef CONFIG_X86_32 + int i; + if (!pirqs_enabled) + for (i = 0; i < MAX_PIRQS; i++) + pirq_entries[i] = -1; +#endif /* * The number of IO-APIC IRQ registers (== #pins): @@ -1237,6 +1755,10 @@ void __init enable_IO_APIC(void) } found_i8259: /* Look to see what if the MP table has reported the ExtINT */ + /* If we could not find the appropriate pin by looking at the ioapic + * the i8259 probably is not connected the ioapic but give the + * mptable a chance anyway. + */ i8259_pin = find_isa_irq_pin(0, mp_ExtINT); i8259_apic = find_isa_irq_apic(0, mp_ExtINT); /* Trust the MP table if nothing is setup in the hardware */ @@ -1285,7 +1807,7 @@ void disable_IO_APIC(void) entry.dest_mode = 0; /* Physical */ entry.delivery_mode = dest_ExtINT; /* ExtInt */ entry.vector = 0; - entry.dest = GET_APIC_ID(read_apic_id()); + entry.dest = read_apic_id(); /* * Add it to the IO-APIC irq-routing table: @@ -1296,6 +1818,133 @@ void disable_IO_APIC(void) disconnect_bsp_APIC(ioapic_i8259.pin != -1); } +#ifdef CONFIG_X86_32 +/* + * function to set the IO-APIC physical IDs based on the + * values stored in the MPC table. + * + * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 + */ + +static void __init setup_ioapic_ids_from_mpc(void) +{ + union IO_APIC_reg_00 reg_00; + physid_mask_t phys_id_present_map; + int apic; + int i; + unsigned char old_id; + unsigned long flags; + + if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids()) + return; + + /* + * Don't check I/O APIC IDs for xAPIC systems. They have + * no meaning without the serial APIC bus. + */ + if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + return; + /* + * This is broken; anything with a real cpu count has to + * circumvent this idiocy regardless. + */ + phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map); + + /* + * Set the IOAPIC ID to the value stored in the MPC table. + */ + for (apic = 0; apic < nr_ioapics; apic++) { + + /* Read the register 0 value */ + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + old_id = mp_ioapics[apic].mp_apicid; + + if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) { + printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", + apic, mp_ioapics[apic].mp_apicid); + printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", + reg_00.bits.ID); + mp_ioapics[apic].mp_apicid = reg_00.bits.ID; + } + + /* + * Sanity check, is the ID really free? Every APIC in a + * system must have a unique ID or we get lots of nice + * 'stuck on smp_invalidate_needed IPI wait' messages. + */ + if (check_apicid_used(phys_id_present_map, + mp_ioapics[apic].mp_apicid)) { + printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", + apic, mp_ioapics[apic].mp_apicid); + for (i = 0; i < get_physical_broadcast(); i++) + if (!physid_isset(i, phys_id_present_map)) + break; + if (i >= get_physical_broadcast()) + panic("Max APIC ID exceeded!\n"); + printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", + i); + physid_set(i, phys_id_present_map); + mp_ioapics[apic].mp_apicid = i; + } else { + physid_mask_t tmp; + tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid); + apic_printk(APIC_VERBOSE, "Setting %d in the " + "phys_id_present_map\n", + mp_ioapics[apic].mp_apicid); + physids_or(phys_id_present_map, phys_id_present_map, tmp); + } + + + /* + * We need to adjust the IRQ routing table + * if the ID changed. + */ + if (old_id != mp_ioapics[apic].mp_apicid) + for (i = 0; i < mp_irq_entries; i++) + if (mp_irqs[i].mp_dstapic == old_id) + mp_irqs[i].mp_dstapic + = mp_ioapics[apic].mp_apicid; + + /* + * Read the right value from the MPC table and + * write it into the ID register. + */ + apic_printk(APIC_VERBOSE, KERN_INFO + "...changing IO-APIC physical APIC ID to %d ...", + mp_ioapics[apic].mp_apicid); + + reg_00.bits.ID = mp_ioapics[apic].mp_apicid; + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0, reg_00.raw); + spin_unlock_irqrestore(&ioapic_lock, flags); + + /* + * Sanity check + */ + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid) + printk("could not set ID!\n"); + else + apic_printk(APIC_VERBOSE, " ok.\n"); + } +} +#endif + +int no_timer_check __initdata; + +static int __init notimercheck(char *s) +{ + no_timer_check = 1; + return 1; +} +__setup("no_timer_check", notimercheck); + /* * There is a nasty bug in some older SMP boards, their mptable lies * about the timer IRQ. We do the following to work around the situation: @@ -1309,6 +1958,9 @@ static int __init timer_irq_works(void) unsigned long t1 = jiffies; unsigned long flags; + if (no_timer_check) + return 1; + local_save_flags(flags); local_irq_enable(); /* Let ten ticks pass... */ @@ -1369,19 +2021,27 @@ static unsigned int startup_ioapic_irq(unsigned int irq) return was_pending; } +#ifdef CONFIG_X86_64 static int ioapic_retrigger_irq(unsigned int irq) { - struct irq_cfg *cfg = &irq_cfg[irq]; - cpumask_t mask; + + struct irq_cfg *cfg = irq_cfg(irq); unsigned long flags; spin_lock_irqsave(&vector_lock, flags); - mask = cpumask_of_cpu(first_cpu(cfg->domain)); - send_IPI_mask(mask, cfg->vector); + send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector); spin_unlock_irqrestore(&vector_lock, flags); return 1; } +#else +static int ioapic_retrigger_irq(unsigned int irq) +{ + send_IPI_self(irq_cfg(irq)->vector); + + return 1; +} +#endif /* * Level and edge triggered IO-APIC interrupts need different handling, @@ -1393,11 +2053,159 @@ static int ioapic_retrigger_irq(unsigned int irq) */ #ifdef CONFIG_SMP + +#ifdef CONFIG_INTR_REMAP +static void ir_irq_migration(struct work_struct *work); + +static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration); + +/* + * Migrate the IO-APIC irq in the presence of intr-remapping. + * + * For edge triggered, irq migration is a simple atomic update(of vector + * and cpu destination) of IRTE and flush the hardware cache. + * + * For level triggered, we need to modify the io-apic RTE aswell with the update + * vector information, along with modifying IRTE with vector and destination. + * So irq migration for level triggered is little bit more complex compared to + * edge triggered migration. But the good news is, we use the same algorithm + * for level triggered migration as we have today, only difference being, + * we now initiate the irq migration from process context instead of the + * interrupt context. + * + * In future, when we do a directed EOI (combined with cpu EOI broadcast + * suppression) to the IO-APIC, level triggered irq migration will also be + * as simple as edge triggered migration and we can do the irq migration + * with a simple atomic update to IO-APIC RTE. + */ +static void migrate_ioapic_irq(int irq, cpumask_t mask) +{ + struct irq_cfg *cfg; + struct irq_desc *desc; + cpumask_t tmp, cleanup_mask; + struct irte irte; + int modify_ioapic_rte; + unsigned int dest; + unsigned long flags; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + return; + + if (get_irte(irq, &irte)) + return; + + if (assign_irq_vector(irq, mask)) + return; + + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); + + desc = irq_to_desc(irq); + modify_ioapic_rte = desc->status & IRQ_LEVEL; + if (modify_ioapic_rte) { + spin_lock_irqsave(&ioapic_lock, flags); + __target_IO_APIC_irq(irq, dest, cfg->vector); + spin_unlock_irqrestore(&ioapic_lock, flags); + } + + irte.vector = cfg->vector; + irte.dest_id = IRTE_DEST(dest); + + /* + * Modified the IRTE and flushes the Interrupt entry cache. + */ + modify_irte(irq, &irte); + + if (cfg->move_in_progress) { + cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); + cfg->move_cleanup_count = cpus_weight(cleanup_mask); + send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + cfg->move_in_progress = 0; + } + + desc->affinity = mask; +} + +static int migrate_irq_remapped_level(int irq) +{ + int ret = -1; + struct irq_desc *desc = irq_to_desc(irq); + + mask_IO_APIC_irq(irq); + + if (io_apic_level_ack_pending(irq)) { + /* + * Interrupt in progress. Migrating irq now will change the + * vector information in the IO-APIC RTE and that will confuse + * the EOI broadcast performed by cpu. + * So, delay the irq migration to the next instance. + */ + schedule_delayed_work(&ir_migration_work, 1); + goto unmask; + } + + /* everthing is clear. we have right of way */ + migrate_ioapic_irq(irq, desc->pending_mask); + + ret = 0; + desc->status &= ~IRQ_MOVE_PENDING; + cpus_clear(desc->pending_mask); + +unmask: + unmask_IO_APIC_irq(irq); + return ret; +} + +static void ir_irq_migration(struct work_struct *work) +{ + unsigned int irq; + struct irq_desc *desc; + + for_each_irq_desc(irq, desc) { + if (desc->status & IRQ_MOVE_PENDING) { + unsigned long flags; + + spin_lock_irqsave(&desc->lock, flags); + if (!desc->chip->set_affinity || + !(desc->status & IRQ_MOVE_PENDING)) { + desc->status &= ~IRQ_MOVE_PENDING; + spin_unlock_irqrestore(&desc->lock, flags); + continue; + } + + desc->chip->set_affinity(irq, desc->pending_mask); + spin_unlock_irqrestore(&desc->lock, flags); + } + } +} + +/* + * Migrates the IRQ destination in the process context. + */ +static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (desc->status & IRQ_LEVEL) { + desc->status |= IRQ_MOVE_PENDING; + desc->pending_mask = mask; + migrate_irq_remapped_level(irq); + return; + } + + migrate_ioapic_irq(irq, mask); +} +#endif + asmlinkage void smp_irq_move_cleanup_interrupt(void) { unsigned vector, me; ack_APIC_irq(); +#ifdef CONFIG_X86_64 exit_idle(); +#endif irq_enter(); me = smp_processor_id(); @@ -1406,11 +2214,12 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) struct irq_desc *desc; struct irq_cfg *cfg; irq = __get_cpu_var(vector_irq)[vector]; - if (irq >= NR_IRQS) + + desc = irq_to_desc(irq); + if (!desc) continue; - desc = irq_desc + irq; - cfg = irq_cfg + irq; + cfg = irq_cfg(irq); spin_lock(&desc->lock); if (!cfg->move_cleanup_count) goto unlock; @@ -1429,7 +2238,7 @@ unlock: static void irq_complete_move(unsigned int irq) { - struct irq_cfg *cfg = irq_cfg + irq; + struct irq_cfg *cfg = irq_cfg(irq); unsigned vector, me; if (likely(!cfg->move_in_progress)) @@ -1449,6 +2258,17 @@ static void irq_complete_move(unsigned int irq) #else static inline void irq_complete_move(unsigned int irq) {} #endif +#ifdef CONFIG_INTR_REMAP +static void ack_x2apic_level(unsigned int irq) +{ + ack_x2APIC_irq(); +} + +static void ack_x2apic_edge(unsigned int irq) +{ + ack_x2APIC_irq(); +} +#endif static void ack_apic_edge(unsigned int irq) { @@ -1457,19 +2277,50 @@ static void ack_apic_edge(unsigned int irq) ack_APIC_irq(); } +atomic_t irq_mis_count; + static void ack_apic_level(unsigned int irq) { +#ifdef CONFIG_X86_32 + unsigned long v; + int i; +#endif int do_unmask_irq = 0; irq_complete_move(irq); #ifdef CONFIG_GENERIC_PENDING_IRQ /* If we are moving the irq we need to mask it */ - if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) { + if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) { do_unmask_irq = 1; mask_IO_APIC_irq(irq); } #endif +#ifdef CONFIG_X86_32 + /* + * It appears there is an erratum which affects at least version 0x11 + * of I/O APIC (that's the 82093AA and cores integrated into various + * chipsets). Under certain conditions a level-triggered interrupt is + * erroneously delivered as edge-triggered one but the respective IRR + * bit gets set nevertheless. As a result the I/O unit expects an EOI + * message but it will never arrive and further interrupts are blocked + * from the source. The exact reason is so far unknown, but the + * phenomenon was observed when two consecutive interrupt requests + * from a given source get delivered to the same CPU and the source is + * temporarily disabled in between. + * + * A workaround is to simulate an EOI message manually. We achieve it + * by setting the trigger mode to edge and then to level when the edge + * trigger mode gets detected in the TMR of a local APIC for a + * level-triggered interrupt. We mask the source for the time of the + * operation to prevent an edge-triggered interrupt escaping meanwhile. + * The idea is from Manfred Spraul. --macro + */ + i = irq_cfg(irq)->vector; + + v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); +#endif + /* * We must acknowledge the irq before we move it or the acknowledge will * not propagate properly. @@ -1508,24 +2359,51 @@ static void ack_apic_level(unsigned int irq) move_masked_irq(irq); unmask_IO_APIC_irq(irq); } + +#ifdef CONFIG_X86_32 + if (!(v & (1 << (i & 0x1f)))) { + atomic_inc(&irq_mis_count); + spin_lock(&ioapic_lock); + __mask_and_edge_IO_APIC_irq(irq); + __unmask_and_level_IO_APIC_irq(irq); + spin_unlock(&ioapic_lock); + } +#endif } static struct irq_chip ioapic_chip __read_mostly = { - .name = "IO-APIC", - .startup = startup_ioapic_irq, - .mask = mask_IO_APIC_irq, - .unmask = unmask_IO_APIC_irq, - .ack = ack_apic_edge, - .eoi = ack_apic_level, + .name = "IO-APIC", + .startup = startup_ioapic_irq, + .mask = mask_IO_APIC_irq, + .unmask = unmask_IO_APIC_irq, + .ack = ack_apic_edge, + .eoi = ack_apic_level, #ifdef CONFIG_SMP - .set_affinity = set_ioapic_affinity_irq, + .set_affinity = set_ioapic_affinity_irq, #endif .retrigger = ioapic_retrigger_irq, }; +#ifdef CONFIG_INTR_REMAP +static struct irq_chip ir_ioapic_chip __read_mostly = { + .name = "IR-IO-APIC", + .startup = startup_ioapic_irq, + .mask = mask_IO_APIC_irq, + .unmask = unmask_IO_APIC_irq, + .ack = ack_x2apic_edge, + .eoi = ack_x2apic_level, +#ifdef CONFIG_SMP + .set_affinity = set_ir_ioapic_affinity_irq, +#endif + .retrigger = ioapic_retrigger_irq, +}; +#endif + static inline void init_IO_APIC_traps(void) { int irq; + struct irq_desc *desc; + struct irq_cfg *cfg; /* * NOTE! The local APIC isn't very good at handling @@ -1538,8 +2416,8 @@ static inline void init_IO_APIC_traps(void) * Also, we've got to be careful not to trash gate * 0x80, because int 0x80 is hm, kind of importantish. ;) */ - for (irq = 0; irq < NR_IRQS ; irq++) { - if (IO_APIC_IRQ(irq) && !irq_cfg[irq].vector) { + for_each_irq_cfg(irq, cfg) { + if (IO_APIC_IRQ(irq) && !cfg->vector) { /* * Hmm.. We don't have an entry for this, * so default to an old-fashioned 8259 @@ -1547,27 +2425,33 @@ static inline void init_IO_APIC_traps(void) */ if (irq < 16) make_8259A_irq(irq); - else + else { + desc = irq_to_desc(irq); /* Strange. Oh, well.. */ - irq_desc[irq].chip = &no_irq_chip; + desc->chip = &no_irq_chip; + } } } } -static void unmask_lapic_irq(unsigned int irq) +/* + * The local APIC irq-chip implementation: + */ + +static void mask_lapic_irq(unsigned int irq) { unsigned long v; v = apic_read(APIC_LVT0); - apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); + apic_write(APIC_LVT0, v | APIC_LVT_MASKED); } -static void mask_lapic_irq(unsigned int irq) +static void unmask_lapic_irq(unsigned int irq) { unsigned long v; v = apic_read(APIC_LVT0); - apic_write(APIC_LVT0, v | APIC_LVT_MASKED); + apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); } static void ack_lapic_irq (unsigned int irq) @@ -1584,7 +2468,10 @@ static struct irq_chip lapic_chip __read_mostly = { static void lapic_register_intr(int irq) { - irq_desc[irq].status &= ~IRQ_LEVEL; + struct irq_desc *desc; + + desc = irq_to_desc(irq); + desc->status &= ~IRQ_LEVEL; set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, "edge"); } @@ -1592,19 +2479,19 @@ static void lapic_register_intr(int irq) static void __init setup_nmi(void) { /* - * Dirty trick to enable the NMI watchdog ... + * Dirty trick to enable the NMI watchdog ... * We put the 8259A master into AEOI mode and * unmask on all local APICs LVT0 as NMI. * * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') * is from Maciej W. Rozycki - so we do not have to EOI from * the NMI handler or the timer interrupt. - */ - printk(KERN_INFO "activating NMI Watchdog ..."); + */ + apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); enable_NMI_through_LVT0(); - printk(" done.\n"); + apic_printk(APIC_VERBOSE, " done.\n"); } /* @@ -1621,12 +2508,17 @@ static inline void __init unlock_ExtINT_logic(void) unsigned char save_control, save_freq_select; pin = find_isa_irq_pin(8, mp_INT); + if (pin == -1) { + WARN_ON_ONCE(1); + return; + } apic = find_isa_irq_apic(8, mp_INT); - if (pin == -1) + if (apic == -1) { + WARN_ON_ONCE(1); return; + } entry0 = ioapic_read_entry(apic, pin); - clear_IO_APIC_pin(apic, pin); memset(&entry1, 0, sizeof(entry1)); @@ -1661,23 +2553,38 @@ static inline void __init unlock_ExtINT_logic(void) ioapic_write_entry(apic, pin, entry0); } +static int disable_timer_pin_1 __initdata; +/* Actually the next is obsolete, but keep it for paranoid reasons -AK */ +static int __init disable_timer_pin_setup(char *arg) +{ + disable_timer_pin_1 = 1; + return 0; +} +early_param("disable_timer_pin_1", disable_timer_pin_setup); + +int timer_through_8259 __initdata; + /* * This code may look a bit paranoid, but it's supposed to cooperate with * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ * is so screwy. Thanks to Brian Perkins for testing/hacking this beast * fanatically on his truly buggy board. * - * FIXME: really need to revamp this for modern platforms only. + * FIXME: really need to revamp this for all platforms. */ static inline void __init check_timer(void) { - struct irq_cfg *cfg = irq_cfg + 0; + struct irq_cfg *cfg = irq_cfg(0); int apic1, pin1, apic2, pin2; unsigned long flags; + unsigned int ver; int no_pin1 = 0; local_irq_save(flags); + ver = apic_read(APIC_LVR); + ver = GET_APIC_VERSION(ver); + /* * get/set the timer IRQ vector: */ @@ -1686,18 +2593,27 @@ static inline void __init check_timer(void) /* * As IRQ0 is to be enabled in the 8259A, the virtual - * wire has to be disabled in the local APIC. + * wire has to be disabled in the local APIC. Also + * timer interrupts need to be acknowledged manually in + * the 8259A for the i82489DX when using the NMI + * watchdog as that APIC treats NMIs as level-triggered. + * The AEOI mode will finish them in the 8259A + * automatically. */ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); init_8259A(1); +#ifdef CONFIG_X86_32 + timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); +#endif pin1 = find_isa_irq_pin(0, mp_INT); apic1 = find_isa_irq_apic(0, mp_INT); pin2 = ioapic_i8259.pin; apic2 = ioapic_i8259.apic; - apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", - cfg->vector, apic1, pin1, apic2, pin2); + apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X " + "apic1=%d pin1=%d apic2=%d pin2=%d\n", + cfg->vector, apic1, pin1, apic2, pin2); /* * Some BIOS writers are clueless and report the ExtINTA @@ -1707,6 +2623,10 @@ static inline void __init check_timer(void) * 8259A. */ if (pin1 == -1) { +#ifdef CONFIG_INTR_REMAP + if (intr_remapping_enabled) + panic("BIOS bug: timer not connected to IO-APIC"); +#endif pin1 = pin2; apic1 = apic2; no_pin1 = 1; @@ -1724,7 +2644,7 @@ static inline void __init check_timer(void) setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); } unmask_IO_APIC_irq(0); - if (!no_timer_check && timer_irq_works()) { + if (timer_irq_works()) { if (nmi_watchdog == NMI_IO_APIC) { setup_nmi(); enable_8259A_irq(0); @@ -1733,16 +2653,19 @@ static inline void __init check_timer(void) clear_IO_APIC_pin(0, pin1); goto out; } +#ifdef CONFIG_INTR_REMAP + if (intr_remapping_enabled) + panic("timer doesn't work through Interrupt-remapped IO-APIC"); +#endif clear_IO_APIC_pin(apic1, pin1); if (!no_pin1) - apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: " + apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " "8254 timer not connected to IO-APIC\n"); - apic_printk(APIC_VERBOSE,KERN_INFO - "...trying to set up timer (IRQ0) " - "through the 8259A ... "); - apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...", - apic2, pin2); + apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer " + "(IRQ0) through the 8259A ...\n"); + apic_printk(APIC_QUIET, KERN_INFO + "..... (found apic %d pin %d) ...\n", apic2, pin2); /* * legacy devices should be connected to IO APIC #0 */ @@ -1751,7 +2674,7 @@ static inline void __init check_timer(void) unmask_IO_APIC_irq(0); enable_8259A_irq(0); if (timer_irq_works()) { - apic_printk(APIC_VERBOSE," works.\n"); + apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); timer_through_8259 = 1; if (nmi_watchdog == NMI_IO_APIC) { disable_8259A_irq(0); @@ -1765,29 +2688,35 @@ static inline void __init check_timer(void) */ disable_8259A_irq(0); clear_IO_APIC_pin(apic2, pin2); - apic_printk(APIC_VERBOSE," failed.\n"); + apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); } if (nmi_watchdog == NMI_IO_APIC) { - printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); + apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work " + "through the IO-APIC - disabling NMI Watchdog!\n"); nmi_watchdog = NMI_NONE; } +#ifdef CONFIG_X86_32 + timer_ack = 0; +#endif - apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); + apic_printk(APIC_QUIET, KERN_INFO + "...trying to set up timer as Virtual Wire IRQ...\n"); lapic_register_intr(0); apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ enable_8259A_irq(0); if (timer_irq_works()) { - apic_printk(APIC_VERBOSE," works.\n"); + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); goto out; } disable_8259A_irq(0); apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); - apic_printk(APIC_VERBOSE," failed.\n"); + apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); - apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ..."); + apic_printk(APIC_QUIET, KERN_INFO + "...trying to set up timer as ExtINT IRQ...\n"); init_8259A(0); make_8259A_irq(0); @@ -1796,22 +2725,16 @@ static inline void __init check_timer(void) unlock_ExtINT_logic(); if (timer_irq_works()) { - apic_printk(APIC_VERBOSE," works.\n"); + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); goto out; } - apic_printk(APIC_VERBOSE," failed :(.\n"); - panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n"); + apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); + panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " + "report. Then try booting with the 'noapic' option.\n"); out: local_irq_restore(flags); } -static int __init notimercheck(char *s) -{ - no_timer_check = 1; - return 1; -} -__setup("no_timer_check", notimercheck); - /* * Traditionally ISA IRQ2 is the cascade IRQ, and is not available * to devices. However there may be an I/O APIC pin available for @@ -1829,27 +2752,49 @@ __setup("no_timer_check", notimercheck); * the I/O APIC in all cases now. No actual device should request * it anyway. --macro */ -#define PIC_IRQS (1<<2) +#define PIC_IRQS (1 << PIC_CASCADE_IR) void __init setup_IO_APIC(void) { +#ifdef CONFIG_X86_32 + enable_IO_APIC(); +#else /* * calling enable_IO_APIC() is moved to setup_local_APIC for BP */ +#endif io_apic_irqs = ~PIC_IRQS; apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); - + /* + * Set up IO-APIC IRQ routing. + */ +#ifdef CONFIG_X86_32 + if (!acpi_ioapic) + setup_ioapic_ids_from_mpc(); +#endif sync_Arb_IDs(); setup_IO_APIC_irqs(); init_IO_APIC_traps(); check_timer(); - if (!acpi_ioapic) - print_IO_APIC(); } +/* + * Called after all the initialization is done. If we didnt find any + * APIC bugs then we can allow the modify fast path + */ + +static int __init io_apic_bug_finalize(void) +{ + if (sis_apic_bug == -1) + sis_apic_bug = 0; + return 0; +} + +late_initcall(io_apic_bug_finalize); + struct sysfs_ioapic_data { struct sys_device dev; struct IO_APIC_route_entry entry[0]; @@ -1937,38 +2882,60 @@ device_initcall(ioapic_init_sysfs); /* * Dynamic irq allocate and deallocation */ -int create_irq(void) +unsigned int create_irq_nr(unsigned int irq_want) { /* Allocate an unused irq */ - int irq; - int new; + unsigned int irq; + unsigned int new; unsigned long flags; + struct irq_cfg *cfg_new; + + irq_want = nr_irqs - 1; - irq = -ENOSPC; + irq = 0; spin_lock_irqsave(&vector_lock, flags); - for (new = (NR_IRQS - 1); new >= 0; new--) { + for (new = irq_want; new > 0; new--) { if (platform_legacy_irq(new)) continue; - if (irq_cfg[new].vector != 0) + cfg_new = irq_cfg(new); + if (cfg_new && cfg_new->vector != 0) continue; + /* check if need to create one */ + if (!cfg_new) + cfg_new = irq_cfg_alloc(new); if (__assign_irq_vector(new, TARGET_CPUS) == 0) irq = new; break; } spin_unlock_irqrestore(&vector_lock, flags); - if (irq >= 0) { + if (irq > 0) { dynamic_irq_init(irq); } return irq; } +int create_irq(void) +{ + int irq; + + irq = create_irq_nr(nr_irqs - 1); + + if (irq == 0) + irq = -1; + + return irq; +} + void destroy_irq(unsigned int irq) { unsigned long flags; dynamic_irq_cleanup(irq); +#ifdef CONFIG_INTR_REMAP + free_irte(irq); +#endif spin_lock_irqsave(&vector_lock, flags); __clear_irq_vector(irq); spin_unlock_irqrestore(&vector_lock, flags); @@ -1980,18 +2947,50 @@ void destroy_irq(unsigned int irq) #ifdef CONFIG_PCI_MSI static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) { - struct irq_cfg *cfg = irq_cfg + irq; + struct irq_cfg *cfg; int err; unsigned dest; cpumask_t tmp; tmp = TARGET_CPUS; err = assign_irq_vector(irq, tmp); - if (!err) { - cpus_and(tmp, cfg->domain, tmp); - dest = cpu_mask_to_apicid(tmp); + if (err) + return err; + + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, tmp); + dest = cpu_mask_to_apicid(tmp); + +#ifdef CONFIG_INTR_REMAP + if (irq_remapped(irq)) { + struct irte irte; + int ir_index; + u16 sub_handle; + + ir_index = map_irq_to_irte_handle(irq, &sub_handle); + BUG_ON(ir_index == -1); + + memset (&irte, 0, sizeof(irte)); + + irte.present = 1; + irte.dst_mode = INT_DEST_MODE; + irte.trigger_mode = 0; /* edge */ + irte.dlvry_mode = INT_DELIVERY_MODE; + irte.vector = cfg->vector; + irte.dest_id = IRTE_DEST(dest); + + modify_irte(irq, &irte); msg->address_hi = MSI_ADDR_BASE_HI; + msg->data = sub_handle; + msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT | + MSI_ADDR_IR_SHV | + MSI_ADDR_IR_INDEX1(ir_index) | + MSI_ADDR_IR_INDEX2(ir_index); + } else +#endif + { + msg->address_hi = MSI_ADDR_BASE_HI; msg->address_lo = MSI_ADDR_BASE_LO | ((INT_DEST_MODE == 0) ? @@ -2016,10 +3015,11 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms #ifdef CONFIG_SMP static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) { - struct irq_cfg *cfg = irq_cfg + irq; + struct irq_cfg *cfg; struct msi_msg msg; unsigned int dest; cpumask_t tmp; + struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -2028,6 +3028,7 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) if (assign_irq_vector(irq, mask)) return; + cfg = irq_cfg(irq); cpus_and(tmp, cfg->domain, mask); dest = cpu_mask_to_apicid(tmp); @@ -2039,8 +3040,61 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); write_msi_msg(irq, &msg); - irq_desc[irq].affinity = mask; + desc = irq_to_desc(irq); + desc->affinity = mask; } + +#ifdef CONFIG_INTR_REMAP +/* + * Migrate the MSI irq to another cpumask. This migration is + * done in the process context using interrupt-remapping hardware. + */ +static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) +{ + struct irq_cfg *cfg; + unsigned int dest; + cpumask_t tmp, cleanup_mask; + struct irte irte; + struct irq_desc *desc; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + return; + + if (get_irte(irq, &irte)) + return; + + if (assign_irq_vector(irq, mask)) + return; + + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); + + irte.vector = cfg->vector; + irte.dest_id = IRTE_DEST(dest); + + /* + * atomically update the IRTE with the new destination and vector. + */ + modify_irte(irq, &irte); + + /* + * After this point, all the interrupts will start arriving + * at the new destination. So, time to cleanup the previous + * vector allocation. + */ + if (cfg->move_in_progress) { + cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); + cfg->move_cleanup_count = cpus_weight(cleanup_mask); + send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + cfg->move_in_progress = 0; + } + + desc = irq_to_desc(irq); + desc->affinity = mask; +} +#endif #endif /* CONFIG_SMP */ /* @@ -2058,26 +3112,179 @@ static struct irq_chip msi_chip = { .retrigger = ioapic_retrigger_irq, }; -int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) +#ifdef CONFIG_INTR_REMAP +static struct irq_chip msi_ir_chip = { + .name = "IR-PCI-MSI", + .unmask = unmask_msi_irq, + .mask = mask_msi_irq, + .ack = ack_x2apic_edge, +#ifdef CONFIG_SMP + .set_affinity = ir_set_msi_irq_affinity, +#endif + .retrigger = ioapic_retrigger_irq, +}; + +/* + * Map the PCI dev to the corresponding remapping hardware unit + * and allocate 'nvec' consecutive interrupt-remapping table entries + * in it. + */ +static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec) { + struct intel_iommu *iommu; + int index; + + iommu = map_dev_to_ir(dev); + if (!iommu) { + printk(KERN_ERR + "Unable to map PCI %s to iommu\n", pci_name(dev)); + return -ENOENT; + } + + index = alloc_irte(iommu, irq, nvec); + if (index < 0) { + printk(KERN_ERR + "Unable to allocate %d IRTE for PCI %s\n", nvec, + pci_name(dev)); + return -ENOSPC; + } + return index; +} +#endif + +static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) +{ + int ret; struct msi_msg msg; - int irq, ret; - irq = create_irq(); - if (irq < 0) - return irq; ret = msi_compose_msg(dev, irq, &msg); + if (ret < 0) + return ret; + + set_irq_msi(irq, desc); + write_msi_msg(irq, &msg); + +#ifdef CONFIG_INTR_REMAP + if (irq_remapped(irq)) { + struct irq_desc *desc = irq_to_desc(irq); + /* + * irq migration in process context + */ + desc->status |= IRQ_MOVE_PCNTXT; + set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge"); + } else +#endif + set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); + + dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq); + + return 0; +} + +static unsigned int build_irq_for_pci_dev(struct pci_dev *dev) +{ + unsigned int irq; + + irq = dev->bus->number; + irq <<= 8; + irq |= dev->devfn; + irq <<= 12; + + return irq; +} + +int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) +{ + unsigned int irq; + int ret; + unsigned int irq_want; + + irq_want = build_irq_for_pci_dev(dev) + 0x100; + + irq = create_irq_nr(irq_want); + if (irq == 0) + return -1; + +#ifdef CONFIG_INTR_REMAP + if (!intr_remapping_enabled) + goto no_ir; + + ret = msi_alloc_irte(dev, irq, 1); + if (ret < 0) + goto error; +no_ir: +#endif + ret = setup_msi_irq(dev, desc, irq); if (ret < 0) { destroy_irq(irq); return ret; } + return 0; - set_irq_msi(irq, desc); - write_msi_msg(irq, &msg); +#ifdef CONFIG_INTR_REMAP +error: + destroy_irq(irq); + return ret; +#endif +} - set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); +int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) +{ + unsigned int irq; + int ret, sub_handle; + struct msi_desc *desc; + unsigned int irq_want; +#ifdef CONFIG_INTR_REMAP + struct intel_iommu *iommu = 0; + int index = 0; +#endif + + irq_want = build_irq_for_pci_dev(dev) + 0x100; + sub_handle = 0; + list_for_each_entry(desc, &dev->msi_list, list) { + irq = create_irq_nr(irq_want--); + if (irq == 0) + return -1; +#ifdef CONFIG_INTR_REMAP + if (!intr_remapping_enabled) + goto no_ir; + + if (!sub_handle) { + /* + * allocate the consecutive block of IRTE's + * for 'nvec' + */ + index = msi_alloc_irte(dev, irq, nvec); + if (index < 0) { + ret = index; + goto error; + } + } else { + iommu = map_dev_to_ir(dev); + if (!iommu) { + ret = -ENOENT; + goto error; + } + /* + * setup the mapping between the irq and the IRTE + * base index, the sub_handle pointing to the + * appropriate interrupt remap table entry. + */ + set_irte_irq(irq, iommu, index, sub_handle); + } +no_ir: +#endif + ret = setup_msi_irq(dev, desc, irq); + if (ret < 0) + goto error; + sub_handle++; + } return 0; + +error: + destroy_irq(irq); + return ret; } void arch_teardown_msi_irq(unsigned int irq) @@ -2089,10 +3296,11 @@ void arch_teardown_msi_irq(unsigned int irq) #ifdef CONFIG_SMP static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) { - struct irq_cfg *cfg = irq_cfg + irq; + struct irq_cfg *cfg; struct msi_msg msg; unsigned int dest; cpumask_t tmp; + struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -2101,6 +3309,7 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) if (assign_irq_vector(irq, mask)) return; + cfg = irq_cfg(irq); cpus_and(tmp, cfg->domain, mask); dest = cpu_mask_to_apicid(tmp); @@ -2112,7 +3321,8 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); dmar_msi_write(irq, &msg); - irq_desc[irq].affinity = mask; + desc = irq_to_desc(irq); + desc->affinity = mask; } #endif /* CONFIG_SMP */ @@ -2142,6 +3352,69 @@ int arch_setup_dmar_msi(unsigned int irq) } #endif +#ifdef CONFIG_HPET_TIMER + +#ifdef CONFIG_SMP +static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask) +{ + struct irq_cfg *cfg; + struct irq_desc *desc; + struct msi_msg msg; + unsigned int dest; + cpumask_t tmp; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + return; + + if (assign_irq_vector(irq, mask)) + return; + + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); + + hpet_msi_read(irq, &msg); + + msg.data &= ~MSI_DATA_VECTOR_MASK; + msg.data |= MSI_DATA_VECTOR(cfg->vector); + msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; + msg.address_lo |= MSI_ADDR_DEST_ID(dest); + + hpet_msi_write(irq, &msg); + desc = irq_to_desc(irq); + desc->affinity = mask; +} +#endif /* CONFIG_SMP */ + +struct irq_chip hpet_msi_type = { + .name = "HPET_MSI", + .unmask = hpet_msi_unmask, + .mask = hpet_msi_mask, + .ack = ack_apic_edge, +#ifdef CONFIG_SMP + .set_affinity = hpet_msi_set_affinity, +#endif + .retrigger = ioapic_retrigger_irq, +}; + +int arch_setup_hpet_msi(unsigned int irq) +{ + int ret; + struct msi_msg msg; + + ret = msi_compose_msg(NULL, irq, &msg); + if (ret < 0) + return ret; + + hpet_msi_write(irq, &msg); + set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq, + "edge"); + + return 0; +} +#endif + #endif /* CONFIG_PCI_MSI */ /* * Hypertransport interrupt support @@ -2166,9 +3439,10 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) { - struct irq_cfg *cfg = irq_cfg + irq; + struct irq_cfg *cfg; unsigned int dest; cpumask_t tmp; + struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -2177,11 +3451,13 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) if (assign_irq_vector(irq, mask)) return; + cfg = irq_cfg(irq); cpus_and(tmp, cfg->domain, mask); dest = cpu_mask_to_apicid(tmp); target_ht_irq(irq, dest, cfg->vector); - irq_desc[irq].affinity = mask; + desc = irq_to_desc(irq); + desc->affinity = mask; } #endif @@ -2198,7 +3474,7 @@ static struct irq_chip ht_irq_chip = { int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) { - struct irq_cfg *cfg = irq_cfg + irq; + struct irq_cfg *cfg; int err; cpumask_t tmp; @@ -2208,6 +3484,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) struct ht_irq_msg msg; unsigned dest; + cfg = irq_cfg(irq); cpus_and(tmp, cfg->domain, tmp); dest = cpu_mask_to_apicid(tmp); @@ -2230,20 +3507,196 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) set_irq_chip_and_handler_name(irq, &ht_irq_chip, handle_edge_irq, "edge"); + + dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq); } return err; } #endif /* CONFIG_HT_IRQ */ +#ifdef CONFIG_X86_64 +/* + * Re-target the irq to the specified CPU and enable the specified MMR located + * on the specified blade to allow the sending of MSIs to the specified CPU. + */ +int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, + unsigned long mmr_offset) +{ + const cpumask_t *eligible_cpu = get_cpu_mask(cpu); + struct irq_cfg *cfg; + int mmr_pnode; + unsigned long mmr_value; + struct uv_IO_APIC_route_entry *entry; + unsigned long flags; + int err; + + err = assign_irq_vector(irq, *eligible_cpu); + if (err != 0) + return err; + + spin_lock_irqsave(&vector_lock, flags); + set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, + irq_name); + spin_unlock_irqrestore(&vector_lock, flags); + + cfg = irq_cfg(irq); + + mmr_value = 0; + entry = (struct uv_IO_APIC_route_entry *)&mmr_value; + BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); + + entry->vector = cfg->vector; + entry->delivery_mode = INT_DELIVERY_MODE; + entry->dest_mode = INT_DEST_MODE; + entry->polarity = 0; + entry->trigger = 0; + entry->mask = 0; + entry->dest = cpu_mask_to_apicid(*eligible_cpu); + + mmr_pnode = uv_blade_to_pnode(mmr_blade); + uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); + + return irq; +} + +/* + * Disable the specified MMR located on the specified blade so that MSIs are + * longer allowed to be sent. + */ +void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset) +{ + unsigned long mmr_value; + struct uv_IO_APIC_route_entry *entry; + int mmr_pnode; + + mmr_value = 0; + entry = (struct uv_IO_APIC_route_entry *)&mmr_value; + BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); + + entry->mask = 1; + + mmr_pnode = uv_blade_to_pnode(mmr_blade); + uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); +} +#endif /* CONFIG_X86_64 */ + +int __init io_apic_get_redir_entries (int ioapic) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(ioapic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return reg_01.bits.entries; +} + +int __init probe_nr_irqs(void) +{ + int idx; + int nr = 0; +#ifndef CONFIG_XEN + int nr_min = 32; +#else + int nr_min = NR_IRQS; +#endif + + for (idx = 0; idx < nr_ioapics; idx++) + nr += io_apic_get_redir_entries(idx) + 1; + + /* double it for hotplug and msi and nmi */ + nr <<= 1; + + /* something wrong ? */ + if (nr < nr_min) + nr = nr_min; + + return nr; +} + /* -------------------------------------------------------------------------- ACPI-based IOAPIC Configuration -------------------------------------------------------------------------- */ #ifdef CONFIG_ACPI -#define IO_APIC_MAX_ID 0xFE +#ifdef CONFIG_X86_32 +int __init io_apic_get_unique_id(int ioapic, int apic_id) +{ + union IO_APIC_reg_00 reg_00; + static physid_mask_t apic_id_map = PHYSID_MASK_NONE; + physid_mask_t tmp; + unsigned long flags; + int i = 0; -int __init io_apic_get_redir_entries (int ioapic) + /* + * The P4 platform supports up to 256 APIC IDs on two separate APIC + * buses (one for LAPICs, one for IOAPICs), where predecessors only + * supports up to 16 on one shared APIC bus. + * + * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full + * advantage of new APIC bus architecture. + */ + + if (physids_empty(apic_id_map)) + apic_id_map = ioapic_phys_id_map(phys_cpu_present_map); + + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(ioapic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + if (apic_id >= get_physical_broadcast()) { + printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " + "%d\n", ioapic, apic_id, reg_00.bits.ID); + apic_id = reg_00.bits.ID; + } + + /* + * Every APIC in a system must have a unique ID or we get lots of nice + * 'stuck on smp_invalidate_needed IPI wait' messages. + */ + if (check_apicid_used(apic_id_map, apic_id)) { + + for (i = 0; i < get_physical_broadcast(); i++) { + if (!check_apicid_used(apic_id_map, i)) + break; + } + + if (i == get_physical_broadcast()) + panic("Max apic_id exceeded!\n"); + + printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " + "trying %d\n", ioapic, apic_id, i); + + apic_id = i; + } + + tmp = apicid_to_cpu_present(apic_id); + physids_or(apic_id_map, apic_id_map, tmp); + + if (reg_00.bits.ID != apic_id) { + reg_00.bits.ID = apic_id; + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(ioapic, 0, reg_00.raw); + reg_00.raw = io_apic_read(ioapic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + /* Sanity check */ + if (reg_00.bits.ID != apic_id) { + printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); + return -1; + } + } + + apic_printk(APIC_VERBOSE, KERN_INFO + "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); + + return apic_id; +} + +int __init io_apic_get_version(int ioapic) { union IO_APIC_reg_01 reg_01; unsigned long flags; @@ -2252,9 +3705,9 @@ int __init io_apic_get_redir_entries (int ioapic) reg_01.raw = io_apic_read(ioapic, 1); spin_unlock_irqrestore(&ioapic_lock, flags); - return reg_01.bits.entries; + return reg_01.bits.version; } - +#endif int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) { @@ -2306,6 +3759,7 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) void __init setup_ioapic_dest(void) { int pin, ioapic, irq, irq_entry; + struct irq_cfg *cfg; if (skip_ioapic_setup == 1) return; @@ -2321,10 +3775,15 @@ void __init setup_ioapic_dest(void) * when you have too many devices, because at that time only boot * cpu is online. */ - if (!irq_cfg[irq].vector) + cfg = irq_cfg(irq); + if (!cfg->vector) setup_IO_APIC_irq(ioapic, pin, irq, irq_trigger(irq_entry), irq_polarity(irq_entry)); +#ifdef CONFIG_INTR_REMAP + else if (intr_remapping_enabled) + set_ir_ioapic_affinity_irq(irq, TARGET_CPUS); +#endif else set_ioapic_affinity_irq(irq, TARGET_CPUS); } @@ -2375,18 +3834,33 @@ void __init ioapic_init_mappings(void) struct resource *ioapic_res; int i; + irq_2_pin_init(); ioapic_res = ioapic_setup_resources(); for (i = 0; i < nr_ioapics; i++) { if (smp_found_config) { ioapic_phys = mp_ioapics[i].mp_apicaddr; +#ifdef CONFIG_X86_32 + if (!ioapic_phys) { + printk(KERN_ERR + "WARNING: bogus zero IO-APIC " + "address found in MPTABLE, " + "disabling IO/APIC support!\n"); + smp_found_config = 0; + skip_ioapic_setup = 1; + goto fake_ioapic_page; + } +#endif } else { +#ifdef CONFIG_X86_32 +fake_ioapic_page: +#endif ioapic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); ioapic_phys = __pa(ioapic_phys); } set_fixmap_nocache(idx, ioapic_phys); apic_printk(APIC_VERBOSE, - "mapped IOAPIC to %016lx (%016lx)\n", + "mapped IOAPIC to %08lx (%08lx)\n", __fix_to_virt(idx), ioapic_phys); idx++; @@ -2420,4 +3894,3 @@ static int __init ioapic_insert_resources(void) /* Insert the IO APIC resources after PCI initialization has occured to handle * IO APICS that are mapped in on a BAR in PCI space. */ late_initcall(ioapic_insert_resources); - diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c deleted file mode 100644 index 558abf4c796a..000000000000 --- a/arch/x86/kernel/io_apic_32.c +++ /dev/null @@ -1,2900 +0,0 @@ -/* - * Intel IO-APIC support for multi-Pentium hosts. - * - * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo - * - * Many thanks to Stig Venaas for trying out countless experimental - * patches and reporting/debugging problems patiently! - * - * (c) 1999, Multiple IO-APIC support, developed by - * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and - * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>, - * further tested and cleaned up by Zach Brown <zab@redhat.com> - * and Ingo Molnar <mingo@redhat.com> - * - * Fixes - * Maciej W. Rozycki : Bits for genuine 82489DX APICs; - * thanks to Eric Gilmore - * and Rolf G. Tews - * for testing these extensively - * Paul Diefenbaugh : Added full ACPI support - */ - -#include <linux/mm.h> -#include <linux/interrupt.h> -#include <linux/init.h> -#include <linux/delay.h> -#include <linux/sched.h> -#include <linux/bootmem.h> -#include <linux/mc146818rtc.h> -#include <linux/compiler.h> -#include <linux/acpi.h> -#include <linux/module.h> -#include <linux/sysdev.h> -#include <linux/pci.h> -#include <linux/msi.h> -#include <linux/htirq.h> -#include <linux/freezer.h> -#include <linux/kthread.h> -#include <linux/jiffies.h> /* time_after() */ - -#include <asm/io.h> -#include <asm/smp.h> -#include <asm/desc.h> -#include <asm/timer.h> -#include <asm/i8259.h> -#include <asm/nmi.h> -#include <asm/msidef.h> -#include <asm/hypertransport.h> - -#include <mach_apic.h> -#include <mach_apicdef.h> - -int (*ioapic_renumber_irq)(int ioapic, int irq); -atomic_t irq_mis_count; - -/* Where if anywhere is the i8259 connect in external int mode */ -static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; - -static DEFINE_SPINLOCK(ioapic_lock); -static DEFINE_SPINLOCK(vector_lock); - -int timer_through_8259 __initdata; - -/* - * Is the SiS APIC rmw bug present ? - * -1 = don't know, 0 = no, 1 = yes - */ -int sis_apic_bug = -1; - -/* - * # of IRQ routing registers - */ -int nr_ioapic_registers[MAX_IO_APICS]; - -/* I/O APIC entries */ -struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; -int nr_ioapics; - -/* MP IRQ source entries */ -struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; - -/* # of MP IRQ source entries */ -int mp_irq_entries; - -#if defined (CONFIG_MCA) || defined (CONFIG_EISA) -int mp_bus_id_to_type[MAX_MP_BUSSES]; -#endif - -DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); - -static int disable_timer_pin_1 __initdata; - -/* - * Rough estimation of how many shared IRQs there are, can - * be changed anytime. - */ -#define MAX_PLUS_SHARED_IRQS NR_IRQS -#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) - -/* - * This is performance-critical, we want to do it O(1) - * - * the indexing order of this array favors 1:1 mappings - * between pins and IRQs. - */ - -static struct irq_pin_list { - int apic, pin, next; -} irq_2_pin[PIN_MAP_SIZE]; - -struct io_apic { - unsigned int index; - unsigned int unused[3]; - unsigned int data; -}; - -static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) -{ - return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) - + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK); -} - -static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) -{ - struct io_apic __iomem *io_apic = io_apic_base(apic); - writel(reg, &io_apic->index); - return readl(&io_apic->data); -} - -static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) -{ - struct io_apic __iomem *io_apic = io_apic_base(apic); - writel(reg, &io_apic->index); - writel(value, &io_apic->data); -} - -/* - * Re-write a value: to be used for read-modify-write - * cycles where the read already set up the index register. - * - * Older SiS APIC requires we rewrite the index register - */ -static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) -{ - volatile struct io_apic __iomem *io_apic = io_apic_base(apic); - if (sis_apic_bug) - writel(reg, &io_apic->index); - writel(value, &io_apic->data); -} - -union entry_union { - struct { u32 w1, w2; }; - struct IO_APIC_route_entry entry; -}; - -static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) -{ - union entry_union eu; - unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); - eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); - eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); - return eu.entry; -} - -/* - * When we write a new IO APIC routing entry, we need to write the high - * word first! If the mask bit in the low word is clear, we will enable - * the interrupt, and we need to make sure the entry is fully populated - * before that happens. - */ -static void -__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) -{ - union entry_union eu; - eu.entry = e; - io_apic_write(apic, 0x11 + 2*pin, eu.w2); - io_apic_write(apic, 0x10 + 2*pin, eu.w1); -} - -static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) -{ - unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); - __ioapic_write_entry(apic, pin, e); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -/* - * When we mask an IO APIC routing entry, we need to write the low - * word first, in order to set the mask bit before we change the - * high bits! - */ -static void ioapic_mask_entry(int apic, int pin) -{ - unsigned long flags; - union entry_union eu = { .entry.mask = 1 }; - - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x10 + 2*pin, eu.w1); - io_apic_write(apic, 0x11 + 2*pin, eu.w2); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -/* - * The common case is 1:1 IRQ<->pin mappings. Sometimes there are - * shared ISA-space IRQs, so we have to support them. We are super - * fast in the common case, and fast for shared ISA-space IRQs. - */ -static void add_pin_to_irq(unsigned int irq, int apic, int pin) -{ - static int first_free_entry = NR_IRQS; - struct irq_pin_list *entry = irq_2_pin + irq; - - while (entry->next) - entry = irq_2_pin + entry->next; - - if (entry->pin != -1) { - entry->next = first_free_entry; - entry = irq_2_pin + entry->next; - if (++first_free_entry >= PIN_MAP_SIZE) - panic("io_apic.c: whoops"); - } - entry->apic = apic; - entry->pin = pin; -} - -/* - * Reroute an IRQ to a different pin. - */ -static void __init replace_pin_at_irq(unsigned int irq, - int oldapic, int oldpin, - int newapic, int newpin) -{ - struct irq_pin_list *entry = irq_2_pin + irq; - - while (1) { - if (entry->apic == oldapic && entry->pin == oldpin) { - entry->apic = newapic; - entry->pin = newpin; - } - if (!entry->next) - break; - entry = irq_2_pin + entry->next; - } -} - -static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable) -{ - struct irq_pin_list *entry = irq_2_pin + irq; - unsigned int pin, reg; - - for (;;) { - pin = entry->pin; - if (pin == -1) - break; - reg = io_apic_read(entry->apic, 0x10 + pin*2); - reg &= ~disable; - reg |= enable; - io_apic_modify(entry->apic, 0x10 + pin*2, reg); - if (!entry->next) - break; - entry = irq_2_pin + entry->next; - } -} - -/* mask = 1 */ -static void __mask_IO_APIC_irq(unsigned int irq) -{ - __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0); -} - -/* mask = 0 */ -static void __unmask_IO_APIC_irq(unsigned int irq) -{ - __modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED); -} - -/* mask = 1, trigger = 0 */ -static void __mask_and_edge_IO_APIC_irq(unsigned int irq) -{ - __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, - IO_APIC_REDIR_LEVEL_TRIGGER); -} - -/* mask = 0, trigger = 1 */ -static void __unmask_and_level_IO_APIC_irq(unsigned int irq) -{ - __modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER, - IO_APIC_REDIR_MASKED); -} - -static void mask_IO_APIC_irq(unsigned int irq) -{ - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - __mask_IO_APIC_irq(irq); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -static void unmask_IO_APIC_irq(unsigned int irq) -{ - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - __unmask_IO_APIC_irq(irq); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) -{ - struct IO_APIC_route_entry entry; - - /* Check delivery_mode to be sure we're not clearing an SMI pin */ - entry = ioapic_read_entry(apic, pin); - if (entry.delivery_mode == dest_SMI) - return; - - /* - * Disable it in the IO-APIC irq-routing table: - */ - ioapic_mask_entry(apic, pin); -} - -static void clear_IO_APIC(void) -{ - int apic, pin; - - for (apic = 0; apic < nr_ioapics; apic++) - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) - clear_IO_APIC_pin(apic, pin); -} - -#ifdef CONFIG_SMP -static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) -{ - unsigned long flags; - int pin; - struct irq_pin_list *entry = irq_2_pin + irq; - unsigned int apicid_value; - cpumask_t tmp; - - cpus_and(tmp, cpumask, cpu_online_map); - if (cpus_empty(tmp)) - tmp = TARGET_CPUS; - - cpus_and(cpumask, tmp, CPU_MASK_ALL); - - apicid_value = cpu_mask_to_apicid(cpumask); - /* Prepare to do the io_apic_write */ - apicid_value = apicid_value << 24; - spin_lock_irqsave(&ioapic_lock, flags); - for (;;) { - pin = entry->pin; - if (pin == -1) - break; - io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value); - if (!entry->next) - break; - entry = irq_2_pin + entry->next; - } - irq_desc[irq].affinity = cpumask; - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -#if defined(CONFIG_IRQBALANCE) -# include <asm/processor.h> /* kernel_thread() */ -# include <linux/kernel_stat.h> /* kstat */ -# include <linux/slab.h> /* kmalloc() */ -# include <linux/timer.h> - -#define IRQBALANCE_CHECK_ARCH -999 -#define MAX_BALANCED_IRQ_INTERVAL (5*HZ) -#define MIN_BALANCED_IRQ_INTERVAL (HZ/2) -#define BALANCED_IRQ_MORE_DELTA (HZ/10) -#define BALANCED_IRQ_LESS_DELTA (HZ) - -static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH; -static int physical_balance __read_mostly; -static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL; - -static struct irq_cpu_info { - unsigned long *last_irq; - unsigned long *irq_delta; - unsigned long irq; -} irq_cpu_data[NR_CPUS]; - -#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq) -#define LAST_CPU_IRQ(cpu, irq) (irq_cpu_data[cpu].last_irq[irq]) -#define IRQ_DELTA(cpu, irq) (irq_cpu_data[cpu].irq_delta[irq]) - -#define IDLE_ENOUGH(cpu,now) \ - (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1)) - -#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask) - -#define CPU_TO_PACKAGEINDEX(i) (first_cpu(per_cpu(cpu_sibling_map, i))) - -static cpumask_t balance_irq_affinity[NR_IRQS] = { - [0 ... NR_IRQS-1] = CPU_MASK_ALL -}; - -void set_balance_irq_affinity(unsigned int irq, cpumask_t mask) -{ - balance_irq_affinity[irq] = mask; -} - -static unsigned long move(int curr_cpu, cpumask_t allowed_mask, - unsigned long now, int direction) -{ - int search_idle = 1; - int cpu = curr_cpu; - - goto inside; - - do { - if (unlikely(cpu == curr_cpu)) - search_idle = 0; -inside: - if (direction == 1) { - cpu++; - if (cpu >= NR_CPUS) - cpu = 0; - } else { - cpu--; - if (cpu == -1) - cpu = NR_CPUS-1; - } - } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu, allowed_mask) || - (search_idle && !IDLE_ENOUGH(cpu, now))); - - return cpu; -} - -static inline void balance_irq(int cpu, int irq) -{ - unsigned long now = jiffies; - cpumask_t allowed_mask; - unsigned int new_cpu; - - if (irqbalance_disabled) - return; - - cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]); - new_cpu = move(cpu, allowed_mask, now, 1); - if (cpu != new_cpu) - set_pending_irq(irq, cpumask_of_cpu(new_cpu)); -} - -static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold) -{ - int i, j; - - for_each_online_cpu(i) { - for (j = 0; j < NR_IRQS; j++) { - if (!irq_desc[j].action) - continue; - /* Is it a significant load ? */ - if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i), j) < - useful_load_threshold) - continue; - balance_irq(i, j); - } - } - balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, - balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); - return; -} - -static void do_irq_balance(void) -{ - int i, j; - unsigned long max_cpu_irq = 0, min_cpu_irq = (~0); - unsigned long move_this_load = 0; - int max_loaded = 0, min_loaded = 0; - int load; - unsigned long useful_load_threshold = balanced_irq_interval + 10; - int selected_irq; - int tmp_loaded, first_attempt = 1; - unsigned long tmp_cpu_irq; - unsigned long imbalance = 0; - cpumask_t allowed_mask, target_cpu_mask, tmp; - - for_each_possible_cpu(i) { - int package_index; - CPU_IRQ(i) = 0; - if (!cpu_online(i)) - continue; - package_index = CPU_TO_PACKAGEINDEX(i); - for (j = 0; j < NR_IRQS; j++) { - unsigned long value_now, delta; - /* Is this an active IRQ or balancing disabled ? */ - if (!irq_desc[j].action || irq_balancing_disabled(j)) - continue; - if (package_index == i) - IRQ_DELTA(package_index, j) = 0; - /* Determine the total count per processor per IRQ */ - value_now = (unsigned long) kstat_cpu(i).irqs[j]; - - /* Determine the activity per processor per IRQ */ - delta = value_now - LAST_CPU_IRQ(i, j); - - /* Update last_cpu_irq[][] for the next time */ - LAST_CPU_IRQ(i, j) = value_now; - - /* Ignore IRQs whose rate is less than the clock */ - if (delta < useful_load_threshold) - continue; - /* update the load for the processor or package total */ - IRQ_DELTA(package_index, j) += delta; - - /* Keep track of the higher numbered sibling as well */ - if (i != package_index) - CPU_IRQ(i) += delta; - /* - * We have sibling A and sibling B in the package - * - * cpu_irq[A] = load for cpu A + load for cpu B - * cpu_irq[B] = load for cpu B - */ - CPU_IRQ(package_index) += delta; - } - } - /* Find the least loaded processor package */ - for_each_online_cpu(i) { - if (i != CPU_TO_PACKAGEINDEX(i)) - continue; - if (min_cpu_irq > CPU_IRQ(i)) { - min_cpu_irq = CPU_IRQ(i); - min_loaded = i; - } - } - max_cpu_irq = ULONG_MAX; - -tryanothercpu: - /* - * Look for heaviest loaded processor. - * We may come back to get the next heaviest loaded processor. - * Skip processors with trivial loads. - */ - tmp_cpu_irq = 0; - tmp_loaded = -1; - for_each_online_cpu(i) { - if (i != CPU_TO_PACKAGEINDEX(i)) - continue; - if (max_cpu_irq <= CPU_IRQ(i)) - continue; - if (tmp_cpu_irq < CPU_IRQ(i)) { - tmp_cpu_irq = CPU_IRQ(i); - tmp_loaded = i; - } - } - - if (tmp_loaded == -1) { - /* - * In the case of small number of heavy interrupt sources, - * loading some of the cpus too much. We use Ingo's original - * approach to rotate them around. - */ - if (!first_attempt && imbalance >= useful_load_threshold) { - rotate_irqs_among_cpus(useful_load_threshold); - return; - } - goto not_worth_the_effort; - } - - first_attempt = 0; /* heaviest search */ - max_cpu_irq = tmp_cpu_irq; /* load */ - max_loaded = tmp_loaded; /* processor */ - imbalance = (max_cpu_irq - min_cpu_irq) / 2; - - /* - * if imbalance is less than approx 10% of max load, then - * observe diminishing returns action. - quit - */ - if (imbalance < (max_cpu_irq >> 3)) - goto not_worth_the_effort; - -tryanotherirq: - /* if we select an IRQ to move that can't go where we want, then - * see if there is another one to try. - */ - move_this_load = 0; - selected_irq = -1; - for (j = 0; j < NR_IRQS; j++) { - /* Is this an active IRQ? */ - if (!irq_desc[j].action) - continue; - if (imbalance <= IRQ_DELTA(max_loaded, j)) - continue; - /* Try to find the IRQ that is closest to the imbalance - * without going over. - */ - if (move_this_load < IRQ_DELTA(max_loaded, j)) { - move_this_load = IRQ_DELTA(max_loaded, j); - selected_irq = j; - } - } - if (selected_irq == -1) - goto tryanothercpu; - - imbalance = move_this_load; - - /* For physical_balance case, we accumulated both load - * values in the one of the siblings cpu_irq[], - * to use the same code for physical and logical processors - * as much as possible. - * - * NOTE: the cpu_irq[] array holds the sum of the load for - * sibling A and sibling B in the slot for the lowest numbered - * sibling (A), _AND_ the load for sibling B in the slot for - * the higher numbered sibling. - * - * We seek the least loaded sibling by making the comparison - * (A+B)/2 vs B - */ - load = CPU_IRQ(min_loaded) >> 1; - for_each_cpu_mask(j, per_cpu(cpu_sibling_map, min_loaded)) { - if (load > CPU_IRQ(j)) { - /* This won't change cpu_sibling_map[min_loaded] */ - load = CPU_IRQ(j); - min_loaded = j; - } - } - - cpus_and(allowed_mask, - cpu_online_map, - balance_irq_affinity[selected_irq]); - target_cpu_mask = cpumask_of_cpu(min_loaded); - cpus_and(tmp, target_cpu_mask, allowed_mask); - - if (!cpus_empty(tmp)) { - /* mark for change destination */ - set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded)); - - /* Since we made a change, come back sooner to - * check for more variation. - */ - balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, - balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); - return; - } - goto tryanotherirq; - -not_worth_the_effort: - /* - * if we did not find an IRQ to move, then adjust the time interval - * upward - */ - balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL, - balanced_irq_interval + BALANCED_IRQ_MORE_DELTA); - return; -} - -static int balanced_irq(void *unused) -{ - int i; - unsigned long prev_balance_time = jiffies; - long time_remaining = balanced_irq_interval; - - /* push everything to CPU 0 to give us a starting point. */ - for (i = 0 ; i < NR_IRQS ; i++) { - irq_desc[i].pending_mask = cpumask_of_cpu(0); - set_pending_irq(i, cpumask_of_cpu(0)); - } - - set_freezable(); - for ( ; ; ) { - time_remaining = schedule_timeout_interruptible(time_remaining); - try_to_freeze(); - if (time_after(jiffies, - prev_balance_time+balanced_irq_interval)) { - preempt_disable(); - do_irq_balance(); - prev_balance_time = jiffies; - time_remaining = balanced_irq_interval; - preempt_enable(); - } - } - return 0; -} - -static int __init balanced_irq_init(void) -{ - int i; - struct cpuinfo_x86 *c; - cpumask_t tmp; - - cpus_shift_right(tmp, cpu_online_map, 2); - c = &boot_cpu_data; - /* When not overwritten by the command line ask subarchitecture. */ - if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH) - irqbalance_disabled = NO_BALANCE_IRQ; - if (irqbalance_disabled) - return 0; - - /* disable irqbalance completely if there is only one processor online */ - if (num_online_cpus() < 2) { - irqbalance_disabled = 1; - return 0; - } - /* - * Enable physical balance only if more than 1 physical processor - * is present - */ - if (smp_num_siblings > 1 && !cpus_empty(tmp)) - physical_balance = 1; - - for_each_online_cpu(i) { - irq_cpu_data[i].irq_delta = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); - irq_cpu_data[i].last_irq = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); - if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) { - printk(KERN_ERR "balanced_irq_init: out of memory"); - goto failed; - } - } - - printk(KERN_INFO "Starting balanced_irq\n"); - if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd"))) - return 0; - printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq"); -failed: - for_each_possible_cpu(i) { - kfree(irq_cpu_data[i].irq_delta); - irq_cpu_data[i].irq_delta = NULL; - kfree(irq_cpu_data[i].last_irq); - irq_cpu_data[i].last_irq = NULL; - } - return 0; -} - -int __devinit irqbalance_disable(char *str) -{ - irqbalance_disabled = 1; - return 1; -} - -__setup("noirqbalance", irqbalance_disable); - -late_initcall(balanced_irq_init); -#endif /* CONFIG_IRQBALANCE */ -#endif /* CONFIG_SMP */ - -#ifndef CONFIG_SMP -void send_IPI_self(int vector) -{ - unsigned int cfg; - - /* - * Wait for idle. - */ - apic_wait_icr_idle(); - cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL; - /* - * Send the IPI. The write to APIC_ICR fires this off. - */ - apic_write_around(APIC_ICR, cfg); -} -#endif /* !CONFIG_SMP */ - - -/* - * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to - * specific CPU-side IRQs. - */ - -#define MAX_PIRQS 8 -static int pirq_entries [MAX_PIRQS]; -static int pirqs_enabled; -int skip_ioapic_setup; - -static int __init ioapic_pirq_setup(char *str) -{ - int i, max; - int ints[MAX_PIRQS+1]; - - get_options(str, ARRAY_SIZE(ints), ints); - - for (i = 0; i < MAX_PIRQS; i++) - pirq_entries[i] = -1; - - pirqs_enabled = 1; - apic_printk(APIC_VERBOSE, KERN_INFO - "PIRQ redirection, working around broken MP-BIOS.\n"); - max = MAX_PIRQS; - if (ints[0] < MAX_PIRQS) - max = ints[0]; - - for (i = 0; i < max; i++) { - apic_printk(APIC_VERBOSE, KERN_DEBUG - "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); - /* - * PIRQs are mapped upside down, usually. - */ - pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; - } - return 1; -} - -__setup("pirq=", ioapic_pirq_setup); - -/* - * Find the IRQ entry number of a certain pin. - */ -static int find_irq_entry(int apic, int pin, int type) -{ - int i; - - for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mp_irqtype == type && - (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid || - mp_irqs[i].mp_dstapic == MP_APIC_ALL) && - mp_irqs[i].mp_dstirq == pin) - return i; - - return -1; -} - -/* - * Find the pin to which IRQ[irq] (ISA) is connected - */ -static int __init find_isa_irq_pin(int irq, int type) -{ - int i; - - for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mp_srcbus; - - if (test_bit(lbus, mp_bus_not_pci) && - (mp_irqs[i].mp_irqtype == type) && - (mp_irqs[i].mp_srcbusirq == irq)) - - return mp_irqs[i].mp_dstirq; - } - return -1; -} - -static int __init find_isa_irq_apic(int irq, int type) -{ - int i; - - for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mp_srcbus; - - if (test_bit(lbus, mp_bus_not_pci) && - (mp_irqs[i].mp_irqtype == type) && - (mp_irqs[i].mp_srcbusirq == irq)) - break; - } - if (i < mp_irq_entries) { - int apic; - for (apic = 0; apic < nr_ioapics; apic++) { - if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic) - return apic; - } - } - - return -1; -} - -/* - * Find a specific PCI IRQ entry. - * Not an __init, possibly needed by modules - */ -static int pin_2_irq(int idx, int apic, int pin); - -int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) -{ - int apic, i, best_guess = -1; - - apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, " - "slot:%d, pin:%d.\n", bus, slot, pin); - if (test_bit(bus, mp_bus_not_pci)) { - printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus); - return -1; - } - for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mp_srcbus; - - for (apic = 0; apic < nr_ioapics; apic++) - if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic || - mp_irqs[i].mp_dstapic == MP_APIC_ALL) - break; - - if (!test_bit(lbus, mp_bus_not_pci) && - !mp_irqs[i].mp_irqtype && - (bus == lbus) && - (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) { - int irq = pin_2_irq(i, apic, mp_irqs[i].mp_dstirq); - - if (!(apic || IO_APIC_IRQ(irq))) - continue; - - if (pin == (mp_irqs[i].mp_srcbusirq & 3)) - return irq; - /* - * Use the first all-but-pin matching entry as a - * best-guess fuzzy result for broken mptables. - */ - if (best_guess < 0) - best_guess = irq; - } - } - return best_guess; -} -EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); - -/* - * This function currently is only a helper for the i386 smp boot process where - * we need to reprogram the ioredtbls to cater for the cpus which have come online - * so mask in all cases should simply be TARGET_CPUS - */ -#ifdef CONFIG_SMP -void __init setup_ioapic_dest(void) -{ - int pin, ioapic, irq, irq_entry; - - if (skip_ioapic_setup == 1) - return; - - for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { - for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { - irq_entry = find_irq_entry(ioapic, pin, mp_INT); - if (irq_entry == -1) - continue; - irq = pin_2_irq(irq_entry, ioapic, pin); - set_ioapic_affinity_irq(irq, TARGET_CPUS); - } - - } -} -#endif - -#if defined(CONFIG_EISA) || defined(CONFIG_MCA) -/* - * EISA Edge/Level control register, ELCR - */ -static int EISA_ELCR(unsigned int irq) -{ - if (irq < 16) { - unsigned int port = 0x4d0 + (irq >> 3); - return (inb(port) >> (irq & 7)) & 1; - } - apic_printk(APIC_VERBOSE, KERN_INFO - "Broken MPtable reports ISA irq %d\n", irq); - return 0; -} -#endif - -/* ISA interrupts are always polarity zero edge triggered, - * when listed as conforming in the MP table. */ - -#define default_ISA_trigger(idx) (0) -#define default_ISA_polarity(idx) (0) - -/* EISA interrupts are always polarity zero and can be edge or level - * trigger depending on the ELCR value. If an interrupt is listed as - * EISA conforming in the MP table, that means its trigger type must - * be read in from the ELCR */ - -#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq)) -#define default_EISA_polarity(idx) default_ISA_polarity(idx) - -/* PCI interrupts are always polarity one level triggered, - * when listed as conforming in the MP table. */ - -#define default_PCI_trigger(idx) (1) -#define default_PCI_polarity(idx) (1) - -/* MCA interrupts are always polarity zero level triggered, - * when listed as conforming in the MP table. */ - -#define default_MCA_trigger(idx) (1) -#define default_MCA_polarity(idx) default_ISA_polarity(idx) - -static int MPBIOS_polarity(int idx) -{ - int bus = mp_irqs[idx].mp_srcbus; - int polarity; - - /* - * Determine IRQ line polarity (high active or low active): - */ - switch (mp_irqs[idx].mp_irqflag & 3) { - case 0: /* conforms, ie. bus-type dependent polarity */ - { - polarity = test_bit(bus, mp_bus_not_pci)? - default_ISA_polarity(idx): - default_PCI_polarity(idx); - break; - } - case 1: /* high active */ - { - polarity = 0; - break; - } - case 2: /* reserved */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - polarity = 1; - break; - } - case 3: /* low active */ - { - polarity = 1; - break; - } - default: /* invalid */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - polarity = 1; - break; - } - } - return polarity; -} - -static int MPBIOS_trigger(int idx) -{ - int bus = mp_irqs[idx].mp_srcbus; - int trigger; - - /* - * Determine IRQ trigger mode (edge or level sensitive): - */ - switch ((mp_irqs[idx].mp_irqflag>>2) & 3) { - case 0: /* conforms, ie. bus-type dependent */ - { - trigger = test_bit(bus, mp_bus_not_pci)? - default_ISA_trigger(idx): - default_PCI_trigger(idx); -#if defined(CONFIG_EISA) || defined(CONFIG_MCA) - switch (mp_bus_id_to_type[bus]) { - case MP_BUS_ISA: /* ISA pin */ - { - /* set before the switch */ - break; - } - case MP_BUS_EISA: /* EISA pin */ - { - trigger = default_EISA_trigger(idx); - break; - } - case MP_BUS_PCI: /* PCI pin */ - { - /* set before the switch */ - break; - } - case MP_BUS_MCA: /* MCA pin */ - { - trigger = default_MCA_trigger(idx); - break; - } - default: - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 1; - break; - } - } -#endif - break; - } - case 1: /* edge */ - { - trigger = 0; - break; - } - case 2: /* reserved */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 1; - break; - } - case 3: /* level */ - { - trigger = 1; - break; - } - default: /* invalid */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 0; - break; - } - } - return trigger; -} - -static inline int irq_polarity(int idx) -{ - return MPBIOS_polarity(idx); -} - -static inline int irq_trigger(int idx) -{ - return MPBIOS_trigger(idx); -} - -static int pin_2_irq(int idx, int apic, int pin) -{ - int irq, i; - int bus = mp_irqs[idx].mp_srcbus; - - /* - * Debugging check, we are in big trouble if this message pops up! - */ - if (mp_irqs[idx].mp_dstirq != pin) - printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); - - if (test_bit(bus, mp_bus_not_pci)) - irq = mp_irqs[idx].mp_srcbusirq; - else { - /* - * PCI IRQs are mapped in order - */ - i = irq = 0; - while (i < apic) - irq += nr_ioapic_registers[i++]; - irq += pin; - - /* - * For MPS mode, so far only needed by ES7000 platform - */ - if (ioapic_renumber_irq) - irq = ioapic_renumber_irq(apic, irq); - } - - /* - * PCI IRQ command line redirection. Yes, limits are hardcoded. - */ - if ((pin >= 16) && (pin <= 23)) { - if (pirq_entries[pin-16] != -1) { - if (!pirq_entries[pin-16]) { - apic_printk(APIC_VERBOSE, KERN_DEBUG - "disabling PIRQ%d\n", pin-16); - } else { - irq = pirq_entries[pin-16]; - apic_printk(APIC_VERBOSE, KERN_DEBUG - "using PIRQ%d -> IRQ %d\n", - pin-16, irq); - } - } - } - return irq; -} - -static inline int IO_APIC_irq_trigger(int irq) -{ - int apic, idx, pin; - - for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - idx = find_irq_entry(apic, pin, mp_INT); - if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) - return irq_trigger(idx); - } - } - /* - * nonexistent IRQs are edge default - */ - return 0; -} - -/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ -static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 }; - -static int __assign_irq_vector(int irq) -{ - static int current_vector = FIRST_DEVICE_VECTOR, current_offset; - int vector, offset; - - BUG_ON((unsigned)irq >= NR_IRQ_VECTORS); - - if (irq_vector[irq] > 0) - return irq_vector[irq]; - - vector = current_vector; - offset = current_offset; -next: - vector += 8; - if (vector >= first_system_vector) { - offset = (offset + 1) % 8; - vector = FIRST_DEVICE_VECTOR + offset; - } - if (vector == current_vector) - return -ENOSPC; - if (test_and_set_bit(vector, used_vectors)) - goto next; - - current_vector = vector; - current_offset = offset; - irq_vector[irq] = vector; - - return vector; -} - -static int assign_irq_vector(int irq) -{ - unsigned long flags; - int vector; - - spin_lock_irqsave(&vector_lock, flags); - vector = __assign_irq_vector(irq); - spin_unlock_irqrestore(&vector_lock, flags); - - return vector; -} - -void setup_vector_irq(int cpu) -{ -} - -static struct irq_chip ioapic_chip; - -#define IOAPIC_AUTO -1 -#define IOAPIC_EDGE 0 -#define IOAPIC_LEVEL 1 - -static void ioapic_register_intr(int irq, int vector, unsigned long trigger) -{ - if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) { - irq_desc[irq].status |= IRQ_LEVEL; - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_fasteoi_irq, "fasteoi"); - } else { - irq_desc[irq].status &= ~IRQ_LEVEL; - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_edge_irq, "edge"); - } - set_intr_gate(vector, interrupt[irq]); -} - -static void __init setup_IO_APIC_irqs(void) -{ - struct IO_APIC_route_entry entry; - int apic, pin, idx, irq, first_notcon = 1, vector; - - apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); - - for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - - /* - * add it to the IO-APIC irq-routing table: - */ - memset(&entry, 0, sizeof(entry)); - - entry.delivery_mode = INT_DELIVERY_MODE; - entry.dest_mode = INT_DEST_MODE; - entry.mask = 0; /* enable IRQ */ - entry.dest.logical.logical_dest = - cpu_mask_to_apicid(TARGET_CPUS); - - idx = find_irq_entry(apic, pin, mp_INT); - if (idx == -1) { - if (first_notcon) { - apic_printk(APIC_VERBOSE, KERN_DEBUG - " IO-APIC (apicid-pin) %d-%d", - mp_ioapics[apic].mp_apicid, - pin); - first_notcon = 0; - } else - apic_printk(APIC_VERBOSE, ", %d-%d", - mp_ioapics[apic].mp_apicid, pin); - continue; - } - - if (!first_notcon) { - apic_printk(APIC_VERBOSE, " not connected.\n"); - first_notcon = 1; - } - - entry.trigger = irq_trigger(idx); - entry.polarity = irq_polarity(idx); - - if (irq_trigger(idx)) { - entry.trigger = 1; - entry.mask = 1; - } - - irq = pin_2_irq(idx, apic, pin); - /* - * skip adding the timer int on secondary nodes, which causes - * a small but painful rift in the time-space continuum - */ - if (multi_timer_check(apic, irq)) - continue; - else - add_pin_to_irq(irq, apic, pin); - - if (!apic && !IO_APIC_IRQ(irq)) - continue; - - if (IO_APIC_IRQ(irq)) { - vector = assign_irq_vector(irq); - entry.vector = vector; - ioapic_register_intr(irq, vector, IOAPIC_AUTO); - - if (!apic && (irq < 16)) - disable_8259A_irq(irq); - } - ioapic_write_entry(apic, pin, entry); - } - } - - if (!first_notcon) - apic_printk(APIC_VERBOSE, " not connected.\n"); -} - -/* - * Set up the timer pin, possibly with the 8259A-master behind. - */ -static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, - int vector) -{ - struct IO_APIC_route_entry entry; - - memset(&entry, 0, sizeof(entry)); - - /* - * We use logical delivery to get the timer IRQ - * to the first CPU. - */ - entry.dest_mode = INT_DEST_MODE; - entry.mask = 1; /* mask IRQ now */ - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); - entry.delivery_mode = INT_DELIVERY_MODE; - entry.polarity = 0; - entry.trigger = 0; - entry.vector = vector; - - /* - * The timer IRQ doesn't have to know that behind the - * scene we may have a 8259A-master in AEOI mode ... - */ - ioapic_register_intr(0, vector, IOAPIC_EDGE); - - /* - * Add it to the IO-APIC irq-routing table: - */ - ioapic_write_entry(apic, pin, entry); -} - -void __init print_IO_APIC(void) -{ - int apic, i; - union IO_APIC_reg_00 reg_00; - union IO_APIC_reg_01 reg_01; - union IO_APIC_reg_02 reg_02; - union IO_APIC_reg_03 reg_03; - unsigned long flags; - - if (apic_verbosity == APIC_QUIET) - return; - - printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); - for (i = 0; i < nr_ioapics; i++) - printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", - mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]); - - /* - * We are a bit conservative about what we expect. We have to - * know about every hardware change ASAP. - */ - printk(KERN_INFO "testing the IO APIC.......................\n"); - - for (apic = 0; apic < nr_ioapics; apic++) { - - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - reg_01.raw = io_apic_read(apic, 1); - if (reg_01.bits.version >= 0x10) - reg_02.raw = io_apic_read(apic, 2); - if (reg_01.bits.version >= 0x20) - reg_03.raw = io_apic_read(apic, 3); - spin_unlock_irqrestore(&ioapic_lock, flags); - - printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid); - printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); - printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); - printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); - printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); - - printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw); - printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); - - printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); - printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); - - /* - * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, - * but the value of reg_02 is read as the previous read register - * value, so ignore it if reg_02 == reg_01. - */ - if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { - printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); - printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); - } - - /* - * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02 - * or reg_03, but the value of reg_0[23] is read as the previous read - * register value, so ignore it if reg_03 == reg_0[12]. - */ - if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw && - reg_03.raw != reg_01.raw) { - printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); - printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); - } - - printk(KERN_DEBUG ".... IRQ redirection table:\n"); - - printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol" - " Stat Dest Deli Vect: \n"); - - for (i = 0; i <= reg_01.bits.entries; i++) { - struct IO_APIC_route_entry entry; - - entry = ioapic_read_entry(apic, i); - - printk(KERN_DEBUG " %02x %03X %02X ", - i, - entry.dest.logical.logical_dest, - entry.dest.physical.physical_dest - ); - - printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", - entry.mask, - entry.trigger, - entry.irr, - entry.polarity, - entry.delivery_status, - entry.dest_mode, - entry.delivery_mode, - entry.vector - ); - } - } - printk(KERN_DEBUG "IRQ to pin mappings:\n"); - for (i = 0; i < NR_IRQS; i++) { - struct irq_pin_list *entry = irq_2_pin + i; - if (entry->pin < 0) - continue; - printk(KERN_DEBUG "IRQ%d ", i); - for (;;) { - printk("-> %d:%d", entry->apic, entry->pin); - if (!entry->next) - break; - entry = irq_2_pin + entry->next; - } - printk("\n"); - } - - printk(KERN_INFO ".................................... done.\n"); - - return; -} - -#if 0 - -static void print_APIC_bitfield(int base) -{ - unsigned int v; - int i, j; - - if (apic_verbosity == APIC_QUIET) - return; - - printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); - for (i = 0; i < 8; i++) { - v = apic_read(base + i*0x10); - for (j = 0; j < 32; j++) { - if (v & (1<<j)) - printk("1"); - else - printk("0"); - } - printk("\n"); - } -} - -void /*__init*/ print_local_APIC(void *dummy) -{ - unsigned int v, ver, maxlvt; - - if (apic_verbosity == APIC_QUIET) - return; - - printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", - smp_processor_id(), hard_smp_processor_id()); - v = apic_read(APIC_ID); - printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, - GET_APIC_ID(read_apic_id())); - v = apic_read(APIC_LVR); - printk(KERN_INFO "... APIC VERSION: %08x\n", v); - ver = GET_APIC_VERSION(v); - maxlvt = lapic_get_maxlvt(); - - v = apic_read(APIC_TASKPRI); - printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); - - if (APIC_INTEGRATED(ver)) { /* !82489DX */ - v = apic_read(APIC_ARBPRI); - printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, - v & APIC_ARBPRI_MASK); - v = apic_read(APIC_PROCPRI); - printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); - } - - v = apic_read(APIC_EOI); - printk(KERN_DEBUG "... APIC EOI: %08x\n", v); - v = apic_read(APIC_RRR); - printk(KERN_DEBUG "... APIC RRR: %08x\n", v); - v = apic_read(APIC_LDR); - printk(KERN_DEBUG "... APIC LDR: %08x\n", v); - v = apic_read(APIC_DFR); - printk(KERN_DEBUG "... APIC DFR: %08x\n", v); - v = apic_read(APIC_SPIV); - printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); - - printk(KERN_DEBUG "... APIC ISR field:\n"); - print_APIC_bitfield(APIC_ISR); - printk(KERN_DEBUG "... APIC TMR field:\n"); - print_APIC_bitfield(APIC_TMR); - printk(KERN_DEBUG "... APIC IRR field:\n"); - print_APIC_bitfield(APIC_IRR); - - if (APIC_INTEGRATED(ver)) { /* !82489DX */ - if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ - apic_write(APIC_ESR, 0); - v = apic_read(APIC_ESR); - printk(KERN_DEBUG "... APIC ESR: %08x\n", v); - } - - v = apic_read(APIC_ICR); - printk(KERN_DEBUG "... APIC ICR: %08x\n", v); - v = apic_read(APIC_ICR2); - printk(KERN_DEBUG "... APIC ICR2: %08x\n", v); - - v = apic_read(APIC_LVTT); - printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); - - if (maxlvt > 3) { /* PC is LVT#4. */ - v = apic_read(APIC_LVTPC); - printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); - } - v = apic_read(APIC_LVT0); - printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); - v = apic_read(APIC_LVT1); - printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); - - if (maxlvt > 2) { /* ERR is LVT#3. */ - v = apic_read(APIC_LVTERR); - printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); - } - - v = apic_read(APIC_TMICT); - printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); - v = apic_read(APIC_TMCCT); - printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); - v = apic_read(APIC_TDCR); - printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); - printk("\n"); -} - -void print_all_local_APICs(void) -{ - on_each_cpu(print_local_APIC, NULL, 1); -} - -void /*__init*/ print_PIC(void) -{ - unsigned int v; - unsigned long flags; - - if (apic_verbosity == APIC_QUIET) - return; - - printk(KERN_DEBUG "\nprinting PIC contents\n"); - - spin_lock_irqsave(&i8259A_lock, flags); - - v = inb(0xa1) << 8 | inb(0x21); - printk(KERN_DEBUG "... PIC IMR: %04x\n", v); - - v = inb(0xa0) << 8 | inb(0x20); - printk(KERN_DEBUG "... PIC IRR: %04x\n", v); - - outb(0x0b, 0xa0); - outb(0x0b, 0x20); - v = inb(0xa0) << 8 | inb(0x20); - outb(0x0a, 0xa0); - outb(0x0a, 0x20); - - spin_unlock_irqrestore(&i8259A_lock, flags); - - printk(KERN_DEBUG "... PIC ISR: %04x\n", v); - - v = inb(0x4d1) << 8 | inb(0x4d0); - printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); -} - -#endif /* 0 */ - -static void __init enable_IO_APIC(void) -{ - union IO_APIC_reg_01 reg_01; - int i8259_apic, i8259_pin; - int i, apic; - unsigned long flags; - - for (i = 0; i < PIN_MAP_SIZE; i++) { - irq_2_pin[i].pin = -1; - irq_2_pin[i].next = 0; - } - if (!pirqs_enabled) - for (i = 0; i < MAX_PIRQS; i++) - pirq_entries[i] = -1; - - /* - * The number of IO-APIC IRQ registers (== #pins): - */ - for (apic = 0; apic < nr_ioapics; apic++) { - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(apic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - nr_ioapic_registers[apic] = reg_01.bits.entries+1; - } - for (apic = 0; apic < nr_ioapics; apic++) { - int pin; - /* See if any of the pins is in ExtINT mode */ - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - struct IO_APIC_route_entry entry; - entry = ioapic_read_entry(apic, pin); - - - /* If the interrupt line is enabled and in ExtInt mode - * I have found the pin where the i8259 is connected. - */ - if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { - ioapic_i8259.apic = apic; - ioapic_i8259.pin = pin; - goto found_i8259; - } - } - } - found_i8259: - /* Look to see what if the MP table has reported the ExtINT */ - /* If we could not find the appropriate pin by looking at the ioapic - * the i8259 probably is not connected the ioapic but give the - * mptable a chance anyway. - */ - i8259_pin = find_isa_irq_pin(0, mp_ExtINT); - i8259_apic = find_isa_irq_apic(0, mp_ExtINT); - /* Trust the MP table if nothing is setup in the hardware */ - if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) { - printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n"); - ioapic_i8259.pin = i8259_pin; - ioapic_i8259.apic = i8259_apic; - } - /* Complain if the MP table and the hardware disagree */ - if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) && - (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) - { - printk(KERN_WARNING "ExtINT in hardware and MP table differ\n"); - } - - /* - * Do not trust the IO-APIC being empty at bootup - */ - clear_IO_APIC(); -} - -/* - * Not an __init, needed by the reboot code - */ -void disable_IO_APIC(void) -{ - /* - * Clear the IO-APIC before rebooting: - */ - clear_IO_APIC(); - - /* - * If the i8259 is routed through an IOAPIC - * Put that IOAPIC in virtual wire mode - * so legacy interrupts can be delivered. - */ - if (ioapic_i8259.pin != -1) { - struct IO_APIC_route_entry entry; - - memset(&entry, 0, sizeof(entry)); - entry.mask = 0; /* Enabled */ - entry.trigger = 0; /* Edge */ - entry.irr = 0; - entry.polarity = 0; /* High */ - entry.delivery_status = 0; - entry.dest_mode = 0; /* Physical */ - entry.delivery_mode = dest_ExtINT; /* ExtInt */ - entry.vector = 0; - entry.dest.physical.physical_dest = - GET_APIC_ID(read_apic_id()); - - /* - * Add it to the IO-APIC irq-routing table: - */ - ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); - } - disconnect_bsp_APIC(ioapic_i8259.pin != -1); -} - -/* - * function to set the IO-APIC physical IDs based on the - * values stored in the MPC table. - * - * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 - */ - -static void __init setup_ioapic_ids_from_mpc(void) -{ - union IO_APIC_reg_00 reg_00; - physid_mask_t phys_id_present_map; - int apic; - int i; - unsigned char old_id; - unsigned long flags; - -#ifdef CONFIG_X86_NUMAQ - if (found_numaq) - return; -#endif - - /* - * Don't check I/O APIC IDs for xAPIC systems. They have - * no meaning without the serial APIC bus. - */ - if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) - return; - /* - * This is broken; anything with a real cpu count has to - * circumvent this idiocy regardless. - */ - phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map); - - /* - * Set the IOAPIC ID to the value stored in the MPC table. - */ - for (apic = 0; apic < nr_ioapics; apic++) { - - /* Read the register 0 value */ - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - - old_id = mp_ioapics[apic].mp_apicid; - - if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) { - printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", - apic, mp_ioapics[apic].mp_apicid); - printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", - reg_00.bits.ID); - mp_ioapics[apic].mp_apicid = reg_00.bits.ID; - } - - /* - * Sanity check, is the ID really free? Every APIC in a - * system must have a unique ID or we get lots of nice - * 'stuck on smp_invalidate_needed IPI wait' messages. - */ - if (check_apicid_used(phys_id_present_map, - mp_ioapics[apic].mp_apicid)) { - printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", - apic, mp_ioapics[apic].mp_apicid); - for (i = 0; i < get_physical_broadcast(); i++) - if (!physid_isset(i, phys_id_present_map)) - break; - if (i >= get_physical_broadcast()) - panic("Max APIC ID exceeded!\n"); - printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", - i); - physid_set(i, phys_id_present_map); - mp_ioapics[apic].mp_apicid = i; - } else { - physid_mask_t tmp; - tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid); - apic_printk(APIC_VERBOSE, "Setting %d in the " - "phys_id_present_map\n", - mp_ioapics[apic].mp_apicid); - physids_or(phys_id_present_map, phys_id_present_map, tmp); - } - - - /* - * We need to adjust the IRQ routing table - * if the ID changed. - */ - if (old_id != mp_ioapics[apic].mp_apicid) - for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mp_dstapic == old_id) - mp_irqs[i].mp_dstapic - = mp_ioapics[apic].mp_apicid; - - /* - * Read the right value from the MPC table and - * write it into the ID register. - */ - apic_printk(APIC_VERBOSE, KERN_INFO - "...changing IO-APIC physical APIC ID to %d ...", - mp_ioapics[apic].mp_apicid); - - reg_00.bits.ID = mp_ioapics[apic].mp_apicid; - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0, reg_00.raw); - spin_unlock_irqrestore(&ioapic_lock, flags); - - /* - * Sanity check - */ - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid) - printk("could not set ID!\n"); - else - apic_printk(APIC_VERBOSE, " ok.\n"); - } -} - -int no_timer_check __initdata; - -static int __init notimercheck(char *s) -{ - no_timer_check = 1; - return 1; -} -__setup("no_timer_check", notimercheck); - -/* - * There is a nasty bug in some older SMP boards, their mptable lies - * about the timer IRQ. We do the following to work around the situation: - * - * - timer IRQ defaults to IO-APIC IRQ - * - if this function detects that timer IRQs are defunct, then we fall - * back to ISA timer IRQs - */ -static int __init timer_irq_works(void) -{ - unsigned long t1 = jiffies; - unsigned long flags; - - if (no_timer_check) - return 1; - - local_save_flags(flags); - local_irq_enable(); - /* Let ten ticks pass... */ - mdelay((10 * 1000) / HZ); - local_irq_restore(flags); - - /* - * Expect a few ticks at least, to be sure some possible - * glue logic does not lock up after one or two first - * ticks in a non-ExtINT mode. Also the local APIC - * might have cached one ExtINT interrupt. Finally, at - * least one tick may be lost due to delays. - */ - if (time_after(jiffies, t1 + 4)) - return 1; - - return 0; -} - -/* - * In the SMP+IOAPIC case it might happen that there are an unspecified - * number of pending IRQ events unhandled. These cases are very rare, - * so we 'resend' these IRQs via IPIs, to the same CPU. It's much - * better to do it this way as thus we do not have to be aware of - * 'pending' interrupts in the IRQ path, except at this point. - */ -/* - * Edge triggered needs to resend any interrupt - * that was delayed but this is now handled in the device - * independent code. - */ - -/* - * Startup quirk: - * - * Starting up a edge-triggered IO-APIC interrupt is - * nasty - we need to make sure that we get the edge. - * If it is already asserted for some reason, we need - * return 1 to indicate that is was pending. - * - * This is not complete - we should be able to fake - * an edge even if it isn't on the 8259A... - * - * (We do this for level-triggered IRQs too - it cannot hurt.) - */ -static unsigned int startup_ioapic_irq(unsigned int irq) -{ - int was_pending = 0; - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - if (irq < 16) { - disable_8259A_irq(irq); - if (i8259A_irq_pending(irq)) - was_pending = 1; - } - __unmask_IO_APIC_irq(irq); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return was_pending; -} - -static void ack_ioapic_irq(unsigned int irq) -{ - move_native_irq(irq); - ack_APIC_irq(); -} - -static void ack_ioapic_quirk_irq(unsigned int irq) -{ - unsigned long v; - int i; - - move_native_irq(irq); -/* - * It appears there is an erratum which affects at least version 0x11 - * of I/O APIC (that's the 82093AA and cores integrated into various - * chipsets). Under certain conditions a level-triggered interrupt is - * erroneously delivered as edge-triggered one but the respective IRR - * bit gets set nevertheless. As a result the I/O unit expects an EOI - * message but it will never arrive and further interrupts are blocked - * from the source. The exact reason is so far unknown, but the - * phenomenon was observed when two consecutive interrupt requests - * from a given source get delivered to the same CPU and the source is - * temporarily disabled in between. - * - * A workaround is to simulate an EOI message manually. We achieve it - * by setting the trigger mode to edge and then to level when the edge - * trigger mode gets detected in the TMR of a local APIC for a - * level-triggered interrupt. We mask the source for the time of the - * operation to prevent an edge-triggered interrupt escaping meanwhile. - * The idea is from Manfred Spraul. --macro - */ - i = irq_vector[irq]; - - v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); - - ack_APIC_irq(); - - if (!(v & (1 << (i & 0x1f)))) { - atomic_inc(&irq_mis_count); - spin_lock(&ioapic_lock); - __mask_and_edge_IO_APIC_irq(irq); - __unmask_and_level_IO_APIC_irq(irq); - spin_unlock(&ioapic_lock); - } -} - -static int ioapic_retrigger_irq(unsigned int irq) -{ - send_IPI_self(irq_vector[irq]); - - return 1; -} - -static struct irq_chip ioapic_chip __read_mostly = { - .name = "IO-APIC", - .startup = startup_ioapic_irq, - .mask = mask_IO_APIC_irq, - .unmask = unmask_IO_APIC_irq, - .ack = ack_ioapic_irq, - .eoi = ack_ioapic_quirk_irq, -#ifdef CONFIG_SMP - .set_affinity = set_ioapic_affinity_irq, -#endif - .retrigger = ioapic_retrigger_irq, -}; - - -static inline void init_IO_APIC_traps(void) -{ - int irq; - - /* - * NOTE! The local APIC isn't very good at handling - * multiple interrupts at the same interrupt level. - * As the interrupt level is determined by taking the - * vector number and shifting that right by 4, we - * want to spread these out a bit so that they don't - * all fall in the same interrupt level. - * - * Also, we've got to be careful not to trash gate - * 0x80, because int 0x80 is hm, kind of importantish. ;) - */ - for (irq = 0; irq < NR_IRQS ; irq++) { - if (IO_APIC_IRQ(irq) && !irq_vector[irq]) { - /* - * Hmm.. We don't have an entry for this, - * so default to an old-fashioned 8259 - * interrupt if we can.. - */ - if (irq < 16) - make_8259A_irq(irq); - else - /* Strange. Oh, well.. */ - irq_desc[irq].chip = &no_irq_chip; - } - } -} - -/* - * The local APIC irq-chip implementation: - */ - -static void ack_lapic_irq(unsigned int irq) -{ - ack_APIC_irq(); -} - -static void mask_lapic_irq(unsigned int irq) -{ - unsigned long v; - - v = apic_read(APIC_LVT0); - apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); -} - -static void unmask_lapic_irq(unsigned int irq) -{ - unsigned long v; - - v = apic_read(APIC_LVT0); - apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED); -} - -static struct irq_chip lapic_chip __read_mostly = { - .name = "local-APIC", - .mask = mask_lapic_irq, - .unmask = unmask_lapic_irq, - .ack = ack_lapic_irq, -}; - -static void lapic_register_intr(int irq, int vector) -{ - irq_desc[irq].status &= ~IRQ_LEVEL; - set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, - "edge"); - set_intr_gate(vector, interrupt[irq]); -} - -static void __init setup_nmi(void) -{ - /* - * Dirty trick to enable the NMI watchdog ... - * We put the 8259A master into AEOI mode and - * unmask on all local APICs LVT0 as NMI. - * - * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') - * is from Maciej W. Rozycki - so we do not have to EOI from - * the NMI handler or the timer interrupt. - */ - apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); - - enable_NMI_through_LVT0(); - - apic_printk(APIC_VERBOSE, " done.\n"); -} - -/* - * This looks a bit hackish but it's about the only one way of sending - * a few INTA cycles to 8259As and any associated glue logic. ICR does - * not support the ExtINT mode, unfortunately. We need to send these - * cycles as some i82489DX-based boards have glue logic that keeps the - * 8259A interrupt line asserted until INTA. --macro - */ -static inline void __init unlock_ExtINT_logic(void) -{ - int apic, pin, i; - struct IO_APIC_route_entry entry0, entry1; - unsigned char save_control, save_freq_select; - - pin = find_isa_irq_pin(8, mp_INT); - if (pin == -1) { - WARN_ON_ONCE(1); - return; - } - apic = find_isa_irq_apic(8, mp_INT); - if (apic == -1) { - WARN_ON_ONCE(1); - return; - } - - entry0 = ioapic_read_entry(apic, pin); - clear_IO_APIC_pin(apic, pin); - - memset(&entry1, 0, sizeof(entry1)); - - entry1.dest_mode = 0; /* physical delivery */ - entry1.mask = 0; /* unmask IRQ now */ - entry1.dest.physical.physical_dest = hard_smp_processor_id(); - entry1.delivery_mode = dest_ExtINT; - entry1.polarity = entry0.polarity; - entry1.trigger = 0; - entry1.vector = 0; - - ioapic_write_entry(apic, pin, entry1); - - save_control = CMOS_READ(RTC_CONTROL); - save_freq_select = CMOS_READ(RTC_FREQ_SELECT); - CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, - RTC_FREQ_SELECT); - CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); - - i = 100; - while (i-- > 0) { - mdelay(10); - if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) - i -= 10; - } - - CMOS_WRITE(save_control, RTC_CONTROL); - CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); - clear_IO_APIC_pin(apic, pin); - - ioapic_write_entry(apic, pin, entry0); -} - -/* - * This code may look a bit paranoid, but it's supposed to cooperate with - * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ - * is so screwy. Thanks to Brian Perkins for testing/hacking this beast - * fanatically on his truly buggy board. - */ -static inline void __init check_timer(void) -{ - int apic1, pin1, apic2, pin2; - int no_pin1 = 0; - int vector; - unsigned int ver; - unsigned long flags; - - local_irq_save(flags); - - ver = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(ver); - - /* - * get/set the timer IRQ vector: - */ - disable_8259A_irq(0); - vector = assign_irq_vector(0); - set_intr_gate(vector, interrupt[0]); - - /* - * As IRQ0 is to be enabled in the 8259A, the virtual - * wire has to be disabled in the local APIC. Also - * timer interrupts need to be acknowledged manually in - * the 8259A for the i82489DX when using the NMI - * watchdog as that APIC treats NMIs as level-triggered. - * The AEOI mode will finish them in the 8259A - * automatically. - */ - apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); - init_8259A(1); - timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); - - pin1 = find_isa_irq_pin(0, mp_INT); - apic1 = find_isa_irq_apic(0, mp_INT); - pin2 = ioapic_i8259.pin; - apic2 = ioapic_i8259.apic; - - printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", - vector, apic1, pin1, apic2, pin2); - - /* - * Some BIOS writers are clueless and report the ExtINTA - * I/O APIC input from the cascaded 8259A as the timer - * interrupt input. So just in case, if only one pin - * was found above, try it both directly and through the - * 8259A. - */ - if (pin1 == -1) { - pin1 = pin2; - apic1 = apic2; - no_pin1 = 1; - } else if (pin2 == -1) { - pin2 = pin1; - apic2 = apic1; - } - - if (pin1 != -1) { - /* - * Ok, does IRQ0 through the IOAPIC work? - */ - if (no_pin1) { - add_pin_to_irq(0, apic1, pin1); - setup_timer_IRQ0_pin(apic1, pin1, vector); - } - unmask_IO_APIC_irq(0); - if (timer_irq_works()) { - if (nmi_watchdog == NMI_IO_APIC) { - setup_nmi(); - enable_8259A_irq(0); - } - if (disable_timer_pin_1 > 0) - clear_IO_APIC_pin(0, pin1); - goto out; - } - clear_IO_APIC_pin(apic1, pin1); - if (!no_pin1) - printk(KERN_ERR "..MP-BIOS bug: " - "8254 timer not connected to IO-APIC\n"); - - printk(KERN_INFO "...trying to set up timer (IRQ0) " - "through the 8259A ... "); - printk("\n..... (found pin %d) ...", pin2); - /* - * legacy devices should be connected to IO APIC #0 - */ - replace_pin_at_irq(0, apic1, pin1, apic2, pin2); - setup_timer_IRQ0_pin(apic2, pin2, vector); - unmask_IO_APIC_irq(0); - enable_8259A_irq(0); - if (timer_irq_works()) { - printk("works.\n"); - timer_through_8259 = 1; - if (nmi_watchdog == NMI_IO_APIC) { - disable_8259A_irq(0); - setup_nmi(); - enable_8259A_irq(0); - } - goto out; - } - /* - * Cleanup, just in case ... - */ - disable_8259A_irq(0); - clear_IO_APIC_pin(apic2, pin2); - printk(" failed.\n"); - } - - if (nmi_watchdog == NMI_IO_APIC) { - printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); - nmi_watchdog = NMI_NONE; - } - timer_ack = 0; - - printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); - - lapic_register_intr(0, vector); - apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ - enable_8259A_irq(0); - - if (timer_irq_works()) { - printk(" works.\n"); - goto out; - } - disable_8259A_irq(0); - apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); - printk(" failed.\n"); - - printk(KERN_INFO "...trying to set up timer as ExtINT IRQ..."); - - init_8259A(0); - make_8259A_irq(0); - apic_write_around(APIC_LVT0, APIC_DM_EXTINT); - - unlock_ExtINT_logic(); - - if (timer_irq_works()) { - printk(" works.\n"); - goto out; - } - printk(" failed :(.\n"); - panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " - "report. Then try booting with the 'noapic' option"); -out: - local_irq_restore(flags); -} - -/* - * Traditionally ISA IRQ2 is the cascade IRQ, and is not available - * to devices. However there may be an I/O APIC pin available for - * this interrupt regardless. The pin may be left unconnected, but - * typically it will be reused as an ExtINT cascade interrupt for - * the master 8259A. In the MPS case such a pin will normally be - * reported as an ExtINT interrupt in the MP table. With ACPI - * there is no provision for ExtINT interrupts, and in the absence - * of an override it would be treated as an ordinary ISA I/O APIC - * interrupt, that is edge-triggered and unmasked by default. We - * used to do this, but it caused problems on some systems because - * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using - * the same ExtINT cascade interrupt to drive the local APIC of the - * bootstrap processor. Therefore we refrain from routing IRQ2 to - * the I/O APIC in all cases now. No actual device should request - * it anyway. --macro - */ -#define PIC_IRQS (1 << PIC_CASCADE_IR) - -void __init setup_IO_APIC(void) -{ - int i; - - /* Reserve all the system vectors. */ - for (i = first_system_vector; i < NR_VECTORS; i++) - set_bit(i, used_vectors); - - enable_IO_APIC(); - - io_apic_irqs = ~PIC_IRQS; - - printk("ENABLING IO-APIC IRQs\n"); - - /* - * Set up IO-APIC IRQ routing. - */ - if (!acpi_ioapic) - setup_ioapic_ids_from_mpc(); - sync_Arb_IDs(); - setup_IO_APIC_irqs(); - init_IO_APIC_traps(); - check_timer(); - if (!acpi_ioapic) - print_IO_APIC(); -} - -/* - * Called after all the initialization is done. If we didnt find any - * APIC bugs then we can allow the modify fast path - */ - -static int __init io_apic_bug_finalize(void) -{ - if (sis_apic_bug == -1) - sis_apic_bug = 0; - return 0; -} - -late_initcall(io_apic_bug_finalize); - -struct sysfs_ioapic_data { - struct sys_device dev; - struct IO_APIC_route_entry entry[0]; -}; -static struct sysfs_ioapic_data *mp_ioapic_data[MAX_IO_APICS]; - -static int ioapic_suspend(struct sys_device *dev, pm_message_t state) -{ - struct IO_APIC_route_entry *entry; - struct sysfs_ioapic_data *data; - int i; - - data = container_of(dev, struct sysfs_ioapic_data, dev); - entry = data->entry; - for (i = 0; i < nr_ioapic_registers[dev->id]; i++) - entry[i] = ioapic_read_entry(dev->id, i); - - return 0; -} - -static int ioapic_resume(struct sys_device *dev) -{ - struct IO_APIC_route_entry *entry; - struct sysfs_ioapic_data *data; - unsigned long flags; - union IO_APIC_reg_00 reg_00; - int i; - - data = container_of(dev, struct sysfs_ioapic_data, dev); - entry = data->entry; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(dev->id, 0); - if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) { - reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid; - io_apic_write(dev->id, 0, reg_00.raw); - } - spin_unlock_irqrestore(&ioapic_lock, flags); - for (i = 0; i < nr_ioapic_registers[dev->id]; i++) - ioapic_write_entry(dev->id, i, entry[i]); - - return 0; -} - -static struct sysdev_class ioapic_sysdev_class = { - .name = "ioapic", - .suspend = ioapic_suspend, - .resume = ioapic_resume, -}; - -static int __init ioapic_init_sysfs(void) -{ - struct sys_device *dev; - int i, size, error = 0; - - error = sysdev_class_register(&ioapic_sysdev_class); - if (error) - return error; - - for (i = 0; i < nr_ioapics; i++) { - size = sizeof(struct sys_device) + nr_ioapic_registers[i] - * sizeof(struct IO_APIC_route_entry); - mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL); - if (!mp_ioapic_data[i]) { - printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); - continue; - } - dev = &mp_ioapic_data[i]->dev; - dev->id = i; - dev->cls = &ioapic_sysdev_class; - error = sysdev_register(dev); - if (error) { - kfree(mp_ioapic_data[i]); - mp_ioapic_data[i] = NULL; - printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); - continue; - } - } - - return 0; -} - -device_initcall(ioapic_init_sysfs); - -/* - * Dynamic irq allocate and deallocation - */ -int create_irq(void) -{ - /* Allocate an unused irq */ - int irq, new, vector = 0; - unsigned long flags; - - irq = -ENOSPC; - spin_lock_irqsave(&vector_lock, flags); - for (new = (NR_IRQS - 1); new >= 0; new--) { - if (platform_legacy_irq(new)) - continue; - if (irq_vector[new] != 0) - continue; - vector = __assign_irq_vector(new); - if (likely(vector > 0)) - irq = new; - break; - } - spin_unlock_irqrestore(&vector_lock, flags); - - if (irq >= 0) { - set_intr_gate(vector, interrupt[irq]); - dynamic_irq_init(irq); - } - return irq; -} - -void destroy_irq(unsigned int irq) -{ - unsigned long flags; - - dynamic_irq_cleanup(irq); - - spin_lock_irqsave(&vector_lock, flags); - clear_bit(irq_vector[irq], used_vectors); - irq_vector[irq] = 0; - spin_unlock_irqrestore(&vector_lock, flags); -} - -/* - * MSI message composition - */ -#ifdef CONFIG_PCI_MSI -static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) -{ - int vector; - unsigned dest; - - vector = assign_irq_vector(irq); - if (vector >= 0) { - dest = cpu_mask_to_apicid(TARGET_CPUS); - - msg->address_hi = MSI_ADDR_BASE_HI; - msg->address_lo = - MSI_ADDR_BASE_LO | - ((INT_DEST_MODE == 0) ? -MSI_ADDR_DEST_MODE_PHYSICAL: - MSI_ADDR_DEST_MODE_LOGICAL) | - ((INT_DELIVERY_MODE != dest_LowestPrio) ? - MSI_ADDR_REDIRECTION_CPU: - MSI_ADDR_REDIRECTION_LOWPRI) | - MSI_ADDR_DEST_ID(dest); - - msg->data = - MSI_DATA_TRIGGER_EDGE | - MSI_DATA_LEVEL_ASSERT | - ((INT_DELIVERY_MODE != dest_LowestPrio) ? -MSI_DATA_DELIVERY_FIXED: - MSI_DATA_DELIVERY_LOWPRI) | - MSI_DATA_VECTOR(vector); - } - return vector; -} - -#ifdef CONFIG_SMP -static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) -{ - struct msi_msg msg; - unsigned int dest; - cpumask_t tmp; - int vector; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - tmp = TARGET_CPUS; - - vector = assign_irq_vector(irq); - if (vector < 0) - return; - - dest = cpu_mask_to_apicid(mask); - - read_msi_msg(irq, &msg); - - msg.data &= ~MSI_DATA_VECTOR_MASK; - msg.data |= MSI_DATA_VECTOR(vector); - msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; - msg.address_lo |= MSI_ADDR_DEST_ID(dest); - - write_msi_msg(irq, &msg); - irq_desc[irq].affinity = mask; -} -#endif /* CONFIG_SMP */ - -/* - * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, - * which implement the MSI or MSI-X Capability Structure. - */ -static struct irq_chip msi_chip = { - .name = "PCI-MSI", - .unmask = unmask_msi_irq, - .mask = mask_msi_irq, - .ack = ack_ioapic_irq, -#ifdef CONFIG_SMP - .set_affinity = set_msi_irq_affinity, -#endif - .retrigger = ioapic_retrigger_irq, -}; - -int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) -{ - struct msi_msg msg; - int irq, ret; - irq = create_irq(); - if (irq < 0) - return irq; - - ret = msi_compose_msg(dev, irq, &msg); - if (ret < 0) { - destroy_irq(irq); - return ret; - } - - set_irq_msi(irq, desc); - write_msi_msg(irq, &msg); - - set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, - "edge"); - - return 0; -} - -void arch_teardown_msi_irq(unsigned int irq) -{ - destroy_irq(irq); -} - -#endif /* CONFIG_PCI_MSI */ - -/* - * Hypertransport interrupt support - */ -#ifdef CONFIG_HT_IRQ - -#ifdef CONFIG_SMP - -static void target_ht_irq(unsigned int irq, unsigned int dest) -{ - struct ht_irq_msg msg; - fetch_ht_irq_msg(irq, &msg); - - msg.address_lo &= ~(HT_IRQ_LOW_DEST_ID_MASK); - msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); - - msg.address_lo |= HT_IRQ_LOW_DEST_ID(dest); - msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest); - - write_ht_irq_msg(irq, &msg); -} - -static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) -{ - unsigned int dest; - cpumask_t tmp; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - tmp = TARGET_CPUS; - - cpus_and(mask, tmp, CPU_MASK_ALL); - - dest = cpu_mask_to_apicid(mask); - - target_ht_irq(irq, dest); - irq_desc[irq].affinity = mask; -} -#endif - -static struct irq_chip ht_irq_chip = { - .name = "PCI-HT", - .mask = mask_ht_irq, - .unmask = unmask_ht_irq, - .ack = ack_ioapic_irq, -#ifdef CONFIG_SMP - .set_affinity = set_ht_irq_affinity, -#endif - .retrigger = ioapic_retrigger_irq, -}; - -int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) -{ - int vector; - - vector = assign_irq_vector(irq); - if (vector >= 0) { - struct ht_irq_msg msg; - unsigned dest; - cpumask_t tmp; - - cpus_clear(tmp); - cpu_set(vector >> 8, tmp); - dest = cpu_mask_to_apicid(tmp); - - msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); - - msg.address_lo = - HT_IRQ_LOW_BASE | - HT_IRQ_LOW_DEST_ID(dest) | - HT_IRQ_LOW_VECTOR(vector) | - ((INT_DEST_MODE == 0) ? - HT_IRQ_LOW_DM_PHYSICAL : - HT_IRQ_LOW_DM_LOGICAL) | - HT_IRQ_LOW_RQEOI_EDGE | - ((INT_DELIVERY_MODE != dest_LowestPrio) ? - HT_IRQ_LOW_MT_FIXED : - HT_IRQ_LOW_MT_ARBITRATED) | - HT_IRQ_LOW_IRQ_MASKED; - - write_ht_irq_msg(irq, &msg); - - set_irq_chip_and_handler_name(irq, &ht_irq_chip, - handle_edge_irq, "edge"); - } - return vector; -} -#endif /* CONFIG_HT_IRQ */ - -/* -------------------------------------------------------------------------- - ACPI-based IOAPIC Configuration - -------------------------------------------------------------------------- */ - -#ifdef CONFIG_ACPI - -int __init io_apic_get_unique_id(int ioapic, int apic_id) -{ - union IO_APIC_reg_00 reg_00; - static physid_mask_t apic_id_map = PHYSID_MASK_NONE; - physid_mask_t tmp; - unsigned long flags; - int i = 0; - - /* - * The P4 platform supports up to 256 APIC IDs on two separate APIC - * buses (one for LAPICs, one for IOAPICs), where predecessors only - * supports up to 16 on one shared APIC bus. - * - * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full - * advantage of new APIC bus architecture. - */ - - if (physids_empty(apic_id_map)) - apic_id_map = ioapic_phys_id_map(phys_cpu_present_map); - - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(ioapic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - - if (apic_id >= get_physical_broadcast()) { - printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " - "%d\n", ioapic, apic_id, reg_00.bits.ID); - apic_id = reg_00.bits.ID; - } - - /* - * Every APIC in a system must have a unique ID or we get lots of nice - * 'stuck on smp_invalidate_needed IPI wait' messages. - */ - if (check_apicid_used(apic_id_map, apic_id)) { - - for (i = 0; i < get_physical_broadcast(); i++) { - if (!check_apicid_used(apic_id_map, i)) - break; - } - - if (i == get_physical_broadcast()) - panic("Max apic_id exceeded!\n"); - - printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " - "trying %d\n", ioapic, apic_id, i); - - apic_id = i; - } - - tmp = apicid_to_cpu_present(apic_id); - physids_or(apic_id_map, apic_id_map, tmp); - - if (reg_00.bits.ID != apic_id) { - reg_00.bits.ID = apic_id; - - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(ioapic, 0, reg_00.raw); - reg_00.raw = io_apic_read(ioapic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - - /* Sanity check */ - if (reg_00.bits.ID != apic_id) { - printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); - return -1; - } - } - - apic_printk(APIC_VERBOSE, KERN_INFO - "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); - - return apic_id; -} - - -int __init io_apic_get_version(int ioapic) -{ - union IO_APIC_reg_01 reg_01; - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(ioapic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return reg_01.bits.version; -} - - -int __init io_apic_get_redir_entries(int ioapic) -{ - union IO_APIC_reg_01 reg_01; - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(ioapic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return reg_01.bits.entries; -} - - -int io_apic_set_pci_routing(int ioapic, int pin, int irq, int edge_level, int active_high_low) -{ - struct IO_APIC_route_entry entry; - - if (!IO_APIC_IRQ(irq)) { - printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", - ioapic); - return -EINVAL; - } - - /* - * Generate a PCI IRQ routing entry and program the IOAPIC accordingly. - * Note that we mask (disable) IRQs now -- these get enabled when the - * corresponding device driver registers for this IRQ. - */ - - memset(&entry, 0, sizeof(entry)); - - entry.delivery_mode = INT_DELIVERY_MODE; - entry.dest_mode = INT_DEST_MODE; - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); - entry.trigger = edge_level; - entry.polarity = active_high_low; - entry.mask = 1; - - /* - * IRQs < 16 are already in the irq_2_pin[] map - */ - if (irq >= 16) - add_pin_to_irq(irq, ioapic, pin); - - entry.vector = assign_irq_vector(irq); - - apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry " - "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic, - mp_ioapics[ioapic].mp_apicid, pin, entry.vector, irq, - edge_level, active_high_low); - - ioapic_register_intr(irq, entry.vector, edge_level); - - if (!ioapic && (irq < 16)) - disable_8259A_irq(irq); - - ioapic_write_entry(ioapic, pin, entry); - - return 0; -} - -int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) -{ - int i; - - if (skip_ioapic_setup) - return -1; - - for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mp_irqtype == mp_INT && - mp_irqs[i].mp_srcbusirq == bus_irq) - break; - if (i >= mp_irq_entries) - return -1; - - *trigger = irq_trigger(i); - *polarity = irq_polarity(i); - return 0; -} - -#endif /* CONFIG_ACPI */ - -static int __init parse_disable_timer_pin_1(char *arg) -{ - disable_timer_pin_1 = 1; - return 0; -} -early_param("disable_timer_pin_1", parse_disable_timer_pin_1); - -static int __init parse_enable_timer_pin_1(char *arg) -{ - disable_timer_pin_1 = -1; - return 0; -} -early_param("enable_timer_pin_1", parse_enable_timer_pin_1); - -static int __init parse_noapic(char *arg) -{ - /* disable IO-APIC */ - disable_ioapic_setup(); - return 0; -} -early_param("noapic", parse_noapic); - -void __init ioapic_init_mappings(void) -{ - unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; - int i; - - for (i = 0; i < nr_ioapics; i++) { - if (smp_found_config) { - ioapic_phys = mp_ioapics[i].mp_apicaddr; - if (!ioapic_phys) { - printk(KERN_ERR - "WARNING: bogus zero IO-APIC " - "address found in MPTABLE, " - "disabling IO/APIC support!\n"); - smp_found_config = 0; - skip_ioapic_setup = 1; - goto fake_ioapic_page; - } - } else { -fake_ioapic_page: - ioapic_phys = (unsigned long) - alloc_bootmem_pages(PAGE_SIZE); - ioapic_phys = __pa(ioapic_phys); - } - set_fixmap_nocache(idx, ioapic_phys); - printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n", - __fix_to_virt(idx), ioapic_phys); - idx++; - } -} - diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c index 5921e5f0a640..720d2607aacb 100644 --- a/arch/x86/kernel/io_delay.c +++ b/arch/x86/kernel/io_delay.c @@ -92,6 +92,14 @@ static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "30BF") } }, + { + .callback = dmi_io_delay_0xed_port, + .ident = "Presario F700", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), + DMI_MATCH(DMI_BOARD_NAME, "30D3") + } + }, { } }; @@ -103,6 +111,9 @@ void __init io_delay_init(void) static int __init io_delay_param(char *s) { + if (!s) + return -EINVAL; + if (!strcmp(s, "0x80")) io_delay_type = CONFIG_IO_DELAY_TYPE_0X80; else if (!strcmp(s, "0xed")) diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 50e5e4a31c85..191914302744 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -14,6 +14,7 @@ #include <linux/slab.h> #include <linux/thread_info.h> #include <linux/syscalls.h> +#include <asm/syscalls.h> /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ static void set_bitmap(unsigned long *bitmap, unsigned int base, diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c index 9d98cda39ad9..f1c688e46f35 100644 --- a/arch/x86/kernel/ipi.c +++ b/arch/x86/kernel/ipi.c @@ -20,6 +20,8 @@ #ifdef CONFIG_X86_32 #include <mach_apic.h> +#include <mach_ipi.h> + /* * the following functions deal with sending IPIs between CPUs. * @@ -70,7 +72,7 @@ void __send_IPI_shortcut(unsigned int shortcut, int vector) /* * Send the IPI. The write to APIC_ICR fires this off. */ - apic_write_around(APIC_ICR, cfg); + apic_write(APIC_ICR, cfg); } void send_IPI_self(int vector) @@ -98,7 +100,7 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector) * prepare target chip field */ cfg = __prepare_ICR2(mask); - apic_write_around(APIC_ICR2, cfg); + apic_write(APIC_ICR2, cfg); /* * program the ICR @@ -108,7 +110,7 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector) /* * Send the IPI. The write to APIC_ICR fires this off. */ - apic_write_around(APIC_ICR, cfg); + apic_write(APIC_ICR, cfg); } /* @@ -147,7 +149,6 @@ void send_IPI_mask_sequence(cpumask_t mask, int vector) } /* must come after the send_IPI functions above for inlining */ -#include <mach_ipi.h> static int convert_apicid_to_cpu(int apic_id) { int i; diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c new file mode 100644 index 000000000000..d1d4dc52f649 --- /dev/null +++ b/arch/x86/kernel/irq.c @@ -0,0 +1,189 @@ +/* + * Common interrupt code for 32 and 64 bit + */ +#include <linux/cpu.h> +#include <linux/interrupt.h> +#include <linux/kernel_stat.h> +#include <linux/seq_file.h> + +#include <asm/apic.h> +#include <asm/io_apic.h> +#include <asm/smp.h> + +atomic_t irq_err_count; + +/* + * 'what should we do if we get a hw irq event on an illegal vector'. + * each architecture has to answer this themselves. + */ +void ack_bad_irq(unsigned int irq) +{ + printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq); + +#ifdef CONFIG_X86_LOCAL_APIC + /* + * Currently unexpected vectors happen only on SMP and APIC. + * We _must_ ack these because every local APIC has only N + * irq slots per priority level, and a 'hanging, unacked' IRQ + * holds up an irq slot - in excessive cases (when multiple + * unexpected vectors occur) that might lock up the APIC + * completely. + * But only ack when the APIC is enabled -AK + */ + if (cpu_has_apic) + ack_APIC_irq(); +#endif +} + +#ifdef CONFIG_X86_32 +# define irq_stats(x) (&per_cpu(irq_stat, x)) +#else +# define irq_stats(x) cpu_pda(x) +#endif +/* + * /proc/interrupts printing: + */ +static int show_other_interrupts(struct seq_file *p) +{ + int j; + + seq_printf(p, "NMI: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->__nmi_count); + seq_printf(p, " Non-maskable interrupts\n"); +#ifdef CONFIG_X86_LOCAL_APIC + seq_printf(p, "LOC: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs); + seq_printf(p, " Local timer interrupts\n"); +#endif +#ifdef CONFIG_SMP + seq_printf(p, "RES: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count); + seq_printf(p, " Rescheduling interrupts\n"); + seq_printf(p, "CAL: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_call_count); + seq_printf(p, " Function call interrupts\n"); + seq_printf(p, "TLB: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count); + seq_printf(p, " TLB shootdowns\n"); +#endif +#ifdef CONFIG_X86_MCE + seq_printf(p, "TRM: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); + seq_printf(p, " Thermal event interrupts\n"); +# ifdef CONFIG_X86_64 + seq_printf(p, "THR: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); + seq_printf(p, " Threshold APIC interrupts\n"); +# endif +#endif +#ifdef CONFIG_X86_LOCAL_APIC + seq_printf(p, "SPU: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); + seq_printf(p, " Spurious interrupts\n"); +#endif + seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); +#if defined(CONFIG_X86_IO_APIC) + seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); +#endif + return 0; +} + +int show_interrupts(struct seq_file *p, void *v) +{ + unsigned long flags, any_count = 0; + int i = *(loff_t *) v, j; + struct irqaction *action; + struct irq_desc *desc; + + if (i > nr_irqs) + return 0; + + if (i == nr_irqs) + return show_other_interrupts(p); + + /* print header */ + if (i == 0) { + seq_printf(p, " "); + for_each_online_cpu(j) + seq_printf(p, "CPU%-8d", j); + seq_putc(p, '\n'); + } + + desc = irq_to_desc(i); + spin_lock_irqsave(&desc->lock, flags); +#ifndef CONFIG_SMP + any_count = kstat_irqs(i); +#else + for_each_online_cpu(j) + any_count |= kstat_irqs_cpu(i, j); +#endif + action = desc->action; + if (!action && !any_count) + goto out; + + seq_printf(p, "%3d: ", i); +#ifndef CONFIG_SMP + seq_printf(p, "%10u ", kstat_irqs(i)); +#else + for_each_online_cpu(j) + seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); +#endif + seq_printf(p, " %8s", desc->chip->name); + seq_printf(p, "-%-8s", desc->name); + + if (action) { + seq_printf(p, " %s", action->name); + while ((action = action->next) != NULL) + seq_printf(p, ", %s", action->name); + } + + seq_putc(p, '\n'); +out: + spin_unlock_irqrestore(&desc->lock, flags); + return 0; +} + +/* + * /proc/stat helpers + */ +u64 arch_irq_stat_cpu(unsigned int cpu) +{ + u64 sum = irq_stats(cpu)->__nmi_count; + +#ifdef CONFIG_X86_LOCAL_APIC + sum += irq_stats(cpu)->apic_timer_irqs; +#endif +#ifdef CONFIG_SMP + sum += irq_stats(cpu)->irq_resched_count; + sum += irq_stats(cpu)->irq_call_count; + sum += irq_stats(cpu)->irq_tlb_count; +#endif +#ifdef CONFIG_X86_MCE + sum += irq_stats(cpu)->irq_thermal_count; +# ifdef CONFIG_X86_64 + sum += irq_stats(cpu)->irq_threshold_count; +#endif +#endif +#ifdef CONFIG_X86_LOCAL_APIC + sum += irq_stats(cpu)->irq_spurious_count; +#endif + return sum; +} + +u64 arch_irq_stat(void) +{ + u64 sum = atomic_read(&irq_err_count); + +#ifdef CONFIG_X86_IO_APIC + sum += atomic_read(&irq_mis_count); +#endif + return sum; +} diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 47a6f6f12478..a51382672de0 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -25,29 +25,6 @@ EXPORT_PER_CPU_SYMBOL(irq_stat); DEFINE_PER_CPU(struct pt_regs *, irq_regs); EXPORT_PER_CPU_SYMBOL(irq_regs); -/* - * 'what should we do if we get a hw irq event on an illegal vector'. - * each architecture has to answer this themselves. - */ -void ack_bad_irq(unsigned int irq) -{ - printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq); - -#ifdef CONFIG_X86_LOCAL_APIC - /* - * Currently unexpected vectors happen only on SMP and APIC. - * We _must_ ack these because every local APIC has only N - * irq slots per priority level, and a 'hanging, unacked' IRQ - * holds up an irq slot - in excessive cases (when multiple - * unexpected vectors occur) that might lock up the APIC - * completely. - * But only ack when the APIC is enabled -AK - */ - if (cpu_has_apic) - ack_APIC_irq(); -#endif -} - #ifdef CONFIG_DEBUG_STACKOVERFLOW /* Debugging check for stack overflow: is there less than 1KB free? */ static int check_stack_overflow(void) @@ -83,11 +60,8 @@ union irq_ctx { static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; -static char softirq_stack[NR_CPUS * THREAD_SIZE] - __attribute__((__section__(".bss.page_aligned"))); - -static char hardirq_stack[NR_CPUS * THREAD_SIZE] - __attribute__((__section__(".bss.page_aligned"))); +static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; +static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; static void call_on_stack(void *func, void *stack) { @@ -226,20 +200,25 @@ unsigned int do_IRQ(struct pt_regs *regs) { struct pt_regs *old_regs; /* high bit used in ret_from_ code */ - int overflow, irq = ~regs->orig_ax; - struct irq_desc *desc = irq_desc + irq; + int overflow; + unsigned vector = ~regs->orig_ax; + struct irq_desc *desc; + unsigned irq; - if (unlikely((unsigned)irq >= NR_IRQS)) { - printk(KERN_EMERG "%s: cannot handle IRQ %d\n", - __func__, irq); - BUG(); - } old_regs = set_irq_regs(regs); irq_enter(); + irq = __get_cpu_var(vector_irq)[vector]; overflow = check_stack_overflow(); + desc = irq_to_desc(irq); + if (unlikely(!desc)) { + printk(KERN_EMERG "%s: cannot handle IRQ %d vector %#x cpu %d\n", + __func__, irq, vector, smp_processor_id()); + BUG(); + } + if (!execute_on_irq_stack(overflow, desc, irq)) { if (unlikely(overflow)) print_stack_overflow(); @@ -251,146 +230,6 @@ unsigned int do_IRQ(struct pt_regs *regs) return 1; } -/* - * Interrupt statistics: - */ - -atomic_t irq_err_count; - -/* - * /proc/interrupts printing: - */ - -int show_interrupts(struct seq_file *p, void *v) -{ - int i = *(loff_t *) v, j; - struct irqaction * action; - unsigned long flags; - - if (i == 0) { - seq_printf(p, " "); - for_each_online_cpu(j) - seq_printf(p, "CPU%-8d",j); - seq_putc(p, '\n'); - } - - if (i < NR_IRQS) { - unsigned any_count = 0; - - spin_lock_irqsave(&irq_desc[i].lock, flags); -#ifndef CONFIG_SMP - any_count = kstat_irqs(i); -#else - for_each_online_cpu(j) - any_count |= kstat_cpu(j).irqs[i]; -#endif - action = irq_desc[i].action; - if (!action && !any_count) - goto skip; - seq_printf(p, "%3d: ",i); -#ifndef CONFIG_SMP - seq_printf(p, "%10u ", kstat_irqs(i)); -#else - for_each_online_cpu(j) - seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); -#endif - seq_printf(p, " %8s", irq_desc[i].chip->name); - seq_printf(p, "-%-8s", irq_desc[i].name); - - if (action) { - seq_printf(p, " %s", action->name); - while ((action = action->next) != NULL) - seq_printf(p, ", %s", action->name); - } - - seq_putc(p, '\n'); -skip: - spin_unlock_irqrestore(&irq_desc[i].lock, flags); - } else if (i == NR_IRQS) { - seq_printf(p, "NMI: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", nmi_count(j)); - seq_printf(p, " Non-maskable interrupts\n"); -#ifdef CONFIG_X86_LOCAL_APIC - seq_printf(p, "LOC: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", - per_cpu(irq_stat,j).apic_timer_irqs); - seq_printf(p, " Local timer interrupts\n"); -#endif -#ifdef CONFIG_SMP - seq_printf(p, "RES: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", - per_cpu(irq_stat,j).irq_resched_count); - seq_printf(p, " Rescheduling interrupts\n"); - seq_printf(p, "CAL: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", - per_cpu(irq_stat,j).irq_call_count); - seq_printf(p, " function call interrupts\n"); - seq_printf(p, "TLB: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", - per_cpu(irq_stat,j).irq_tlb_count); - seq_printf(p, " TLB shootdowns\n"); -#endif -#ifdef CONFIG_X86_MCE - seq_printf(p, "TRM: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", - per_cpu(irq_stat,j).irq_thermal_count); - seq_printf(p, " Thermal event interrupts\n"); -#endif -#ifdef CONFIG_X86_LOCAL_APIC - seq_printf(p, "SPU: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", - per_cpu(irq_stat,j).irq_spurious_count); - seq_printf(p, " Spurious interrupts\n"); -#endif - seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); -#if defined(CONFIG_X86_IO_APIC) - seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); -#endif - } - return 0; -} - -/* - * /proc/stat helpers - */ -u64 arch_irq_stat_cpu(unsigned int cpu) -{ - u64 sum = nmi_count(cpu); - -#ifdef CONFIG_X86_LOCAL_APIC - sum += per_cpu(irq_stat, cpu).apic_timer_irqs; -#endif -#ifdef CONFIG_SMP - sum += per_cpu(irq_stat, cpu).irq_resched_count; - sum += per_cpu(irq_stat, cpu).irq_call_count; - sum += per_cpu(irq_stat, cpu).irq_tlb_count; -#endif -#ifdef CONFIG_X86_MCE - sum += per_cpu(irq_stat, cpu).irq_thermal_count; -#endif -#ifdef CONFIG_X86_LOCAL_APIC - sum += per_cpu(irq_stat, cpu).irq_spurious_count; -#endif - return sum; -} - -u64 arch_irq_stat(void) -{ - u64 sum = atomic_read(&irq_err_count); - -#ifdef CONFIG_X86_IO_APIC - sum += atomic_read(&irq_mis_count); -#endif - return sum; -} - #ifdef CONFIG_HOTPLUG_CPU #include <mach_apic.h> @@ -398,20 +237,22 @@ void fixup_irqs(cpumask_t map) { unsigned int irq; static int warned; + struct irq_desc *desc; - for (irq = 0; irq < NR_IRQS; irq++) { + for_each_irq_desc(irq, desc) { cpumask_t mask; + if (irq == 2) continue; - cpus_and(mask, irq_desc[irq].affinity, map); + cpus_and(mask, desc->affinity, map); if (any_online_cpu(mask) == NR_CPUS) { printk("Breaking affinity for irq %i\n", irq); mask = map; } - if (irq_desc[irq].chip->set_affinity) - irq_desc[irq].chip->set_affinity(irq, mask); - else if (irq_desc[irq].action && !(warned++)) + if (desc->chip->set_affinity) + desc->chip->set_affinity(irq, mask); + else if (desc->action && !(warned++)) printk("Cannot set affinity for irq %i\n", irq); } diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 1f78b238d8d2..60eb84eb77a0 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -18,28 +18,6 @@ #include <asm/idle.h> #include <asm/smp.h> -atomic_t irq_err_count; - -/* - * 'what should we do if we get a hw irq event on an illegal vector'. - * each architecture has to answer this themselves. - */ -void ack_bad_irq(unsigned int irq) -{ - printk(KERN_WARNING "unexpected IRQ trap at vector %02x\n", irq); - /* - * Currently unexpected vectors happen only on SMP and APIC. - * We _must_ ack these because every local APIC has only N - * irq slots per priority level, and a 'hanging, unacked' IRQ - * holds up an irq slot - in excessive cases (when multiple - * unexpected vectors occur) that might lock up the APIC - * completely. - * But don't ack when the APIC is disabled. -AK - */ - if (!disable_apic) - ack_APIC_irq(); -} - #ifdef CONFIG_DEBUG_STACKOVERFLOW /* * Probabilistic stack overflow check: @@ -65,122 +43,6 @@ static inline void stack_overflow_check(struct pt_regs *regs) #endif /* - * Generic, controller-independent functions: - */ - -int show_interrupts(struct seq_file *p, void *v) -{ - int i = *(loff_t *) v, j; - struct irqaction * action; - unsigned long flags; - - if (i == 0) { - seq_printf(p, " "); - for_each_online_cpu(j) - seq_printf(p, "CPU%-8d",j); - seq_putc(p, '\n'); - } - - if (i < NR_IRQS) { - unsigned any_count = 0; - - spin_lock_irqsave(&irq_desc[i].lock, flags); -#ifndef CONFIG_SMP - any_count = kstat_irqs(i); -#else - for_each_online_cpu(j) - any_count |= kstat_cpu(j).irqs[i]; -#endif - action = irq_desc[i].action; - if (!action && !any_count) - goto skip; - seq_printf(p, "%3d: ",i); -#ifndef CONFIG_SMP - seq_printf(p, "%10u ", kstat_irqs(i)); -#else - for_each_online_cpu(j) - seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); -#endif - seq_printf(p, " %8s", irq_desc[i].chip->name); - seq_printf(p, "-%-8s", irq_desc[i].name); - - if (action) { - seq_printf(p, " %s", action->name); - while ((action = action->next) != NULL) - seq_printf(p, ", %s", action->name); - } - seq_putc(p, '\n'); -skip: - spin_unlock_irqrestore(&irq_desc[i].lock, flags); - } else if (i == NR_IRQS) { - seq_printf(p, "NMI: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count); - seq_printf(p, " Non-maskable interrupts\n"); - seq_printf(p, "LOC: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs); - seq_printf(p, " Local timer interrupts\n"); -#ifdef CONFIG_SMP - seq_printf(p, "RES: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->irq_resched_count); - seq_printf(p, " Rescheduling interrupts\n"); - seq_printf(p, "CAL: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->irq_call_count); - seq_printf(p, " function call interrupts\n"); - seq_printf(p, "TLB: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count); - seq_printf(p, " TLB shootdowns\n"); -#endif -#ifdef CONFIG_X86_MCE - seq_printf(p, "TRM: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->irq_thermal_count); - seq_printf(p, " Thermal event interrupts\n"); - seq_printf(p, "THR: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->irq_threshold_count); - seq_printf(p, " Threshold APIC interrupts\n"); -#endif - seq_printf(p, "SPU: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->irq_spurious_count); - seq_printf(p, " Spurious interrupts\n"); - seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); - } - return 0; -} - -/* - * /proc/stat helpers - */ -u64 arch_irq_stat_cpu(unsigned int cpu) -{ - u64 sum = cpu_pda(cpu)->__nmi_count; - - sum += cpu_pda(cpu)->apic_timer_irqs; -#ifdef CONFIG_SMP - sum += cpu_pda(cpu)->irq_resched_count; - sum += cpu_pda(cpu)->irq_call_count; - sum += cpu_pda(cpu)->irq_tlb_count; -#endif -#ifdef CONFIG_X86_MCE - sum += cpu_pda(cpu)->irq_thermal_count; - sum += cpu_pda(cpu)->irq_threshold_count; -#endif - sum += cpu_pda(cpu)->irq_spurious_count; - return sum; -} - -u64 arch_irq_stat(void) -{ - return atomic_read(&irq_err_count); -} - -/* * do_IRQ handles all normal device IRQ's (the special * SMP cross-CPU interrupts have their own specific * handlers). @@ -188,6 +50,7 @@ u64 arch_irq_stat(void) asmlinkage unsigned int do_IRQ(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); + struct irq_desc *desc; /* high bit used in ret_from_ code */ unsigned vector = ~regs->orig_ax; @@ -201,8 +64,9 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs) stack_overflow_check(regs); #endif - if (likely(irq < NR_IRQS)) - generic_handle_irq(irq); + desc = irq_to_desc(irq); + if (likely(desc)) + generic_handle_irq_desc(irq, desc); else { if (!disable_apic) ack_APIC_irq(); @@ -223,8 +87,9 @@ void fixup_irqs(cpumask_t map) { unsigned int irq; static int warned; + struct irq_desc *desc; - for (irq = 0; irq < NR_IRQS; irq++) { + for_each_irq_desc(irq, desc) { cpumask_t mask; int break_affinity = 0; int set_affinity = 1; @@ -233,32 +98,32 @@ void fixup_irqs(cpumask_t map) continue; /* interrupt's are disabled at this point */ - spin_lock(&irq_desc[irq].lock); + spin_lock(&desc->lock); if (!irq_has_action(irq) || - cpus_equal(irq_desc[irq].affinity, map)) { - spin_unlock(&irq_desc[irq].lock); + cpus_equal(desc->affinity, map)) { + spin_unlock(&desc->lock); continue; } - cpus_and(mask, irq_desc[irq].affinity, map); + cpus_and(mask, desc->affinity, map); if (cpus_empty(mask)) { break_affinity = 1; mask = map; } - if (irq_desc[irq].chip->mask) - irq_desc[irq].chip->mask(irq); + if (desc->chip->mask) + desc->chip->mask(irq); - if (irq_desc[irq].chip->set_affinity) - irq_desc[irq].chip->set_affinity(irq, mask); + if (desc->chip->set_affinity) + desc->chip->set_affinity(irq, mask); else if (!(warned++)) set_affinity = 0; - if (irq_desc[irq].chip->unmask) - irq_desc[irq].chip->unmask(irq); + if (desc->chip->unmask) + desc->chip->unmask(irq); - spin_unlock(&irq_desc[irq].lock); + spin_unlock(&desc->lock); if (break_affinity && set_affinity) printk("Broke affinity for irq %i\n", irq); diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index d66914287ee1..845aa9803e80 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -69,11 +69,48 @@ void __init init_ISA_irqs (void) * 16 old-style INTA-cycle interrupts: */ for (i = 0; i < 16; i++) { + /* first time call this irq_desc */ + struct irq_desc *desc = irq_to_desc(i); + + desc->status = IRQ_DISABLED; + desc->action = NULL; + desc->depth = 1; + set_irq_chip_and_handler_name(i, &i8259A_chip, handle_level_irq, "XT"); } } +/* + * IRQ2 is cascade interrupt to second interrupt controller + */ +static struct irqaction irq2 = { + .handler = no_action, + .mask = CPU_MASK_NONE, + .name = "cascade", +}; + +DEFINE_PER_CPU(vector_irq_t, vector_irq) = { + [0 ... IRQ0_VECTOR - 1] = -1, + [IRQ0_VECTOR] = 0, + [IRQ1_VECTOR] = 1, + [IRQ2_VECTOR] = 2, + [IRQ3_VECTOR] = 3, + [IRQ4_VECTOR] = 4, + [IRQ5_VECTOR] = 5, + [IRQ6_VECTOR] = 6, + [IRQ7_VECTOR] = 7, + [IRQ8_VECTOR] = 8, + [IRQ9_VECTOR] = 9, + [IRQ10_VECTOR] = 10, + [IRQ11_VECTOR] = 11, + [IRQ12_VECTOR] = 12, + [IRQ13_VECTOR] = 13, + [IRQ14_VECTOR] = 14, + [IRQ15_VECTOR] = 15, + [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 +}; + /* Overridden in paravirt.c */ void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); @@ -89,15 +126,50 @@ void __init native_init_IRQ(void) * us. (some of these will be overridden and become * 'special' SMP interrupts) */ - for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { - int vector = FIRST_EXTERNAL_VECTOR + i; - if (i >= NR_IRQS) - break; + for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { /* SYSCALL_VECTOR was reserved in trap_init. */ - if (!test_bit(vector, used_vectors)) - set_intr_gate(vector, interrupt[i]); + if (i != SYSCALL_VECTOR) + set_intr_gate(i, interrupt[i]); } + +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) + /* + * The reschedule interrupt is a CPU-to-CPU reschedule-helper + * IPI, driven by wakeup. + */ + alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); + + /* IPI for invalidation */ + alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); + + /* IPI for generic function call */ + alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); + + /* IPI for single call function */ + set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt); + + /* Low priority IPI to cleanup after moving an irq */ + set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); +#endif + +#ifdef CONFIG_X86_LOCAL_APIC + /* self generated IPI for local APIC timer */ + alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); + + /* IPI vectors for APIC spurious and error interrupts */ + alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); + alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); +#endif + +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) + /* thermal monitor LVT interrupt */ + alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); +#endif + + if (!acpi_ioapic) + setup_irq(2, &irq2); + /* setup after call gates are initialised (usually add in * the architecture specific gates) */ diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 0373e88de95a..ff0235391285 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -43,10 +43,11 @@ #define BUILD_IRQ(nr) \ asmlinkage void IRQ_NAME(nr); \ - asm("\n.p2align\n" \ + asm("\n.text\n.p2align\n" \ "IRQ" #nr "_interrupt:\n\t" \ "push $~(" #nr ") ; " \ - "jmp common_interrupt"); + "jmp common_interrupt\n" \ + ".previous"); #define BI(x,y) \ BUILD_IRQ(x##y) @@ -134,51 +135,33 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = { [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 }; -static void __init init_ISA_irqs (void) +void __init init_ISA_irqs(void) { int i; init_bsp_APIC(); init_8259A(0); - for (i = 0; i < NR_IRQS; i++) { - irq_desc[i].status = IRQ_DISABLED; - irq_desc[i].action = NULL; - irq_desc[i].depth = 1; + for (i = 0; i < 16; i++) { + /* first time call this irq_desc */ + struct irq_desc *desc = irq_to_desc(i); - if (i < 16) { - /* - * 16 old-style INTA-cycle interrupts: - */ - set_irq_chip_and_handler_name(i, &i8259A_chip, + desc->status = IRQ_DISABLED; + desc->action = NULL; + desc->depth = 1; + + /* + * 16 old-style INTA-cycle interrupts: + */ + set_irq_chip_and_handler_name(i, &i8259A_chip, handle_level_irq, "XT"); - } else { - /* - * 'high' PCI IRQs filled in on demand - */ - irq_desc[i].chip = &no_irq_chip; - } } } void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); -void __init native_init_IRQ(void) +static void __init smp_intr_init(void) { - int i; - - init_ISA_irqs(); - /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) - */ - for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { - int vector = FIRST_EXTERNAL_VECTOR + i; - if (vector != IA32_SYSCALL_VECTOR) - set_intr_gate(vector, interrupt[i]); - } - #ifdef CONFIG_SMP /* * The reschedule interrupt is a CPU-to-CPU reschedule-helper @@ -206,6 +189,12 @@ void __init native_init_IRQ(void) /* Low priority IPI to cleanup after moving an irq */ set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); #endif +} + +static void __init apic_intr_init(void) +{ + smp_intr_init(); + alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); @@ -215,6 +204,25 @@ void __init native_init_IRQ(void) /* IPI vectors for APIC spurious and error interrupts */ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); +} + +void __init native_init_IRQ(void) +{ + int i; + + init_ISA_irqs(); + /* + * Cover the whole vector space, no vector can escape + * us. (some of these will be overridden and become + * 'special' SMP interrupts) + */ + for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { + int vector = FIRST_EXTERNAL_VECTOR + i; + if (vector != IA32_SYSCALL_VECTOR) + set_intr_gate(vector, interrupt[i]); + } + + apic_intr_init(); if (!acpi_ioapic) setup_irq(2, &irq2); diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c index 7377ccb21335..304d8bad6559 100644 --- a/arch/x86/kernel/k8.c +++ b/arch/x86/kernel/k8.c @@ -16,8 +16,9 @@ EXPORT_SYMBOL(num_k8_northbridges); static u32 *flush_words; struct pci_device_id k8_nb_ids[] = { - { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_11H_NB_MISC) }, {} }; EXPORT_SYMBOL(k8_nb_ids); diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index c03205991718..ff7d3b0124f1 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -12,9 +12,13 @@ #include <linux/init.h> #include <linux/io.h> #include <linux/mm.h> +#include <linux/module.h> #include <asm/setup.h> +struct dentry *arch_debugfs_dir; +EXPORT_SYMBOL(arch_debugfs_dir); + #ifdef CONFIG_DEBUG_BOOT_PARAMS struct setup_data_node { u64 paddr; @@ -135,6 +139,7 @@ static int __init create_setup_data_nodes(struct dentry *parent) if (PageHighMem(pg)) { data = ioremap_cache(pa_data, sizeof(*data)); if (!data) { + kfree(node); error = -ENXIO; goto err_dir; } @@ -209,6 +214,10 @@ static int __init arch_kdebugfs_init(void) { int error = 0; + arch_debugfs_dir = debugfs_create_dir("x86", NULL); + if (!arch_debugfs_dir) + return -ENOMEM; + #ifdef CONFIG_DEBUG_BOOT_PARAMS error = boot_params_kdebugfs_init(); #endif diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index f47f0eb886b8..10435a120d22 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -69,6 +69,9 @@ static int gdb_x86vector = -1; */ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) { +#ifndef CONFIG_X86_32 + u32 *gdb_regs32 = (u32 *)gdb_regs; +#endif gdb_regs[GDB_AX] = regs->ax; gdb_regs[GDB_BX] = regs->bx; gdb_regs[GDB_CX] = regs->cx; @@ -76,9 +79,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) gdb_regs[GDB_SI] = regs->si; gdb_regs[GDB_DI] = regs->di; gdb_regs[GDB_BP] = regs->bp; - gdb_regs[GDB_PS] = regs->flags; gdb_regs[GDB_PC] = regs->ip; #ifdef CONFIG_X86_32 + gdb_regs[GDB_PS] = regs->flags; gdb_regs[GDB_DS] = regs->ds; gdb_regs[GDB_ES] = regs->es; gdb_regs[GDB_CS] = regs->cs; @@ -94,6 +97,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) gdb_regs[GDB_R13] = regs->r13; gdb_regs[GDB_R14] = regs->r14; gdb_regs[GDB_R15] = regs->r15; + gdb_regs32[GDB_PS] = regs->flags; + gdb_regs32[GDB_CS] = regs->cs; + gdb_regs32[GDB_SS] = regs->ss; #endif gdb_regs[GDB_SP] = regs->sp; } @@ -112,6 +118,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) */ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) { +#ifndef CONFIG_X86_32 + u32 *gdb_regs32 = (u32 *)gdb_regs; +#endif gdb_regs[GDB_AX] = 0; gdb_regs[GDB_BX] = 0; gdb_regs[GDB_CX] = 0; @@ -129,8 +138,10 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) gdb_regs[GDB_FS] = 0xFFFF; gdb_regs[GDB_GS] = 0xFFFF; #else - gdb_regs[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); - gdb_regs[GDB_PC] = 0; + gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); + gdb_regs32[GDB_CS] = __KERNEL_CS; + gdb_regs32[GDB_SS] = __KERNEL_DS; + gdb_regs[GDB_PC] = p->thread.ip; gdb_regs[GDB_R8] = 0; gdb_regs[GDB_R9] = 0; gdb_regs[GDB_R10] = 0; @@ -153,6 +164,9 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) */ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) { +#ifndef CONFIG_X86_32 + u32 *gdb_regs32 = (u32 *)gdb_regs; +#endif regs->ax = gdb_regs[GDB_AX]; regs->bx = gdb_regs[GDB_BX]; regs->cx = gdb_regs[GDB_CX]; @@ -160,9 +174,9 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) regs->si = gdb_regs[GDB_SI]; regs->di = gdb_regs[GDB_DI]; regs->bp = gdb_regs[GDB_BP]; - regs->flags = gdb_regs[GDB_PS]; regs->ip = gdb_regs[GDB_PC]; #ifdef CONFIG_X86_32 + regs->flags = gdb_regs[GDB_PS]; regs->ds = gdb_regs[GDB_DS]; regs->es = gdb_regs[GDB_ES]; regs->cs = gdb_regs[GDB_CS]; @@ -175,6 +189,9 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) regs->r13 = gdb_regs[GDB_R13]; regs->r14 = gdb_regs[GDB_R14]; regs->r15 = gdb_regs[GDB_R15]; + regs->flags = gdb_regs32[GDB_PS]; + regs->cs = gdb_regs32[GDB_CS]; + regs->ss = gdb_regs32[GDB_SS]; #endif } @@ -378,10 +395,8 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, if (remcomInBuffer[0] == 's') { linux_regs->flags |= X86_EFLAGS_TF; kgdb_single_step = 1; - if (kgdb_contthread) { - atomic_set(&kgdb_cpu_doing_single_step, - raw_smp_processor_id()); - } + atomic_set(&kgdb_cpu_doing_single_step, + raw_smp_processor_id()); } get_debugreg(dr6, 6); @@ -440,12 +455,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd) return NOTIFY_DONE; case DIE_NMI_IPI: - if (atomic_read(&kgdb_active) != -1) { - /* KGDB CPU roundup */ - kgdb_nmicallback(raw_smp_processor_id(), regs); - was_in_debug_nmi[raw_smp_processor_id()] = 1; - touch_nmi_watchdog(); - } + /* Just ignore, we will handle the roundup on DIE_NMI. */ return NOTIFY_DONE; case DIE_NMIUNKNOWN: @@ -466,9 +476,15 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd) case DIE_DEBUG: if (atomic_read(&kgdb_cpu_doing_single_step) == - raw_smp_processor_id() && - user_mode(regs)) - return single_step_cont(regs, args); + raw_smp_processor_id()) { + if (user_mode(regs)) + return single_step_cont(regs, args); + break; + } else if (test_thread_flag(TIF_SINGLESTEP)) + /* This means a user thread is single stepping + * a system call which should be ignored + */ + return NOTIFY_DONE; /* fall through */ default: if (user_mode(regs)) diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index b8c6743a13da..6c27679ec6aa 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -431,7 +431,6 @@ static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) regs->ip = (unsigned long)p->ainsn.insn; } -/* Called with kretprobe_lock held */ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) { @@ -682,8 +681,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs) unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline; INIT_HLIST_HEAD(&empty_rp); - spin_lock_irqsave(&kretprobe_lock, flags); - head = kretprobe_inst_table_head(current); + kretprobe_hash_lock(current, &head, &flags); /* fixup registers */ #ifdef CONFIG_X86_64 regs->cs = __KERNEL_CS; @@ -732,7 +730,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs) kretprobe_assert(ri, orig_ret_address, trampoline_address); - spin_unlock_irqrestore(&kretprobe_lock, flags); + kretprobe_hash_unlock(current, &flags); hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { hlist_del(&ri->hlist); @@ -860,7 +858,6 @@ static int __kprobes post_kprobe_handler(struct pt_regs *regs) resume_execution(cur, regs, kcb); regs->flags |= kcb->kprobe_saved_flags; - trace_hardirqs_fixup_flags(regs->flags); if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { kcb->kprobe_status = KPROBE_HIT_SSDONE; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 8b7a3cf37d2b..478bca986eca 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -178,7 +178,7 @@ static void kvm_flush_tlb(void) kvm_deferred_mmu_op(&ftlb, sizeof ftlb); } -static void kvm_release_pt(u32 pfn) +static void kvm_release_pt(unsigned long pfn) { struct kvm_mmu_op_release_pt rpt = { .header.op = KVM_MMU_OP_RELEASE_PT, diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 87edf1ceb1df..774ac4991568 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -78,6 +78,34 @@ static cycle_t kvm_clock_read(void) return ret; } +/* + * If we don't do that, there is the possibility that the guest + * will calibrate under heavy load - thus, getting a lower lpj - + * and execute the delays themselves without load. This is wrong, + * because no delay loop can finish beforehand. + * Any heuristics is subject to fail, because ultimately, a large + * poll of guests can be running and trouble each other. So we preset + * lpj here + */ +static unsigned long kvm_get_tsc_khz(void) +{ + return preset_lpj; +} + +static void kvm_get_preset_lpj(void) +{ + struct pvclock_vcpu_time_info *src; + unsigned long khz; + u64 lpj; + + src = &per_cpu(hv_clock, 0); + khz = pvclock_tsc_khz(src); + + lpj = ((u64)khz * 1000); + do_div(lpj, HZ); + preset_lpj = lpj; +} + static struct clocksource kvm_clock = { .name = "kvm-clock", .read = kvm_clock_read, @@ -113,7 +141,7 @@ static void kvm_setup_secondary_clock(void) #endif #ifdef CONFIG_SMP -void __init kvm_smp_prepare_boot_cpu(void) +static void __init kvm_smp_prepare_boot_cpu(void) { WARN_ON(kvm_register_clock("primary cpu clock")); native_smp_prepare_boot_cpu(); @@ -153,6 +181,7 @@ void __init kvmclock_init(void) pv_time_ops.get_wallclock = kvm_get_wallclock; pv_time_ops.set_wallclock = kvm_set_wallclock; pv_time_ops.sched_clock = kvm_clock_read; + pv_time_ops.get_tsc_khz = kvm_get_tsc_khz; #ifdef CONFIG_X86_LOCAL_APIC pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; #endif @@ -163,6 +192,7 @@ void __init kvmclock_init(void) #ifdef CONFIG_KEXEC machine_ops.crash_shutdown = kvm_crash_shutdown; #endif + kvm_get_preset_lpj(); clocksource_register(&kvm_clock); } } diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index a8449571858a..eee32b43fee3 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -18,6 +18,7 @@ #include <asm/ldt.h> #include <asm/desc.h> #include <asm/mmu_context.h> +#include <asm/syscalls.h> #ifdef CONFIG_SMP static void flush_ldt(void *current_mm) @@ -51,6 +52,8 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) memset(newldt + oldsize * LDT_ENTRY_SIZE, 0, (mincount - oldsize) * LDT_ENTRY_SIZE); + paravirt_alloc_ldt(newldt, mincount); + #ifdef CONFIG_X86_64 /* CHECKME: Do we really need this ? */ wmb(); @@ -62,12 +65,10 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) if (reload) { #ifdef CONFIG_SMP - cpumask_t mask; - preempt_disable(); load_LDT(pc); - mask = cpumask_of_cpu(smp_processor_id()); - if (!cpus_equal(current->mm->cpu_vm_mask, mask)) + if (!cpus_equal(current->mm->cpu_vm_mask, + cpumask_of_cpu(smp_processor_id()))) smp_call_function(flush_ldt, current->mm, 1); preempt_enable(); #else @@ -75,6 +76,7 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) #endif } if (oldsize) { + paravirt_free_ldt(oldldt, oldsize); if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE) vfree(oldldt); else @@ -86,10 +88,13 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) static inline int copy_ldt(mm_context_t *new, mm_context_t *old) { int err = alloc_ldt(new, old->size, 0); + int i; if (err < 0) return err; - memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE); + + for(i = 0; i < old->size; i++) + write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE); return 0; } @@ -126,6 +131,7 @@ void destroy_context(struct mm_struct *mm) if (mm == current->active_mm) clear_LDT(); #endif + paravirt_free_ldt(mm->context.ldt, mm->context.size); if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE) vfree(mm->context.ldt); else diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 8864230d55af..0732adba05ca 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -12,6 +12,7 @@ #include <linux/init.h> #include <linux/numa.h> #include <linux/ftrace.h> +#include <linux/suspend.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -22,6 +23,7 @@ #include <asm/cpufeature.h> #include <asm/desc.h> #include <asm/system.h> +#include <asm/cacheflush.h> #define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) static u32 kexec_pgd[1024] PAGE_ALIGNED; @@ -77,7 +79,7 @@ static void load_segments(void) /* * A architecture hook called to validate the * proposed image and prepare the control pages - * as needed. The pages for KEXEC_CONTROL_CODE_SIZE + * as needed. The pages for KEXEC_CONTROL_PAGE_SIZE * have been allocated, but the segments have yet * been copied into the kernel. * @@ -85,10 +87,12 @@ static void load_segments(void) * reboot code buffer to allow us to avoid allocations * later. * - * Currently nothing. + * Make control page executable. */ int machine_kexec_prepare(struct kimage *image) { + if (nx_enabled) + set_pages_x(image->control_code_page, 1); return 0; } @@ -98,27 +102,54 @@ int machine_kexec_prepare(struct kimage *image) */ void machine_kexec_cleanup(struct kimage *image) { + if (nx_enabled) + set_pages_nx(image->control_code_page, 1); } /* * Do not allocate memory (or fail in any way) in machine_kexec(). * We are past the point of no return, committed to rebooting now. */ -NORET_TYPE void machine_kexec(struct kimage *image) +void machine_kexec(struct kimage *image) { unsigned long page_list[PAGES_NR]; void *control_page; + int save_ftrace_enabled; + asmlinkage unsigned long + (*relocate_kernel_ptr)(unsigned long indirection_page, + unsigned long control_page, + unsigned long start_address, + unsigned int has_pae, + unsigned int preserve_context); + +#ifdef CONFIG_KEXEC_JUMP + if (kexec_image->preserve_context) + save_processor_state(); +#endif - tracer_disable(); + save_ftrace_enabled = __ftrace_enabled_save(); /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); + if (image->preserve_context) { +#ifdef CONFIG_X86_IO_APIC + /* We need to put APICs in legacy mode so that we can + * get timer interrupts in second kernel. kexec/kdump + * paths already have calls to disable_IO_APIC() in + * one form or other. kexec jump path also need + * one. + */ + disable_IO_APIC(); +#endif + } + control_page = page_address(image->control_code_page); - memcpy(control_page, relocate_kernel, PAGE_SIZE); + memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); + relocate_kernel_ptr = control_page; page_list[PA_CONTROL_PAGE] = __pa(control_page); - page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; + page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; page_list[PA_PGD] = __pa(kexec_pgd); page_list[VA_PGD] = (unsigned long)kexec_pgd; #ifdef CONFIG_X86_PAE @@ -131,6 +162,7 @@ NORET_TYPE void machine_kexec(struct kimage *image) page_list[VA_PTE_0] = (unsigned long)kexec_pte0; page_list[PA_PTE_1] = __pa(kexec_pte1); page_list[VA_PTE_1] = (unsigned long)kexec_pte1; + page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) << PAGE_SHIFT); /* The segment registers are funny things, they have both a * visible and an invisible part. Whenever the visible part is @@ -149,8 +181,17 @@ NORET_TYPE void machine_kexec(struct kimage *image) set_idt(phys_to_virt(0),0); /* now call it */ - relocate_kernel((unsigned long)image->head, (unsigned long)page_list, - image->start, cpu_has_pae); + image->start = relocate_kernel_ptr((unsigned long)image->head, + (unsigned long)page_list, + image->start, cpu_has_pae, + image->preserve_context); + +#ifdef CONFIG_KEXEC_JUMP + if (kexec_image->preserve_context) + restore_processor_state(); +#endif + + __ftrace_enabled_restore(save_ftrace_enabled); } void arch_crash_save_vmcoreinfo(void) diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 9dd9262693a3..c43caa3a91f3 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -181,7 +181,7 @@ void machine_kexec_cleanup(struct kimage *image) * Do not allocate memory (or fail in any way) in machine_kexec(). * We are past the point of no return, committed to rebooting now. */ -NORET_TYPE void machine_kexec(struct kimage *image) +void machine_kexec(struct kimage *image) { unsigned long page_list[PAGES_NR]; void *control_page; diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index 07c0f828f488..3b599518c322 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c @@ -33,6 +33,8 @@ #include <linux/module.h> #include <asm/geode.h> +#define MFGPT_DEFAULT_IRQ 7 + static struct mfgpt_timer_t { unsigned int avail:1; } mfgpt_timers[MFGPT_MAX_TIMERS]; @@ -157,29 +159,48 @@ int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable) } EXPORT_SYMBOL_GPL(geode_mfgpt_toggle_event); -int geode_mfgpt_set_irq(int timer, int cmp, int irq, int enable) +int geode_mfgpt_set_irq(int timer, int cmp, int *irq, int enable) { - u32 val, dummy; - int offset; + u32 zsel, lpc, dummy; + int shift; if (timer < 0 || timer >= MFGPT_MAX_TIMERS) return -EIO; - if (geode_mfgpt_toggle_event(timer, cmp, MFGPT_EVENT_IRQ, enable)) + /* + * Unfortunately, MFGPTs come in pairs sharing their IRQ lines. If VSA + * is using the same CMP of the timer's Siamese twin, the IRQ is set to + * 2, and we mustn't use nor change it. + * XXX: Likewise, 2 Linux drivers might clash if the 2nd overwrites the + * IRQ of the 1st. This can only happen if forcing an IRQ, calling this + * with *irq==0 is safe. Currently there _are_ no 2 drivers. + */ + rdmsr(MSR_PIC_ZSEL_LOW, zsel, dummy); + shift = ((cmp == MFGPT_CMP1 ? 0 : 4) + timer % 4) * 4; + if (((zsel >> shift) & 0xF) == 2) return -EIO; - rdmsr(MSR_PIC_ZSEL_LOW, val, dummy); + /* Choose IRQ: if none supplied, keep IRQ already set or use default */ + if (!*irq) + *irq = (zsel >> shift) & 0xF; + if (!*irq) + *irq = MFGPT_DEFAULT_IRQ; - offset = (timer % 4) * 4; - - val &= ~((0xF << offset) | (0xF << (offset + 16))); + /* Can't use IRQ if it's 0 (=disabled), 2, or routed to LPC */ + if (*irq < 1 || *irq == 2 || *irq > 15) + return -EIO; + rdmsr(MSR_PIC_IRQM_LPC, lpc, dummy); + if (lpc & (1 << *irq)) + return -EIO; + /* All chosen and checked - go for it */ + if (geode_mfgpt_toggle_event(timer, cmp, MFGPT_EVENT_IRQ, enable)) + return -EIO; if (enable) { - val |= (irq & 0x0F) << (offset); - val |= (irq & 0x0F) << (offset + 16); + zsel = (zsel & ~(0xF << shift)) | (*irq << shift); + wrmsr(MSR_PIC_ZSEL_LOW, zsel, dummy); } - wrmsr(MSR_PIC_ZSEL_LOW, val, dummy); return 0; } @@ -242,7 +263,7 @@ EXPORT_SYMBOL_GPL(geode_mfgpt_alloc_timer); static unsigned int mfgpt_tick_mode = CLOCK_EVT_MODE_SHUTDOWN; static u16 mfgpt_event_clock; -static int irq = 7; +static int irq; static int __init mfgpt_setup(char *str) { get_option(&str, &irq); @@ -346,7 +367,7 @@ int __init mfgpt_timer_setup(void) mfgpt_event_clock = timer; /* Set up the IRQ on the MFGPT side */ - if (geode_mfgpt_setup_irq(mfgpt_event_clock, MFGPT_CMP2, irq)) { + if (geode_mfgpt_setup_irq(mfgpt_event_clock, MFGPT_CMP2, &irq)) { printk(KERN_ERR "mfgpt-timer: Could not set up IRQ %d\n", irq); return -EIO; } @@ -374,13 +395,14 @@ int __init mfgpt_timer_setup(void) &mfgpt_clockevent); printk(KERN_INFO - "mfgpt-timer: registering the MFGPT timer as a clock event.\n"); + "mfgpt-timer: Registering MFGPT timer %d as a clock event, using IRQ %d\n", + timer, irq); clockevents_register_device(&mfgpt_clockevent); return 0; err: - geode_mfgpt_release_irq(mfgpt_event_clock, MFGPT_CMP2, irq); + geode_mfgpt_release_irq(mfgpt_event_clock, MFGPT_CMP2, &irq); printk(KERN_ERR "mfgpt-timer: Unable to set up the MFGPT clock source\n"); return -EIO; diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c deleted file mode 100644 index 56b933119a04..000000000000 --- a/arch/x86/kernel/microcode.c +++ /dev/null @@ -1,851 +0,0 @@ -/* - * Intel CPU Microcode Update Driver for Linux - * - * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk> - * 2006 Shaohua Li <shaohua.li@intel.com> - * - * This driver allows to upgrade microcode on Intel processors - * belonging to IA-32 family - PentiumPro, Pentium II, - * Pentium III, Xeon, Pentium 4, etc. - * - * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture - * Software Developer's Manual - * Order Number 253668 or free download from: - * - * http://developer.intel.com/design/pentium4/manuals/253668.htm - * - * For more information, go to http://www.urbanmyth.org/microcode - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com> - * Initial release. - * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com> - * Added read() support + cleanups. - * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com> - * Added 'device trimming' support. open(O_WRONLY) zeroes - * and frees the saved copy of applied microcode. - * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com> - * Made to use devfs (/dev/cpu/microcode) + cleanups. - * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com> - * Added misc device support (now uses both devfs and misc). - * Added MICROCODE_IOCFREE ioctl to clear memory. - * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com> - * Messages for error cases (non Intel & no suitable microcode). - * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com> - * Removed ->release(). Removed exclusive open and status bitmap. - * Added microcode_rwsem to serialize read()/write()/ioctl(). - * Removed global kernel lock usage. - * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com> - * Write 0 to 0x8B msr and then cpuid before reading revision, - * so that it works even if there were no update done by the - * BIOS. Otherwise, reading from 0x8B gives junk (which happened - * to be 0 on my machine which is why it worked even when I - * disabled update by the BIOS) - * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix. - * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and - * Tigran Aivazian <tigran@veritas.com> - * Intel Pentium 4 processor support and bugfixes. - * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com> - * Bugfix for HT (Hyper-Threading) enabled processors - * whereby processor resources are shared by all logical processors - * in a single CPU package. - * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and - * Tigran Aivazian <tigran@veritas.com>, - * Serialize updates as required on HT processors due to speculative - * nature of implementation. - * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com> - * Fix the panic when writing zero-length microcode chunk. - * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>, - * Jun Nakajima <jun.nakajima@intel.com> - * Support for the microcode updates in the new format. - * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com> - * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl - * because we no longer hold a copy of applied microcode - * in kernel memory. - * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com> - * Fix sigmatch() macro to handle old CPUs with pf == 0. - * Thanks to Stuart Swales for pointing out this bug. - */ - -//#define DEBUG /* pr_debug */ -#include <linux/capability.h> -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/sched.h> -#include <linux/smp_lock.h> -#include <linux/cpumask.h> -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/vmalloc.h> -#include <linux/miscdevice.h> -#include <linux/spinlock.h> -#include <linux/mm.h> -#include <linux/fs.h> -#include <linux/mutex.h> -#include <linux/cpu.h> -#include <linux/firmware.h> -#include <linux/platform_device.h> - -#include <asm/msr.h> -#include <asm/uaccess.h> -#include <asm/processor.h> - -MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver"); -MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); -MODULE_LICENSE("GPL"); - -#define MICROCODE_VERSION "1.14a" - -#define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */ -#define MC_HEADER_SIZE (sizeof (microcode_header_t)) /* 48 bytes */ -#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */ -#define EXT_HEADER_SIZE (sizeof (struct extended_sigtable)) /* 20 bytes */ -#define EXT_SIGNATURE_SIZE (sizeof (struct extended_signature)) /* 12 bytes */ -#define DWSIZE (sizeof (u32)) -#define get_totalsize(mc) \ - (((microcode_t *)mc)->hdr.totalsize ? \ - ((microcode_t *)mc)->hdr.totalsize : DEFAULT_UCODE_TOTALSIZE) -#define get_datasize(mc) \ - (((microcode_t *)mc)->hdr.datasize ? \ - ((microcode_t *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE) - -#define sigmatch(s1, s2, p1, p2) \ - (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0)))) - -#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) - -/* serialize access to the physical write to MSR 0x79 */ -static DEFINE_SPINLOCK(microcode_update_lock); - -/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ -static DEFINE_MUTEX(microcode_mutex); - -static struct ucode_cpu_info { - int valid; - unsigned int sig; - unsigned int pf; - unsigned int rev; - microcode_t *mc; -} ucode_cpu_info[NR_CPUS]; - -static void collect_cpu_info(int cpu_num) -{ - struct cpuinfo_x86 *c = &cpu_data(cpu_num); - struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; - unsigned int val[2]; - - /* We should bind the task to the CPU */ - BUG_ON(raw_smp_processor_id() != cpu_num); - uci->pf = uci->rev = 0; - uci->mc = NULL; - uci->valid = 1; - - if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || - cpu_has(c, X86_FEATURE_IA64)) { - printk(KERN_ERR "microcode: CPU%d not a capable Intel " - "processor\n", cpu_num); - uci->valid = 0; - return; - } - - uci->sig = cpuid_eax(0x00000001); - - if ((c->x86_model >= 5) || (c->x86 > 6)) { - /* get processor flags from MSR 0x17 */ - rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); - uci->pf = 1 << ((val[1] >> 18) & 7); - } - - wrmsr(MSR_IA32_UCODE_REV, 0, 0); - /* see notes above for revision 1.07. Apparent chip bug */ - sync_core(); - /* get the current revision from MSR 0x8B */ - rdmsr(MSR_IA32_UCODE_REV, val[0], uci->rev); - pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n", - uci->sig, uci->pf, uci->rev); -} - -static inline int microcode_update_match(int cpu_num, - microcode_header_t *mc_header, int sig, int pf) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; - - if (!sigmatch(sig, uci->sig, pf, uci->pf) - || mc_header->rev <= uci->rev) - return 0; - return 1; -} - -static int microcode_sanity_check(void *mc) -{ - microcode_header_t *mc_header = mc; - struct extended_sigtable *ext_header = NULL; - struct extended_signature *ext_sig; - unsigned long total_size, data_size, ext_table_size; - int sum, orig_sum, ext_sigcount = 0, i; - - total_size = get_totalsize(mc_header); - data_size = get_datasize(mc_header); - if (data_size + MC_HEADER_SIZE > total_size) { - printk(KERN_ERR "microcode: error! " - "Bad data size in microcode data file\n"); - return -EINVAL; - } - - if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { - printk(KERN_ERR "microcode: error! " - "Unknown microcode update format\n"); - return -EINVAL; - } - ext_table_size = total_size - (MC_HEADER_SIZE + data_size); - if (ext_table_size) { - if ((ext_table_size < EXT_HEADER_SIZE) - || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { - printk(KERN_ERR "microcode: error! " - "Small exttable size in microcode data file\n"); - return -EINVAL; - } - ext_header = mc + MC_HEADER_SIZE + data_size; - if (ext_table_size != exttable_size(ext_header)) { - printk(KERN_ERR "microcode: error! " - "Bad exttable size in microcode data file\n"); - return -EFAULT; - } - ext_sigcount = ext_header->count; - } - - /* check extended table checksum */ - if (ext_table_size) { - int ext_table_sum = 0; - int *ext_tablep = (int *)ext_header; - - i = ext_table_size / DWSIZE; - while (i--) - ext_table_sum += ext_tablep[i]; - if (ext_table_sum) { - printk(KERN_WARNING "microcode: aborting, " - "bad extended signature table checksum\n"); - return -EINVAL; - } - } - - /* calculate the checksum */ - orig_sum = 0; - i = (MC_HEADER_SIZE + data_size) / DWSIZE; - while (i--) - orig_sum += ((int *)mc)[i]; - if (orig_sum) { - printk(KERN_ERR "microcode: aborting, bad checksum\n"); - return -EINVAL; - } - if (!ext_table_size) - return 0; - /* check extended signature checksum */ - for (i = 0; i < ext_sigcount; i++) { - ext_sig = (void *)ext_header + EXT_HEADER_SIZE + - EXT_SIGNATURE_SIZE * i; - sum = orig_sum - - (mc_header->sig + mc_header->pf + mc_header->cksum) - + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); - if (sum) { - printk(KERN_ERR "microcode: aborting, bad checksum\n"); - return -EINVAL; - } - } - return 0; -} - -/* - * return 0 - no update found - * return 1 - found update - * return < 0 - error - */ -static int get_maching_microcode(void *mc, int cpu) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - microcode_header_t *mc_header = mc; - struct extended_sigtable *ext_header; - unsigned long total_size = get_totalsize(mc_header); - int ext_sigcount, i; - struct extended_signature *ext_sig; - void *new_mc; - - if (microcode_update_match(cpu, mc_header, - mc_header->sig, mc_header->pf)) - goto find; - - if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE) - return 0; - - ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE; - ext_sigcount = ext_header->count; - ext_sig = (void *)ext_header + EXT_HEADER_SIZE; - for (i = 0; i < ext_sigcount; i++) { - if (microcode_update_match(cpu, mc_header, - ext_sig->sig, ext_sig->pf)) - goto find; - ext_sig++; - } - return 0; -find: - pr_debug("microcode: CPU%d found a matching microcode update with" - " version 0x%x (current=0x%x)\n", cpu, mc_header->rev,uci->rev); - new_mc = vmalloc(total_size); - if (!new_mc) { - printk(KERN_ERR "microcode: error! Can not allocate memory\n"); - return -ENOMEM; - } - - /* free previous update file */ - vfree(uci->mc); - - memcpy(new_mc, mc, total_size); - uci->mc = new_mc; - return 1; -} - -static void apply_microcode(int cpu) -{ - unsigned long flags; - unsigned int val[2]; - int cpu_num = raw_smp_processor_id(); - struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; - - /* We should bind the task to the CPU */ - BUG_ON(cpu_num != cpu); - - if (uci->mc == NULL) - return; - - /* serialize access to the physical write to MSR 0x79 */ - spin_lock_irqsave(µcode_update_lock, flags); - - /* write microcode via MSR 0x79 */ - wrmsr(MSR_IA32_UCODE_WRITE, - (unsigned long) uci->mc->bits, - (unsigned long) uci->mc->bits >> 16 >> 16); - wrmsr(MSR_IA32_UCODE_REV, 0, 0); - - /* see notes above for revision 1.07. Apparent chip bug */ - sync_core(); - - /* get the current revision from MSR 0x8B */ - rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); - - spin_unlock_irqrestore(µcode_update_lock, flags); - if (val[1] != uci->mc->hdr.rev) { - printk(KERN_ERR "microcode: CPU%d update from revision " - "0x%x to 0x%x failed\n", cpu_num, uci->rev, val[1]); - return; - } - printk(KERN_INFO "microcode: CPU%d updated from revision " - "0x%x to 0x%x, date = %08x \n", - cpu_num, uci->rev, val[1], uci->mc->hdr.date); - uci->rev = val[1]; -} - -#ifdef CONFIG_MICROCODE_OLD_INTERFACE -static void __user *user_buffer; /* user area microcode data buffer */ -static unsigned int user_buffer_size; /* it's size */ - -static long get_next_ucode(void **mc, long offset) -{ - microcode_header_t mc_header; - unsigned long total_size; - - /* No more data */ - if (offset >= user_buffer_size) - return 0; - if (copy_from_user(&mc_header, user_buffer + offset, MC_HEADER_SIZE)) { - printk(KERN_ERR "microcode: error! Can not read user data\n"); - return -EFAULT; - } - total_size = get_totalsize(&mc_header); - if (offset + total_size > user_buffer_size) { - printk(KERN_ERR "microcode: error! Bad total size in microcode " - "data file\n"); - return -EINVAL; - } - *mc = vmalloc(total_size); - if (!*mc) - return -ENOMEM; - if (copy_from_user(*mc, user_buffer + offset, total_size)) { - printk(KERN_ERR "microcode: error! Can not read user data\n"); - vfree(*mc); - return -EFAULT; - } - return offset + total_size; -} - -static int do_microcode_update (void) -{ - long cursor = 0; - int error = 0; - void *new_mc = NULL; - int cpu; - cpumask_t old; - - old = current->cpus_allowed; - - while ((cursor = get_next_ucode(&new_mc, cursor)) > 0) { - error = microcode_sanity_check(new_mc); - if (error) - goto out; - /* - * It's possible the data file has multiple matching ucode, - * lets keep searching till the latest version - */ - for_each_online_cpu(cpu) { - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - if (!uci->valid) - continue; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - error = get_maching_microcode(new_mc, cpu); - if (error < 0) - goto out; - if (error == 1) - apply_microcode(cpu); - } - vfree(new_mc); - } -out: - if (cursor > 0) - vfree(new_mc); - if (cursor < 0) - error = cursor; - set_cpus_allowed_ptr(current, &old); - return error; -} - -static int microcode_open (struct inode *unused1, struct file *unused2) -{ - cycle_kernel_lock(); - return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; -} - -static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos) -{ - ssize_t ret; - - if ((len >> PAGE_SHIFT) > num_physpages) { - printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages); - return -EINVAL; - } - - get_online_cpus(); - mutex_lock(µcode_mutex); - - user_buffer = (void __user *) buf; - user_buffer_size = (int) len; - - ret = do_microcode_update(); - if (!ret) - ret = (ssize_t)len; - - mutex_unlock(µcode_mutex); - put_online_cpus(); - - return ret; -} - -static const struct file_operations microcode_fops = { - .owner = THIS_MODULE, - .write = microcode_write, - .open = microcode_open, -}; - -static struct miscdevice microcode_dev = { - .minor = MICROCODE_MINOR, - .name = "microcode", - .fops = µcode_fops, -}; - -static int __init microcode_dev_init (void) -{ - int error; - - error = misc_register(µcode_dev); - if (error) { - printk(KERN_ERR - "microcode: can't misc_register on minor=%d\n", - MICROCODE_MINOR); - return error; - } - - return 0; -} - -static void microcode_dev_exit (void) -{ - misc_deregister(µcode_dev); -} - -MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); -#else -#define microcode_dev_init() 0 -#define microcode_dev_exit() do { } while(0) -#endif - -static long get_next_ucode_from_buffer(void **mc, const u8 *buf, - unsigned long size, long offset) -{ - microcode_header_t *mc_header; - unsigned long total_size; - - /* No more data */ - if (offset >= size) - return 0; - mc_header = (microcode_header_t *)(buf + offset); - total_size = get_totalsize(mc_header); - - if (offset + total_size > size) { - printk(KERN_ERR "microcode: error! Bad data in microcode data file\n"); - return -EINVAL; - } - - *mc = vmalloc(total_size); - if (!*mc) { - printk(KERN_ERR "microcode: error! Can not allocate memory\n"); - return -ENOMEM; - } - memcpy(*mc, buf + offset, total_size); - return offset + total_size; -} - -/* fake device for request_firmware */ -static struct platform_device *microcode_pdev; - -static int cpu_request_microcode(int cpu) -{ - char name[30]; - struct cpuinfo_x86 *c = &cpu_data(cpu); - const struct firmware *firmware; - const u8 *buf; - unsigned long size; - long offset = 0; - int error; - void *mc; - - /* We should bind the task to the CPU */ - BUG_ON(cpu != raw_smp_processor_id()); - sprintf(name,"intel-ucode/%02x-%02x-%02x", - c->x86, c->x86_model, c->x86_mask); - error = request_firmware(&firmware, name, µcode_pdev->dev); - if (error) { - pr_debug("microcode: data file %s load failed\n", name); - return error; - } - buf = firmware->data; - size = firmware->size; - while ((offset = get_next_ucode_from_buffer(&mc, buf, size, offset)) - > 0) { - error = microcode_sanity_check(mc); - if (error) - break; - error = get_maching_microcode(mc, cpu); - if (error < 0) - break; - /* - * It's possible the data file has multiple matching ucode, - * lets keep searching till the latest version - */ - if (error == 1) { - apply_microcode(cpu); - error = 0; - } - vfree(mc); - } - if (offset > 0) - vfree(mc); - if (offset < 0) - error = offset; - release_firmware(firmware); - - return error; -} - -static int apply_microcode_check_cpu(int cpu) -{ - struct cpuinfo_x86 *c = &cpu_data(cpu); - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - cpumask_t old; - unsigned int val[2]; - int err = 0; - - /* Check if the microcode is available */ - if (!uci->mc) - return 0; - - old = current->cpus_allowed; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - - /* Check if the microcode we have in memory matches the CPU */ - if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || - cpu_has(c, X86_FEATURE_IA64) || uci->sig != cpuid_eax(0x00000001)) - err = -EINVAL; - - if (!err && ((c->x86_model >= 5) || (c->x86 > 6))) { - /* get processor flags from MSR 0x17 */ - rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); - if (uci->pf != (1 << ((val[1] >> 18) & 7))) - err = -EINVAL; - } - - if (!err) { - wrmsr(MSR_IA32_UCODE_REV, 0, 0); - /* see notes above for revision 1.07. Apparent chip bug */ - sync_core(); - /* get the current revision from MSR 0x8B */ - rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); - if (uci->rev != val[1]) - err = -EINVAL; - } - - if (!err) - apply_microcode(cpu); - else - printk(KERN_ERR "microcode: Could not apply microcode to CPU%d:" - " sig=0x%x, pf=0x%x, rev=0x%x\n", - cpu, uci->sig, uci->pf, uci->rev); - - set_cpus_allowed_ptr(current, &old); - return err; -} - -static void microcode_init_cpu(int cpu, int resume) -{ - cpumask_t old; - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - old = current->cpus_allowed; - - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - mutex_lock(µcode_mutex); - collect_cpu_info(cpu); - if (uci->valid && system_state == SYSTEM_RUNNING && !resume) - cpu_request_microcode(cpu); - mutex_unlock(µcode_mutex); - set_cpus_allowed_ptr(current, &old); -} - -static void microcode_fini_cpu(int cpu) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - mutex_lock(µcode_mutex); - uci->valid = 0; - vfree(uci->mc); - uci->mc = NULL; - mutex_unlock(µcode_mutex); -} - -static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; - char *end; - unsigned long val = simple_strtoul(buf, &end, 0); - int err = 0; - int cpu = dev->id; - - if (end == buf) - return -EINVAL; - if (val == 1) { - cpumask_t old; - - old = current->cpus_allowed; - - get_online_cpus(); - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - - mutex_lock(µcode_mutex); - if (uci->valid) - err = cpu_request_microcode(cpu); - mutex_unlock(µcode_mutex); - put_online_cpus(); - set_cpus_allowed_ptr(current, &old); - } - if (err) - return err; - return sz; -} - -static ssize_t version_show(struct sys_device *dev, char *buf) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; - - return sprintf(buf, "0x%x\n", uci->rev); -} - -static ssize_t pf_show(struct sys_device *dev, char *buf) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; - - return sprintf(buf, "0x%x\n", uci->pf); -} - -static SYSDEV_ATTR(reload, 0200, NULL, reload_store); -static SYSDEV_ATTR(version, 0400, version_show, NULL); -static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL); - -static struct attribute *mc_default_attrs[] = { - &attr_reload.attr, - &attr_version.attr, - &attr_processor_flags.attr, - NULL -}; - -static struct attribute_group mc_attr_group = { - .attrs = mc_default_attrs, - .name = "microcode", -}; - -static int __mc_sysdev_add(struct sys_device *sys_dev, int resume) -{ - int err, cpu = sys_dev->id; - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - if (!cpu_online(cpu)) - return 0; - - pr_debug("microcode: CPU%d added\n", cpu); - memset(uci, 0, sizeof(*uci)); - - err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); - if (err) - return err; - - microcode_init_cpu(cpu, resume); - - return 0; -} - -static int mc_sysdev_add(struct sys_device *sys_dev) -{ - return __mc_sysdev_add(sys_dev, 0); -} - -static int mc_sysdev_remove(struct sys_device *sys_dev) -{ - int cpu = sys_dev->id; - - if (!cpu_online(cpu)) - return 0; - - pr_debug("microcode: CPU%d removed\n", cpu); - microcode_fini_cpu(cpu); - sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); - return 0; -} - -static int mc_sysdev_resume(struct sys_device *dev) -{ - int cpu = dev->id; - - if (!cpu_online(cpu)) - return 0; - pr_debug("microcode: CPU%d resumed\n", cpu); - /* only CPU 0 will apply ucode here */ - apply_microcode(0); - return 0; -} - -static struct sysdev_driver mc_sysdev_driver = { - .add = mc_sysdev_add, - .remove = mc_sysdev_remove, - .resume = mc_sysdev_resume, -}; - -static __cpuinit int -mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - struct sys_device *sys_dev; - - sys_dev = get_cpu_sysdev(cpu); - switch (action) { - case CPU_UP_CANCELED_FROZEN: - /* The CPU refused to come up during a system resume */ - microcode_fini_cpu(cpu); - break; - case CPU_ONLINE: - case CPU_DOWN_FAILED: - mc_sysdev_add(sys_dev); - break; - case CPU_ONLINE_FROZEN: - /* System-wide resume is in progress, try to apply microcode */ - if (apply_microcode_check_cpu(cpu)) { - /* The application of microcode failed */ - microcode_fini_cpu(cpu); - __mc_sysdev_add(sys_dev, 1); - break; - } - case CPU_DOWN_FAILED_FROZEN: - if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) - printk(KERN_ERR "microcode: Failed to create the sysfs " - "group for CPU%d\n", cpu); - break; - case CPU_DOWN_PREPARE: - mc_sysdev_remove(sys_dev); - break; - case CPU_DOWN_PREPARE_FROZEN: - /* Suspend is in progress, only remove the interface */ - sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); - break; - } - return NOTIFY_OK; -} - -static struct notifier_block __refdata mc_cpu_notifier = { - .notifier_call = mc_cpu_callback, -}; - -static int __init microcode_init (void) -{ - int error; - - printk(KERN_INFO - "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n"); - - error = microcode_dev_init(); - if (error) - return error; - microcode_pdev = platform_device_register_simple("microcode", -1, - NULL, 0); - if (IS_ERR(microcode_pdev)) { - microcode_dev_exit(); - return PTR_ERR(microcode_pdev); - } - - get_online_cpus(); - error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); - put_online_cpus(); - if (error) { - microcode_dev_exit(); - platform_device_unregister(microcode_pdev); - return error; - } - - register_hotcpu_notifier(&mc_cpu_notifier); - return 0; -} - -static void __exit microcode_exit (void) -{ - microcode_dev_exit(); - - unregister_hotcpu_notifier(&mc_cpu_notifier); - - get_online_cpus(); - sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); - put_online_cpus(); - - platform_device_unregister(microcode_pdev); -} - -module_init(microcode_init) -module_exit(microcode_exit) diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c new file mode 100644 index 000000000000..7a1f8eeac2c7 --- /dev/null +++ b/arch/x86/kernel/microcode_amd.c @@ -0,0 +1,435 @@ +/* + * AMD CPU Microcode Update Driver for Linux + * Copyright (C) 2008 Advanced Micro Devices Inc. + * + * Author: Peter Oruba <peter.oruba@amd.com> + * + * Based on work by: + * Tigran Aivazian <tigran@aivazian.fsnet.co.uk> + * + * This driver allows to upgrade microcode on AMD + * family 0x10 and 0x11 processors. + * + * Licensed unter the terms of the GNU General Public + * License version 2. See file COPYING for details. +*/ + +#include <linux/capability.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/cpumask.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/miscdevice.h> +#include <linux/spinlock.h> +#include <linux/mm.h> +#include <linux/fs.h> +#include <linux/mutex.h> +#include <linux/cpu.h> +#include <linux/firmware.h> +#include <linux/platform_device.h> +#include <linux/pci.h> +#include <linux/pci_ids.h> + +#include <asm/msr.h> +#include <asm/uaccess.h> +#include <asm/processor.h> +#include <asm/microcode.h> + +MODULE_DESCRIPTION("AMD Microcode Update Driver"); +MODULE_AUTHOR("Peter Oruba <peter.oruba@amd.com>"); +MODULE_LICENSE("GPL v2"); + +#define UCODE_MAGIC 0x00414d44 +#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000 +#define UCODE_UCODE_TYPE 0x00000001 + +struct equiv_cpu_entry { + unsigned int installed_cpu; + unsigned int fixed_errata_mask; + unsigned int fixed_errata_compare; + unsigned int equiv_cpu; +}; + +struct microcode_header_amd { + unsigned int data_code; + unsigned int patch_id; + unsigned char mc_patch_data_id[2]; + unsigned char mc_patch_data_len; + unsigned char init_flag; + unsigned int mc_patch_data_checksum; + unsigned int nb_dev_id; + unsigned int sb_dev_id; + unsigned char processor_rev_id[2]; + unsigned char nb_rev_id; + unsigned char sb_rev_id; + unsigned char bios_api_rev; + unsigned char reserved1[3]; + unsigned int match_reg[8]; +}; + +struct microcode_amd { + struct microcode_header_amd hdr; + unsigned int mpb[0]; +}; + +#define UCODE_MAX_SIZE (2048) +#define DEFAULT_UCODE_DATASIZE (896) +#define MC_HEADER_SIZE (sizeof(struct microcode_header_amd)) +#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) +#define DWSIZE (sizeof(u32)) +/* For now we support a fixed ucode total size only */ +#define get_totalsize(mc) \ + ((((struct microcode_amd *)mc)->hdr.mc_patch_data_len * 28) \ + + MC_HEADER_SIZE) + +/* serialize access to the physical write */ +static DEFINE_SPINLOCK(microcode_update_lock); + +static struct equiv_cpu_entry *equiv_cpu_table; + +static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + + memset(csig, 0, sizeof(*csig)); + + if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { + printk(KERN_ERR "microcode: CPU%d not a capable AMD processor\n", + cpu); + return -1; + } + + asm volatile("movl %1, %%ecx; rdmsr" + : "=a" (csig->rev) + : "i" (0x0000008B) : "ecx"); + + printk(KERN_INFO "microcode: collect_cpu_info_amd : patch_id=0x%x\n", + csig->rev); + + return 0; +} + +static int get_matching_microcode(int cpu, void *mc, int rev) +{ + struct microcode_header_amd *mc_header = mc; + struct pci_dev *nb_pci_dev, *sb_pci_dev; + unsigned int current_cpu_id; + unsigned int equiv_cpu_id = 0x00; + unsigned int i = 0; + + BUG_ON(equiv_cpu_table == NULL); + current_cpu_id = cpuid_eax(0x00000001); + + while (equiv_cpu_table[i].installed_cpu != 0) { + if (current_cpu_id == equiv_cpu_table[i].installed_cpu) { + equiv_cpu_id = equiv_cpu_table[i].equiv_cpu; + break; + } + i++; + } + + if (!equiv_cpu_id) { + printk(KERN_ERR "microcode: CPU%d cpu_id " + "not found in equivalent cpu table \n", cpu); + return 0; + } + + if ((mc_header->processor_rev_id[0]) != (equiv_cpu_id & 0xff)) { + printk(KERN_ERR + "microcode: CPU%d patch does not match " + "(patch is %x, cpu extended is %x) \n", + cpu, mc_header->processor_rev_id[0], + (equiv_cpu_id & 0xff)); + return 0; + } + + if ((mc_header->processor_rev_id[1]) != ((equiv_cpu_id >> 16) & 0xff)) { + printk(KERN_ERR "microcode: CPU%d patch does not match " + "(patch is %x, cpu base id is %x) \n", + cpu, mc_header->processor_rev_id[1], + ((equiv_cpu_id >> 16) & 0xff)); + + return 0; + } + + /* ucode may be northbridge specific */ + if (mc_header->nb_dev_id) { + nb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD, + (mc_header->nb_dev_id & 0xff), + NULL); + if ((!nb_pci_dev) || + (mc_header->nb_rev_id != nb_pci_dev->revision)) { + printk(KERN_ERR "microcode: CPU%d NB mismatch \n", cpu); + pci_dev_put(nb_pci_dev); + return 0; + } + pci_dev_put(nb_pci_dev); + } + + /* ucode may be southbridge specific */ + if (mc_header->sb_dev_id) { + sb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD, + (mc_header->sb_dev_id & 0xff), + NULL); + if ((!sb_pci_dev) || + (mc_header->sb_rev_id != sb_pci_dev->revision)) { + printk(KERN_ERR "microcode: CPU%d SB mismatch \n", cpu); + pci_dev_put(sb_pci_dev); + return 0; + } + pci_dev_put(sb_pci_dev); + } + + if (mc_header->patch_id <= rev) + return 0; + + return 1; +} + +static void apply_microcode_amd(int cpu) +{ + unsigned long flags; + unsigned int eax, edx; + unsigned int rev; + int cpu_num = raw_smp_processor_id(); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; + struct microcode_amd *mc_amd = uci->mc; + unsigned long addr; + + /* We should bind the task to the CPU */ + BUG_ON(cpu_num != cpu); + + if (mc_amd == NULL) + return; + + spin_lock_irqsave(µcode_update_lock, flags); + + addr = (unsigned long)&mc_amd->hdr.data_code; + edx = (unsigned int)(((unsigned long)upper_32_bits(addr))); + eax = (unsigned int)(((unsigned long)lower_32_bits(addr))); + + asm volatile("movl %0, %%ecx; wrmsr" : + : "i" (0xc0010020), "a" (eax), "d" (edx) : "ecx"); + + /* get patch id after patching */ + asm volatile("movl %1, %%ecx; rdmsr" + : "=a" (rev) + : "i" (0x0000008B) : "ecx"); + + spin_unlock_irqrestore(µcode_update_lock, flags); + + /* check current patch id and patch's id for match */ + if (rev != mc_amd->hdr.patch_id) { + printk(KERN_ERR "microcode: CPU%d update from revision " + "0x%x to 0x%x failed\n", cpu_num, + mc_amd->hdr.patch_id, rev); + return; + } + + printk(KERN_INFO "microcode: CPU%d updated from revision " + "0x%x to 0x%x \n", + cpu_num, uci->cpu_sig.rev, mc_amd->hdr.patch_id); + + uci->cpu_sig.rev = rev; +} + +static void * get_next_ucode(u8 *buf, unsigned int size, + int (*get_ucode_data)(void *, const void *, size_t), + unsigned int *mc_size) +{ + unsigned int total_size; +#define UCODE_CONTAINER_SECTION_HDR 8 + u8 section_hdr[UCODE_CONTAINER_SECTION_HDR]; + void *mc; + + if (get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR)) + return NULL; + + if (section_hdr[0] != UCODE_UCODE_TYPE) { + printk(KERN_ERR "microcode: error! " + "Wrong microcode payload type field\n"); + return NULL; + } + + total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); + + printk(KERN_INFO "microcode: size %u, total_size %u\n", + size, total_size); + + if (total_size > size || total_size > UCODE_MAX_SIZE) { + printk(KERN_ERR "microcode: error! Bad data in microcode data file\n"); + return NULL; + } + + mc = vmalloc(UCODE_MAX_SIZE); + if (mc) { + memset(mc, 0, UCODE_MAX_SIZE); + if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size)) { + vfree(mc); + mc = NULL; + } else + *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR; + } +#undef UCODE_CONTAINER_SECTION_HDR + return mc; +} + + +static int install_equiv_cpu_table(u8 *buf, + int (*get_ucode_data)(void *, const void *, size_t)) +{ +#define UCODE_CONTAINER_HEADER_SIZE 12 + u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE]; + unsigned int *buf_pos = (unsigned int *)container_hdr; + unsigned long size; + + if (get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE)) + return 0; + + size = buf_pos[2]; + + if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { + printk(KERN_ERR "microcode: error! " + "Wrong microcode equivalnet cpu table\n"); + return 0; + } + + equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size); + if (!equiv_cpu_table) { + printk(KERN_ERR "microcode: error, can't allocate memory for equiv CPU table\n"); + return 0; + } + + buf += UCODE_CONTAINER_HEADER_SIZE; + if (get_ucode_data(equiv_cpu_table, buf, size)) { + vfree(equiv_cpu_table); + return 0; + } + + return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */ +#undef UCODE_CONTAINER_HEADER_SIZE +} + +static void free_equiv_cpu_table(void) +{ + if (equiv_cpu_table) { + vfree(equiv_cpu_table); + equiv_cpu_table = NULL; + } +} + +static int generic_load_microcode(int cpu, void *data, size_t size, + int (*get_ucode_data)(void *, const void *, size_t)) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + u8 *ucode_ptr = data, *new_mc = NULL, *mc; + int new_rev = uci->cpu_sig.rev; + unsigned int leftover; + unsigned long offset; + + offset = install_equiv_cpu_table(ucode_ptr, get_ucode_data); + if (!offset) { + printk(KERN_ERR "microcode: installing equivalent cpu table failed\n"); + return -EINVAL; + } + + ucode_ptr += offset; + leftover = size - offset; + + while (leftover) { + unsigned int uninitialized_var(mc_size); + struct microcode_header_amd *mc_header; + + mc = get_next_ucode(ucode_ptr, leftover, get_ucode_data, &mc_size); + if (!mc) + break; + + mc_header = (struct microcode_header_amd *)mc; + if (get_matching_microcode(cpu, mc, new_rev)) { + if (new_mc) + vfree(new_mc); + new_rev = mc_header->patch_id; + new_mc = mc; + } else + vfree(mc); + + ucode_ptr += mc_size; + leftover -= mc_size; + } + + if (new_mc) { + if (!leftover) { + if (uci->mc) + vfree(uci->mc); + uci->mc = new_mc; + pr_debug("microcode: CPU%d found a matching microcode update with" + " version 0x%x (current=0x%x)\n", + cpu, new_rev, uci->cpu_sig.rev); + } else + vfree(new_mc); + } + + free_equiv_cpu_table(); + + return (int)leftover; +} + +static int get_ucode_fw(void *to, const void *from, size_t n) +{ + memcpy(to, from, n); + return 0; +} + +static int request_microcode_fw(int cpu, struct device *device) +{ + const char *fw_name = "amd-ucode/microcode_amd.bin"; + const struct firmware *firmware; + int ret; + + /* We should bind the task to the CPU */ + BUG_ON(cpu != raw_smp_processor_id()); + + ret = request_firmware(&firmware, fw_name, device); + if (ret) { + printk(KERN_ERR "microcode: ucode data file %s load failed\n", fw_name); + return ret; + } + + ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size, + &get_ucode_fw); + + release_firmware(firmware); + + return ret; +} + +static int request_microcode_user(int cpu, const void __user *buf, size_t size) +{ + printk(KERN_WARNING "microcode: AMD microcode update via /dev/cpu/microcode" + "is not supported\n"); + return -1; +} + +static void microcode_fini_cpu_amd(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + vfree(uci->mc); + uci->mc = NULL; +} + +static struct microcode_ops microcode_amd_ops = { + .request_microcode_user = request_microcode_user, + .request_microcode_fw = request_microcode_fw, + .collect_cpu_info = collect_cpu_info_amd, + .apply_microcode = apply_microcode_amd, + .microcode_fini_cpu = microcode_fini_cpu_amd, +}; + +struct microcode_ops * __init init_amd_microcode(void) +{ + return µcode_amd_ops; +} diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c new file mode 100644 index 000000000000..936d8d55f230 --- /dev/null +++ b/arch/x86/kernel/microcode_core.c @@ -0,0 +1,508 @@ +/* + * Intel CPU Microcode Update Driver for Linux + * + * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk> + * 2006 Shaohua Li <shaohua.li@intel.com> + * + * This driver allows to upgrade microcode on Intel processors + * belonging to IA-32 family - PentiumPro, Pentium II, + * Pentium III, Xeon, Pentium 4, etc. + * + * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture + * Software Developer's Manual + * Order Number 253668 or free download from: + * + * http://developer.intel.com/design/pentium4/manuals/253668.htm + * + * For more information, go to http://www.urbanmyth.org/microcode + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com> + * Initial release. + * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com> + * Added read() support + cleanups. + * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com> + * Added 'device trimming' support. open(O_WRONLY) zeroes + * and frees the saved copy of applied microcode. + * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com> + * Made to use devfs (/dev/cpu/microcode) + cleanups. + * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com> + * Added misc device support (now uses both devfs and misc). + * Added MICROCODE_IOCFREE ioctl to clear memory. + * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com> + * Messages for error cases (non Intel & no suitable microcode). + * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com> + * Removed ->release(). Removed exclusive open and status bitmap. + * Added microcode_rwsem to serialize read()/write()/ioctl(). + * Removed global kernel lock usage. + * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com> + * Write 0 to 0x8B msr and then cpuid before reading revision, + * so that it works even if there were no update done by the + * BIOS. Otherwise, reading from 0x8B gives junk (which happened + * to be 0 on my machine which is why it worked even when I + * disabled update by the BIOS) + * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix. + * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and + * Tigran Aivazian <tigran@veritas.com> + * Intel Pentium 4 processor support and bugfixes. + * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com> + * Bugfix for HT (Hyper-Threading) enabled processors + * whereby processor resources are shared by all logical processors + * in a single CPU package. + * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and + * Tigran Aivazian <tigran@veritas.com>, + * Serialize updates as required on HT processors due to + * speculative nature of implementation. + * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com> + * Fix the panic when writing zero-length microcode chunk. + * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>, + * Jun Nakajima <jun.nakajima@intel.com> + * Support for the microcode updates in the new format. + * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com> + * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl + * because we no longer hold a copy of applied microcode + * in kernel memory. + * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com> + * Fix sigmatch() macro to handle old CPUs with pf == 0. + * Thanks to Stuart Swales for pointing out this bug. + */ +#include <linux/capability.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/smp_lock.h> +#include <linux/cpumask.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/miscdevice.h> +#include <linux/spinlock.h> +#include <linux/mm.h> +#include <linux/fs.h> +#include <linux/mutex.h> +#include <linux/cpu.h> +#include <linux/firmware.h> +#include <linux/platform_device.h> + +#include <asm/msr.h> +#include <asm/uaccess.h> +#include <asm/processor.h> +#include <asm/microcode.h> + +MODULE_DESCRIPTION("Microcode Update Driver"); +MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); +MODULE_LICENSE("GPL"); + +#define MICROCODE_VERSION "2.00" + +struct microcode_ops *microcode_ops; + +/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ +static DEFINE_MUTEX(microcode_mutex); + +struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; +EXPORT_SYMBOL_GPL(ucode_cpu_info); + +#ifdef CONFIG_MICROCODE_OLD_INTERFACE +static int do_microcode_update(const void __user *buf, size_t size) +{ + cpumask_t old; + int error = 0; + int cpu; + + old = current->cpus_allowed; + + for_each_online_cpu(cpu) { + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + if (!uci->valid) + continue; + + set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + error = microcode_ops->request_microcode_user(cpu, buf, size); + if (error < 0) + goto out; + if (!error) + microcode_ops->apply_microcode(cpu); + } +out: + set_cpus_allowed_ptr(current, &old); + return error; +} + +static int microcode_open(struct inode *unused1, struct file *unused2) +{ + cycle_kernel_lock(); + return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; +} + +static ssize_t microcode_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + ssize_t ret; + + if ((len >> PAGE_SHIFT) > num_physpages) { + printk(KERN_ERR "microcode: too much data (max %ld pages)\n", + num_physpages); + return -EINVAL; + } + + get_online_cpus(); + mutex_lock(µcode_mutex); + + ret = do_microcode_update(buf, len); + if (!ret) + ret = (ssize_t)len; + + mutex_unlock(µcode_mutex); + put_online_cpus(); + + return ret; +} + +static const struct file_operations microcode_fops = { + .owner = THIS_MODULE, + .write = microcode_write, + .open = microcode_open, +}; + +static struct miscdevice microcode_dev = { + .minor = MICROCODE_MINOR, + .name = "microcode", + .fops = µcode_fops, +}; + +static int __init microcode_dev_init(void) +{ + int error; + + error = misc_register(µcode_dev); + if (error) { + printk(KERN_ERR + "microcode: can't misc_register on minor=%d\n", + MICROCODE_MINOR); + return error; + } + + return 0; +} + +static void microcode_dev_exit(void) +{ + misc_deregister(µcode_dev); +} + +MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); +#else +#define microcode_dev_init() 0 +#define microcode_dev_exit() do { } while (0) +#endif + +/* fake device for request_firmware */ +struct platform_device *microcode_pdev; + +static ssize_t reload_store(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, size_t sz) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; + char *end; + unsigned long val = simple_strtoul(buf, &end, 0); + int err = 0; + int cpu = dev->id; + + if (end == buf) + return -EINVAL; + if (val == 1) { + cpumask_t old = current->cpus_allowed; + + get_online_cpus(); + if (cpu_online(cpu)) { + set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + mutex_lock(µcode_mutex); + if (uci->valid) { + err = microcode_ops->request_microcode_fw(cpu, + µcode_pdev->dev); + if (!err) + microcode_ops->apply_microcode(cpu); + } + mutex_unlock(µcode_mutex); + set_cpus_allowed_ptr(current, &old); + } + put_online_cpus(); + } + if (err) + return err; + return sz; +} + +static ssize_t version_show(struct sys_device *dev, + struct sysdev_attribute *attr, char *buf) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; + + return sprintf(buf, "0x%x\n", uci->cpu_sig.rev); +} + +static ssize_t pf_show(struct sys_device *dev, + struct sysdev_attribute *attr, char *buf) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; + + return sprintf(buf, "0x%x\n", uci->cpu_sig.pf); +} + +static SYSDEV_ATTR(reload, 0200, NULL, reload_store); +static SYSDEV_ATTR(version, 0400, version_show, NULL); +static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL); + +static struct attribute *mc_default_attrs[] = { + &attr_reload.attr, + &attr_version.attr, + &attr_processor_flags.attr, + NULL +}; + +static struct attribute_group mc_attr_group = { + .attrs = mc_default_attrs, + .name = "microcode", +}; + +static void microcode_fini_cpu(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + mutex_lock(µcode_mutex); + microcode_ops->microcode_fini_cpu(cpu); + uci->valid = 0; + mutex_unlock(µcode_mutex); +} + +static void collect_cpu_info(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + memset(uci, 0, sizeof(*uci)); + if (!microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig)) + uci->valid = 1; +} + +static int microcode_resume_cpu(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + struct cpu_signature nsig; + + pr_debug("microcode: CPU%d resumed\n", cpu); + + if (!uci->mc) + return 1; + + /* + * Let's verify that the 'cached' ucode does belong + * to this cpu (a bit of paranoia): + */ + if (microcode_ops->collect_cpu_info(cpu, &nsig)) { + microcode_fini_cpu(cpu); + return -1; + } + + if (memcmp(&nsig, &uci->cpu_sig, sizeof(nsig))) { + microcode_fini_cpu(cpu); + /* Should we look for a new ucode here? */ + return 1; + } + + return 0; +} + +void microcode_update_cpu(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + int err = 0; + + /* + * Check if the system resume is in progress (uci->valid != NULL), + * otherwise just request a firmware: + */ + if (uci->valid) { + err = microcode_resume_cpu(cpu); + } else { + collect_cpu_info(cpu); + if (uci->valid && system_state == SYSTEM_RUNNING) + err = microcode_ops->request_microcode_fw(cpu, + µcode_pdev->dev); + } + if (!err) + microcode_ops->apply_microcode(cpu); +} + +static void microcode_init_cpu(int cpu) +{ + cpumask_t old = current->cpus_allowed; + + set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + /* We should bind the task to the CPU */ + BUG_ON(raw_smp_processor_id() != cpu); + + mutex_lock(µcode_mutex); + microcode_update_cpu(cpu); + mutex_unlock(µcode_mutex); + + set_cpus_allowed_ptr(current, &old); +} + +static int mc_sysdev_add(struct sys_device *sys_dev) +{ + int err, cpu = sys_dev->id; + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + if (!cpu_online(cpu)) + return 0; + + pr_debug("microcode: CPU%d added\n", cpu); + memset(uci, 0, sizeof(*uci)); + + err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); + if (err) + return err; + + microcode_init_cpu(cpu); + return 0; +} + +static int mc_sysdev_remove(struct sys_device *sys_dev) +{ + int cpu = sys_dev->id; + + if (!cpu_online(cpu)) + return 0; + + pr_debug("microcode: CPU%d removed\n", cpu); + microcode_fini_cpu(cpu); + sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); + return 0; +} + +static int mc_sysdev_resume(struct sys_device *dev) +{ + int cpu = dev->id; + + if (!cpu_online(cpu)) + return 0; + + /* only CPU 0 will apply ucode here */ + microcode_update_cpu(0); + return 0; +} + +static struct sysdev_driver mc_sysdev_driver = { + .add = mc_sysdev_add, + .remove = mc_sysdev_remove, + .resume = mc_sysdev_resume, +}; + +static __cpuinit int +mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct sys_device *sys_dev; + + sys_dev = get_cpu_sysdev(cpu); + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + microcode_init_cpu(cpu); + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + pr_debug("microcode: CPU%d added\n", cpu); + if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) + printk(KERN_ERR "microcode: Failed to create the sysfs " + "group for CPU%d\n", cpu); + break; + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + /* Suspend is in progress, only remove the interface */ + sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); + pr_debug("microcode: CPU%d removed\n", cpu); + break; + case CPU_DEAD: + case CPU_UP_CANCELED_FROZEN: + /* The CPU refused to come up during a system resume */ + microcode_fini_cpu(cpu); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __refdata mc_cpu_notifier = { + .notifier_call = mc_cpu_callback, +}; + +static int __init microcode_init(void) +{ + struct cpuinfo_x86 *c = &cpu_data(0); + int error; + + if (c->x86_vendor == X86_VENDOR_INTEL) + microcode_ops = init_intel_microcode(); + else if (c->x86_vendor == X86_VENDOR_AMD) + microcode_ops = init_amd_microcode(); + + if (!microcode_ops) { + printk(KERN_ERR "microcode: no support for this CPU vendor\n"); + return -ENODEV; + } + + error = microcode_dev_init(); + if (error) + return error; + microcode_pdev = platform_device_register_simple("microcode", -1, + NULL, 0); + if (IS_ERR(microcode_pdev)) { + microcode_dev_exit(); + return PTR_ERR(microcode_pdev); + } + + get_online_cpus(); + error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); + put_online_cpus(); + if (error) { + microcode_dev_exit(); + platform_device_unregister(microcode_pdev); + return error; + } + + register_hotcpu_notifier(&mc_cpu_notifier); + + printk(KERN_INFO + "Microcode Update Driver: v" MICROCODE_VERSION + " <tigran@aivazian.fsnet.co.uk>" + " <peter.oruba@amd.com>\n"); + + return 0; +} + +static void __exit microcode_exit(void) +{ + microcode_dev_exit(); + + unregister_hotcpu_notifier(&mc_cpu_notifier); + + get_online_cpus(); + sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); + put_online_cpus(); + + platform_device_unregister(microcode_pdev); + + microcode_ops = NULL; + + printk(KERN_INFO + "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); +} + +module_init(microcode_init); +module_exit(microcode_exit); diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c new file mode 100644 index 000000000000..622dc4a21784 --- /dev/null +++ b/arch/x86/kernel/microcode_intel.c @@ -0,0 +1,480 @@ +/* + * Intel CPU Microcode Update Driver for Linux + * + * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk> + * 2006 Shaohua Li <shaohua.li@intel.com> + * + * This driver allows to upgrade microcode on Intel processors + * belonging to IA-32 family - PentiumPro, Pentium II, + * Pentium III, Xeon, Pentium 4, etc. + * + * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture + * Software Developer's Manual + * Order Number 253668 or free download from: + * + * http://developer.intel.com/design/pentium4/manuals/253668.htm + * + * For more information, go to http://www.urbanmyth.org/microcode + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com> + * Initial release. + * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com> + * Added read() support + cleanups. + * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com> + * Added 'device trimming' support. open(O_WRONLY) zeroes + * and frees the saved copy of applied microcode. + * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com> + * Made to use devfs (/dev/cpu/microcode) + cleanups. + * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com> + * Added misc device support (now uses both devfs and misc). + * Added MICROCODE_IOCFREE ioctl to clear memory. + * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com> + * Messages for error cases (non Intel & no suitable microcode). + * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com> + * Removed ->release(). Removed exclusive open and status bitmap. + * Added microcode_rwsem to serialize read()/write()/ioctl(). + * Removed global kernel lock usage. + * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com> + * Write 0 to 0x8B msr and then cpuid before reading revision, + * so that it works even if there were no update done by the + * BIOS. Otherwise, reading from 0x8B gives junk (which happened + * to be 0 on my machine which is why it worked even when I + * disabled update by the BIOS) + * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix. + * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and + * Tigran Aivazian <tigran@veritas.com> + * Intel Pentium 4 processor support and bugfixes. + * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com> + * Bugfix for HT (Hyper-Threading) enabled processors + * whereby processor resources are shared by all logical processors + * in a single CPU package. + * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and + * Tigran Aivazian <tigran@veritas.com>, + * Serialize updates as required on HT processors due to + * speculative nature of implementation. + * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com> + * Fix the panic when writing zero-length microcode chunk. + * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>, + * Jun Nakajima <jun.nakajima@intel.com> + * Support for the microcode updates in the new format. + * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com> + * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl + * because we no longer hold a copy of applied microcode + * in kernel memory. + * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com> + * Fix sigmatch() macro to handle old CPUs with pf == 0. + * Thanks to Stuart Swales for pointing out this bug. + */ +#include <linux/capability.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/smp_lock.h> +#include <linux/cpumask.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/miscdevice.h> +#include <linux/spinlock.h> +#include <linux/mm.h> +#include <linux/fs.h> +#include <linux/mutex.h> +#include <linux/cpu.h> +#include <linux/firmware.h> +#include <linux/platform_device.h> + +#include <asm/msr.h> +#include <asm/uaccess.h> +#include <asm/processor.h> +#include <asm/microcode.h> + +MODULE_DESCRIPTION("Microcode Update Driver"); +MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); +MODULE_LICENSE("GPL"); + +struct microcode_header_intel { + unsigned int hdrver; + unsigned int rev; + unsigned int date; + unsigned int sig; + unsigned int cksum; + unsigned int ldrver; + unsigned int pf; + unsigned int datasize; + unsigned int totalsize; + unsigned int reserved[3]; +}; + +struct microcode_intel { + struct microcode_header_intel hdr; + unsigned int bits[0]; +}; + +/* microcode format is extended from prescott processors */ +struct extended_signature { + unsigned int sig; + unsigned int pf; + unsigned int cksum; +}; + +struct extended_sigtable { + unsigned int count; + unsigned int cksum; + unsigned int reserved[3]; + struct extended_signature sigs[0]; +}; + +#define DEFAULT_UCODE_DATASIZE (2000) +#define MC_HEADER_SIZE (sizeof(struct microcode_header_intel)) +#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) +#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable)) +#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature)) +#define DWSIZE (sizeof(u32)) +#define get_totalsize(mc) \ + (((struct microcode_intel *)mc)->hdr.totalsize ? \ + ((struct microcode_intel *)mc)->hdr.totalsize : \ + DEFAULT_UCODE_TOTALSIZE) + +#define get_datasize(mc) \ + (((struct microcode_intel *)mc)->hdr.datasize ? \ + ((struct microcode_intel *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE) + +#define sigmatch(s1, s2, p1, p2) \ + (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0)))) + +#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) + +/* serialize access to the physical write to MSR 0x79 */ +static DEFINE_SPINLOCK(microcode_update_lock); + +static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu_num); + unsigned int val[2]; + + memset(csig, 0, sizeof(*csig)); + + if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || + cpu_has(c, X86_FEATURE_IA64)) { + printk(KERN_ERR "microcode: CPU%d not a capable Intel " + "processor\n", cpu_num); + return -1; + } + + csig->sig = cpuid_eax(0x00000001); + + if ((c->x86_model >= 5) || (c->x86 > 6)) { + /* get processor flags from MSR 0x17 */ + rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); + csig->pf = 1 << ((val[1] >> 18) & 7); + } + + wrmsr(MSR_IA32_UCODE_REV, 0, 0); + /* see notes above for revision 1.07. Apparent chip bug */ + sync_core(); + /* get the current revision from MSR 0x8B */ + rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); + pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n", + csig->sig, csig->pf, csig->rev); + + return 0; +} + +static inline int update_match_cpu(struct cpu_signature *csig, int sig, int pf) +{ + return (!sigmatch(sig, csig->sig, pf, csig->pf)) ? 0 : 1; +} + +static inline int +update_match_revision(struct microcode_header_intel *mc_header, int rev) +{ + return (mc_header->rev <= rev) ? 0 : 1; +} + +static int microcode_sanity_check(void *mc) +{ + struct microcode_header_intel *mc_header = mc; + struct extended_sigtable *ext_header = NULL; + struct extended_signature *ext_sig; + unsigned long total_size, data_size, ext_table_size; + int sum, orig_sum, ext_sigcount = 0, i; + + total_size = get_totalsize(mc_header); + data_size = get_datasize(mc_header); + if (data_size + MC_HEADER_SIZE > total_size) { + printk(KERN_ERR "microcode: error! " + "Bad data size in microcode data file\n"); + return -EINVAL; + } + + if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { + printk(KERN_ERR "microcode: error! " + "Unknown microcode update format\n"); + return -EINVAL; + } + ext_table_size = total_size - (MC_HEADER_SIZE + data_size); + if (ext_table_size) { + if ((ext_table_size < EXT_HEADER_SIZE) + || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { + printk(KERN_ERR "microcode: error! " + "Small exttable size in microcode data file\n"); + return -EINVAL; + } + ext_header = mc + MC_HEADER_SIZE + data_size; + if (ext_table_size != exttable_size(ext_header)) { + printk(KERN_ERR "microcode: error! " + "Bad exttable size in microcode data file\n"); + return -EFAULT; + } + ext_sigcount = ext_header->count; + } + + /* check extended table checksum */ + if (ext_table_size) { + int ext_table_sum = 0; + int *ext_tablep = (int *)ext_header; + + i = ext_table_size / DWSIZE; + while (i--) + ext_table_sum += ext_tablep[i]; + if (ext_table_sum) { + printk(KERN_WARNING "microcode: aborting, " + "bad extended signature table checksum\n"); + return -EINVAL; + } + } + + /* calculate the checksum */ + orig_sum = 0; + i = (MC_HEADER_SIZE + data_size) / DWSIZE; + while (i--) + orig_sum += ((int *)mc)[i]; + if (orig_sum) { + printk(KERN_ERR "microcode: aborting, bad checksum\n"); + return -EINVAL; + } + if (!ext_table_size) + return 0; + /* check extended signature checksum */ + for (i = 0; i < ext_sigcount; i++) { + ext_sig = (void *)ext_header + EXT_HEADER_SIZE + + EXT_SIGNATURE_SIZE * i; + sum = orig_sum + - (mc_header->sig + mc_header->pf + mc_header->cksum) + + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); + if (sum) { + printk(KERN_ERR "microcode: aborting, bad checksum\n"); + return -EINVAL; + } + } + return 0; +} + +/* + * return 0 - no update found + * return 1 - found update + */ +static int +get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev) +{ + struct microcode_header_intel *mc_header = mc; + struct extended_sigtable *ext_header; + unsigned long total_size = get_totalsize(mc_header); + int ext_sigcount, i; + struct extended_signature *ext_sig; + + if (!update_match_revision(mc_header, rev)) + return 0; + + if (update_match_cpu(cpu_sig, mc_header->sig, mc_header->pf)) + return 1; + + /* Look for ext. headers: */ + if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE) + return 0; + + ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE; + ext_sigcount = ext_header->count; + ext_sig = (void *)ext_header + EXT_HEADER_SIZE; + + for (i = 0; i < ext_sigcount; i++) { + if (update_match_cpu(cpu_sig, ext_sig->sig, ext_sig->pf)) + return 1; + ext_sig++; + } + return 0; +} + +static void apply_microcode(int cpu) +{ + unsigned long flags; + unsigned int val[2]; + int cpu_num = raw_smp_processor_id(); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + struct microcode_intel *mc_intel = uci->mc; + + /* We should bind the task to the CPU */ + BUG_ON(cpu_num != cpu); + + if (mc_intel == NULL) + return; + + /* serialize access to the physical write to MSR 0x79 */ + spin_lock_irqsave(µcode_update_lock, flags); + + /* write microcode via MSR 0x79 */ + wrmsr(MSR_IA32_UCODE_WRITE, + (unsigned long) mc_intel->bits, + (unsigned long) mc_intel->bits >> 16 >> 16); + wrmsr(MSR_IA32_UCODE_REV, 0, 0); + + /* see notes above for revision 1.07. Apparent chip bug */ + sync_core(); + + /* get the current revision from MSR 0x8B */ + rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); + + spin_unlock_irqrestore(µcode_update_lock, flags); + if (val[1] != mc_intel->hdr.rev) { + printk(KERN_ERR "microcode: CPU%d update from revision " + "0x%x to 0x%x failed\n", cpu_num, uci->cpu_sig.rev, val[1]); + return; + } + printk(KERN_INFO "microcode: CPU%d updated from revision " + "0x%x to 0x%x, date = %04x-%02x-%02x \n", + cpu_num, uci->cpu_sig.rev, val[1], + mc_intel->hdr.date & 0xffff, + mc_intel->hdr.date >> 24, + (mc_intel->hdr.date >> 16) & 0xff); + uci->cpu_sig.rev = val[1]; +} + +static int generic_load_microcode(int cpu, void *data, size_t size, + int (*get_ucode_data)(void *, const void *, size_t)) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + u8 *ucode_ptr = data, *new_mc = NULL, *mc; + int new_rev = uci->cpu_sig.rev; + unsigned int leftover = size; + + while (leftover) { + struct microcode_header_intel mc_header; + unsigned int mc_size; + + if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header))) + break; + + mc_size = get_totalsize(&mc_header); + if (!mc_size || mc_size > leftover) { + printk(KERN_ERR "microcode: error!" + "Bad data in microcode data file\n"); + break; + } + + mc = vmalloc(mc_size); + if (!mc) + break; + + if (get_ucode_data(mc, ucode_ptr, mc_size) || + microcode_sanity_check(mc) < 0) { + vfree(mc); + break; + } + + if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) { + if (new_mc) + vfree(new_mc); + new_rev = mc_header.rev; + new_mc = mc; + } else + vfree(mc); + + ucode_ptr += mc_size; + leftover -= mc_size; + } + + if (new_mc) { + if (!leftover) { + if (uci->mc) + vfree(uci->mc); + uci->mc = (struct microcode_intel *)new_mc; + pr_debug("microcode: CPU%d found a matching microcode update with" + " version 0x%x (current=0x%x)\n", + cpu, new_rev, uci->cpu_sig.rev); + } else + vfree(new_mc); + } + + return (int)leftover; +} + +static int get_ucode_fw(void *to, const void *from, size_t n) +{ + memcpy(to, from, n); + return 0; +} + +static int request_microcode_fw(int cpu, struct device *device) +{ + char name[30]; + struct cpuinfo_x86 *c = &cpu_data(cpu); + const struct firmware *firmware; + int ret; + + /* We should bind the task to the CPU */ + BUG_ON(cpu != raw_smp_processor_id()); + sprintf(name, "intel-ucode/%02x-%02x-%02x", + c->x86, c->x86_model, c->x86_mask); + ret = request_firmware(&firmware, name, device); + if (ret) { + pr_debug("microcode: data file %s load failed\n", name); + return ret; + } + + ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size, + &get_ucode_fw); + + release_firmware(firmware); + + return ret; +} + +static int get_ucode_user(void *to, const void *from, size_t n) +{ + return copy_from_user(to, from, n); +} + +static int request_microcode_user(int cpu, const void __user *buf, size_t size) +{ + /* We should bind the task to the CPU */ + BUG_ON(cpu != raw_smp_processor_id()); + + return generic_load_microcode(cpu, (void*)buf, size, &get_ucode_user); +} + +static void microcode_fini_cpu(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + vfree(uci->mc); + uci->mc = NULL; +} + +struct microcode_ops microcode_intel_ops = { + .request_microcode_user = request_microcode_user, + .request_microcode_fw = request_microcode_fw, + .collect_cpu_info = collect_cpu_info, + .apply_microcode = apply_microcode, + .microcode_fini_cpu = microcode_fini_cpu, +}; + +struct microcode_ops * __init init_intel_microcode(void) +{ + return µcode_intel_ops; +} + diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c index fdfdc550b366..efc2f361fe85 100644 --- a/arch/x86/kernel/mmconf-fam10h_64.c +++ b/arch/x86/kernel/mmconf-fam10h_64.c @@ -238,7 +238,7 @@ static struct dmi_system_id __devinitdata mmconf_dmi_table[] = { {} }; -void __init check_enable_amd_mmconf_dmi(void) +void __cpuinit check_enable_amd_mmconf_dmi(void) { dmi_check_system(mmconf_dmi_table); } diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module_64.c index a888e67f5874..6ba87830d4b1 100644 --- a/arch/x86/kernel/module_64.c +++ b/arch/x86/kernel/module_64.c @@ -22,6 +22,7 @@ #include <linux/fs.h> #include <linux/string.h> #include <linux/kernel.h> +#include <linux/mm.h> #include <linux/slab.h> #include <linux/bug.h> @@ -150,7 +151,8 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *me) { - const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL; + const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, + *para = NULL; char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { @@ -160,6 +162,8 @@ int module_finalize(const Elf_Ehdr *hdr, alt = s; if (!strcmp(".smp_locks", secstrings + s->sh_name)) locks= s; + if (!strcmp(".parainstructions", secstrings + s->sh_name)) + para = s; } if (alt) { @@ -175,6 +179,11 @@ int module_finalize(const Elf_Ehdr *hdr, tseg, tseg + text->sh_size); } + if (para) { + void *pseg = (void *)para->sh_addr; + apply_paravirt(pseg, pseg + para->sh_size); + } + return module_bug_finalize(hdr, sechdrs, me); } diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 3b25e49380c6..f98f4e1dba09 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -27,6 +27,7 @@ #include <asm/bios_ebda.h> #include <asm/e820.h> #include <asm/trampoline.h> +#include <asm/setup.h> #include <mach_apic.h> #ifdef CONFIG_X86_32 @@ -48,77 +49,7 @@ static int __init mpf_checksum(unsigned char *mp, int len) return sum & 0xFF; } -#ifdef CONFIG_X86_NUMAQ -int found_numaq; -/* - * Have to match translation table entries to main table entries by counter - * hence the mpc_record variable .... can't see a less disgusting way of - * doing this .... - */ -struct mpc_config_translation { - unsigned char mpc_type; - unsigned char trans_len; - unsigned char trans_type; - unsigned char trans_quad; - unsigned char trans_global; - unsigned char trans_local; - unsigned short trans_reserved; -}; - - -static int mpc_record; -static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] - __cpuinitdata; - -static inline int generate_logical_apicid(int quad, int phys_apicid) -{ - return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1); -} - - -static inline int mpc_apic_id(struct mpc_config_processor *m, - struct mpc_config_translation *translation_record) -{ - int quad = translation_record->trans_quad; - int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid); - - printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n", - m->mpc_apicid, - (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8, - (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4, - m->mpc_apicver, quad, logical_apicid); - return logical_apicid; -} - -int mp_bus_id_to_node[MAX_MP_BUSSES]; - -int mp_bus_id_to_local[MAX_MP_BUSSES]; - -static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name, - struct mpc_config_translation *translation) -{ - int quad = translation->trans_quad; - int local = translation->trans_local; - - mp_bus_id_to_node[m->mpc_busid] = quad; - mp_bus_id_to_local[m->mpc_busid] = local; - printk(KERN_INFO "Bus #%d is %s (node %d)\n", - m->mpc_busid, name, quad); -} - -int quad_local_to_mp_bus_id [NR_CPUS/4][4]; -static void mpc_oem_pci_bus(struct mpc_config_bus *m, - struct mpc_config_translation *translation) -{ - int quad = translation->trans_quad; - int local = translation->trans_local; - - quad_local_to_mp_bus_id[quad][local] = m->mpc_busid; -} - -#endif - -static void __cpuinit MP_processor_info(struct mpc_config_processor *m) +static void __init MP_processor_info(struct mpc_config_processor *m) { int apicid; char *bootup_cpu = ""; @@ -127,14 +58,12 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m) disabled_cpus++; return; } -#ifdef CONFIG_X86_NUMAQ - if (found_numaq) - apicid = mpc_apic_id(m, translation_table[mpc_record]); + + if (x86_quirks->mpc_apic_id) + apicid = x86_quirks->mpc_apic_id(m); else apicid = m->mpc_apicid; -#else - apicid = m->mpc_apicid; -#endif + if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { bootup_cpu = " (Bootup-CPU)"; boot_cpu_physical_apicid = m->mpc_apicid; @@ -151,12 +80,10 @@ static void __init MP_bus_info(struct mpc_config_bus *m) memcpy(str, m->mpc_bustype, 6); str[6] = 0; -#ifdef CONFIG_X86_NUMAQ - if (found_numaq) - mpc_oem_bus_info(m, str, translation_table[mpc_record]); -#else - printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str); -#endif + if (x86_quirks->mpc_oem_bus_info) + x86_quirks->mpc_oem_bus_info(m, str); + else + apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->mpc_busid, str); #if MAX_MP_BUSSES < 256 if (m->mpc_busid >= MAX_MP_BUSSES) { @@ -173,10 +100,9 @@ static void __init MP_bus_info(struct mpc_config_bus *m) mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; #endif } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { -#ifdef CONFIG_X86_NUMAQ - if (found_numaq) - mpc_oem_pci_bus(m, translation_table[mpc_record]); -#endif + if (x86_quirks->mpc_oem_pci_bus) + x86_quirks->mpc_oem_pci_bus(m); + clear_bit(m->mpc_busid, mp_bus_not_pci); #if defined(CONFIG_EISA) || defined (CONFIG_MCA) mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; @@ -228,7 +154,7 @@ static void __init MP_ioapic_info(struct mpc_config_ioapic *m) static void print_MP_intsrc_info(struct mpc_config_intsrc *m) { - printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x," + apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x," " IRQ %02x, APIC ID %x, APIC INT %02x\n", m->mpc_irqtype, m->mpc_irqflag & 3, (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, @@ -237,7 +163,7 @@ static void print_MP_intsrc_info(struct mpc_config_intsrc *m) static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq) { - printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x," + apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x," " IRQ %02x, APIC ID %x, APIC INT %02x\n", mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3, (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus, @@ -309,90 +235,13 @@ static void __init MP_intsrc_info(struct mpc_config_intsrc *m) static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m) { - printk(KERN_INFO "Lint: type %d, pol %d, trig %d, bus %02x," + apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x," " IRQ %02x, APIC ID %x, APIC LINT %02x\n", m->mpc_irqtype, m->mpc_irqflag & 3, (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid, m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); } -#ifdef CONFIG_X86_NUMAQ -static void __init MP_translation_info(struct mpc_config_translation *m) -{ - printk(KERN_INFO - "Translation: record %d, type %d, quad %d, global %d, local %d\n", - mpc_record, m->trans_type, m->trans_quad, m->trans_global, - m->trans_local); - - if (mpc_record >= MAX_MPC_ENTRY) - printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n"); - else - translation_table[mpc_record] = m; /* stash this for later */ - if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad)) - node_set_online(m->trans_quad); -} - -/* - * Read/parse the MPC oem tables - */ - -static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, - unsigned short oemsize) -{ - int count = sizeof(*oemtable); /* the header size */ - unsigned char *oemptr = ((unsigned char *)oemtable) + count; - - mpc_record = 0; - printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", - oemtable); - if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) { - printk(KERN_WARNING - "SMP mpc oemtable: bad signature [%c%c%c%c]!\n", - oemtable->oem_signature[0], oemtable->oem_signature[1], - oemtable->oem_signature[2], oemtable->oem_signature[3]); - return; - } - if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) { - printk(KERN_WARNING "SMP oem mptable: checksum error!\n"); - return; - } - while (count < oemtable->oem_length) { - switch (*oemptr) { - case MP_TRANSLATION: - { - struct mpc_config_translation *m = - (struct mpc_config_translation *)oemptr; - MP_translation_info(m); - oemptr += sizeof(*m); - count += sizeof(*m); - ++mpc_record; - break; - } - default: - { - printk(KERN_WARNING - "Unrecognised OEM table entry type! - %d\n", - (int)*oemptr); - return; - } - } - } -} - -void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem, - char *productid) -{ - if (strncmp(oem, "IBM NUMA", 8)) - printk("Warning! Not a NUMA-Q system!\n"); - else - found_numaq = 1; - - if (mpc->mpc_oemptr) - smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr, - mpc->mpc_oemsize); -} -#endif /* CONFIG_X86_NUMAQ */ - /* * Read/parse the MPC */ @@ -457,7 +306,6 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early) } else mps_oem_check(mpc, oem, str); #endif - /* save the local APIC address, it might be non-default */ if (!acpi_lapic) mp_lapic_addr = mpc->mpc_lapic; @@ -465,12 +313,17 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early) if (early) return 1; + if (mpc->mpc_oemptr && x86_quirks->smp_read_mpc_oem) { + struct mp_config_oemtable *oem_table = (struct mp_config_oemtable *)(unsigned long)mpc->mpc_oemptr; + x86_quirks->smp_read_mpc_oem(oem_table, mpc->mpc_oemsize); + } + /* * Now process the configuration blocks. */ -#ifdef CONFIG_X86_NUMAQ - mpc_record = 0; -#endif + if (x86_quirks->mpc_record) + *x86_quirks->mpc_record = 0; + while (count < mpc->mpc_length) { switch (*mpt) { case MP_PROCESSOR: @@ -536,16 +389,17 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early) count = mpc->mpc_length; break; } -#ifdef CONFIG_X86_NUMAQ - ++mpc_record; -#endif + if (x86_quirks->mpc_record) + (*x86_quirks->mpc_record)++; } #ifdef CONFIG_X86_GENERICARCH generic_bigsmp_probe(); #endif +#ifdef CONFIG_X86_32 setup_apic_routing(); +#endif if (!num_processors) printk(KERN_ERR "MPTABLE: no processors registered!\n"); return num_processors; @@ -632,7 +486,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type) } -static void construct_ioapic_table(int mpc_default_type) +static void __init construct_ioapic_table(int mpc_default_type) { struct mpc_config_ioapic ioapic; struct mpc_config_bus bus; @@ -677,7 +531,7 @@ static void construct_ioapic_table(int mpc_default_type) construct_default_ioirq_mptable(mpc_default_type); } #else -static inline void construct_ioapic_table(int mpc_default_type) { } +static inline void __init construct_ioapic_table(int mpc_default_type) { } #endif static inline void __init construct_default_ISA_mptable(int mpc_default_type) @@ -726,20 +580,14 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) static struct intel_mp_floating *mpf_found; /* - * Machine specific quirk for finding the SMP config before other setup - * activities destroy the table: - */ -int (*mach_get_smp_config_quirk)(unsigned int early); - -/* * Scan the memory blocks for an SMP configuration block. */ static void __init __get_smp_config(unsigned int early) { struct intel_mp_floating *mpf = mpf_found; - if (mach_get_smp_config_quirk) { - if (mach_get_smp_config_quirk(early)) + if (x86_quirks->mach_get_smp_config) { + if (x86_quirks->mach_get_smp_config(early)) return; } if (acpi_lapic && early) @@ -849,7 +697,8 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, unsigned int *bp = phys_to_virt(base); struct intel_mp_floating *mpf; - printk(KERN_DEBUG "Scan SMP from %p for %ld bytes.\n", bp, length); + apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n", + bp, length); BUILD_BUG_ON(sizeof(*mpf) != 16); while (length > 0) { @@ -899,14 +748,12 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, return 0; } -int (*mach_find_smp_config_quirk)(unsigned int reserve); - static void __init __find_smp_config(unsigned int reserve) { unsigned int address; - if (mach_find_smp_config_quirk) { - if (mach_find_smp_config_quirk(reserve)) + if (x86_quirks->mach_find_smp_config) { + if (x86_quirks->mach_find_smp_config(reserve)) return; } /* diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index a153b3905f60..82a7c7ed6d45 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -72,21 +72,28 @@ static ssize_t msr_read(struct file *file, char __user *buf, u32 data[2]; u32 reg = *ppos; int cpu = iminor(file->f_path.dentry->d_inode); - int err; + int err = 0; + ssize_t bytes = 0; if (count % 8) return -EINVAL; /* Invalid chunk size */ for (; count; count -= 8) { err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]); - if (err) - return -EIO; - if (copy_to_user(tmp, &data, 8)) - return -EFAULT; + if (err) { + if (err == -EFAULT) /* Fix idiotic error code */ + err = -EIO; + break; + } + if (copy_to_user(tmp, &data, 8)) { + err = -EFAULT; + break; + } tmp += 2; + bytes += 8; } - return ((char __user *)tmp) - buf; + return bytes ? bytes : err; } static ssize_t msr_write(struct file *file, const char __user *buf, @@ -96,21 +103,28 @@ static ssize_t msr_write(struct file *file, const char __user *buf, u32 data[2]; u32 reg = *ppos; int cpu = iminor(file->f_path.dentry->d_inode); - int err; + int err = 0; + ssize_t bytes = 0; if (count % 8) return -EINVAL; /* Invalid chunk size */ for (; count; count -= 8) { - if (copy_from_user(&data, tmp, 8)) - return -EFAULT; + if (copy_from_user(&data, tmp, 8)) { + err = -EFAULT; + break; + } err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]); - if (err) - return -EIO; + if (err) { + if (err == -EFAULT) /* Fix idiotic error code */ + err = -EIO; + break; + } tmp += 2; + bytes += 8; } - return ((char __user *)tmp) - buf; + return bytes ? bytes : err; } static int msr_open(struct inode *inode, struct file *file) @@ -131,7 +145,7 @@ static int msr_open(struct inode *inode, struct file *file) ret = -EIO; /* MSR not supported */ out: unlock_kernel(); - return 0; + return ret; } /* @@ -149,7 +163,7 @@ static int __cpuinit msr_device_create(int cpu) { struct device *dev; - dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), + dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), NULL, "msr%d", cpu); return IS_ERR(dev) ? PTR_ERR(dev) : 0; } diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index ec024b3baad0..2c97f07f1c2c 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -114,6 +114,23 @@ static __init void nmi_cpu_busy(void *data) } #endif +static void report_broken_nmi(int cpu, int *prev_nmi_count) +{ + printk(KERN_CONT "\n"); + + printk(KERN_WARNING + "WARNING: CPU#%d: NMI appears to be stuck (%d->%d)!\n", + cpu, prev_nmi_count[cpu], get_nmi_count(cpu)); + + printk(KERN_WARNING + "Please report this to bugzilla.kernel.org,\n"); + printk(KERN_WARNING + "and attach the output of the 'dmesg' command.\n"); + + per_cpu(wd_enabled, cpu) = 0; + atomic_dec(&nmi_active); +} + int __init check_nmi_watchdog(void) { unsigned int *prev_nmi_count; @@ -141,15 +158,8 @@ int __init check_nmi_watchdog(void) for_each_online_cpu(cpu) { if (!per_cpu(wd_enabled, cpu)) continue; - if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { - printk(KERN_WARNING "WARNING: CPU#%d: NMI " - "appears to be stuck (%d->%d)!\n", - cpu, - prev_nmi_count[cpu], - get_nmi_count(cpu)); - per_cpu(wd_enabled, cpu) = 0; - atomic_dec(&nmi_active); - } + if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) + report_broken_nmi(cpu, prev_nmi_count); } endflag = 1; if (!atomic_read(&nmi_active)) { @@ -263,7 +273,7 @@ late_initcall(init_lapic_nmi_sysfs); static void __acpi_nmi_enable(void *__unused) { - apic_write_around(APIC_LVT0, APIC_DM_NMI); + apic_write(APIC_LVT0, APIC_DM_NMI); } /* @@ -277,7 +287,7 @@ void acpi_nmi_enable(void) static void __acpi_nmi_disable(void *__unused) { - apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); + apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); } /* @@ -289,6 +299,15 @@ void acpi_nmi_disable(void) on_each_cpu(__acpi_nmi_disable, NULL, 1); } +/* + * This function is called as soon the LAPIC NMI watchdog driver has everything + * in place and it's ready to check if the NMIs belong to the NMI watchdog + */ +void cpu_nmi_set_wd_enabled(void) +{ + __get_cpu_var(wd_enabled) = 1; +} + void setup_apic_nmi_watchdog(void *unused) { if (__get_cpu_var(wd_enabled)) @@ -301,8 +320,6 @@ void setup_apic_nmi_watchdog(void *unused) switch (nmi_watchdog) { case NMI_LOCAL_APIC: - /* enable it before to avoid race with handler */ - __get_cpu_var(wd_enabled) = 1; if (lapic_watchdog_init(nmi_hz) < 0) { __get_cpu_var(wd_enabled) = 0; return; @@ -448,6 +465,13 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) #ifdef CONFIG_SYSCTL +static int __init setup_unknown_nmi_panic(char *str) +{ + unknown_nmi_panic = 1; + return 1; +} +__setup("unknown_nmi_panic", setup_unknown_nmi_panic); + static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) { unsigned char reason = get_nmi_reason(); diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c index a23e8233b9ac..4caff39078e0 100644 --- a/arch/x86/kernel/numaq_32.c +++ b/arch/x86/kernel/numaq_32.c @@ -33,6 +33,7 @@ #include <asm/processor.h> #include <asm/mpspec.h> #include <asm/e820.h> +#include <asm/setup.h> #define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT)) @@ -71,6 +72,195 @@ static void __init smp_dump_qct(void) } } + +void __cpuinit numaq_tsc_disable(void) +{ + if (!found_numaq) + return; + + if (num_online_nodes() > 1) { + printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); + setup_clear_cpu_cap(X86_FEATURE_TSC); + } +} + +static int __init numaq_pre_time_init(void) +{ + numaq_tsc_disable(); + return 0; +} + +int found_numaq; +/* + * Have to match translation table entries to main table entries by counter + * hence the mpc_record variable .... can't see a less disgusting way of + * doing this .... + */ +struct mpc_config_translation { + unsigned char mpc_type; + unsigned char trans_len; + unsigned char trans_type; + unsigned char trans_quad; + unsigned char trans_global; + unsigned char trans_local; + unsigned short trans_reserved; +}; + +/* x86_quirks member */ +static int mpc_record; +static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] + __cpuinitdata; + +static inline int generate_logical_apicid(int quad, int phys_apicid) +{ + return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1); +} + +/* x86_quirks member */ +static int mpc_apic_id(struct mpc_config_processor *m) +{ + int quad = translation_table[mpc_record]->trans_quad; + int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid); + + printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n", + m->mpc_apicid, + (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8, + (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4, + m->mpc_apicver, quad, logical_apicid); + return logical_apicid; +} + +int mp_bus_id_to_node[MAX_MP_BUSSES]; + +int mp_bus_id_to_local[MAX_MP_BUSSES]; + +/* x86_quirks member */ +static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name) +{ + int quad = translation_table[mpc_record]->trans_quad; + int local = translation_table[mpc_record]->trans_local; + + mp_bus_id_to_node[m->mpc_busid] = quad; + mp_bus_id_to_local[m->mpc_busid] = local; + printk(KERN_INFO "Bus #%d is %s (node %d)\n", + m->mpc_busid, name, quad); +} + +int quad_local_to_mp_bus_id [NR_CPUS/4][4]; + +/* x86_quirks member */ +static void mpc_oem_pci_bus(struct mpc_config_bus *m) +{ + int quad = translation_table[mpc_record]->trans_quad; + int local = translation_table[mpc_record]->trans_local; + + quad_local_to_mp_bus_id[quad][local] = m->mpc_busid; +} + +static void __init MP_translation_info(struct mpc_config_translation *m) +{ + printk(KERN_INFO + "Translation: record %d, type %d, quad %d, global %d, local %d\n", + mpc_record, m->trans_type, m->trans_quad, m->trans_global, + m->trans_local); + + if (mpc_record >= MAX_MPC_ENTRY) + printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n"); + else + translation_table[mpc_record] = m; /* stash this for later */ + if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad)) + node_set_online(m->trans_quad); +} + +static int __init mpf_checksum(unsigned char *mp, int len) +{ + int sum = 0; + + while (len--) + sum += *mp++; + + return sum & 0xFF; +} + +/* + * Read/parse the MPC oem tables + */ + +static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, + unsigned short oemsize) +{ + int count = sizeof(*oemtable); /* the header size */ + unsigned char *oemptr = ((unsigned char *)oemtable) + count; + + mpc_record = 0; + printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", + oemtable); + if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) { + printk(KERN_WARNING + "SMP mpc oemtable: bad signature [%c%c%c%c]!\n", + oemtable->oem_signature[0], oemtable->oem_signature[1], + oemtable->oem_signature[2], oemtable->oem_signature[3]); + return; + } + if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) { + printk(KERN_WARNING "SMP oem mptable: checksum error!\n"); + return; + } + while (count < oemtable->oem_length) { + switch (*oemptr) { + case MP_TRANSLATION: + { + struct mpc_config_translation *m = + (struct mpc_config_translation *)oemptr; + MP_translation_info(m); + oemptr += sizeof(*m); + count += sizeof(*m); + ++mpc_record; + break; + } + default: + { + printk(KERN_WARNING + "Unrecognised OEM table entry type! - %d\n", + (int)*oemptr); + return; + } + } + } +} + +static int __init numaq_setup_ioapic_ids(void) +{ + /* so can skip it */ + return 1; +} + +static struct x86_quirks numaq_x86_quirks __initdata = { + .arch_pre_time_init = numaq_pre_time_init, + .arch_time_init = NULL, + .arch_pre_intr_init = NULL, + .arch_memory_setup = NULL, + .arch_intr_init = NULL, + .arch_trap_init = NULL, + .mach_get_smp_config = NULL, + .mach_find_smp_config = NULL, + .mpc_record = &mpc_record, + .mpc_apic_id = mpc_apic_id, + .mpc_oem_bus_info = mpc_oem_bus_info, + .mpc_oem_pci_bus = mpc_oem_pci_bus, + .smp_read_mpc_oem = smp_read_mpc_oem, + .setup_ioapic_ids = numaq_setup_ioapic_ids, +}; + +void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem, + char *productid) +{ + if (strncmp(oem, "IBM NUMA", 8)) + printk("Warning! Not a NUMA-Q system!\n"); + else + found_numaq = 1; +} + static __init void early_check_numaq(void) { /* @@ -82,6 +272,9 @@ static __init void early_check_numaq(void) */ if (smp_found_config) early_get_smp_config(); + + if (found_numaq) + x86_quirks = &numaq_x86_quirks; } int __init get_memcfg_numaq(void) @@ -92,14 +285,3 @@ int __init get_memcfg_numaq(void) smp_dump_qct(); return 1; } - -void __init numaq_tsc_disable(void) -{ - if (!found_numaq) - return; - - if (num_online_nodes() > 1) { - printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); - setup_clear_cpu_cap(X86_FEATURE_TSC); - } -} diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c index 3e6672274807..7a13fac63a1f 100644 --- a/arch/x86/kernel/olpc.c +++ b/arch/x86/kernel/olpc.c @@ -190,12 +190,12 @@ EXPORT_SYMBOL_GPL(olpc_ec_cmd); static void __init platform_detect(void) { size_t propsize; - u32 rev; + __be32 rev; if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4, &propsize) || propsize != 4) { printk(KERN_ERR "ofw: getprop call failed!\n"); - rev = 0; + rev = cpu_to_be32(0); } olpc_platform_info.boardrev = be32_to_cpu(rev); } @@ -203,7 +203,7 @@ static void __init platform_detect(void) static void __init platform_detect(void) { /* stopgap until OFW support is added to the kernel */ - olpc_platform_info.boardrev = be32_to_cpu(0xc2); + olpc_platform_info.boardrev = 0xc2; } #endif diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c new file mode 100644 index 000000000000..0e9f1982b1dd --- /dev/null +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -0,0 +1,37 @@ +/* + * Split spinlock implementation out into its own file, so it can be + * compiled in a FTRACE-compatible way. + */ +#include <linux/spinlock.h> +#include <linux/module.h> + +#include <asm/paravirt.h> + +static void default_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags) +{ + __raw_spin_lock(lock); +} + +struct pv_lock_ops pv_lock_ops = { +#ifdef CONFIG_SMP + .spin_is_locked = __ticket_spin_is_locked, + .spin_is_contended = __ticket_spin_is_contended, + + .spin_lock = __ticket_spin_lock, + .spin_lock_flags = default_spin_lock_flags, + .spin_trylock = __ticket_spin_trylock, + .spin_unlock = __ticket_spin_unlock, +#endif +}; +EXPORT_SYMBOL(pv_lock_ops); + +void __init paravirt_use_bytelocks(void) +{ +#ifdef CONFIG_SMP + pv_lock_ops.spin_is_locked = __byte_spin_is_locked; + pv_lock_ops.spin_is_contended = __byte_spin_is_contended; + pv_lock_ops.spin_lock = __byte_spin_lock; + pv_lock_ops.spin_trylock = __byte_spin_trylock; + pv_lock_ops.spin_unlock = __byte_spin_unlock; +#endif +} diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index e0f571d58c19..e4c8fb608873 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -29,6 +29,7 @@ #include <asm/desc.h> #include <asm/setup.h> #include <asm/arch_hooks.h> +#include <asm/pgtable.h> #include <asm/time.h> #include <asm/pgalloc.h> #include <asm/irq.h> @@ -123,6 +124,7 @@ static void *get_call_destination(u8 type) .pv_irq_ops = pv_irq_ops, .pv_apic_ops = pv_apic_ops, .pv_mmu_ops = pv_mmu_ops, + .pv_lock_ops = pv_lock_ops, }; return *((void **)&tmpl + type); } @@ -317,6 +319,7 @@ struct pv_cpu_ops pv_cpu_ops = { #endif .wbinvd = native_wbinvd, .read_msr = native_read_msr_safe, + .read_msr_amd = native_read_msr_amd_safe, .write_msr = native_write_msr_safe, .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, @@ -335,6 +338,10 @@ struct pv_cpu_ops pv_cpu_ops = { .write_ldt_entry = native_write_ldt_entry, .write_gdt_entry = native_write_gdt_entry, .write_idt_entry = native_write_idt_entry, + + .alloc_ldt = paravirt_nop, + .free_ldt = paravirt_nop, + .load_sp0 = native_load_sp0, #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) @@ -360,9 +367,6 @@ struct pv_cpu_ops pv_cpu_ops = { struct pv_apic_ops pv_apic_ops = { #ifdef CONFIG_X86_LOCAL_APIC - .apic_write = native_apic_write, - .apic_write_atomic = native_apic_write_atomic, - .apic_read = native_apic_read, .setup_boot_clock = setup_boot_APIC_clock, .setup_secondary_clock = setup_secondary_APIC_clock, .startup_ipi_hook = paravirt_nop, @@ -373,6 +377,9 @@ struct pv_mmu_ops pv_mmu_ops = { #ifndef CONFIG_X86_64 .pagetable_setup_start = native_pagetable_setup_start, .pagetable_setup_done = native_pagetable_setup_done, +#else + .pagetable_setup_start = paravirt_nop, + .pagetable_setup_done = paravirt_nop, #endif .read_cr2 = native_read_cr2, @@ -428,7 +435,7 @@ struct pv_mmu_ops pv_mmu_ops = { #endif /* PAGETABLE_LEVELS >= 3 */ .pte_val = native_pte_val, - .pte_flags = native_pte_val, + .pte_flags = native_pte_flags, .pgd_val = native_pgd_val, .make_pte = native_make_pte, diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c index 58262218781b..9fe644f4861d 100644 --- a/arch/x86/kernel/paravirt_patch_32.c +++ b/arch/x86/kernel/paravirt_patch_32.c @@ -23,7 +23,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, start = start_##ops##_##x; \ end = end_##ops##_##x; \ goto patch_site - switch(type) { + switch (type) { PATCH_SITE(pv_irq_ops, irq_disable); PATCH_SITE(pv_irq_ops, irq_enable); PATCH_SITE(pv_irq_ops, restore_fl); diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 6959b5c45df4..e1e731d78f38 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -29,6 +29,7 @@ #include <linux/mm.h> #include <linux/spinlock.h> #include <linux/string.h> +#include <linux/crash_dump.h> #include <linux/dma-mapping.h> #include <linux/bitops.h> #include <linux/pci_ids.h> @@ -36,7 +37,8 @@ #include <linux/delay.h> #include <linux/scatterlist.h> #include <linux/iommu-helper.h> -#include <asm/gart.h> + +#include <asm/iommu.h> #include <asm/calgary.h> #include <asm/tce.h> #include <asm/pci-direct.h> @@ -167,6 +169,8 @@ static void calgary_dump_error_regs(struct iommu_table *tbl); static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev); static void calioc2_tce_cache_blast(struct iommu_table *tbl); static void calioc2_dump_error_regs(struct iommu_table *tbl); +static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl); +static void get_tce_space_from_tar(void); static struct cal_chipset_ops calgary_chip_ops = { .handle_quirks = calgary_handle_quirks, @@ -213,16 +217,6 @@ static inline unsigned long verify_bit_range(unsigned long* bitmap, #endif /* CONFIG_IOMMU_DEBUG */ -static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen) -{ - unsigned int npages; - - npages = PAGE_ALIGN(dma + dmalen) - (dma & PAGE_MASK); - npages >>= PAGE_SHIFT; - - return npages; -} - static inline int translation_enabled(struct iommu_table *tbl) { /* only PHBs with translation enabled have an IOMMU table */ @@ -257,7 +251,7 @@ static void iommu_range_reserve(struct iommu_table *tbl, badbit, tbl, start_addr, npages); } - set_bit_string(tbl->it_map, index, npages); + iommu_area_reserve(tbl->it_map, index, npages); spin_unlock_irqrestore(&tbl->it_lock, flags); } @@ -339,9 +333,8 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, /* were we called with bad_dma_address? */ badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE); if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) { - printk(KERN_ERR "Calgary: driver tried unmapping bad DMA " + WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA " "address 0x%Lx\n", dma_addr); - WARN_ON(1); return; } @@ -405,27 +398,11 @@ static void calgary_unmap_sg(struct device *dev, if (dmalen == 0) break; - npages = num_dma_pages(dma, dmalen); + npages = iommu_num_pages(dma, dmalen, PAGE_SIZE); iommu_free(tbl, dma, npages); } } -static int calgary_nontranslate_map_sg(struct device* dev, - struct scatterlist *sg, int nelems, int direction) -{ - struct scatterlist *s; - int i; - - for_each_sg(sg, s, nelems, i) { - struct page *p = sg_page(s); - - BUG_ON(!p); - s->dma_address = virt_to_bus(sg_virt(s)); - s->dma_length = s->length; - } - return nelems; -} - static int calgary_map_sg(struct device *dev, struct scatterlist *sg, int nelems, int direction) { @@ -436,14 +413,11 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, unsigned long entry; int i; - if (!translation_enabled(tbl)) - return calgary_nontranslate_map_sg(dev, sg, nelems, direction); - for_each_sg(sg, s, nelems, i) { BUG_ON(!sg_page(s)); vaddr = (unsigned long) sg_virt(s); - npages = num_dma_pages(vaddr, s->length); + npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE); entry = iommu_range_alloc(dev, tbl, npages); if (entry == bad_dma_address) { @@ -474,21 +448,15 @@ error: static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr, size_t size, int direction) { - dma_addr_t dma_handle = bad_dma_address; void *vaddr = phys_to_virt(paddr); unsigned long uaddr; unsigned int npages; struct iommu_table *tbl = find_iommu_table(dev); uaddr = (unsigned long)vaddr; - npages = num_dma_pages(uaddr, size); + npages = iommu_num_pages(uaddr, size, PAGE_SIZE); - if (translation_enabled(tbl)) - dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction); - else - dma_handle = virt_to_bus(vaddr); - - return dma_handle; + return iommu_alloc(dev, tbl, vaddr, npages, direction); } static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, @@ -497,10 +465,7 @@ static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, struct iommu_table *tbl = find_iommu_table(dev); unsigned int npages; - if (!translation_enabled(tbl)) - return; - - npages = num_dma_pages(dma_handle, size); + npages = iommu_num_pages(dma_handle, size, PAGE_SIZE); iommu_free(tbl, dma_handle, npages); } @@ -516,24 +481,20 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size, npages = size >> PAGE_SHIFT; order = get_order(size); + flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); + /* alloc enough pages (and possibly more) */ ret = (void *)__get_free_pages(flag, order); if (!ret) goto error; memset(ret, 0, size); - if (translation_enabled(tbl)) { - /* set up tces to cover the allocated range */ - mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); - if (mapping == bad_dma_address) - goto free; - - *dma_handle = mapping; - } else /* non translated slot */ - *dma_handle = virt_to_bus(ret); - + /* set up tces to cover the allocated range */ + mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); + if (mapping == bad_dma_address) + goto free; + *dma_handle = mapping; return ret; - free: free_pages((unsigned long)ret, get_order(size)); ret = NULL; @@ -541,8 +502,22 @@ error: return ret; } -static const struct dma_mapping_ops calgary_dma_ops = { +static void calgary_free_coherent(struct device *dev, size_t size, + void *vaddr, dma_addr_t dma_handle) +{ + unsigned int npages; + struct iommu_table *tbl = find_iommu_table(dev); + + size = PAGE_ALIGN(size); + npages = size >> PAGE_SHIFT; + + iommu_free(tbl, dma_handle, npages); + free_pages((unsigned long)vaddr, get_order(size)); +} + +static struct dma_mapping_ops calgary_dma_ops = { .alloc_coherent = calgary_alloc_coherent, + .free_coherent = calgary_free_coherent, .map_single = calgary_map_single, .unmap_single = calgary_unmap_single, .map_sg = calgary_map_sg, @@ -830,7 +805,11 @@ static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar) tbl = pci_iommu(dev->bus); tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space; - tce_free(tbl, 0, tbl->it_size); + + if (is_kdump_kernel()) + calgary_init_bitmap_from_tce_table(tbl); + else + tce_free(tbl, 0, tbl->it_size); if (is_calgary(dev->device)) tbl->chip_ops = &calgary_chip_ops; @@ -1209,6 +1188,10 @@ static int __init calgary_init(void) if (ret) return ret; + /* Purely for kdump kernel case */ + if (is_kdump_kernel()) + get_tce_space_from_tar(); + do { dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); if (!dev) @@ -1230,6 +1213,16 @@ static int __init calgary_init(void) goto error; } while (1); + dev = NULL; + for_each_pci_dev(dev) { + struct iommu_table *tbl; + + tbl = find_iommu_table(&dev->dev); + + if (translation_enabled(tbl)) + dev->dev.archdata.dma_ops = &calgary_dma_ops; + } + return ret; error: @@ -1251,6 +1244,7 @@ error: calgary_disable_translation(dev); calgary_free_bus(dev); pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */ + dev->dev.archdata.dma_ops = NULL; } while (1); return ret; @@ -1280,13 +1274,15 @@ static inline int __init determine_tce_table_size(u64 ram) static int __init build_detail_arrays(void) { unsigned long ptr; - int i, scal_detail_size, rio_detail_size; + unsigned numnodes, i; + int scal_detail_size, rio_detail_size; - if (rio_table_hdr->num_scal_dev > MAX_NUMNODES){ + numnodes = rio_table_hdr->num_scal_dev; + if (numnodes > MAX_NUMNODES){ printk(KERN_WARNING "Calgary: MAX_NUMNODES too low! Defined as %d, " "but system has %d nodes.\n", - MAX_NUMNODES, rio_table_hdr->num_scal_dev); + MAX_NUMNODES, numnodes); return -ENODEV; } @@ -1307,8 +1303,7 @@ static int __init build_detail_arrays(void) } ptr = ((unsigned long)rio_table_hdr) + 3; - for (i = 0; i < rio_table_hdr->num_scal_dev; - i++, ptr += scal_detail_size) + for (i = 0; i < numnodes; i++, ptr += scal_detail_size) scal_devs[i] = (struct scal_detail *)ptr; for (i = 0; i < rio_table_hdr->num_rio_dev; @@ -1339,6 +1334,61 @@ static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev) return (val != 0xffffffff); } +/* + * calgary_init_bitmap_from_tce_table(): + * Funtion for kdump case. In the second/kdump kernel initialize + * the bitmap based on the tce table entries obtained from first kernel + */ +static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl) +{ + u64 *tp; + unsigned int index; + tp = ((u64 *)tbl->it_base); + for (index = 0 ; index < tbl->it_size; index++) { + if (*tp != 0x0) + set_bit(index, tbl->it_map); + tp++; + } +} + +/* + * get_tce_space_from_tar(): + * Function for kdump case. Get the tce tables from first kernel + * by reading the contents of the base adress register of calgary iommu + */ +static void __init get_tce_space_from_tar(void) +{ + int bus; + void __iomem *target; + unsigned long tce_space; + + for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { + struct calgary_bus_info *info = &bus_info[bus]; + unsigned short pci_device; + u32 val; + + val = read_pci_config(bus, 0, 0, 0); + pci_device = (val & 0xFFFF0000) >> 16; + + if (!is_cal_pci_dev(pci_device)) + continue; + if (info->translation_disabled) + continue; + + if (calgary_bus_has_devices(bus, pci_device) || + translate_empty_slots) { + target = calgary_reg(bus_info[bus].bbar, + tar_offset(bus)); + tce_space = be64_to_cpu(readq(target)); + tce_space = tce_space & TAR_SW_BITS; + + tce_space = tce_space & (~specified_table_size); + info->tce_space = (u64 *)__va(tce_space); + } + } + return; +} + void __init detect_calgary(void) { int bus; @@ -1394,7 +1444,8 @@ void __init detect_calgary(void) return; } - specified_table_size = determine_tce_table_size(max_pfn * PAGE_SIZE); + specified_table_size = determine_tce_table_size((is_kdump_kernel() ? + saved_max_pfn : max_pfn) * PAGE_SIZE); for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { struct calgary_bus_info *info = &bus_info[bus]; @@ -1412,10 +1463,16 @@ void __init detect_calgary(void) if (calgary_bus_has_devices(bus, pci_device) || translate_empty_slots) { - tbl = alloc_tce_table(); - if (!tbl) - goto cleanup; - info->tce_space = tbl; + /* + * If it is kdump kernel, find and use tce tables + * from first kernel, else allocate tce tables here + */ + if (!is_kdump_kernel()) { + tbl = alloc_tce_table(); + if (!tbl) + goto cleanup; + info->tce_space = tbl; + } calgary_found = 1; } } @@ -1430,6 +1487,10 @@ void __init detect_calgary(void) printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, debugging ? "enabled" : "disabled"); + + /* swiotlb for devices that aren't behind the Calgary. */ + if (max_pfn > MAX_DMA32_PFN) + swiotlb = 1; } return; @@ -1446,7 +1507,7 @@ int __init calgary_iommu_init(void) { int ret; - if (no_iommu || swiotlb) + if (no_iommu || (swiotlb && !calgary_detected)) return -ENODEV; if (!calgary_detected) @@ -1459,15 +1520,14 @@ int __init calgary_iommu_init(void) if (ret) { printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " "falling back to no_iommu\n", ret); - if (max_pfn > MAX_DMA32_PFN) - printk(KERN_ERR "WARNING more than 4GB of memory, " - "32bit PCI may malfunction.\n"); return ret; } force_iommu = 1; bad_dma_address = 0x0; - dma_ops = &calgary_dma_ops; + /* dma_ops is set to swiotlb or nommu */ + if (!dma_ops) + dma_ops = &nommu_dma_ops; return 0; } diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 8467ec2320f1..1972266e8ba5 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -5,14 +5,11 @@ #include <asm/proto.h> #include <asm/dma.h> -#include <asm/gart.h> +#include <asm/iommu.h> #include <asm/calgary.h> #include <asm/amd_iommu.h> -int forbid_dac __read_mostly; -EXPORT_SYMBOL(forbid_dac); - -const struct dma_mapping_ops *dma_ops; +struct dma_mapping_ops *dma_ops; EXPORT_SYMBOL(dma_ops); static int iommu_sac_force __read_mostly; @@ -42,11 +39,12 @@ EXPORT_SYMBOL(bad_dma_address); /* Dummy device used for NULL arguments (normally ISA). Better would be probably a smaller DMA mask, but this is bug-to-bug compatible to older i386. */ -struct device fallback_dev = { +struct device x86_dma_fallback_dev = { .bus_id = "fallback device", .coherent_dma_mask = DMA_32BIT_MASK, - .dma_mask = &fallback_dev.coherent_dma_mask, + .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask, }; +EXPORT_SYMBOL(x86_dma_fallback_dev); int dma_set_mask(struct device *dev, u64 mask) { @@ -83,7 +81,7 @@ void __init dma32_reserve_bootmem(void) * using 512M as goal */ align = 64ULL<<20; - size = round_up(dma32_bootmem_size, align); + size = roundup(dma32_bootmem_size, align); dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align, 512ULL<<20); if (dma32_bootmem_ptr) @@ -114,24 +112,57 @@ void __init pci_iommu_alloc(void) * The order of these functions is important for * fall-back/fail-over reasons */ -#ifdef CONFIG_GART_IOMMU gart_iommu_hole_init(); -#endif -#ifdef CONFIG_CALGARY_IOMMU detect_calgary(); -#endif detect_intel_iommu(); amd_iommu_detect(); -#ifdef CONFIG_SWIOTLB pci_swiotlb_init(); -#endif } + +unsigned long iommu_nr_pages(unsigned long addr, unsigned long len) +{ + unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE); + + return size >> PAGE_SHIFT; +} +EXPORT_SYMBOL(iommu_nr_pages); #endif +void *dma_generic_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_addr, gfp_t flag) +{ + unsigned long dma_mask; + struct page *page; + dma_addr_t addr; + + dma_mask = dma_alloc_coherent_mask(dev, flag); + + flag |= __GFP_ZERO; +again: + page = alloc_pages_node(dev_to_node(dev), flag, get_order(size)); + if (!page) + return NULL; + + addr = page_to_phys(page); + if (!is_buffer_dma_capable(dma_mask, addr, size)) { + __free_pages(page, get_order(size)); + + if (dma_mask < DMA_32BIT_MASK && !(flag & GFP_DMA)) { + flag = (flag & ~GFP_DMA32) | GFP_DMA; + goto again; + } + + return NULL; + } + + *dma_addr = addr; + return page_address(page); +} + /* * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter * documentation. @@ -184,9 +215,7 @@ static __init int iommu_setup(char *p) swiotlb = 1; #endif -#ifdef CONFIG_GART_IOMMU gart_parse_options(p); -#endif #ifdef CONFIG_CALGARY_IOMMU if (!strncmp(p, "calgary", 7)) @@ -201,136 +230,19 @@ static __init int iommu_setup(char *p) } early_param("iommu", iommu_setup); -#ifdef CONFIG_X86_32 -int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, - dma_addr_t device_addr, size_t size, int flags) -{ - void __iomem *mem_base = NULL; - int pages = size >> PAGE_SHIFT; - int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long); - - if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0) - goto out; - if (!size) - goto out; - if (dev->dma_mem) - goto out; - - /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */ - - mem_base = ioremap(bus_addr, size); - if (!mem_base) - goto out; - - dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); - if (!dev->dma_mem) - goto out; - dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL); - if (!dev->dma_mem->bitmap) - goto free1_out; - - dev->dma_mem->virt_base = mem_base; - dev->dma_mem->device_base = device_addr; - dev->dma_mem->size = pages; - dev->dma_mem->flags = flags; - - if (flags & DMA_MEMORY_MAP) - return DMA_MEMORY_MAP; - - return DMA_MEMORY_IO; - - free1_out: - kfree(dev->dma_mem); - out: - if (mem_base) - iounmap(mem_base); - return 0; -} -EXPORT_SYMBOL(dma_declare_coherent_memory); - -void dma_release_declared_memory(struct device *dev) -{ - struct dma_coherent_mem *mem = dev->dma_mem; - - if (!mem) - return; - dev->dma_mem = NULL; - iounmap(mem->virt_base); - kfree(mem->bitmap); - kfree(mem); -} -EXPORT_SYMBOL(dma_release_declared_memory); - -void *dma_mark_declared_memory_occupied(struct device *dev, - dma_addr_t device_addr, size_t size) -{ - struct dma_coherent_mem *mem = dev->dma_mem; - int pos, err; - int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1); - - pages >>= PAGE_SHIFT; - - if (!mem) - return ERR_PTR(-EINVAL); - - pos = (device_addr - mem->device_base) >> PAGE_SHIFT; - err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages)); - if (err != 0) - return ERR_PTR(err); - return mem->virt_base + (pos << PAGE_SHIFT); -} -EXPORT_SYMBOL(dma_mark_declared_memory_occupied); - -static int dma_alloc_from_coherent_mem(struct device *dev, ssize_t size, - dma_addr_t *dma_handle, void **ret) -{ - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; - int order = get_order(size); - - if (mem) { - int page = bitmap_find_free_region(mem->bitmap, mem->size, - order); - if (page >= 0) { - *dma_handle = mem->device_base + (page << PAGE_SHIFT); - *ret = mem->virt_base + (page << PAGE_SHIFT); - memset(*ret, 0, size); - } - if (mem->flags & DMA_MEMORY_EXCLUSIVE) - *ret = NULL; - } - return (mem != NULL); -} - -static int dma_release_coherent(struct device *dev, int order, void *vaddr) -{ - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; - - if (mem && vaddr >= mem->virt_base && vaddr < - (mem->virt_base + (mem->size << PAGE_SHIFT))) { - int page = (vaddr - mem->virt_base) >> PAGE_SHIFT; - - bitmap_release_region(mem->bitmap, page, order); - return 1; - } - return 0; -} -#else -#define dma_alloc_from_coherent_mem(dev, size, handle, ret) (0) -#define dma_release_coherent(dev, order, vaddr) (0) -#endif /* CONFIG_X86_32 */ - int dma_supported(struct device *dev, u64 mask) { + struct dma_mapping_ops *ops = get_dma_ops(dev); + #ifdef CONFIG_PCI if (mask > 0xffffffff && forbid_dac > 0) { - printk(KERN_INFO "PCI: Disallowing DAC for device %s\n", - dev->bus_id); + dev_info(dev, "PCI: Disallowing DAC for device\n"); return 0; } #endif - if (dma_ops->dma_supported) - return dma_ops->dma_supported(dev, mask); + if (ops->dma_supported) + return ops->dma_supported(dev, mask); /* Copied from i386. Doesn't make much sense, because it will only work for pci_alloc_coherent. @@ -351,8 +263,7 @@ int dma_supported(struct device *dev, u64 mask) type. Normally this doesn't make any difference, but gives more gentle handling of IOMMU overflow. */ if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) { - printk(KERN_INFO "%s: Force SAC with mask %Lx\n", - dev->bus_id, mask); + dev_info(dev, "Force SAC with mask %Lx\n", mask); return 0; } @@ -360,157 +271,15 @@ int dma_supported(struct device *dev, u64 mask) } EXPORT_SYMBOL(dma_supported); -/* Allocate DMA memory on node near device */ -static noinline struct page * -dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order) -{ - int node; - - node = dev_to_node(dev); - - return alloc_pages_node(node, gfp, order); -} - -/* - * Allocate memory for a coherent mapping. - */ -void * -dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t gfp) -{ - void *memory = NULL; - struct page *page; - unsigned long dma_mask = 0; - dma_addr_t bus; - int noretry = 0; - - /* ignore region specifiers */ - gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); - - if (dma_alloc_from_coherent_mem(dev, size, dma_handle, &memory)) - return memory; - - if (!dev) { - dev = &fallback_dev; - gfp |= GFP_DMA; - } - dma_mask = dev->coherent_dma_mask; - if (dma_mask == 0) - dma_mask = (gfp & GFP_DMA) ? DMA_24BIT_MASK : DMA_32BIT_MASK; - - /* Device not DMA able */ - if (dev->dma_mask == NULL) - return NULL; - - /* Don't invoke OOM killer or retry in lower 16MB DMA zone */ - if (gfp & __GFP_DMA) - noretry = 1; - -#ifdef CONFIG_X86_64 - /* Why <=? Even when the mask is smaller than 4GB it is often - larger than 16MB and in this case we have a chance of - finding fitting memory in the next higher zone first. If - not retry with true GFP_DMA. -AK */ - if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA)) { - gfp |= GFP_DMA32; - if (dma_mask < DMA_32BIT_MASK) - noretry = 1; - } -#endif - - again: - page = dma_alloc_pages(dev, - noretry ? gfp | __GFP_NORETRY : gfp, get_order(size)); - if (page == NULL) - return NULL; - - { - int high, mmu; - bus = page_to_phys(page); - memory = page_address(page); - high = (bus + size) >= dma_mask; - mmu = high; - if (force_iommu && !(gfp & GFP_DMA)) - mmu = 1; - else if (high) { - free_pages((unsigned long)memory, - get_order(size)); - - /* Don't use the 16MB ZONE_DMA unless absolutely - needed. It's better to use remapping first. */ - if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) { - gfp = (gfp & ~GFP_DMA32) | GFP_DMA; - goto again; - } - - /* Let low level make its own zone decisions */ - gfp &= ~(GFP_DMA32|GFP_DMA); - - if (dma_ops->alloc_coherent) - return dma_ops->alloc_coherent(dev, size, - dma_handle, gfp); - return NULL; - } - - memset(memory, 0, size); - if (!mmu) { - *dma_handle = bus; - return memory; - } - } - - if (dma_ops->alloc_coherent) { - free_pages((unsigned long)memory, get_order(size)); - gfp &= ~(GFP_DMA|GFP_DMA32); - return dma_ops->alloc_coherent(dev, size, dma_handle, gfp); - } - - if (dma_ops->map_simple) { - *dma_handle = dma_ops->map_simple(dev, virt_to_phys(memory), - size, - PCI_DMA_BIDIRECTIONAL); - if (*dma_handle != bad_dma_address) - return memory; - } - - if (panic_on_overflow) - panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n", - (unsigned long)size); - free_pages((unsigned long)memory, get_order(size)); - return NULL; -} -EXPORT_SYMBOL(dma_alloc_coherent); - -/* - * Unmap coherent memory. - * The caller must ensure that the device has finished accessing the mapping. - */ -void dma_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t bus) -{ - int order = get_order(size); - WARN_ON(irqs_disabled()); /* for portability */ - if (dma_release_coherent(dev, order, vaddr)) - return; - if (dma_ops->unmap_single) - dma_ops->unmap_single(dev, bus, size, 0); - free_pages((unsigned long)vaddr, order); -} -EXPORT_SYMBOL(dma_free_coherent); - static int __init pci_iommu_init(void) { -#ifdef CONFIG_CALGARY_IOMMU calgary_iommu_init(); -#endif intel_iommu_init(); amd_iommu_init(); -#ifdef CONFIG_GART_IOMMU gart_iommu_init(); -#endif no_iommu_init(); return 0; @@ -522,17 +291,3 @@ void pci_iommu_shutdown(void) } /* Must execute after PCI subsystem */ fs_initcall(pci_iommu_init); - -#ifdef CONFIG_PCI -/* Many VIA bridges seem to corrupt data for DAC. Disable it here */ - -static __devinit void via_no_dac(struct pci_dev *dev) -{ - if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { - printk(KERN_INFO "PCI: VIA PCI bridge detected." - "Disabling DAC.\n"); - forbid_dac = 1; - } -} -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac); -#endif diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index c3fe78406d18..e3f75bbcedea 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -27,11 +27,12 @@ #include <linux/scatterlist.h> #include <linux/iommu-helper.h> #include <linux/sysdev.h> +#include <linux/io.h> #include <asm/atomic.h> -#include <asm/io.h> #include <asm/mtrr.h> #include <asm/pgtable.h> #include <asm/proto.h> +#include <asm/iommu.h> #include <asm/gart.h> #include <asm/cacheflush.h> #include <asm/swiotlb.h> @@ -66,9 +67,6 @@ static u32 gart_unmapped_entry; (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT) #define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28)) -#define to_pages(addr, size) \ - (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT) - #define EMERGENCY_PAGES 32 /* = 128KB */ #ifdef CONFIG_AGP @@ -82,9 +80,10 @@ AGPEXTERN int agp_memory_reserved; AGPEXTERN __u32 *agp_gatt_table; static unsigned long next_bit; /* protected by iommu_bitmap_lock */ -static int need_flush; /* global flush state. set for each gart wrap */ +static bool need_flush; /* global flush state. set for each gart wrap */ -static unsigned long alloc_iommu(struct device *dev, int size) +static unsigned long alloc_iommu(struct device *dev, int size, + unsigned long align_mask) { unsigned long offset, flags; unsigned long boundary_size; @@ -92,26 +91,27 @@ static unsigned long alloc_iommu(struct device *dev, int size) base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev), PAGE_SIZE) >> PAGE_SHIFT; - boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, + boundary_size = ALIGN((unsigned long long)dma_get_seg_boundary(dev) + 1, PAGE_SIZE) >> PAGE_SHIFT; spin_lock_irqsave(&iommu_bitmap_lock, flags); offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit, - size, base_index, boundary_size, 0); + size, base_index, boundary_size, align_mask); if (offset == -1) { - need_flush = 1; + need_flush = true; offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0, - size, base_index, boundary_size, 0); + size, base_index, boundary_size, + align_mask); } if (offset != -1) { next_bit = offset+size; if (next_bit >= iommu_pages) { next_bit = 0; - need_flush = 1; + need_flush = true; } } if (iommu_fullflush) - need_flush = 1; + need_flush = true; spin_unlock_irqrestore(&iommu_bitmap_lock, flags); return offset; @@ -136,7 +136,7 @@ static void flush_gart(void) spin_lock_irqsave(&iommu_bitmap_lock, flags); if (need_flush) { k8_flush_garts(); - need_flush = 0; + need_flush = false; } spin_unlock_irqrestore(&iommu_bitmap_lock, flags); } @@ -175,7 +175,8 @@ static void dump_leak(void) iommu_leak_pages); for (i = 0; i < iommu_leak_pages; i += 2) { printk(KERN_DEBUG "%lu: ", iommu_pages-i); - printk_address((unsigned long) iommu_leak_tab[iommu_pages-i], 0); + printk_address((unsigned long) iommu_leak_tab[iommu_pages-i], + 0); printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' '); } printk(KERN_DEBUG "\n"); @@ -197,9 +198,7 @@ static void iommu_full(struct device *dev, size_t size, int dir) * out. Hopefully no network devices use single mappings that big. */ - printk(KERN_ERR - "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n", - size, dev->bus_id); + dev_err(dev, "PCI-DMA: Out of IOMMU space for %lu bytes\n", size); if (size > PAGE_SIZE*EMERGENCY_PAGES) { if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) @@ -216,34 +215,24 @@ static void iommu_full(struct device *dev, size_t size, int dir) static inline int need_iommu(struct device *dev, unsigned long addr, size_t size) { - u64 mask = *dev->dma_mask; - int high = addr + size > mask; - int mmu = high; - - if (force_iommu) - mmu = 1; - - return mmu; + return force_iommu || + !is_buffer_dma_capable(*dev->dma_mask, addr, size); } static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size) { - u64 mask = *dev->dma_mask; - int high = addr + size > mask; - int mmu = high; - - return mmu; + return !is_buffer_dma_capable(*dev->dma_mask, addr, size); } /* Map a single continuous physical area into the IOMMU. * Caller needs to check if the iommu is needed and flush. */ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, - size_t size, int dir) + size_t size, int dir, unsigned long align_mask) { - unsigned long npages = to_pages(phys_mem, size); - unsigned long iommu_page = alloc_iommu(dev, npages); + unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE); + unsigned long iommu_page = alloc_iommu(dev, npages, align_mask); int i; if (iommu_page == -1) { @@ -263,16 +252,6 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); } -static dma_addr_t -gart_map_simple(struct device *dev, phys_addr_t paddr, size_t size, int dir) -{ - dma_addr_t map = dma_map_area(dev, paddr, size, dir); - - flush_gart(); - - return map; -} - /* Map a single area into the IOMMU */ static dma_addr_t gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir) @@ -280,12 +259,13 @@ gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir) unsigned long bus; if (!dev) - dev = &fallback_dev; + dev = &x86_dma_fallback_dev; if (!need_iommu(dev, paddr, size)) return paddr; - bus = gart_map_simple(dev, paddr, size, dir); + bus = dma_map_area(dev, paddr, size, dir, 0); + flush_gart(); return bus; } @@ -305,7 +285,7 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, return; iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; - npages = to_pages(dma_addr, size); + npages = iommu_num_pages(dma_addr, size, PAGE_SIZE); for (i = 0; i < npages; i++) { iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; CLEAR_LEAK(iommu_page + i); @@ -344,7 +324,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, unsigned long addr = sg_phys(s); if (nonforced_iommu(dev, addr, s->length)) { - addr = dma_map_area(dev, addr, s->length, dir); + addr = dma_map_area(dev, addr, s->length, dir, 0); if (addr == bad_dma_address) { if (i > 0) gart_unmap_sg(dev, sg, i, dir); @@ -366,7 +346,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start, int nelems, struct scatterlist *sout, unsigned long pages) { - unsigned long iommu_start = alloc_iommu(dev, pages); + unsigned long iommu_start = alloc_iommu(dev, pages, 0); unsigned long iommu_page = iommu_start; struct scatterlist *s; int i; @@ -388,7 +368,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start, } addr = phys_addr; - pages = to_pages(s->offset, s->length); + pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE); while (pages--) { iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); SET_LEAK(iommu_page); @@ -431,7 +411,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) return 0; if (!dev) - dev = &fallback_dev; + dev = &x86_dma_fallback_dev; out = 0; start = 0; @@ -471,7 +451,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) seg_size += s->length; need = nextneed; - pages += to_pages(s->offset, s->length); + pages += iommu_num_pages(s->offset, s->length, PAGE_SIZE); ps = s; } if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0) @@ -503,6 +483,46 @@ error: return 0; } +/* allocate and map a coherent mapping */ +static void * +gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, + gfp_t flag) +{ + dma_addr_t paddr; + unsigned long align_mask; + struct page *page; + + if (force_iommu && !(flag & GFP_DMA)) { + flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); + page = alloc_pages(flag | __GFP_ZERO, get_order(size)); + if (!page) + return NULL; + + align_mask = (1UL << get_order(size)) - 1; + paddr = dma_map_area(dev, page_to_phys(page), size, + DMA_BIDIRECTIONAL, align_mask); + + flush_gart(); + if (paddr != bad_dma_address) { + *dma_addr = paddr; + return page_address(page); + } + __free_pages(page, get_order(size)); + } else + return dma_generic_alloc_coherent(dev, size, dma_addr, flag); + + return NULL; +} + +/* free a coherent mapping */ +static void +gart_free_coherent(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_addr) +{ + gart_unmap_single(dev, dma_addr, size, DMA_BIDIRECTIONAL); + free_pages((unsigned long)vaddr, get_order(size)); +} + static int no_agp; static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) @@ -630,7 +650,6 @@ static __init int init_k8_gatt(struct agp_kern_info *info) struct pci_dev *dev; void *gatt; int i, error; - unsigned long start_pfn, end_pfn; printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); aper_size = aper_base = info->aper_size = 0; @@ -654,13 +673,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info) info->aper_size = aper_size >> 20; gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); - gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size)); + gatt = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(gatt_size)); if (!gatt) panic("Cannot allocate GATT table"); if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT)) panic("Could not set GART PTEs to uncacheable pages"); - memset(gatt, 0, gatt_size); agp_gatt_table = gatt; enable_gart_translations(); @@ -669,19 +688,14 @@ static __init int init_k8_gatt(struct agp_kern_info *info) if (!error) error = sysdev_register(&device_gart); if (error) - panic("Could not register gart_sysdev -- would corrupt data on next suspend"); + panic("Could not register gart_sysdev -- " + "would corrupt data on next suspend"); flush_gart(); printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", aper_base, aper_size>>10); - /* need to map that range */ - end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); - if (end_pfn > max_low_pfn_mapped) { - start_pfn = (aper_base>>PAGE_SHIFT); - init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); - } return 0; nommu: @@ -691,21 +705,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info) return -1; } -extern int agp_amd64_init(void); - -static const struct dma_mapping_ops gart_dma_ops = { - .mapping_error = NULL, +static struct dma_mapping_ops gart_dma_ops = { .map_single = gart_map_single, - .map_simple = gart_map_simple, .unmap_single = gart_unmap_single, - .sync_single_for_cpu = NULL, - .sync_single_for_device = NULL, - .sync_single_range_for_cpu = NULL, - .sync_single_range_for_device = NULL, - .sync_sg_for_cpu = NULL, - .sync_sg_for_device = NULL, .map_sg = gart_map_sg, .unmap_sg = gart_unmap_sg, + .alloc_coherent = gart_alloc_coherent, + .free_coherent = gart_free_coherent, }; void gart_iommu_shutdown(void) @@ -732,7 +738,8 @@ void __init gart_iommu_init(void) { struct agp_kern_info info; unsigned long iommu_start; - unsigned long aper_size; + unsigned long aper_base, aper_size; + unsigned long start_pfn, end_pfn; unsigned long scratch; long i; @@ -764,30 +771,35 @@ void __init gart_iommu_init(void) (no_agp && init_k8_gatt(&info) < 0)) { if (max_pfn > MAX_DMA32_PFN) { printk(KERN_WARNING "More than 4GB of memory " - "but GART IOMMU not available.\n" - KERN_WARNING "falling back to iommu=soft.\n"); + "but GART IOMMU not available.\n"); + printk(KERN_WARNING "falling back to iommu=soft.\n"); } return; } + /* need to map that range */ + aper_size = info.aper_size << 20; + aper_base = info.aper_base; + end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); + if (end_pfn > max_low_pfn_mapped) { + start_pfn = (aper_base>>PAGE_SHIFT); + init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); + } + printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); - aper_size = info.aper_size * 1024 * 1024; iommu_size = check_iommu_size(info.aper_base, aper_size); iommu_pages = iommu_size >> PAGE_SHIFT; - iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL, + iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(iommu_pages/8)); if (!iommu_gart_bitmap) panic("Cannot allocate iommu bitmap\n"); - memset(iommu_gart_bitmap, 0, iommu_pages/8); #ifdef CONFIG_IOMMU_LEAK if (leak_trace) { - iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, + iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, get_order(iommu_pages*sizeof(void *))); - if (iommu_leak_tab) - memset(iommu_leak_tab, 0, iommu_pages * 8); - else + if (!iommu_leak_tab) printk(KERN_DEBUG "PCI-DMA: Cannot allocate leak trace area\n"); } @@ -797,7 +809,7 @@ void __init gart_iommu_init(void) * Out of IOMMU space handling. * Reserve some invalid pages at the beginning of the GART. */ - set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); + iommu_area_reserve(iommu_gart_bitmap, 0, EMERGENCY_PAGES); agp_memory_reserved = iommu_size; printk(KERN_INFO @@ -855,7 +867,8 @@ void __init gart_parse_options(char *p) if (!strncmp(p, "leak", 4)) { leak_trace = 1; p += 4; - if (*p == '=') ++p; + if (*p == '=') + ++p; if (isdigit(*p) && get_option(&p, &arg)) iommu_leak_pages = arg; } diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index aec43d56f49c..c70ab5a5d4c8 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -7,14 +7,14 @@ #include <linux/dma-mapping.h> #include <linux/scatterlist.h> -#include <asm/gart.h> +#include <asm/iommu.h> #include <asm/processor.h> #include <asm/dma.h> static int check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) { - if (hwdev && bus + size > *hwdev->dma_mask) { + if (hwdev && !is_buffer_dma_capable(*hwdev->dma_mask, bus, size)) { if (*hwdev->dma_mask >= DMA_32BIT_MASK) printk(KERN_ERR "nommu_%s: overflow %Lx+%zu of device mask %Lx\n", @@ -72,21 +72,17 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, return nents; } -/* Make sure we keep the same behaviour */ -static int nommu_mapping_error(dma_addr_t dma_addr) +static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_addr) { -#ifdef CONFIG_X86_32 - return 0; -#else - return (dma_addr == bad_dma_address); -#endif + free_pages((unsigned long)vaddr, get_order(size)); } - -const struct dma_mapping_ops nommu_dma_ops = { +struct dma_mapping_ops nommu_dma_ops = { + .alloc_coherent = dma_generic_alloc_coherent, + .free_coherent = nommu_free_coherent, .map_single = nommu_map_single, .map_sg = nommu_map_sg, - .mapping_error = nommu_mapping_error, .is_phys = 1, }; diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c index 82299cd1d04d..c4ce0332759e 100644 --- a/arch/x86/kernel/pci-swiotlb_64.c +++ b/arch/x86/kernel/pci-swiotlb_64.c @@ -5,7 +5,7 @@ #include <linux/module.h> #include <linux/dma-mapping.h> -#include <asm/gart.h> +#include <asm/iommu.h> #include <asm/swiotlb.h> #include <asm/dma.h> @@ -18,7 +18,7 @@ swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size, return swiotlb_map_single(hwdev, phys_to_virt(paddr), size, direction); } -const struct dma_mapping_ops swiotlb_dma_ops = { +struct dma_mapping_ops swiotlb_dma_ops = { .mapping_error = swiotlb_dma_mapping_error, .alloc_coherent = swiotlb_alloc_coherent, .free_coherent = swiotlb_free_coherent, diff --git a/arch/x86/kernel/pcspeaker.c b/arch/x86/kernel/pcspeaker.c index bc1f2d3ea277..a311ffcaad16 100644 --- a/arch/x86/kernel/pcspeaker.c +++ b/arch/x86/kernel/pcspeaker.c @@ -1,20 +1,13 @@ #include <linux/platform_device.h> -#include <linux/errno.h> +#include <linux/err.h> #include <linux/init.h> static __init int add_pcspkr(void) { struct platform_device *pd; - int ret; - pd = platform_device_alloc("pcspkr", -1); - if (!pd) - return -ENOMEM; + pd = platform_device_register_simple("pcspkr", -1, NULL, 0); - ret = platform_device_add(pd); - if (ret) - platform_device_put(pd); - - return ret; + return IS_ERR(pd) ? PTR_ERR(pd) : 0; } device_initcall(add_pcspkr); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 4d629c62f4f8..c622772744d8 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -184,7 +184,8 @@ static void mwait_idle(void) static void poll_idle(void) { local_irq_enable(); - cpu_relax(); + while (!need_resched()) + cpu_relax(); } /* @@ -199,6 +200,7 @@ static void poll_idle(void) * * idle=mwait overrides this decision and forces the usage of mwait. */ +static int __cpuinitdata force_mwait; #define MWAIT_INFO 0x05 #define MWAIT_ECX_EXTENDED_INFO 0x01 @@ -244,6 +246,14 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) return 1; } +static cpumask_t c1e_mask = CPU_MASK_NONE; +static int c1e_detected; + +void c1e_remove_cpu(int cpu) +{ + cpu_clear(cpu, c1e_mask); +} + /* * C1E aware idle routine. We check for C1E active in the interrupt * pending message MSR. If we detect C1E, then we handle it the same @@ -251,9 +261,6 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) */ static void c1e_idle(void) { - static cpumask_t c1e_mask = CPU_MASK_NONE; - static int c1e_detected; - if (need_resched()) return; @@ -263,8 +270,10 @@ static void c1e_idle(void) rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); if (lo & K8_INTP_C1E_ACTIVE_MASK) { c1e_detected = 1; - mark_tsc_unstable("TSC halt in C1E"); - printk(KERN_INFO "System has C1E enabled\n"); + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + mark_tsc_unstable("TSC halt in AMD C1E"); + printk(KERN_INFO "System has AMD C1E enabled\n"); + set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E); } } @@ -326,6 +335,9 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) static int __init idle_setup(char *str) { + if (!str) + return -EINVAL; + if (!strcmp(str, "poll")) { printk("using polling idle threads.\n"); pm_idle = poll_idle; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 0c3927accb00..0a1302fe6d45 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -37,6 +37,7 @@ #include <linux/tick.h> #include <linux/percpu.h> #include <linux/prctl.h> +#include <linux/dmi.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -55,6 +56,9 @@ #include <asm/tlbflush.h> #include <asm/cpu.h> #include <asm/kdebug.h> +#include <asm/idle.h> +#include <asm/syscalls.h> +#include <asm/smp.h> asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -72,47 +76,12 @@ unsigned long thread_saved_pc(struct task_struct *tsk) return ((unsigned long *)tsk->thread.sp)[3]; } -#ifdef CONFIG_HOTPLUG_CPU -#include <asm/nmi.h> - -static void cpu_exit_clear(void) -{ - int cpu = raw_smp_processor_id(); - - idle_task_exit(); - - cpu_uninit(); - irq_ctx_exit(cpu); - - cpu_clear(cpu, cpu_callout_map); - cpu_clear(cpu, cpu_callin_map); - - numa_remove_cpu(cpu); -} - -/* We don't actually take CPU down, just spin without interrupts. */ -static inline void play_dead(void) -{ - /* This must be done before dead CPU ack */ - cpu_exit_clear(); - wbinvd(); - mb(); - /* Ack it */ - __get_cpu_var(cpu_state) = CPU_DEAD; - - /* - * With physical CPU hotplug, we should halt the cpu - */ - local_irq_disable(); - while (1) - halt(); -} -#else +#ifndef CONFIG_SMP static inline void play_dead(void) { BUG(); } -#endif /* CONFIG_HOTPLUG_CPU */ +#endif /* * The idle thread. There's no useful work to be @@ -128,7 +97,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_stop_sched_tick(); + tick_nohz_stop_sched_tick(1); while (!need_resched()) { check_pgt_cache(); @@ -154,12 +123,13 @@ void cpu_idle(void) } } -void __show_registers(struct pt_regs *regs, int all) +void __show_regs(struct pt_regs *regs, int all) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; unsigned long d0, d1, d2, d3, d6, d7; unsigned long sp; unsigned short ss, gs; + const char *board; if (user_mode_vm(regs)) { sp = regs->sp; @@ -172,11 +142,15 @@ void __show_registers(struct pt_regs *regs, int all) } printk("\n"); - printk("Pid: %d, comm: %s %s (%s %.*s)\n", + + board = dmi_get_system_info(DMI_PRODUCT_NAME); + if (!board) + board = ""; + printk("Pid: %d, comm: %s %s (%s %.*s) %s\n", task_pid_nr(current), current->comm, print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); + init_utsname()->version, board); printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", (u16)regs->cs, regs->ip, regs->flags, @@ -215,7 +189,7 @@ void __show_registers(struct pt_regs *regs, int all) void show_regs(struct pt_regs *regs) { - __show_registers(regs, 1); + __show_regs(regs, 1); show_trace(NULL, regs, ®s->sp, regs->bp); } @@ -276,6 +250,14 @@ void exit_thread(void) tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; put_cpu(); } +#ifdef CONFIG_X86_DS + /* Free any DS contexts that have not been properly released. */ + if (unlikely(current->thread.ds_ctx)) { + /* we clear debugctl to make sure DS is not used. */ + update_debugctlmsr(0); + ds_free(current->thread.ds_ctx); + } +#endif /* CONFIG_X86_DS */ } void flush_thread(void) @@ -437,6 +419,35 @@ int set_tsc_mode(unsigned int val) return 0; } +#ifdef CONFIG_X86_DS +static int update_debugctl(struct thread_struct *prev, + struct thread_struct *next, unsigned long debugctl) +{ + unsigned long ds_prev = 0; + unsigned long ds_next = 0; + + if (prev->ds_ctx) + ds_prev = (unsigned long)prev->ds_ctx->ds; + if (next->ds_ctx) + ds_next = (unsigned long)next->ds_ctx->ds; + + if (ds_next != ds_prev) { + /* we clear debugctl to make sure DS + * is not in use when we change it */ + debugctl = 0; + update_debugctlmsr(0); + wrmsr(MSR_IA32_DS_AREA, ds_next, 0); + } + return debugctl; +} +#else +static int update_debugctl(struct thread_struct *prev, + struct thread_struct *next, unsigned long debugctl) +{ + return debugctl; +} +#endif /* CONFIG_X86_DS */ + static noinline void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, struct tss_struct *tss) @@ -447,14 +458,7 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, prev = &prev_p->thread; next = &next_p->thread; - debugctl = prev->debugctlmsr; - if (next->ds_area_msr != prev->ds_area_msr) { - /* we clear debugctl to make sure DS - * is not in use when we change it */ - debugctl = 0; - update_debugctlmsr(0); - wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0); - } + debugctl = update_debugctl(prev, next, prev->debugctlmsr); if (next->debugctlmsr != debugctl) update_debugctlmsr(next->debugctlmsr); @@ -478,13 +482,13 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, hard_enable_TSC(); } -#ifdef X86_BTS +#ifdef CONFIG_X86_PTRACE_BTS if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); -#endif +#endif /* CONFIG_X86_PTRACE_BTS */ if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index a8e53626ac9a..c958120fb1b6 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -37,11 +37,11 @@ #include <linux/kdebug.h> #include <linux/tick.h> #include <linux/prctl.h> +#include <linux/uaccess.h> +#include <linux/io.h> -#include <asm/uaccess.h> #include <asm/pgtable.h> #include <asm/system.h> -#include <asm/io.h> #include <asm/processor.h> #include <asm/i387.h> #include <asm/mmu_context.h> @@ -51,6 +51,7 @@ #include <asm/proto.h> #include <asm/ia32.h> #include <asm/idle.h> +#include <asm/syscalls.h> asmlinkage extern void ret_from_fork(void); @@ -62,6 +63,13 @@ void idle_notifier_register(struct notifier_block *n) { atomic_notifier_chain_register(&idle_notifier, n); } +EXPORT_SYMBOL_GPL(idle_notifier_register); + +void idle_notifier_unregister(struct notifier_block *n) +{ + atomic_notifier_chain_unregister(&idle_notifier, n); +} +EXPORT_SYMBOL_GPL(idle_notifier_unregister); void enter_idle(void) { @@ -85,29 +93,12 @@ void exit_idle(void) __exit_idle(); } -#ifdef CONFIG_HOTPLUG_CPU -DECLARE_PER_CPU(int, cpu_state); - -#include <asm/nmi.h> -/* We halt the CPU with physical CPU hotplug */ -static inline void play_dead(void) -{ - idle_task_exit(); - wbinvd(); - mb(); - /* Ack it */ - __get_cpu_var(cpu_state) = CPU_DEAD; - - local_irq_disable(); - while (1) - halt(); -} -#else +#ifndef CONFIG_SMP static inline void play_dead(void) { BUG(); } -#endif /* CONFIG_HOTPLUG_CPU */ +#endif /* * The idle thread. There's no useful work to be @@ -120,7 +111,7 @@ void cpu_idle(void) current_thread_info()->status |= TS_POLLING; /* endless idle loop with no priority at all */ while (1) { - tick_nohz_stop_sched_tick(); + tick_nohz_stop_sched_tick(1); while (!need_resched()) { rmb(); @@ -152,7 +143,7 @@ void cpu_idle(void) } /* Prints also some state that isn't saved in the pt_regs */ -void __show_regs(struct pt_regs * regs) +void __show_regs(struct pt_regs *regs, int all) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; unsigned long d0, d1, d2, d3, d6, d7; @@ -161,60 +152,65 @@ void __show_regs(struct pt_regs * regs) printk("\n"); print_modules(); - printk("Pid: %d, comm: %.20s %s %s %.*s\n", + printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s\n", current->pid, current->comm, print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); - printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); + printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); printk_address(regs->ip, 1); - printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp, - regs->flags); - printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", + printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, + regs->sp, regs->flags); + printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n", regs->ax, regs->bx, regs->cx); - printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", + printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n", regs->dx, regs->si, regs->di); - printk("RBP: %016lx R08: %016lx R09: %016lx\n", + printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n", regs->bp, regs->r8, regs->r9); - printk("R10: %016lx R11: %016lx R12: %016lx\n", - regs->r10, regs->r11, regs->r12); - printk("R13: %016lx R14: %016lx R15: %016lx\n", - regs->r13, regs->r14, regs->r15); - - asm("movl %%ds,%0" : "=r" (ds)); - asm("movl %%cs,%0" : "=r" (cs)); - asm("movl %%es,%0" : "=r" (es)); + printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n", + regs->r10, regs->r11, regs->r12); + printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n", + regs->r13, regs->r14, regs->r15); + + asm("movl %%ds,%0" : "=r" (ds)); + asm("movl %%cs,%0" : "=r" (cs)); + asm("movl %%es,%0" : "=r" (es)); asm("movl %%fs,%0" : "=r" (fsindex)); asm("movl %%gs,%0" : "=r" (gsindex)); rdmsrl(MSR_FS_BASE, fs); - rdmsrl(MSR_GS_BASE, gs); - rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); + rdmsrl(MSR_GS_BASE, gs); + rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); + + if (!all) + return; cr0 = read_cr0(); cr2 = read_cr2(); cr3 = read_cr3(); cr4 = read_cr4(); - printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", - fs,fsindex,gs,gsindex,shadowgs); - printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); - printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4); + printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", + fs, fsindex, gs, gsindex, shadowgs); + printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, + es, cr0); + printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, + cr4); get_debugreg(d0, 0); get_debugreg(d1, 1); get_debugreg(d2, 2); - printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); + printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); get_debugreg(d3, 3); get_debugreg(d6, 6); get_debugreg(d7, 7); - printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); + printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); } void show_regs(struct pt_regs *regs) { - printk("CPU %d:", smp_processor_id()); - __show_regs(regs); + printk(KERN_INFO "CPU %d:", smp_processor_id()); + __show_regs(regs, 1); show_trace(NULL, regs, (void *)(regs + 1), regs->bp); } @@ -239,6 +235,14 @@ void exit_thread(void) t->io_bitmap_max = 0; put_cpu(); } +#ifdef CONFIG_X86_DS + /* Free any DS contexts that have not been properly released. */ + if (unlikely(t->ds_ctx)) { + /* we clear debugctl to make sure DS is not used. */ + update_debugctlmsr(0); + ds_free(t->ds_ctx); + } +#endif /* CONFIG_X86_DS */ } void flush_thread(void) @@ -314,10 +318,10 @@ void prepare_to_copy(struct task_struct *tsk) int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, unsigned long unused, - struct task_struct * p, struct pt_regs * regs) + struct task_struct *p, struct pt_regs *regs) { int err; - struct pt_regs * childregs; + struct pt_regs *childregs; struct task_struct *me = current; childregs = ((struct pt_regs *) @@ -362,10 +366,10 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, if (test_thread_flag(TIF_IA32)) err = do_set_thread_area(p, -1, (struct user_desc __user *)childregs->si, 0); - else -#endif - err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); - if (err) + else +#endif + err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); + if (err) goto out; } err = 0; @@ -472,13 +476,27 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, next = &next_p->thread; debugctl = prev->debugctlmsr; - if (next->ds_area_msr != prev->ds_area_msr) { - /* we clear debugctl to make sure DS - * is not in use when we change it */ - debugctl = 0; - update_debugctlmsr(0); - wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr); + +#ifdef CONFIG_X86_DS + { + unsigned long ds_prev = 0, ds_next = 0; + + if (prev->ds_ctx) + ds_prev = (unsigned long)prev->ds_ctx->ds; + if (next->ds_ctx) + ds_next = (unsigned long)next->ds_ctx->ds; + + if (ds_next != ds_prev) { + /* + * We clear debugctl to make sure DS + * is not in use when we change it: + */ + debugctl = 0; + update_debugctlmsr(0); + wrmsrl(MSR_IA32_DS_AREA, ds_next); + } } +#endif /* CONFIG_X86_DS */ if (next->debugctlmsr != debugctl) update_debugctlmsr(next->debugctlmsr); @@ -516,13 +534,13 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); } -#ifdef X86_BTS +#ifdef CONFIG_X86_PTRACE_BTS if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); -#endif +#endif /* CONFIG_X86_PTRACE_BTS */ } /* @@ -537,14 +555,14 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { - struct thread_struct *prev = &prev_p->thread, - *next = &next_p->thread; + struct thread_struct *prev = &prev_p->thread; + struct thread_struct *next = &next_p->thread; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); unsigned fsindex, gsindex; /* we're going to use this soon, after a few expensive things */ - if (next_p->fpu_counter>5) + if (next_p->fpu_counter > 5) prefetch(next->xstate); /* @@ -552,13 +570,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ load_sp0(tss, next); - /* + /* * Switch DS and ES. * This won't pick up thread selector changes, but I guess that is ok. */ savesegment(es, prev->es); if (unlikely(next->es | prev->es)) - loadsegment(es, next->es); + loadsegment(es, next->es); savesegment(ds, prev->ds); if (unlikely(next->ds | prev->ds)) @@ -584,50 +602,50 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ arch_leave_lazy_cpu_mode(); - /* + /* * Switch FS and GS. + * + * Segment register != 0 always requires a reload. Also + * reload when it has changed. When prev process used 64bit + * base always reload to avoid an information leak. */ - { - /* segment register != 0 always requires a reload. - also reload when it has changed. - when prev process used 64bit base always reload - to avoid an information leak. */ - if (unlikely(fsindex | next->fsindex | prev->fs)) { - loadsegment(fs, next->fsindex); - /* check if the user used a selector != 0 - * if yes clear 64bit base, since overloaded base - * is always mapped to the Null selector - */ - if (fsindex) - prev->fs = 0; - } - /* when next process has a 64bit base use it */ - if (next->fs) - wrmsrl(MSR_FS_BASE, next->fs); - prev->fsindex = fsindex; - - if (unlikely(gsindex | next->gsindex | prev->gs)) { - load_gs_index(next->gsindex); - if (gsindex) - prev->gs = 0; - } - if (next->gs) - wrmsrl(MSR_KERNEL_GS_BASE, next->gs); - prev->gsindex = gsindex; + if (unlikely(fsindex | next->fsindex | prev->fs)) { + loadsegment(fs, next->fsindex); + /* + * Check if the user used a selector != 0; if yes + * clear 64bit base, since overloaded base is always + * mapped to the Null selector + */ + if (fsindex) + prev->fs = 0; + } + /* when next process has a 64bit base use it */ + if (next->fs) + wrmsrl(MSR_FS_BASE, next->fs); + prev->fsindex = fsindex; + + if (unlikely(gsindex | next->gsindex | prev->gs)) { + load_gs_index(next->gsindex); + if (gsindex) + prev->gs = 0; } + if (next->gs) + wrmsrl(MSR_KERNEL_GS_BASE, next->gs); + prev->gsindex = gsindex; /* Must be after DS reload */ unlazy_fpu(prev_p); - /* + /* * Switch the PDA and FPU contexts. */ prev->usersp = read_pda(oldrsp); write_pda(oldrsp, next->usersp); - write_pda(pcurrent, next_p); + write_pda(pcurrent, next_p); write_pda(kernelstack, - (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); + (unsigned long)task_stack_page(next_p) + + THREAD_SIZE - PDA_STACKOFFSET); #ifdef CONFIG_CC_STACKPROTECTOR write_pda(stack_canary, next_p->stack_canary); /* @@ -664,7 +682,7 @@ long sys_execve(char __user *name, char __user * __user *argv, char __user * __user *envp, struct pt_regs *regs) { long error; - char * filename; + char *filename; filename = getname(name); error = PTR_ERR(filename); @@ -722,55 +740,55 @@ asmlinkage long sys_vfork(struct pt_regs *regs) unsigned long get_wchan(struct task_struct *p) { unsigned long stack; - u64 fp,ip; + u64 fp, ip; int count = 0; - if (!p || p == current || p->state==TASK_RUNNING) - return 0; + if (!p || p == current || p->state == TASK_RUNNING) + return 0; stack = (unsigned long)task_stack_page(p); - if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE) + if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE) return 0; fp = *(u64 *)(p->thread.sp); - do { + do { if (fp < (unsigned long)stack || - fp > (unsigned long)stack+THREAD_SIZE) - return 0; + fp >= (unsigned long)stack+THREAD_SIZE) + return 0; ip = *(u64 *)(fp+8); if (!in_sched_functions(ip)) return ip; - fp = *(u64 *)fp; - } while (count++ < 16); + fp = *(u64 *)fp; + } while (count++ < 16); return 0; } long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) -{ - int ret = 0; +{ + int ret = 0; int doit = task == current; int cpu; - switch (code) { + switch (code) { case ARCH_SET_GS: if (addr >= TASK_SIZE_OF(task)) - return -EPERM; + return -EPERM; cpu = get_cpu(); - /* handle small bases via the GDT because that's faster to + /* handle small bases via the GDT because that's faster to switch. */ - if (addr <= 0xffffffff) { - set_32bit_tls(task, GS_TLS, addr); - if (doit) { + if (addr <= 0xffffffff) { + set_32bit_tls(task, GS_TLS, addr); + if (doit) { load_TLS(&task->thread, cpu); - load_gs_index(GS_TLS_SEL); + load_gs_index(GS_TLS_SEL); } - task->thread.gsindex = GS_TLS_SEL; + task->thread.gsindex = GS_TLS_SEL; task->thread.gs = 0; - } else { + } else { task->thread.gsindex = 0; task->thread.gs = addr; if (doit) { load_gs_index(0); ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); - } + } } put_cpu(); break; @@ -824,8 +842,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) rdmsrl(MSR_KERNEL_GS_BASE, base); else base = task->thread.gs; - } - else + } else base = task->thread.gs; ret = put_user(base, (unsigned long __user *)addr); break; diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 77040b6070e1..0a6d8c12e10d 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -14,6 +14,7 @@ #include <linux/errno.h> #include <linux/ptrace.h> #include <linux/regset.h> +#include <linux/tracehook.h> #include <linux/user.h> #include <linux/elf.h> #include <linux/security.h> @@ -39,7 +40,9 @@ enum x86_regset { REGSET_GENERAL, REGSET_FP, REGSET_XFP, + REGSET_IOPERM64 = REGSET_XFP, REGSET_TLS, + REGSET_IOPERM32, }; /* @@ -69,7 +72,7 @@ static inline bool invalid_selector(u16 value) #define FLAG_MASK FLAG_MASK_32 -static long *pt_regs_access(struct pt_regs *regs, unsigned long regno) +static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno) { BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0); regno >>= 2; @@ -554,45 +557,138 @@ static int ptrace_set_debugreg(struct task_struct *child, return 0; } -#ifdef X86_BTS +/* + * These access the current or another (stopped) task's io permission + * bitmap for debugging or core dump. + */ +static int ioperm_active(struct task_struct *target, + const struct user_regset *regset) +{ + return target->thread.io_bitmap_max / regset->size; +} -static int ptrace_bts_get_size(struct task_struct *child) +static int ioperm_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) { - if (!child->thread.ds_area_msr) + if (!target->thread.io_bitmap_ptr) return -ENXIO; - return ds_get_bts_index((void *)child->thread.ds_area_msr); + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + target->thread.io_bitmap_ptr, + 0, IO_BITMAP_BYTES); +} + +#ifdef CONFIG_X86_PTRACE_BTS +/* + * The configuration for a particular BTS hardware implementation. + */ +struct bts_configuration { + /* the size of a BTS record in bytes; at most BTS_MAX_RECORD_SIZE */ + unsigned char sizeof_bts; + /* the size of a field in the BTS record in bytes */ + unsigned char sizeof_field; + /* a bitmask to enable/disable BTS in DEBUGCTL MSR */ + unsigned long debugctl_mask; +}; +static struct bts_configuration bts_cfg; + +#define BTS_MAX_RECORD_SIZE (8 * 3) + + +/* + * Branch Trace Store (BTS) uses the following format. Different + * architectures vary in the size of those fields. + * - source linear address + * - destination linear address + * - flags + * + * Later architectures use 64bit pointers throughout, whereas earlier + * architectures use 32bit pointers in 32bit mode. + * + * We compute the base address for the first 8 fields based on: + * - the field size stored in the DS configuration + * - the relative field position + * + * In order to store additional information in the BTS buffer, we use + * a special source address to indicate that the record requires + * special interpretation. + * + * Netburst indicated via a bit in the flags field whether the branch + * was predicted; this is ignored. + */ + +enum bts_field { + bts_from = 0, + bts_to, + bts_flags, + + bts_escape = (unsigned long)-1, + bts_qual = bts_to, + bts_jiffies = bts_flags +}; + +static inline unsigned long bts_get(const char *base, enum bts_field field) +{ + base += (bts_cfg.sizeof_field * field); + return *(unsigned long *)base; } -static int ptrace_bts_read_record(struct task_struct *child, - long index, +static inline void bts_set(char *base, enum bts_field field, unsigned long val) +{ + base += (bts_cfg.sizeof_field * field);; + (*(unsigned long *)base) = val; +} + +/* + * Translate a BTS record from the raw format into the bts_struct format + * + * out (out): bts_struct interpretation + * raw: raw BTS record + */ +static void ptrace_bts_translate_record(struct bts_struct *out, const void *raw) +{ + memset(out, 0, sizeof(*out)); + if (bts_get(raw, bts_from) == bts_escape) { + out->qualifier = bts_get(raw, bts_qual); + out->variant.jiffies = bts_get(raw, bts_jiffies); + } else { + out->qualifier = BTS_BRANCH; + out->variant.lbr.from_ip = bts_get(raw, bts_from); + out->variant.lbr.to_ip = bts_get(raw, bts_to); + } +} + +static int ptrace_bts_read_record(struct task_struct *child, size_t index, struct bts_struct __user *out) { struct bts_struct ret; - int retval; - int bts_end; - int bts_index; + const void *bts_record; + size_t bts_index, bts_end; + int error; - if (!child->thread.ds_area_msr) - return -ENXIO; + error = ds_get_bts_end(child, &bts_end); + if (error < 0) + return error; - if (index < 0) - return -EINVAL; - - bts_end = ds_get_bts_end((void *)child->thread.ds_area_msr); if (bts_end <= index) return -EINVAL; + error = ds_get_bts_index(child, &bts_index); + if (error < 0) + return error; + /* translate the ptrace bts index into the ds bts index */ - bts_index = ds_get_bts_index((void *)child->thread.ds_area_msr); - bts_index -= (index + 1); - if (bts_index < 0) - bts_index += bts_end; + bts_index += bts_end - (index + 1); + if (bts_end <= bts_index) + bts_index -= bts_end; + + error = ds_access_bts(child, bts_index, &bts_record); + if (error < 0) + return error; - retval = ds_read_bts((void *)child->thread.ds_area_msr, - bts_index, &ret); - if (retval < 0) - return retval; + ptrace_bts_translate_record(&ret, bts_record); if (copy_to_user(out, &ret, sizeof(ret))) return -EFAULT; @@ -600,101 +696,106 @@ static int ptrace_bts_read_record(struct task_struct *child, return sizeof(ret); } -static int ptrace_bts_clear(struct task_struct *child) -{ - if (!child->thread.ds_area_msr) - return -ENXIO; - - return ds_clear((void *)child->thread.ds_area_msr); -} - static int ptrace_bts_drain(struct task_struct *child, long size, struct bts_struct __user *out) { - int end, i; - void *ds = (void *)child->thread.ds_area_msr; - - if (!ds) - return -ENXIO; + struct bts_struct ret; + const unsigned char *raw; + size_t end, i; + int error; - end = ds_get_bts_index(ds); - if (end <= 0) - return end; + error = ds_get_bts_index(child, &end); + if (error < 0) + return error; if (size < (end * sizeof(struct bts_struct))) return -EIO; - for (i = 0; i < end; i++, out++) { - struct bts_struct ret; - int retval; + error = ds_access_bts(child, 0, (const void **)&raw); + if (error < 0) + return error; - retval = ds_read_bts(ds, i, &ret); - if (retval < 0) - return retval; + for (i = 0; i < end; i++, out++, raw += bts_cfg.sizeof_bts) { + ptrace_bts_translate_record(&ret, raw); if (copy_to_user(out, &ret, sizeof(ret))) return -EFAULT; } - ds_clear(ds); + error = ds_clear_bts(child); + if (error < 0) + return error; return end; } +static void ptrace_bts_ovfl(struct task_struct *child) +{ + send_sig(child->thread.bts_ovfl_signal, child, 0); +} + static int ptrace_bts_config(struct task_struct *child, long cfg_size, const struct ptrace_bts_config __user *ucfg) { struct ptrace_bts_config cfg; - int bts_size, ret = 0; - void *ds; + int error = 0; + error = -EOPNOTSUPP; + if (!bts_cfg.sizeof_bts) + goto errout; + + error = -EIO; if (cfg_size < sizeof(cfg)) - return -EIO; + goto errout; + error = -EFAULT; if (copy_from_user(&cfg, ucfg, sizeof(cfg))) - return -EFAULT; + goto errout; - if ((int)cfg.size < 0) - return -EINVAL; + error = -EINVAL; + if ((cfg.flags & PTRACE_BTS_O_SIGNAL) && + !(cfg.flags & PTRACE_BTS_O_ALLOC)) + goto errout; - bts_size = 0; - ds = (void *)child->thread.ds_area_msr; - if (ds) { - bts_size = ds_get_bts_size(ds); - if (bts_size < 0) - return bts_size; - } - cfg.size = PAGE_ALIGN(cfg.size); + if (cfg.flags & PTRACE_BTS_O_ALLOC) { + ds_ovfl_callback_t ovfl = NULL; + unsigned int sig = 0; - if (bts_size != cfg.size) { - ret = ptrace_bts_realloc(child, cfg.size, - cfg.flags & PTRACE_BTS_O_CUT_SIZE); - if (ret < 0) + /* we ignore the error in case we were not tracing child */ + (void)ds_release_bts(child); + + if (cfg.flags & PTRACE_BTS_O_SIGNAL) { + if (!cfg.signal) + goto errout; + + sig = cfg.signal; + ovfl = ptrace_bts_ovfl; + } + + error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl); + if (error < 0) goto errout; - ds = (void *)child->thread.ds_area_msr; + child->thread.bts_ovfl_signal = sig; } - if (cfg.flags & PTRACE_BTS_O_SIGNAL) - ret = ds_set_overflow(ds, DS_O_SIGNAL); - else - ret = ds_set_overflow(ds, DS_O_WRAP); - if (ret < 0) + error = -EINVAL; + if (!child->thread.ds_ctx && cfg.flags) goto errout; if (cfg.flags & PTRACE_BTS_O_TRACE) - child->thread.debugctlmsr |= ds_debugctl_mask(); + child->thread.debugctlmsr |= bts_cfg.debugctl_mask; else - child->thread.debugctlmsr &= ~ds_debugctl_mask(); + child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; if (cfg.flags & PTRACE_BTS_O_SCHED) set_tsk_thread_flag(child, TIF_BTS_TRACE_TS); else clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); - ret = sizeof(cfg); + error = sizeof(cfg); out: if (child->thread.debugctlmsr) @@ -702,10 +803,10 @@ out: else clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); - return ret; + return error; errout: - child->thread.debugctlmsr &= ~ds_debugctl_mask(); + child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); goto out; } @@ -714,29 +815,40 @@ static int ptrace_bts_status(struct task_struct *child, long cfg_size, struct ptrace_bts_config __user *ucfg) { - void *ds = (void *)child->thread.ds_area_msr; struct ptrace_bts_config cfg; + size_t end; + const void *base, *max; + int error; if (cfg_size < sizeof(cfg)) return -EIO; - memset(&cfg, 0, sizeof(cfg)); + error = ds_get_bts_end(child, &end); + if (error < 0) + return error; + + error = ds_access_bts(child, /* index = */ 0, &base); + if (error < 0) + return error; - if (ds) { - cfg.size = ds_get_bts_size(ds); + error = ds_access_bts(child, /* index = */ end, &max); + if (error < 0) + return error; - if (ds_get_overflow(ds) == DS_O_SIGNAL) - cfg.flags |= PTRACE_BTS_O_SIGNAL; + memset(&cfg, 0, sizeof(cfg)); + cfg.size = (max - base); + cfg.signal = child->thread.bts_ovfl_signal; + cfg.bts_size = sizeof(struct bts_struct); - if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) && - child->thread.debugctlmsr & ds_debugctl_mask()) - cfg.flags |= PTRACE_BTS_O_TRACE; + if (cfg.signal) + cfg.flags |= PTRACE_BTS_O_SIGNAL; - if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS)) - cfg.flags |= PTRACE_BTS_O_SCHED; - } + if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) && + child->thread.debugctlmsr & bts_cfg.debugctl_mask) + cfg.flags |= PTRACE_BTS_O_TRACE; - cfg.bts_size = sizeof(struct bts_struct); + if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS)) + cfg.flags |= PTRACE_BTS_O_SCHED; if (copy_to_user(ucfg, &cfg, sizeof(cfg))) return -EFAULT; @@ -744,89 +856,38 @@ static int ptrace_bts_status(struct task_struct *child, return sizeof(cfg); } - static int ptrace_bts_write_record(struct task_struct *child, const struct bts_struct *in) { - int retval; + unsigned char bts_record[BTS_MAX_RECORD_SIZE]; - if (!child->thread.ds_area_msr) - return -ENXIO; + BUG_ON(BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts); - retval = ds_write_bts((void *)child->thread.ds_area_msr, in); - if (retval) - return retval; + memset(bts_record, 0, bts_cfg.sizeof_bts); + switch (in->qualifier) { + case BTS_INVALID: + break; - return sizeof(*in); -} + case BTS_BRANCH: + bts_set(bts_record, bts_from, in->variant.lbr.from_ip); + bts_set(bts_record, bts_to, in->variant.lbr.to_ip); + break; -static int ptrace_bts_realloc(struct task_struct *child, - int size, int reduce_size) -{ - unsigned long rlim, vm; - int ret, old_size; + case BTS_TASK_ARRIVES: + case BTS_TASK_DEPARTS: + bts_set(bts_record, bts_from, bts_escape); + bts_set(bts_record, bts_qual, in->qualifier); + bts_set(bts_record, bts_jiffies, in->variant.jiffies); + break; - if (size < 0) + default: return -EINVAL; - - old_size = ds_get_bts_size((void *)child->thread.ds_area_msr); - if (old_size < 0) - return old_size; - - ret = ds_free((void **)&child->thread.ds_area_msr); - if (ret < 0) - goto out; - - size >>= PAGE_SHIFT; - old_size >>= PAGE_SHIFT; - - current->mm->total_vm -= old_size; - current->mm->locked_vm -= old_size; - - if (size == 0) - goto out; - - rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; - vm = current->mm->total_vm + size; - if (rlim < vm) { - ret = -ENOMEM; - - if (!reduce_size) - goto out; - - size = rlim - current->mm->total_vm; - if (size <= 0) - goto out; } - rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; - vm = current->mm->locked_vm + size; - if (rlim < vm) { - ret = -ENOMEM; - - if (!reduce_size) - goto out; - - size = rlim - current->mm->locked_vm; - if (size <= 0) - goto out; - } - - ret = ds_allocate((void **)&child->thread.ds_area_msr, - size << PAGE_SHIFT); - if (ret < 0) - goto out; - - current->mm->total_vm += size; - current->mm->locked_vm += size; - -out: - if (child->thread.ds_area_msr) - set_tsk_thread_flag(child, TIF_DS_AREA_MSR); - else - clear_tsk_thread_flag(child, TIF_DS_AREA_MSR); - - return ret; + /* The writing task will be the switched-to task on a context + * switch. It needs to write into the switched-from task's BTS + * buffer. */ + return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts); } void ptrace_bts_take_timestamp(struct task_struct *tsk, @@ -839,7 +900,66 @@ void ptrace_bts_take_timestamp(struct task_struct *tsk, ptrace_bts_write_record(tsk, &rec); } -#endif /* X86_BTS */ + +static const struct bts_configuration bts_cfg_netburst = { + .sizeof_bts = sizeof(long) * 3, + .sizeof_field = sizeof(long), + .debugctl_mask = (1<<2)|(1<<3)|(1<<5) +}; + +static const struct bts_configuration bts_cfg_pentium_m = { + .sizeof_bts = sizeof(long) * 3, + .sizeof_field = sizeof(long), + .debugctl_mask = (1<<6)|(1<<7) +}; + +static const struct bts_configuration bts_cfg_core2 = { + .sizeof_bts = 8 * 3, + .sizeof_field = 8, + .debugctl_mask = (1<<6)|(1<<7)|(1<<9) +}; + +static inline void bts_configure(const struct bts_configuration *cfg) +{ + bts_cfg = *cfg; +} + +void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c) +{ + switch (c->x86) { + case 0x6: + switch (c->x86_model) { + case 0xD: + case 0xE: /* Pentium M */ + bts_configure(&bts_cfg_pentium_m); + break; + case 0xF: /* Core2 */ + case 0x1C: /* Atom */ + bts_configure(&bts_cfg_core2); + break; + default: + /* sorry, don't know about them */ + break; + } + break; + case 0xF: + switch (c->x86_model) { + case 0x0: + case 0x1: + case 0x2: /* Netburst */ + bts_configure(&bts_cfg_netburst); + break; + default: + /* sorry, don't know about them */ + break; + } + break; + default: + /* sorry, don't know about them */ + break; + } +} +#endif /* CONFIG_X86_PTRACE_BTS */ /* * Called by kernel/ptrace.c when detaching.. @@ -852,15 +972,15 @@ void ptrace_disable(struct task_struct *child) #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); #endif - if (child->thread.ds_area_msr) { -#ifdef X86_BTS - ptrace_bts_realloc(child, 0, 0); -#endif - child->thread.debugctlmsr &= ~ds_debugctl_mask(); - if (!child->thread.debugctlmsr) - clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); - clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); - } +#ifdef CONFIG_X86_PTRACE_BTS + (void)ds_release_bts(child); + + child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; + if (!child->thread.debugctlmsr) + clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); + + clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); +#endif /* CONFIG_X86_PTRACE_BTS */ } #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION @@ -980,7 +1100,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) /* * These bits need more cooking - not enabled yet: */ -#ifdef X86_BTS +#ifdef CONFIG_X86_PTRACE_BTS case PTRACE_BTS_CONFIG: ret = ptrace_bts_config (child, data, (struct ptrace_bts_config __user *)addr); @@ -992,7 +1112,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) break; case PTRACE_BTS_SIZE: - ret = ptrace_bts_get_size(child); + ret = ds_get_bts_index(child, /* pos = */ NULL); break; case PTRACE_BTS_GET: @@ -1001,14 +1121,14 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) break; case PTRACE_BTS_CLEAR: - ret = ptrace_bts_clear(child); + ret = ds_clear_bts(child); break; case PTRACE_BTS_DRAIN: ret = ptrace_bts_drain (child, data, (struct bts_struct __user *) addr); break; -#endif +#endif /* CONFIG_X86_PTRACE_BTS */ default: ret = ptrace_request(child, request, addr, data); @@ -1290,6 +1410,12 @@ static const struct user_regset x86_64_regsets[] = { .size = sizeof(long), .align = sizeof(long), .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set }, + [REGSET_IOPERM64] = { + .core_note_type = NT_386_IOPERM, + .n = IO_BITMAP_LONGS, + .size = sizeof(long), .align = sizeof(long), + .active = ioperm_active, .get = ioperm_get + }, }; static const struct user_regset_view user_x86_64_view = { @@ -1336,6 +1462,12 @@ static const struct user_regset x86_32_regsets[] = { .active = regset_tls_active, .get = regset_tls_get, .set = regset_tls_set }, + [REGSET_IOPERM32] = { + .core_note_type = NT_386_IOPERM, + .n = IO_BITMAP_BYTES / sizeof(u32), + .size = sizeof(u32), .align = sizeof(u32), + .active = ioperm_active, .get = ioperm_get + }, }; static const struct user_regset_view user_x86_32_view = { @@ -1357,9 +1489,8 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task) #endif } -#ifdef CONFIG_X86_32 - -void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) +void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, + int error_code, int si_code) { struct siginfo info; @@ -1368,7 +1499,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) memset(&info, 0, sizeof(info)); info.si_signo = SIGTRAP; - info.si_code = TRAP_BRKPT; + info.si_code = si_code; /* User-mode ip? */ info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL; @@ -1377,143 +1508,83 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) force_sig_info(SIGTRAP, &info, tsk); } -/* notification of system call entry/exit - * - triggered by current->work.syscall_trace - */ -int do_syscall_trace(struct pt_regs *regs, int entryexit) -{ - int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU); - /* - * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall - * interception - */ - int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP); - int ret = 0; - - /* do the secure computing check first */ - if (!entryexit) - secure_computing(regs->orig_ax); - - if (unlikely(current->audit_context)) { - if (entryexit) - audit_syscall_exit(AUDITSC_RESULT(regs->ax), - regs->ax); - /* Debug traps, when using PTRACE_SINGLESTEP, must be sent only - * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is - * not used, entry.S will call us only on syscall exit, not - * entry; so when TIF_SYSCALL_AUDIT is used we must avoid - * calling send_sigtrap() on syscall entry. - * - * Note that when PTRACE_SYSEMU_SINGLESTEP is used, - * is_singlestep is false, despite his name, so we will still do - * the correct thing. - */ - else if (is_singlestep) - goto out; - } - - if (!(current->ptrace & PT_PTRACED)) - goto out; - - /* If a process stops on the 1st tracepoint with SYSCALL_TRACE - * and then is resumed with SYSEMU_SINGLESTEP, it will come in - * here. We have to check this and return */ - if (is_sysemu && entryexit) - return 0; - - /* Fake a debug trap */ - if (is_singlestep) - send_sigtrap(current, regs, 0); - - if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu) - goto out; - - /* the 0x80 provides a way for the tracing parent to distinguish - between a syscall stop and SIGTRAP delivery */ - /* Note that the debugger could change the result of test_thread_flag!*/ - ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0)); - /* - * this isn't the same as continuing with a signal, but it will do - * for normal use. strace only continues with a signal if the - * stopping signal is not SIGTRAP. -brl - */ - if (current->exit_code) { - send_sig(current->exit_code, current, 1); - current->exit_code = 0; - } - ret = is_sysemu; -out: - if (unlikely(current->audit_context) && !entryexit) - audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_ax, - regs->bx, regs->cx, regs->dx, regs->si); - if (ret == 0) - return 0; - - regs->orig_ax = -1; /* force skip of syscall restarting */ - if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); - return 1; -} - -#else /* CONFIG_X86_64 */ +#ifdef CONFIG_X86_32 +# define IS_IA32 1 +#elif defined CONFIG_IA32_EMULATION +# define IS_IA32 test_thread_flag(TIF_IA32) +#else +# define IS_IA32 0 +#endif -static void syscall_trace(struct pt_regs *regs) +/* + * We must return the syscall number to actually look up in the table. + * This can be -1L to skip running any syscall at all. + */ +asmregparm long syscall_trace_enter(struct pt_regs *regs) { + long ret = 0; -#if 0 - printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n", - current->comm, - regs->ip, regs->sp, regs->ax, regs->orig_ax, __builtin_return_address(0), - current_thread_info()->flags, current->ptrace); -#endif - - ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) - ? 0x80 : 0)); /* - * this isn't the same as continuing with a signal, but it will do - * for normal use. strace only continues with a signal if the - * stopping signal is not SIGTRAP. -brl + * If we stepped into a sysenter/syscall insn, it trapped in + * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP. + * If user-mode had set TF itself, then it's still clear from + * do_debug() and we need to set it again to restore the user + * state. If we entered on the slow path, TF was already set. */ - if (current->exit_code) { - send_sig(current->exit_code, current, 1); - current->exit_code = 0; - } -} + if (test_thread_flag(TIF_SINGLESTEP)) + regs->flags |= X86_EFLAGS_TF; -asmlinkage void syscall_trace_enter(struct pt_regs *regs) -{ /* do the secure computing check first */ secure_computing(regs->orig_ax); - if (test_thread_flag(TIF_SYSCALL_TRACE) - && (current->ptrace & PT_PTRACED)) - syscall_trace(regs); + if (unlikely(test_thread_flag(TIF_SYSCALL_EMU))) + ret = -1L; + + if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) && + tracehook_report_syscall_entry(regs)) + ret = -1L; if (unlikely(current->audit_context)) { - if (test_thread_flag(TIF_IA32)) { + if (IS_IA32) audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_ax, regs->bx, regs->cx, regs->dx, regs->si); - } else { +#ifdef CONFIG_X86_64 + else audit_syscall_entry(AUDIT_ARCH_X86_64, regs->orig_ax, regs->di, regs->si, regs->dx, regs->r10); - } +#endif } + + return ret ?: regs->orig_ax; } -asmlinkage void syscall_trace_leave(struct pt_regs *regs) +asmregparm void syscall_trace_leave(struct pt_regs *regs) { if (unlikely(current->audit_context)) audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); - if ((test_thread_flag(TIF_SYSCALL_TRACE) - || test_thread_flag(TIF_SINGLESTEP)) - && (current->ptrace & PT_PTRACED)) - syscall_trace(regs); -} + if (test_thread_flag(TIF_SYSCALL_TRACE)) + tracehook_report_syscall_exit(regs, 0); -#endif /* CONFIG_X86_32 */ + /* + * If TIF_SYSCALL_EMU is set, we only get here because of + * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). + * We already reported this syscall instruction in + * syscall_trace_enter(), so don't do any more now. + */ + if (unlikely(test_thread_flag(TIF_SYSCALL_EMU))) + return; + + /* + * If we are single-stepping, synthesize a trap to follow the + * system call instruction. + */ + if (test_thread_flag(TIF_SINGLESTEP) && + tracehook_consider_fatal_signal(current, SIGTRAP, SIG_DFL)) + send_sigtrap(current, regs, 0, TRAP_BRKPT); +} diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 05fbe9a0325a..4f9c55f3a7c0 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -97,6 +97,18 @@ static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst, return dst->version; } +unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) +{ + u64 pv_tsc_khz = 1000000ULL << 32; + + do_div(pv_tsc_khz, src->tsc_to_system_mul); + if (src->tsc_shift < 0) + pv_tsc_khz <<= -src->tsc_shift; + else + pv_tsc_khz >>= src->tsc_shift; + return pv_tsc_khz; +} + cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) { struct pvclock_shadow_time shadow; diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index d13858818100..67465ed89310 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -35,9 +35,6 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) if (!(word & (1 << 13))) { dev_info(&dev->dev, "Intel E7520/7320/7525 detected; " "disabling irq balancing and affinity\n"); -#ifdef CONFIG_IRQBALANCE - irqbalance_disable(""); -#endif noirqdebug_setup(""); #ifdef CONFIG_PROC_FS no_irq_affinity = 1; @@ -354,9 +351,27 @@ static void ati_force_hpet_resume(void) printk(KERN_DEBUG "Force enabled HPET at resume\n"); } +static u32 ati_ixp4x0_rev(struct pci_dev *dev) +{ + u32 d; + u8 b; + + pci_read_config_byte(dev, 0xac, &b); + b &= ~(1<<5); + pci_write_config_byte(dev, 0xac, b); + pci_read_config_dword(dev, 0x70, &d); + d |= 1<<8; + pci_write_config_dword(dev, 0x70, d); + pci_read_config_dword(dev, 0x8, &d); + d &= 0xff; + dev_printk(KERN_DEBUG, &dev->dev, "SB4X0 revision 0x%x\n", d); + return d; +} + static void ati_force_enable_hpet(struct pci_dev *dev) { - u32 uninitialized_var(val); + u32 d, val; + u8 b; if (hpet_address || force_hpet_address) return; @@ -366,14 +381,33 @@ static void ati_force_enable_hpet(struct pci_dev *dev) return; } + d = ati_ixp4x0_rev(dev); + if (d < 0x82) + return; + + /* base address */ pci_write_config_dword(dev, 0x14, 0xfed00000); pci_read_config_dword(dev, 0x14, &val); + + /* enable interrupt */ + outb(0x72, 0xcd6); b = inb(0xcd7); + b |= 0x1; + outb(0x72, 0xcd6); outb(b, 0xcd7); + outb(0x72, 0xcd6); b = inb(0xcd7); + if (!(b & 0x1)) + return; + pci_read_config_dword(dev, 0x64, &d); + d |= (1<<10); + pci_write_config_dword(dev, 0x64, d); + pci_read_config_dword(dev, 0x64, &d); + if (!(d & (1<<10))) + return; + force_hpet_address = val; force_hpet_resume_type = ATI_FORCE_HPET_RESUME; dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n", force_hpet_address); cached_dev = dev; - return; } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS, ati_force_enable_hpet); diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index f8a62160e151..f4c93f1cfc19 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -29,7 +29,11 @@ EXPORT_SYMBOL(pm_power_off); static const struct desc_ptr no_idt = {}; static int reboot_mode; -enum reboot_type reboot_type = BOOT_KBD; +/* + * Keyboard reset and triple fault may result in INIT, not RESET, which + * doesn't work when we're in vmx root mode. Try ACPI first. + */ +enum reboot_type reboot_type = BOOT_ACPI; int reboot_force; #if defined(CONFIG_X86_32) && defined(CONFIG_SMP) @@ -177,6 +181,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"), }, }, + { /* Handle problems with rebooting on Dell T5400's */ + .callback = set_bios_reboot, + .ident = "Dell Precision T5400", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"), + }, + }, { /* Handle problems with rebooting on HP laptops */ .callback = set_bios_reboot, .ident = "HP Compaq Laptop", @@ -403,10 +415,9 @@ void native_machine_shutdown(void) { /* Stop the cpus and apics */ #ifdef CONFIG_SMP - int reboot_cpu_id; /* The boot cpu is always logical cpu 0 */ - reboot_cpu_id = 0; + int reboot_cpu_id = 0; #ifdef CONFIG_X86_32 /* See if there has been given a command line override */ diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index c30fe25d470d..6f50664b2ba5 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -20,11 +20,45 @@ #define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) #define PAE_PGD_ATTR (_PAGE_PRESENT) +/* control_page + KEXEC_CONTROL_CODE_MAX_SIZE + * ~ control_page + PAGE_SIZE are used as data storage and stack for + * jumping back + */ +#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset)) + +/* Minimal CPU state */ +#define ESP DATA(0x0) +#define CR0 DATA(0x4) +#define CR3 DATA(0x8) +#define CR4 DATA(0xc) + +/* other data */ +#define CP_VA_CONTROL_PAGE DATA(0x10) +#define CP_PA_PGD DATA(0x14) +#define CP_PA_SWAP_PAGE DATA(0x18) +#define CP_PA_BACKUP_PAGES_MAP DATA(0x1c) + .text .align PAGE_SIZE .globl relocate_kernel relocate_kernel: - movl 8(%esp), %ebp /* list of pages */ + /* Save the CPU context, used for jumping back */ + + pushl %ebx + pushl %esi + pushl %edi + pushl %ebp + pushf + + movl 20+8(%esp), %ebp /* list of pages */ + movl PTR(VA_CONTROL_PAGE)(%ebp), %edi + movl %esp, ESP(%edi) + movl %cr0, %eax + movl %eax, CR0(%edi) + movl %cr3, %eax + movl %eax, CR3(%edi) + movl %cr4, %eax + movl %eax, CR4(%edi) #ifdef CONFIG_X86_PAE /* map the control page at its virtual address */ @@ -138,15 +172,25 @@ relocate_kernel: relocate_new_kernel: /* read the arguments and say goodbye to the stack */ - movl 4(%esp), %ebx /* page_list */ - movl 8(%esp), %ebp /* list of pages */ - movl 12(%esp), %edx /* start address */ - movl 16(%esp), %ecx /* cpu_has_pae */ + movl 20+4(%esp), %ebx /* page_list */ + movl 20+8(%esp), %ebp /* list of pages */ + movl 20+12(%esp), %edx /* start address */ + movl 20+16(%esp), %ecx /* cpu_has_pae */ + movl 20+20(%esp), %esi /* preserve_context */ /* zero out flags, and disable interrupts */ pushl $0 popfl + /* save some information for jumping back */ + movl PTR(VA_CONTROL_PAGE)(%ebp), %edi + movl %edi, CP_VA_CONTROL_PAGE(%edi) + movl PTR(PA_PGD)(%ebp), %eax + movl %eax, CP_PA_PGD(%edi) + movl PTR(PA_SWAP_PAGE)(%ebp), %eax + movl %eax, CP_PA_SWAP_PAGE(%edi) + movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi) + /* get physical address of control page now */ /* this is impossible after page table switch */ movl PTR(PA_CONTROL_PAGE)(%ebp), %edi @@ -197,8 +241,90 @@ identity_mapped: xorl %eax, %eax movl %eax, %cr3 + movl CP_PA_SWAP_PAGE(%edi), %eax + pushl %eax + pushl %ebx + call swap_pages + addl $8, %esp + + /* To be certain of avoiding problems with self-modifying code + * I need to execute a serializing instruction here. + * So I flush the TLB, it's handy, and not processor dependent. + */ + xorl %eax, %eax + movl %eax, %cr3 + + /* set all of the registers to known values */ + /* leave %esp alone */ + + testl %esi, %esi + jnz 1f + xorl %edi, %edi + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %esi, %esi + xorl %ebp, %ebp + ret +1: + popl %edx + movl CP_PA_SWAP_PAGE(%edi), %esp + addl $PAGE_SIZE, %esp +2: + call *%edx + + /* get the re-entry point of the peer system */ + movl 0(%esp), %ebp + call 1f +1: + popl %ebx + subl $(1b - relocate_kernel), %ebx + movl CP_VA_CONTROL_PAGE(%ebx), %edi + lea PAGE_SIZE(%ebx), %esp + movl CP_PA_SWAP_PAGE(%ebx), %eax + movl CP_PA_BACKUP_PAGES_MAP(%ebx), %edx + pushl %eax + pushl %edx + call swap_pages + addl $8, %esp + movl CP_PA_PGD(%ebx), %eax + movl %eax, %cr3 + movl %cr0, %eax + orl $(1<<31), %eax + movl %eax, %cr0 + lea PAGE_SIZE(%edi), %esp + movl %edi, %eax + addl $(virtual_mapped - relocate_kernel), %eax + pushl %eax + ret + +virtual_mapped: + movl CR4(%edi), %eax + movl %eax, %cr4 + movl CR3(%edi), %eax + movl %eax, %cr3 + movl CR0(%edi), %eax + movl %eax, %cr0 + movl ESP(%edi), %esp + movl %ebp, %eax + + popf + popl %ebp + popl %edi + popl %esi + popl %ebx + ret + /* Do the copies */ - movl %ebx, %ecx +swap_pages: + movl 8(%esp), %edx + movl 4(%esp), %ecx + pushl %ebp + pushl %ebx + pushl %edi + pushl %esi + movl %ecx, %ebx jmp 1f 0: /* top, read another word from the indirection page */ @@ -226,27 +352,31 @@ identity_mapped: movl %ecx, %esi /* For every source page do a copy */ andl $0xfffff000, %esi + movl %edi, %eax + movl %esi, %ebp + + movl %edx, %edi movl $1024, %ecx rep ; movsl - jmp 0b - -3: - /* To be certain of avoiding problems with self-modifying code - * I need to execute a serializing instruction here. - * So I flush the TLB, it's handy, and not processor dependent. - */ - xorl %eax, %eax - movl %eax, %cr3 + movl %ebp, %edi + movl %eax, %esi + movl $1024, %ecx + rep ; movsl - /* set all of the registers to known values */ - /* leave %esp alone */ + movl %eax, %edi + movl %edx, %esi + movl $1024, %ecx + rep ; movsl - xorl %eax, %eax - xorl %ebx, %ebx - xorl %ecx, %ecx - xorl %edx, %edx - xorl %esi, %esi - xorl %edi, %edi - xorl %ebp, %ebp + lea PAGE_SIZE(%ebp), %esi + jmp 0b +3: + popl %esi + popl %edi + popl %ebx + popl %ebp ret + + .globl kexec_control_code_size +.set kexec_control_code_size, . - relocate_kernel diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 05191bbc68b8..dd6f2b71561b 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -52,7 +52,7 @@ int mach_set_rtc_mmss(unsigned long nowtime) cmos_minutes = CMOS_READ(RTC_MINUTES); if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) - BCD_TO_BIN(cmos_minutes); + cmos_minutes = bcd2bin(cmos_minutes); /* * since we're only adjusting minutes and seconds, @@ -69,8 +69,8 @@ int mach_set_rtc_mmss(unsigned long nowtime) if (abs(real_minutes - cmos_minutes) < 30) { if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { - BIN_TO_BCD(real_seconds); - BIN_TO_BCD(real_minutes); + real_seconds = bin2bcd(real_seconds); + real_minutes = bin2bcd(real_minutes); } CMOS_WRITE(real_seconds,RTC_SECONDS); CMOS_WRITE(real_minutes,RTC_MINUTES); @@ -124,16 +124,16 @@ unsigned long mach_get_cmos_time(void) WARN_ON_ONCE(RTC_ALWAYS_BCD && (status & RTC_DM_BINARY)); if (RTC_ALWAYS_BCD || !(status & RTC_DM_BINARY)) { - BCD_TO_BIN(sec); - BCD_TO_BIN(min); - BCD_TO_BIN(hour); - BCD_TO_BIN(day); - BCD_TO_BIN(mon); - BCD_TO_BIN(year); + sec = bcd2bin(sec); + min = bcd2bin(min); + hour = bcd2bin(hour); + day = bcd2bin(day); + mon = bcd2bin(mon); + year = bcd2bin(year); } if (century) { - BCD_TO_BIN(century); + century = bcd2bin(century); year += century * 100; printk(KERN_INFO "Extended CMOS year: %d\n", century * 100); } else @@ -223,11 +223,25 @@ static struct platform_device rtc_device = { static __init int add_rtc_cmos(void) { #ifdef CONFIG_PNP - if (!pnp_platform_devices) - platform_device_register(&rtc_device); -#else + static const char *ids[] __initconst = + { "PNP0b00", "PNP0b01", "PNP0b02", }; + struct pnp_dev *dev; + struct pnp_id *id; + int i; + + pnp_for_each_dev(dev) { + for (id = dev->id; id; id = id->next) { + for (i = 0; i < ARRAY_SIZE(ids); i++) { + if (compare_pnp_id(id, ids[i]) != 0) + return 0; + } + } + } +#endif + platform_device_register(&rtc_device); -#endif /* CONFIG_PNP */ + dev_info(&rtc_device.dev, + "registered platform RTC device (no PNP device found)\n"); return 0; } device_initcall(add_rtc_cmos); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 531b55b8e81a..0fa6790c1dd3 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -57,12 +57,8 @@ #include <linux/slab.h> #include <linux/user.h> #include <linux/delay.h> -#include <linux/highmem.h> #include <linux/kallsyms.h> -#include <linux/edd.h> -#include <linux/iscsi_ibft.h> -#include <linux/kexec.h> #include <linux/cpufreq.h> #include <linux/dma-mapping.h> #include <linux/ctype.h> @@ -96,7 +92,7 @@ #include <asm/smp.h> #include <asm/desc.h> #include <asm/dma.h> -#include <asm/gart.h> +#include <asm/iommu.h> #include <asm/mmu_context.h> #include <asm/proto.h> @@ -104,7 +100,6 @@ #include <asm/paravirt.h> #include <asm/percpu.h> -#include <asm/sections.h> #include <asm/topology.h> #include <asm/apicdef.h> #ifdef CONFIG_X86_64 @@ -228,6 +223,9 @@ unsigned long saved_video_mode; #define RAMDISK_LOAD_FLAG 0x4000 static char __initdata command_line[COMMAND_LINE_SIZE]; +#ifdef CONFIG_CMDLINE_BOOL +static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE; +#endif #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) struct edd edd; @@ -304,7 +302,7 @@ static void __init relocate_initrd(void) if (clen > MAX_MAP_CHUNK-slop) clen = MAX_MAP_CHUNK-slop; mapaddr = ramdisk_image & PAGE_MASK; - p = early_ioremap(mapaddr, clen+slop); + p = early_memremap(mapaddr, clen+slop); memcpy(q, p+slop, clen); early_iounmap(p, clen+slop); q += clen; @@ -381,7 +379,7 @@ static void __init parse_setup_data(void) return; pa_data = boot_params.hdr.setup_data; while (pa_data) { - data = early_ioremap(pa_data, PAGE_SIZE); + data = early_memremap(pa_data, PAGE_SIZE); switch (data->type) { case SETUP_E820_EXT: parse_e820_ext(data, pa_data); @@ -404,7 +402,7 @@ static void __init e820_reserve_setup_data(void) return; pa_data = boot_params.hdr.setup_data; while (pa_data) { - data = early_ioremap(pa_data, sizeof(*data)); + data = early_memremap(pa_data, sizeof(*data)); e820_update_range(pa_data, sizeof(*data)+data->len, E820_RAM, E820_RESERVED_KERN); found = 1; @@ -430,7 +428,7 @@ static void __init reserve_early_setup_data(void) return; pa_data = boot_params.hdr.setup_data; while (pa_data) { - data = early_ioremap(pa_data, sizeof(*data)); + data = early_memremap(pa_data, sizeof(*data)); sprintf(buf, "setup data %x", data->type); reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf); pa_data = data->next; @@ -450,7 +448,7 @@ static void __init reserve_early_setup_data(void) * @size: Size of the crashkernel memory to reserve. * Returns the base address on success, and -1ULL on failure. */ -unsigned long long find_and_reserve_crashkernel(unsigned long long size) +unsigned long long __init find_and_reserve_crashkernel(unsigned long long size) { const unsigned long long alignment = 16<<20; /* 16M */ unsigned long long start = 0LL; @@ -563,7 +561,13 @@ static void __init reserve_standard_io_resources(void) } -#ifdef CONFIG_PROC_VMCORE +/* + * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by + * is_kdump_kernel() to determine if we are booting after a panic. Hence + * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. + */ + +#ifdef CONFIG_CRASH_DUMP /* elfcorehdr= specifies the location of elf core header * stored by the crashed kernel. This option will be passed * by kexec loader to the capture kernel. @@ -579,6 +583,194 @@ static int __init setup_elfcorehdr(char *arg) early_param("elfcorehdr", setup_elfcorehdr); #endif +static struct x86_quirks default_x86_quirks __initdata; + +struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; + +/* + * Some BIOSes seem to corrupt the low 64k of memory during events + * like suspend/resume and unplugging an HDMI cable. Reserve all + * remaining free memory in that area and fill it with a distinct + * pattern. + */ +#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION +#define MAX_SCAN_AREAS 8 + +static int __read_mostly memory_corruption_check = -1; + +static unsigned __read_mostly corruption_check_size = 64*1024; +static unsigned __read_mostly corruption_check_period = 60; /* seconds */ + +static struct e820entry scan_areas[MAX_SCAN_AREAS]; +static int num_scan_areas; + + +static int set_corruption_check(char *arg) +{ + char *end; + + memory_corruption_check = simple_strtol(arg, &end, 10); + + return (*end == 0) ? 0 : -EINVAL; +} +early_param("memory_corruption_check", set_corruption_check); + +static int set_corruption_check_period(char *arg) +{ + char *end; + + corruption_check_period = simple_strtoul(arg, &end, 10); + + return (*end == 0) ? 0 : -EINVAL; +} +early_param("memory_corruption_check_period", set_corruption_check_period); + +static int set_corruption_check_size(char *arg) +{ + char *end; + unsigned size; + + size = memparse(arg, &end); + + if (*end == '\0') + corruption_check_size = size; + + return (size == corruption_check_size) ? 0 : -EINVAL; +} +early_param("memory_corruption_check_size", set_corruption_check_size); + + +static void __init setup_bios_corruption_check(void) +{ + u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */ + + if (memory_corruption_check == -1) { + memory_corruption_check = +#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK + 1 +#else + 0 +#endif + ; + } + + if (corruption_check_size == 0) + memory_corruption_check = 0; + + if (!memory_corruption_check) + return; + + corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); + + while(addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) { + u64 size; + addr = find_e820_area_size(addr, &size, PAGE_SIZE); + + if (addr == 0) + break; + + if ((addr + size) > corruption_check_size) + size = corruption_check_size - addr; + + if (size == 0) + break; + + e820_update_range(addr, size, E820_RAM, E820_RESERVED); + scan_areas[num_scan_areas].addr = addr; + scan_areas[num_scan_areas].size = size; + num_scan_areas++; + + /* Assume we've already mapped this early memory */ + memset(__va(addr), 0, size); + + addr += size; + } + + printk(KERN_INFO "Scanning %d areas for low memory corruption\n", + num_scan_areas); + update_e820(); +} + +static struct timer_list periodic_check_timer; + +void check_for_bios_corruption(void) +{ + int i; + int corruption = 0; + + if (!memory_corruption_check) + return; + + for(i = 0; i < num_scan_areas; i++) { + unsigned long *addr = __va(scan_areas[i].addr); + unsigned long size = scan_areas[i].size; + + for(; size; addr++, size -= sizeof(unsigned long)) { + if (!*addr) + continue; + printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n", + addr, __pa(addr), *addr); + corruption = 1; + *addr = 0; + } + } + + WARN(corruption, KERN_ERR "Memory corruption detected in low memory\n"); +} + +static void periodic_check_for_corruption(unsigned long data) +{ + check_for_bios_corruption(); + mod_timer(&periodic_check_timer, round_jiffies(jiffies + corruption_check_period*HZ)); +} + +void start_periodic_check_for_corruption(void) +{ + if (!memory_corruption_check || corruption_check_period == 0) + return; + + printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n", + corruption_check_period); + + init_timer(&periodic_check_timer); + periodic_check_timer.function = &periodic_check_for_corruption; + periodic_check_for_corruption(0); +} +#endif + +static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) +{ + printk(KERN_NOTICE + "%s detected: BIOS may corrupt low RAM, working it around.\n", + d->ident); + + e820_update_range(0, 0x10000, E820_RAM, E820_RESERVED); + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + + return 0; +} + +/* List of systems that have known low memory corruption BIOS problems */ +static struct dmi_system_id __initdata bad_bios_dmi_table[] = { +#ifdef CONFIG_X86_RESERVE_LOW_64K + { + .callback = dmi_low_memory_corruption, + .ident = "AMI BIOS", + .matches = { + DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."), + }, + }, + { + .callback = dmi_low_memory_corruption, + .ident = "Phoenix BIOS", + .matches = { + DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies, LTD"), + }, + }, +#endif + {} +}; + /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -598,11 +790,11 @@ void __init setup_arch(char **cmdline_p) memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); visws_early_detect(); pre_setup_arch_hook(); - early_cpu_init(); #else printk(KERN_INFO "Command line: %s\n", boot_command_line); #endif + early_cpu_init(); early_ioremap_init(); ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); @@ -666,14 +858,36 @@ void __init setup_arch(char **cmdline_p) bss_resource.start = virt_to_phys(&__bss_start); bss_resource.end = virt_to_phys(&__bss_stop)-1; -#ifdef CONFIG_X86_64 - early_cpu_init(); +#ifdef CONFIG_CMDLINE_BOOL +#ifdef CONFIG_CMDLINE_OVERRIDE + strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); +#else + if (builtin_cmdline[0]) { + /* append boot loader cmdline to builtin */ + strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE); + strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE); + strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); + } #endif +#endif + strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; parse_early_param(); +#ifdef CONFIG_X86_64 + check_efer(); +#endif + +#if defined(CONFIG_VMI) && defined(CONFIG_X86_32) + /* + * Must be before kernel pagetables are setup + * or fixmap area is touched. + */ + vmi_init(); +#endif + /* after early param, so could get panic from serial */ reserve_early_setup_data(); @@ -681,7 +895,7 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_X86_LOCAL_APIC disable_apic = 1; #endif - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); + setup_clear_cpu_cap(X86_FEATURE_APIC); } #ifdef CONFIG_PCI @@ -691,6 +905,10 @@ void __init setup_arch(char **cmdline_p) finish_e820_parsing(); + dmi_scan_machine(); + + dmi_check_system(bad_bios_dmi_table); + #ifdef CONFIG_X86_32 probe_roms(); #endif @@ -734,7 +952,8 @@ void __init setup_arch(char **cmdline_p) #else num_physpages = max_pfn; - check_efer(); + if (cpu_has_x2apic) + check_x2apic(); /* How many end-of-memory variables you have, grandma! */ /* need this before calling reserve_initrd */ @@ -746,6 +965,10 @@ void __init setup_arch(char **cmdline_p) high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; #endif +#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION + setup_bios_corruption_check(); +#endif + /* max_pfn_mapped is updated here */ max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT); max_pfn_mapped = max_low_pfn_mapped; @@ -774,8 +997,6 @@ void __init setup_arch(char **cmdline_p) vsmp_init(); #endif - dmi_scan_machine(); - io_delay_init(); /* @@ -783,6 +1004,8 @@ void __init setup_arch(char **cmdline_p) */ acpi_boot_table_init(); + early_acpi_boot_init(); + #ifdef CONFIG_ACPI_NUMA /* * Parse SRAT to discover nodes. @@ -792,10 +1015,6 @@ void __init setup_arch(char **cmdline_p) initmem_init(0, max_pfn); -#ifdef CONFIG_X86_64 - dma32_reserve_bootmem(); -#endif - #ifdef CONFIG_ACPI_SLEEP /* * Reserve low memory region for sleep support. @@ -810,21 +1029,25 @@ void __init setup_arch(char **cmdline_p) #endif reserve_crashkernel(); +#ifdef CONFIG_X86_64 + /* + * dma32_reserve_bootmem() allocates bootmem which may conflict + * with the crashkernel command line, so do that after + * reserve_crashkernel() + */ + dma32_reserve_bootmem(); +#endif + reserve_ibft_region(); #ifdef CONFIG_KVM_CLOCK kvmclock_init(); #endif -#if defined(CONFIG_VMI) && defined(CONFIG_X86_32) - /* - * Must be after max_low_pfn is determined, and before kernel - * pagetables are setup. - */ - vmi_init(); -#endif - + paravirt_pagetable_setup_start(swapper_pg_dir); paging_init(); + paravirt_pagetable_setup_done(swapper_pg_dir); + paravirt_post_allocator_init(); #ifdef CONFIG_X86_64 map_vsyscall(); @@ -850,27 +1073,17 @@ void __init setup_arch(char **cmdline_p) #endif prefill_possible_map(); + #ifdef CONFIG_X86_64 init_cpu_to_node(); #endif -#ifdef CONFIG_X86_NUMAQ - /* - * need to check online nodes num, call it - * here before time_init/tsc_init - */ - numaq_tsc_disable(); -#endif - init_apic_mappings(); ioapic_init_mappings(); -#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC) && defined(CONFIG_X86_32) - if (def_to_bigsmp) - printk(KERN_WARNING "More than 8 CPUs detected and " - "CONFIG_X86_PC cannot handle it.\nUse " - "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n"); -#endif + /* need to wait for io_apic is mapped */ + nr_irqs = probe_nr_irqs(); + kvm_guest_init(); e820_reserve_resources(); @@ -892,3 +1105,5 @@ void __init setup_arch(char **cmdline_p) #endif #endif } + + diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index cac68430d31f..ae0c0d3bb770 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -80,24 +80,6 @@ static void __init setup_per_cpu_maps(void) #endif } -#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP -cpumask_t *cpumask_of_cpu_map __read_mostly; -EXPORT_SYMBOL(cpumask_of_cpu_map); - -/* requires nr_cpu_ids to be initialized */ -static void __init setup_cpumask_of_cpu(void) -{ - int i; - - /* alloc_bootmem zeroes memory */ - cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids); - for (i = 0; i < nr_cpu_ids; i++) - cpu_set(i, cpumask_of_cpu_map[i]); -} -#else -static inline void setup_cpumask_of_cpu(void) { } -#endif - #ifdef CONFIG_X86_32 /* * Great future not-so-futuristic plan: make i386 and x86_64 do it @@ -158,35 +140,47 @@ static void __init setup_cpu_pda_map(void) */ void __init setup_per_cpu_areas(void) { - ssize_t size = PERCPU_ENOUGH_ROOM; + ssize_t size, old_size; char *ptr; int cpu; + unsigned long align = 1; /* Setup cpu_pda map */ setup_cpu_pda_map(); /* Copy section for each CPU (we discard the original) */ - size = PERCPU_ENOUGH_ROOM; + old_size = PERCPU_ENOUGH_ROOM; + align = max_t(unsigned long, PAGE_SIZE, align); + size = roundup(old_size, align); printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n", size); for_each_possible_cpu(cpu) { #ifndef CONFIG_NEED_MULTIPLE_NODES - ptr = alloc_bootmem_pages(size); + ptr = __alloc_bootmem(size, align, + __pa(MAX_DMA_ADDRESS)); #else int node = early_cpu_to_node(cpu); if (!node_online(node) || !NODE_DATA(node)) { - ptr = alloc_bootmem_pages(size); + ptr = __alloc_bootmem(size, align, + __pa(MAX_DMA_ADDRESS)); printk(KERN_INFO "cpu %d has no node %d or node-local memory\n", cpu, node); + if (ptr) + printk(KERN_DEBUG "per cpu data for cpu%d at %016lx\n", + cpu, __pa(ptr)); + } + else { + ptr = __alloc_bootmem_node(NODE_DATA(node), size, align, + __pa(MAX_DMA_ADDRESS)); + if (ptr) + printk(KERN_DEBUG "per cpu data for cpu%d on node%d at %016lx\n", + cpu, node, __pa(ptr)); } - else - ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); #endif per_cpu_offset(cpu) = ptr - __per_cpu_start; memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); - } printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n", @@ -197,9 +191,6 @@ void __init setup_per_cpu_areas(void) /* Setup node to cpumask map */ setup_node_to_cpumask_map(); - - /* Setup cpumask_of_cpu map */ - setup_cpumask_of_cpu(); } #endif @@ -227,8 +218,8 @@ static void __init setup_node_to_cpumask_map(void) /* allocate the map */ map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t)); - Dprintk(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n", - map, nr_node_ids); + pr_debug("Node to cpumask map at %p for %d nodes\n", + map, nr_node_ids); /* node_to_cpumask() will now work */ node_to_cpumask_map = map; @@ -248,7 +239,7 @@ void __cpuinit numa_set_node(int cpu, int node) per_cpu(x86_cpu_to_node_map, cpu) = node; else - Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu); + pr_debug("Setting node for non-present cpu %d\n", cpu); } void __cpuinit numa_clear_node(int cpu) diff --git a/arch/x86/kernel/sigframe.h b/arch/x86/kernel/sigframe.h index 72bbb519d2dc..cc673aa55ce4 100644 --- a/arch/x86/kernel/sigframe.h +++ b/arch/x86/kernel/sigframe.h @@ -3,9 +3,18 @@ struct sigframe { char __user *pretcode; int sig; struct sigcontext sc; - struct _fpstate fpstate; + /* + * fpstate is unused. fpstate is moved/allocated after + * retcode[] below. This movement allows to have the FP state and the + * future state extensions (xsave) stay together. + * And at the same time retaining the unused fpstate, prevents changing + * the offset of extramask[] in the sigframe and thus prevent any + * legacy application accessing/modifying it. + */ + struct _fpstate fpstate_unused; unsigned long extramask[_NSIG_WORDS-1]; char retcode[8]; + /* fp state follows here */ }; struct rt_sigframe { @@ -15,13 +24,19 @@ struct rt_sigframe { void __user *puc; struct siginfo info; struct ucontext uc; - struct _fpstate fpstate; char retcode[8]; + /* fp state follows here */ }; #else struct rt_sigframe { char __user *pretcode; struct ucontext uc; struct siginfo info; + /* fp state follows here */ }; + +int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + sigset_t *set, struct pt_regs *regs); +int ia32_setup_frame(int sig, struct k_sigaction *ka, + sigset_t *set, struct pt_regs *regs); #endif diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index d92373630963..d6dd057d0f22 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -17,6 +17,7 @@ #include <linux/errno.h> #include <linux/sched.h> #include <linux/wait.h> +#include <linux/tracehook.h> #include <linux/elf.h> #include <linux/smp.h> #include <linux/mm.h> @@ -26,6 +27,8 @@ #include <asm/uaccess.h> #include <asm/i387.h> #include <asm/vdso.h> +#include <asm/syscall.h> +#include <asm/syscalls.h> #include "sigframe.h" @@ -110,6 +113,27 @@ asmlinkage int sys_sigaltstack(unsigned long bx) return do_sigaltstack(uss, uoss, regs->sp); } +#define COPY(x) { \ + err |= __get_user(regs->x, &sc->x); \ +} + +#define COPY_SEG(seg) { \ + unsigned short tmp; \ + err |= __get_user(tmp, &sc->seg); \ + regs->seg = tmp; \ +} + +#define COPY_SEG_STRICT(seg) { \ + unsigned short tmp; \ + err |= __get_user(tmp, &sc->seg); \ + regs->seg = tmp | 3; \ +} + +#define GET_SEG(seg) { \ + unsigned short tmp; \ + err |= __get_user(tmp, &sc->seg); \ + loadsegment(seg, tmp); \ +} /* * Do a signal return; undo the signal stack. @@ -118,28 +142,13 @@ static int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned long *pax) { + void __user *buf; + unsigned int tmpflags; unsigned int err = 0; /* Always make any pending restarted system calls return -EINTR */ current_thread_info()->restart_block.fn = do_no_restart_syscall; -#define COPY(x) err |= __get_user(regs->x, &sc->x) - -#define COPY_SEG(seg) \ - { unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ - regs->seg = tmp; } - -#define COPY_SEG_STRICT(seg) \ - { unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ - regs->seg = tmp|3; } - -#define GET_SEG(seg) \ - { unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ - loadsegment(seg, tmp); } - GET_SEG(gs); COPY_SEG(fs); COPY_SEG(es); @@ -149,38 +158,15 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, COPY_SEG_STRICT(cs); COPY_SEG_STRICT(ss); - { - unsigned int tmpflags; - - err |= __get_user(tmpflags, &sc->flags); - regs->flags = (regs->flags & ~FIX_EFLAGS) | - (tmpflags & FIX_EFLAGS); - regs->orig_ax = -1; /* disable syscall checks */ - } + err |= __get_user(tmpflags, &sc->flags); + regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); + regs->orig_ax = -1; /* disable syscall checks */ - { - struct _fpstate __user *buf; - - err |= __get_user(buf, &sc->fpstate); - if (buf) { - if (!access_ok(VERIFY_READ, buf, sizeof(*buf))) - goto badframe; - err |= restore_i387(buf); - } else { - struct task_struct *me = current; - - if (used_math()) { - clear_fpu(me); - clear_used_math(); - } - } - } + err |= __get_user(buf, &sc->fpstate); + err |= restore_i387_xstate(buf); err |= __get_user(*pax, &sc->ax); return err; - -badframe: - return 1; } asmlinkage unsigned long sys_sigreturn(unsigned long __unused) @@ -212,7 +198,7 @@ asmlinkage unsigned long sys_sigreturn(unsigned long __unused) badframe: if (show_unhandled_signals && printk_ratelimit()) { - printk(KERN_INFO "%s%s[%d] bad frame in sigreturn frame:" + printk("%s%s[%d] bad frame in sigreturn frame:" "%p ip:%lx sp:%lx oeax:%lx", task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, current->comm, task_pid_nr(current), frame, regs->ip, @@ -226,9 +212,8 @@ badframe: return 0; } -asmlinkage int sys_rt_sigreturn(unsigned long __unused) +static long do_rt_sigreturn(struct pt_regs *regs) { - struct pt_regs *regs = (struct pt_regs *)&__unused; struct rt_sigframe __user *frame; unsigned long ax; sigset_t set; @@ -254,15 +239,22 @@ asmlinkage int sys_rt_sigreturn(unsigned long __unused) return ax; badframe: - force_sig(SIGSEGV, current); + signal_fault(regs, frame, "rt_sigreturn"); return 0; } +asmlinkage int sys_rt_sigreturn(unsigned long __unused) +{ + struct pt_regs *regs = (struct pt_regs *)&__unused; + + return do_rt_sigreturn(regs); +} + /* * Set up a signal frame. */ static int -setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate, +setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, struct pt_regs *regs, unsigned long mask) { int tmp, err = 0; @@ -289,7 +281,7 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate, err |= __put_user(regs->sp, &sc->sp_at_signal); err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss); - tmp = save_i387(fpstate); + tmp = save_i387_xstate(fpstate); if (tmp < 0) err = 1; else @@ -306,7 +298,8 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate, * Determine which stack to use.. */ static inline void __user * -get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size) +get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, + void **fpstate) { unsigned long sp; @@ -332,6 +325,11 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size) sp = (unsigned long) ka->sa.sa_restorer; } + if (used_math()) { + sp = sp - sig_xstate_size; + *fpstate = (struct _fpstate *) sp; + } + sp -= frame_size; /* * Align the stack pointer according to the i386 ABI, @@ -343,38 +341,29 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size) } static int -setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, - struct pt_regs *regs) +__setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, + struct pt_regs *regs) { struct sigframe __user *frame; void __user *restorer; int err = 0; - int usig; + void __user *fpstate = NULL; - frame = get_sigframe(ka, regs, sizeof(*frame)); + frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - goto give_sigsegv; + return -EFAULT; - usig = current_thread_info()->exec_domain - && current_thread_info()->exec_domain->signal_invmap - && sig < 32 - ? current_thread_info()->exec_domain->signal_invmap[sig] - : sig; + if (__put_user(sig, &frame->sig)) + return -EFAULT; - err = __put_user(usig, &frame->sig); - if (err) - goto give_sigsegv; - - err = setup_sigcontext(&frame->sc, &frame->fpstate, regs, set->sig[0]); - if (err) - goto give_sigsegv; + if (setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0])) + return -EFAULT; if (_NSIG_WORDS > 1) { - err = __copy_to_user(&frame->extramask, &set->sig[1], - sizeof(frame->extramask)); - if (err) - goto give_sigsegv; + if (__copy_to_user(&frame->extramask, &set->sig[1], + sizeof(frame->extramask))) + return -EFAULT; } if (current->mm->context.vdso) @@ -399,7 +388,7 @@ setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, err |= __put_user(0x80cd, (short __user *)(frame->retcode+6)); if (err) - goto give_sigsegv; + return -EFAULT; /* Set up registers for signal handler */ regs->sp = (unsigned long)frame; @@ -414,50 +403,43 @@ setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, regs->cs = __USER_CS; return 0; - -give_sigsegv: - force_sigsegv(sig, current); - return -EFAULT; } -static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *set, struct pt_regs *regs) +static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + sigset_t *set, struct pt_regs *regs) { struct rt_sigframe __user *frame; void __user *restorer; int err = 0; - int usig; + void __user *fpstate = NULL; - frame = get_sigframe(ka, regs, sizeof(*frame)); + frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - goto give_sigsegv; - - usig = current_thread_info()->exec_domain - && current_thread_info()->exec_domain->signal_invmap - && sig < 32 - ? current_thread_info()->exec_domain->signal_invmap[sig] - : sig; + return -EFAULT; - err |= __put_user(usig, &frame->sig); + err |= __put_user(sig, &frame->sig); err |= __put_user(&frame->info, &frame->pinfo); err |= __put_user(&frame->uc, &frame->puc); err |= copy_siginfo_to_user(&frame->info, info); if (err) - goto give_sigsegv; + return -EFAULT; /* Create the ucontext. */ - err |= __put_user(0, &frame->uc.uc_flags); + if (cpu_has_xsave) + err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags); + else + err |= __put_user(0, &frame->uc.uc_flags); err |= __put_user(0, &frame->uc.uc_link); err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); err |= __put_user(sas_ss_flags(regs->sp), &frame->uc.uc_stack.ss_flags); err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); - err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate, + err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, regs, set->sig[0]); err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); if (err) - goto give_sigsegv; + return -EFAULT; /* Set up to return from userspace. */ restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); @@ -477,12 +459,12 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, err |= __put_user(0x80cd, (short __user *)(frame->retcode+5)); if (err) - goto give_sigsegv; + return -EFAULT; /* Set up registers for signal handler */ regs->sp = (unsigned long)frame; regs->ip = (unsigned long)ka->sa.sa_handler; - regs->ax = (unsigned long)usig; + regs->ax = (unsigned long)sig; regs->dx = (unsigned long)&frame->info; regs->cx = (unsigned long)&frame->uc; @@ -492,15 +474,48 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, regs->cs = __USER_CS; return 0; - -give_sigsegv: - force_sigsegv(sig, current); - return -EFAULT; } /* * OK, we're invoking a handler: */ +static int signr_convert(int sig) +{ + struct thread_info *info = current_thread_info(); + + if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32) + return info->exec_domain->signal_invmap[sig]; + return sig; +} + +#define is_ia32 1 +#define ia32_setup_frame __setup_frame +#define ia32_setup_rt_frame __setup_rt_frame + +static int +setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + sigset_t *set, struct pt_regs *regs) +{ + int usig = signr_convert(sig); + int ret; + + /* Set up the stack frame */ + if (is_ia32) { + if (ka->sa.sa_flags & SA_SIGINFO) + ret = ia32_setup_rt_frame(usig, ka, info, set, regs); + else + ret = ia32_setup_frame(usig, ka, set, regs); + } else + ret = __setup_rt_frame(sig, ka, info, set, regs); + + if (ret) { + force_sigsegv(sig, current); + return -EFAULT; + } + + return ret; +} + static int handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, sigset_t *oldset, struct pt_regs *regs) @@ -508,9 +523,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, int ret; /* Are we from a system call? */ - if ((long)regs->orig_ax >= 0) { + if (syscall_get_nr(current, regs) >= 0) { /* If so, check system call restarting.. */ - switch (regs->ax) { + switch (syscall_get_error(current, regs)) { case -ERESTART_RESTARTBLOCK: case -ERESTARTNOHAND: regs->ax = -EINTR; @@ -537,15 +552,20 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, likely(test_and_clear_thread_flag(TIF_FORCED_TF))) regs->flags &= ~X86_EFLAGS_TF; - /* Set up the stack frame */ - if (ka->sa.sa_flags & SA_SIGINFO) - ret = setup_rt_frame(sig, ka, info, oldset, regs); - else - ret = setup_frame(sig, ka, oldset, regs); + ret = setup_rt_frame(sig, ka, info, oldset, regs); if (ret) return ret; +#ifdef CONFIG_X86_64 + /* + * This has nothing to do with segment registers, + * despite the name. This magic affects uaccess.h + * macros' behavior. Reset it to the normal setting. + */ + set_fs(USER_DS); +#endif + /* * Clear the direction flag as per the ABI for function entry. */ @@ -558,8 +578,6 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, * handler too. */ regs->flags &= ~X86_EFLAGS_TF; - if (test_thread_flag(TIF_SINGLESTEP)) - ptrace_notify(SIGTRAP); spin_lock_irq(¤t->sighand->siglock); sigorsets(¤t->blocked, ¤t->blocked, &ka->sa.sa_mask); @@ -568,9 +586,13 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); + tracehook_signal_handler(sig, info, ka, regs, + test_thread_flag(TIF_SINGLESTEP)); + return 0; } +#define NR_restart_syscall __NR_restart_syscall /* * Note that 'init' is a special process: it doesn't get signals it doesn't * want to handle. Thus you cannot kill init even with a SIGKILL even by @@ -623,9 +645,9 @@ static void do_signal(struct pt_regs *regs) } /* Did we come from a system call? */ - if ((long)regs->orig_ax >= 0) { + if (syscall_get_nr(current, regs) >= 0) { /* Restart the system call - no handlers present */ - switch (regs->ax) { + switch (syscall_get_error(current, regs)) { case -ERESTARTNOHAND: case -ERESTARTSYS: case -ERESTARTNOINTR: @@ -634,7 +656,7 @@ static void do_signal(struct pt_regs *regs) break; case -ERESTART_RESTARTBLOCK: - regs->ax = __NR_restart_syscall; + regs->ax = NR_restart_syscall; regs->ip -= 2; break; } @@ -657,18 +679,38 @@ static void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { - /* Pending single-step? */ - if (thread_info_flags & _TIF_SINGLESTEP) { - regs->flags |= X86_EFLAGS_TF; - clear_thread_flag(TIF_SINGLESTEP); - } +#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) + /* notify userspace of pending MCEs */ + if (thread_info_flags & _TIF_MCE_NOTIFY) + mce_notify_user(); +#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ /* deal with pending signal delivery */ if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs); - if (thread_info_flags & _TIF_HRTICK_RESCHED) - hrtick_resched(); + if (thread_info_flags & _TIF_NOTIFY_RESUME) { + clear_thread_flag(TIF_NOTIFY_RESUME); + tracehook_notify_resume(regs); + } +#ifdef CONFIG_X86_32 clear_thread_flag(TIF_IRET); +#endif /* CONFIG_X86_32 */ +} + +void signal_fault(struct pt_regs *regs, void __user *frame, char *where) +{ + struct task_struct *me = current; + + if (show_unhandled_signals && printk_ratelimit()) { + printk(KERN_INFO + "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", + me->comm, me->pid, where, frame, + regs->ip, regs->sp, regs->orig_ax); + print_vma_addr(" in ", regs->ip); + printk(KERN_CONT "\n"); + } + + force_sig(SIGSEGV, me); } diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index e53b267662e7..a5c9627f4db9 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -15,17 +15,21 @@ #include <linux/errno.h> #include <linux/wait.h> #include <linux/ptrace.h> +#include <linux/tracehook.h> #include <linux/unistd.h> #include <linux/stddef.h> #include <linux/personality.h> #include <linux/compiler.h> +#include <linux/uaccess.h> + #include <asm/processor.h> #include <asm/ucontext.h> -#include <asm/uaccess.h> #include <asm/i387.h> #include <asm/proto.h> #include <asm/ia32_unistd.h> #include <asm/mce.h> +#include <asm/syscall.h> +#include <asm/syscalls.h> #include "sigframe.h" #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) @@ -41,11 +45,6 @@ # define FIX_EFLAGS __FIX_EFLAGS #endif -int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *set, struct pt_regs * regs); -int ia32_setup_frame(int sig, struct k_sigaction *ka, - sigset_t *set, struct pt_regs * regs); - asmlinkage long sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, struct pt_regs *regs) @@ -53,6 +52,15 @@ sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, return do_sigaltstack(uss, uoss, regs->sp); } +#define COPY(x) { \ + err |= __get_user(regs->x, &sc->x); \ +} + +#define COPY_SEG_STRICT(seg) { \ + unsigned short tmp; \ + err |= __get_user(tmp, &sc->seg); \ + regs->seg = tmp | 3; \ +} /* * Do a signal return; undo the signal stack. @@ -61,13 +69,13 @@ static int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned long *pax) { + void __user *buf; + unsigned int tmpflags; unsigned int err = 0; /* Always make any pending restarted system calls return -EINTR */ current_thread_info()->restart_block.fn = do_no_restart_syscall; -#define COPY(x) err |= __get_user(regs->x, &sc->x) - COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); COPY(dx); COPY(cx); COPY(ip); COPY(r8); @@ -82,48 +90,24 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, /* Kernel saves and restores only the CS segment register on signals, * which is the bare minimum needed to allow mixed 32/64-bit code. * App's signal handler can save/restore other segments if needed. */ - { - unsigned cs; - err |= __get_user(cs, &sc->cs); - regs->cs = cs | 3; /* Force into user mode */ - } + COPY_SEG_STRICT(cs); - { - unsigned int tmpflags; - err |= __get_user(tmpflags, &sc->flags); - regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); - regs->orig_ax = -1; /* disable syscall checks */ - } + err |= __get_user(tmpflags, &sc->flags); + regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); + regs->orig_ax = -1; /* disable syscall checks */ - { - struct _fpstate __user * buf; - err |= __get_user(buf, &sc->fpstate); - - if (buf) { - if (!access_ok(VERIFY_READ, buf, sizeof(*buf))) - goto badframe; - err |= restore_i387(buf); - } else { - struct task_struct *me = current; - if (used_math()) { - clear_fpu(me); - clear_used_math(); - } - } - } + err |= __get_user(buf, &sc->fpstate); + err |= restore_i387_xstate(buf); err |= __get_user(*pax, &sc->ax); return err; - -badframe: - return 1; } -asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) +static long do_rt_sigreturn(struct pt_regs *regs) { struct rt_sigframe __user *frame; - sigset_t set; unsigned long ax; + sigset_t set; frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) @@ -136,7 +120,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) current->blocked = set; recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - + if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) goto badframe; @@ -146,16 +130,22 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) return ax; badframe: - signal_fault(regs,frame,"sigreturn"); + signal_fault(regs, frame, "rt_sigreturn"); return 0; -} +} + +asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) +{ + return do_rt_sigreturn(regs); +} /* * Set up a signal frame. */ static inline int -setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned long mask, struct task_struct *me) +setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, + unsigned long mask, struct task_struct *me) { int err = 0; @@ -207,41 +197,40 @@ get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size) sp = current->sas_ss_sp + current->sas_ss_size; } - return (void __user *)round_down(sp - size, 16); + return (void __user *)round_down(sp - size, 64); } -static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *set, struct pt_regs * regs) +static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + sigset_t *set, struct pt_regs *regs) { struct rt_sigframe __user *frame; - struct _fpstate __user *fp = NULL; + void __user *fp = NULL; int err = 0; struct task_struct *me = current; if (used_math()) { - fp = get_stack(ka, regs, sizeof(struct _fpstate)); + fp = get_stack(ka, regs, sig_xstate_size); frame = (void __user *)round_down( (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8; - if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate))) - goto give_sigsegv; - - if (save_i387(fp) < 0) - err |= -1; + if (save_i387_xstate(fp) < 0) + return -EFAULT; } else frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8; if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - goto give_sigsegv; + return -EFAULT; - if (ka->sa.sa_flags & SA_SIGINFO) { - err |= copy_siginfo_to_user(&frame->info, info); - if (err) - goto give_sigsegv; + if (ka->sa.sa_flags & SA_SIGINFO) { + if (copy_siginfo_to_user(&frame->info, info)) + return -EFAULT; } - + /* Create the ucontext. */ - err |= __put_user(0, &frame->uc.uc_flags); + if (cpu_has_xsave) + err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags); + else + err |= __put_user(0, &frame->uc.uc_flags); err |= __put_user(0, &frame->uc.uc_link); err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); err |= __put_user(sas_ss_flags(regs->sp), @@ -249,9 +238,9 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me); err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate); - if (sizeof(*set) == 16) { + if (sizeof(*set) == 16) { __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]); - __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); + __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); } else err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); @@ -262,15 +251,15 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); } else { /* could use a vstub here */ - goto give_sigsegv; + return -EFAULT; } if (err) - goto give_sigsegv; + return -EFAULT; /* Set up registers for signal handler */ regs->di = sig; - /* In case the signal handler was declared without prototypes */ + /* In case the signal handler was declared without prototypes */ regs->ax = 0; /* This also works for non SA_SIGINFO handlers because they expect the @@ -286,44 +275,45 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, regs->cs = __USER_CS; return 0; - -give_sigsegv: - force_sigsegv(sig, current); - return -EFAULT; } /* - * Return -1L or the syscall number that @regs is executing. + * OK, we're invoking a handler */ -static long current_syscall(struct pt_regs *regs) +static int signr_convert(int sig) { - /* - * We always sign-extend a -1 value being set here, - * so this is always either -1L or a syscall number. - */ - return regs->orig_ax; + return sig; } -/* - * Return a value that is -EFOO if the system call in @regs->orig_ax - * returned an error. This only works for @regs from @current. - */ -static long current_syscall_ret(struct pt_regs *regs) -{ #ifdef CONFIG_IA32_EMULATION - if (test_thread_flag(TIF_IA32)) - /* - * Sign-extend the value so (int)-EFOO becomes (long)-EFOO - * and will match correctly in comparisons. - */ - return (int) regs->ax; +#define is_ia32 test_thread_flag(TIF_IA32) +#else +#define is_ia32 0 #endif - return regs->ax; -} -/* - * OK, we're invoking a handler - */ +static int +setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + sigset_t *set, struct pt_regs *regs) +{ + int usig = signr_convert(sig); + int ret; + + /* Set up the stack frame */ + if (is_ia32) { + if (ka->sa.sa_flags & SA_SIGINFO) + ret = ia32_setup_rt_frame(usig, ka, info, set, regs); + else + ret = ia32_setup_frame(usig, ka, set, regs); + } else + ret = __setup_rt_frame(sig, ka, info, set, regs); + + if (ret) { + force_sigsegv(sig, current); + return -EFAULT; + } + + return ret; +} static int handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, @@ -332,9 +322,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, int ret; /* Are we from a system call? */ - if (current_syscall(regs) >= 0) { + if (syscall_get_nr(current, regs) >= 0) { /* If so, check system call restarting.. */ - switch (current_syscall_ret(regs)) { + switch (syscall_get_error(current, regs)) { case -ERESTART_RESTARTBLOCK: case -ERESTARTNOHAND: regs->ax = -EINTR; @@ -361,50 +351,48 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, likely(test_and_clear_thread_flag(TIF_FORCED_TF))) regs->flags &= ~X86_EFLAGS_TF; -#ifdef CONFIG_IA32_EMULATION - if (test_thread_flag(TIF_IA32)) { - if (ka->sa.sa_flags & SA_SIGINFO) - ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs); - else - ret = ia32_setup_frame(sig, ka, oldset, regs); - } else -#endif ret = setup_rt_frame(sig, ka, info, oldset, regs); - if (ret == 0) { - /* - * This has nothing to do with segment registers, - * despite the name. This magic affects uaccess.h - * macros' behavior. Reset it to the normal setting. - */ - set_fs(USER_DS); + if (ret) + return ret; - /* - * Clear the direction flag as per the ABI for function entry. - */ - regs->flags &= ~X86_EFLAGS_DF; +#ifdef CONFIG_X86_64 + /* + * This has nothing to do with segment registers, + * despite the name. This magic affects uaccess.h + * macros' behavior. Reset it to the normal setting. + */ + set_fs(USER_DS); +#endif - /* - * Clear TF when entering the signal handler, but - * notify any tracer that was single-stepping it. - * The tracer may want to single-step inside the - * handler too. - */ - regs->flags &= ~X86_EFLAGS_TF; - if (test_thread_flag(TIF_SINGLESTEP)) - ptrace_notify(SIGTRAP); - - spin_lock_irq(¤t->sighand->siglock); - sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask); - if (!(ka->sa.sa_flags & SA_NODEFER)) - sigaddset(¤t->blocked,sig); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - } + /* + * Clear the direction flag as per the ABI for function entry. + */ + regs->flags &= ~X86_EFLAGS_DF; - return ret; + /* + * Clear TF when entering the signal handler, but + * notify any tracer that was single-stepping it. + * The tracer may want to single-step inside the + * handler too. + */ + regs->flags &= ~X86_EFLAGS_TF; + + spin_lock_irq(¤t->sighand->siglock); + sigorsets(¤t->blocked, ¤t->blocked, &ka->sa.sa_mask); + if (!(ka->sa.sa_flags & SA_NODEFER)) + sigaddset(¤t->blocked, sig); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + + tracehook_signal_handler(sig, info, ka, regs, + test_thread_flag(TIF_SINGLESTEP)); + + return 0; } +#define NR_restart_syscall \ + test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall /* * Note that 'init' is a special process: it doesn't get signals it doesn't * want to handle. Thus you cannot kill init even with a SIGKILL even by @@ -434,7 +422,8 @@ static void do_signal(struct pt_regs *regs) signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { - /* Re-enable any watchpoints before delivering the + /* + * Re-enable any watchpoints before delivering the * signal to user space. The processor register will * have been cleared if the watchpoint triggered * inside the kernel. @@ -442,7 +431,7 @@ static void do_signal(struct pt_regs *regs) if (current->thread.debugreg7) set_debugreg(current->thread.debugreg7, 7); - /* Whee! Actually deliver the signal. */ + /* Whee! Actually deliver the signal. */ if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { /* * A signal was successfully delivered; the saved @@ -456,19 +445,18 @@ static void do_signal(struct pt_regs *regs) } /* Did we come from a system call? */ - if (current_syscall(regs) >= 0) { + if (syscall_get_nr(current, regs) >= 0) { /* Restart the system call - no handlers present */ - switch (current_syscall_ret(regs)) { + switch (syscall_get_error(current, regs)) { case -ERESTARTNOHAND: case -ERESTARTSYS: case -ERESTARTNOINTR: regs->ax = regs->orig_ax; regs->ip -= 2; break; + case -ERESTART_RESTARTBLOCK: - regs->ax = test_thread_flag(TIF_IA32) ? - __NR_ia32_restart_syscall : - __NR_restart_syscall; + regs->ax = NR_restart_syscall; regs->ip -= 2; break; } @@ -484,38 +472,45 @@ static void do_signal(struct pt_regs *regs) } } -void do_notify_resume(struct pt_regs *regs, void *unused, - __u32 thread_info_flags) +/* + * notification of userspace execution resumption + * - triggered by the TIF_WORK_MASK flags + */ +void +do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { - /* Pending single-step? */ - if (thread_info_flags & _TIF_SINGLESTEP) { - regs->flags |= X86_EFLAGS_TF; - clear_thread_flag(TIF_SINGLESTEP); - } - -#ifdef CONFIG_X86_MCE +#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) /* notify userspace of pending MCEs */ if (thread_info_flags & _TIF_MCE_NOTIFY) mce_notify_user(); -#endif /* CONFIG_X86_MCE */ +#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ /* deal with pending signal delivery */ if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs); - if (thread_info_flags & _TIF_HRTICK_RESCHED) - hrtick_resched(); + if (thread_info_flags & _TIF_NOTIFY_RESUME) { + clear_thread_flag(TIF_NOTIFY_RESUME); + tracehook_notify_resume(regs); + } + +#ifdef CONFIG_X86_32 + clear_thread_flag(TIF_IRET); +#endif /* CONFIG_X86_32 */ } void signal_fault(struct pt_regs *regs, void __user *frame, char *where) -{ - struct task_struct *me = current; +{ + struct task_struct *me = current; + if (show_unhandled_signals && printk_ratelimit()) { - printk("%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", - me->comm,me->pid,where,frame,regs->ip,regs->sp,regs->orig_ax); + printk(KERN_INFO + "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", + me->comm, me->pid, where, frame, + regs->ip, regs->sp, regs->orig_ax); print_vma_addr(" in ", regs->ip); - printk("\n"); + printk(KERN_CONT "\n"); } - force_sig(SIGSEGV, me); -} + force_sig(SIGSEGV, me); +} diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 361b7a4c640c..18f9b19f5f8f 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -214,12 +214,16 @@ void smp_call_function_single_interrupt(struct pt_regs *regs) struct smp_ops smp_ops = { .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, .smp_prepare_cpus = native_smp_prepare_cpus, - .cpu_up = native_cpu_up, .smp_cpus_done = native_smp_cpus_done, .smp_send_stop = native_smp_send_stop, .smp_send_reschedule = native_smp_send_reschedule, + .cpu_up = native_cpu_up, + .cpu_die = native_cpu_die, + .cpu_disable = native_cpu_disable, + .play_dead = native_play_dead, + .send_call_func_ipi = native_send_call_func_ipi, .send_call_func_single_ipi = native_send_call_func_single_ipi, }; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 687376ab07e8..7b1093397319 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -52,6 +52,7 @@ #include <asm/desc.h> #include <asm/nmi.h> #include <asm/irq.h> +#include <asm/idle.h> #include <asm/smp.h> #include <asm/trampoline.h> #include <asm/cpu.h> @@ -88,7 +89,7 @@ static DEFINE_PER_CPU(struct task_struct *, idle_thread_array); #define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x)) #define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p)) #else -struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; +static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; #define get_idle_for_cpu(x) (idle_thread_array[(x)]) #define set_idle_for_cpu(x, p) (idle_thread_array[(x)] = (p)) #endif @@ -123,13 +124,12 @@ EXPORT_PER_CPU_SYMBOL(cpu_info); static atomic_t init_deasserted; -static int boot_cpu_logical_apicid; /* representing cpus for which sibling maps can be computed */ static cpumask_t cpu_sibling_setup_map; /* Set if we find a B stepping CPU */ -int __cpuinitdata smp_b_stepping; +static int __cpuinitdata smp_b_stepping; #if defined(CONFIG_NUMA) && defined(CONFIG_X86_32) @@ -165,6 +165,8 @@ static void unmap_cpu_to_node(int cpu) #endif #ifdef CONFIG_X86_32 +static int boot_cpu_logical_apicid; + u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; @@ -210,13 +212,13 @@ static void __cpuinit smp_callin(void) /* * (This works even if the APIC is not enabled.) */ - phys_id = GET_APIC_ID(read_apic_id()); + phys_id = read_apic_id(); cpuid = smp_processor_id(); if (cpu_isset(cpuid, cpu_callin_map)) { panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__, phys_id, cpuid); } - Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id); + pr_debug("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id); /* * STARTUP IPIs are fragile beasts as they might sometimes @@ -251,12 +253,13 @@ static void __cpuinit smp_callin(void) * boards) */ - Dprintk("CALLIN, before setup_local_APIC().\n"); + pr_debug("CALLIN, before setup_local_APIC().\n"); smp_callin_clear_local_apic(); setup_local_APIC(); end_local_APIC_setup(); map_cpu_to_logical_apicid(); + notify_cpu_starting(cpuid); /* * Get our bogomips. * @@ -266,7 +269,7 @@ static void __cpuinit smp_callin(void) local_irq_enable(); calibrate_delay(); local_irq_disable(); - Dprintk("Stack at about %p\n", &cpuid); + pr_debug("Stack at about %p\n", &cpuid); /* * Save our processor parameters @@ -279,6 +282,8 @@ static void __cpuinit smp_callin(void) cpu_set(cpuid, cpu_callin_map); } +static int __cpuinitdata unsafe_smp; + /* * Activate a secondary processor. */ @@ -326,15 +331,22 @@ static void __cpuinit start_secondary(void *unused) * for which cpus receive the IPI. Holding this * lock helps us to not include this cpu in a currently in progress * smp_call_function(). + * + * We need to hold vector_lock so there the set of online cpus + * does not change while we are assigning vectors to cpus. Holding + * this lock ensures we don't half assign or remove an irq from a cpu. */ - ipi_call_lock_irq(); -#ifdef CONFIG_X86_IO_APIC - setup_vector_irq(smp_processor_id()); -#endif + ipi_call_lock(); + lock_vector_lock(); + __setup_vector_irq(smp_processor_id()); cpu_set(smp_processor_id(), cpu_online_map); - ipi_call_unlock_irq(); + unlock_vector_lock(); + ipi_call_unlock(); per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; + /* enable local interrupts */ + local_irq_enable(); + setup_secondary_clock(); wmb(); @@ -387,7 +399,7 @@ static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c) goto valid_k7; /* If we get here, not a certified SMP capable AMD system. */ - add_taint(TAINT_UNSAFE_SMP); + unsafe_smp = 1; } valid_k7: @@ -404,12 +416,10 @@ static void __cpuinit smp_checks(void) * Don't taint if we are running SMP kernel on a single non-MP * approved Athlon */ - if (tainted & TAINT_UNSAFE_SMP) { - if (num_online_cpus()) - printk(KERN_INFO "WARNING: This combination of AMD" - "processors is not suitable for SMP.\n"); - else - tainted &= ~TAINT_UNSAFE_SMP; + if (unsafe_smp && num_online_cpus() > 1) { + printk(KERN_INFO "WARNING: This combination of AMD" + "processors is not suitable for SMP.\n"); + add_taint(TAINT_UNSAFE_SMP); } } @@ -438,7 +448,7 @@ void __cpuinit set_cpu_sibling_map(int cpu) cpu_set(cpu, cpu_sibling_setup_map); if (smp_num_siblings > 1) { - for_each_cpu_mask(i, cpu_sibling_setup_map) { + for_each_cpu_mask_nr(i, cpu_sibling_setup_map) { if (c->phys_proc_id == cpu_data(i).phys_proc_id && c->cpu_core_id == cpu_data(i).cpu_core_id) { cpu_set(i, per_cpu(cpu_sibling_map, cpu)); @@ -461,7 +471,7 @@ void __cpuinit set_cpu_sibling_map(int cpu) return; } - for_each_cpu_mask(i, cpu_sibling_setup_map) { + for_each_cpu_mask_nr(i, cpu_sibling_setup_map) { if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { cpu_set(i, c->llc_shared_map); @@ -513,7 +523,7 @@ static void impress_friends(void) /* * Allow the user to impress friends. */ - Dprintk("Before bogomips.\n"); + pr_debug("Before bogomips.\n"); for_each_possible_cpu(cpu) if (cpu_isset(cpu, cpu_callout_map)) bogosum += cpu_data(cpu).loops_per_jiffy; @@ -523,7 +533,7 @@ static void impress_friends(void) bogosum/(500000/HZ), (bogosum/(5000/HZ))%100); - Dprintk("Before bogocount - setting activated=1.\n"); + pr_debug("Before bogocount - setting activated=1.\n"); } static inline void __inquire_remote_apic(int apicid) @@ -533,10 +543,10 @@ static inline void __inquire_remote_apic(int apicid) int timeout; u32 status; - printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid); + printk(KERN_INFO "Inquiring remote APIC 0x%x...\n", apicid); for (i = 0; i < ARRAY_SIZE(regs); i++) { - printk(KERN_INFO "... APIC #%d %s: ", apicid, names[i]); + printk(KERN_INFO "... APIC 0x%x %s: ", apicid, names[i]); /* * Wait for idle. @@ -546,8 +556,7 @@ static inline void __inquire_remote_apic(int apicid) printk(KERN_CONT "a previous APIC delivery may have failed\n"); - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); - apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]); + apic_icr_write(APIC_DM_REMRD | regs[i], apicid); timeout = 0; do { @@ -579,29 +588,24 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) int maxlvt; /* Target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid)); - /* Boot on the stack */ /* Kick the second */ - apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL); + apic_icr_write(APIC_DM_NMI | APIC_DEST_LOGICAL, logical_apicid); - Dprintk("Waiting for send to finish...\n"); + pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); /* * Give the other CPU some time to accept the IPI. */ udelay(200); - /* - * Due to the Pentium erratum 3AP. - */ - maxlvt = lapic_get_maxlvt(); - if (maxlvt > 3) { - apic_read_around(APIC_SPIV); - apic_write(APIC_ESR, 0); + if (APIC_INTEGRATED(apic_version[phys_apicid])) { + maxlvt = lapic_get_maxlvt(); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + accept_status = (apic_read(APIC_ESR) & 0xEF); } - accept_status = (apic_read(APIC_ESR) & 0xEF); - Dprintk("NMI sent.\n"); + pr_debug("NMI sent.\n"); if (send_status) printk(KERN_ERR "APIC never delivered???\n"); @@ -625,42 +629,40 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) return send_status; } + maxlvt = lapic_get_maxlvt(); + /* * Be paranoid about clearing APIC errors. */ if (APIC_INTEGRATED(apic_version[phys_apicid])) { - apic_read_around(APIC_SPIV); - apic_write(APIC_ESR, 0); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); apic_read(APIC_ESR); } - Dprintk("Asserting INIT.\n"); + pr_debug("Asserting INIT.\n"); /* * Turn INIT on target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); - /* * Send IPI */ - apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT - | APIC_DM_INIT); + apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT, + phys_apicid); - Dprintk("Waiting for send to finish...\n"); + pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); mdelay(10); - Dprintk("Deasserting INIT.\n"); + pr_debug("Deasserting INIT.\n"); /* Target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); - /* Send IPI */ - apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); + apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid); - Dprintk("Waiting for send to finish...\n"); + pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); mb(); @@ -687,55 +689,46 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) /* * Run STARTUP IPI loop. */ - Dprintk("#startup loops: %d.\n", num_starts); - - maxlvt = lapic_get_maxlvt(); + pr_debug("#startup loops: %d.\n", num_starts); for (j = 1; j <= num_starts; j++) { - Dprintk("Sending STARTUP #%d.\n", j); - apic_read_around(APIC_SPIV); - apic_write(APIC_ESR, 0); + pr_debug("Sending STARTUP #%d.\n", j); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); apic_read(APIC_ESR); - Dprintk("After apic_write.\n"); + pr_debug("After apic_write.\n"); /* * STARTUP IPI */ /* Target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); - /* Boot on the stack */ /* Kick the second */ - apic_write_around(APIC_ICR, APIC_DM_STARTUP - | (start_eip >> 12)); + apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12), + phys_apicid); /* * Give the other CPU some time to accept the IPI. */ udelay(300); - Dprintk("Startup point 1.\n"); + pr_debug("Startup point 1.\n"); - Dprintk("Waiting for send to finish...\n"); + pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); /* * Give the other CPU some time to accept the IPI. */ udelay(200); - /* - * Due to the Pentium erratum 3AP. - */ - if (maxlvt > 3) { - apic_read_around(APIC_SPIV); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); - } accept_status = (apic_read(APIC_ESR) & 0xEF); if (send_status || accept_status) break; } - Dprintk("After Startup.\n"); + pr_debug("After Startup.\n"); if (send_status) printk(KERN_ERR "APIC never delivered???\n"); @@ -763,12 +756,20 @@ static void __cpuinit do_fork_idle(struct work_struct *work) } #ifdef CONFIG_X86_64 + +/* __ref because it's safe to call free_bootmem when after_bootmem == 0. */ +static void __ref free_bootmem_pda(struct x8664_pda *oldpda) +{ + if (!after_bootmem) + free_bootmem((unsigned long)oldpda, sizeof(*oldpda)); +} + /* * Allocate node local memory for the AP pda. * * Must be called after the _cpu_pda pointer table is initialized. */ -static int __cpuinit get_local_pda(int cpu) +int __cpuinit get_local_pda(int cpu) { struct x8664_pda *oldpda, *newpda; unsigned long size = sizeof(struct x8664_pda); @@ -791,8 +792,7 @@ static int __cpuinit get_local_pda(int cpu) if (oldpda) { memcpy(newpda, oldpda, size); - if (!after_bootmem) - free_bootmem((unsigned long)oldpda, size); + free_bootmem_pda(oldpda); } newpda->in_bootmem = 0; @@ -874,7 +874,7 @@ do_rest: start_ip = setup_trampoline(); /* So we see what's up */ - printk(KERN_INFO "Booting processor %d/%d ip %lx\n", + printk(KERN_INFO "Booting processor %d APIC 0x%x ip 0x%lx\n", cpu, apicid, start_ip); /* @@ -886,16 +886,18 @@ do_rest: if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { - Dprintk("Setting warm reset code and vector.\n"); + pr_debug("Setting warm reset code and vector.\n"); store_NMI_vector(&nmi_high, &nmi_low); smpboot_setup_warm_reset_vector(start_ip); /* * Be paranoid about clearing APIC errors. - */ - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); + */ + if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + } } /* @@ -907,9 +909,9 @@ do_rest: /* * allow APs to start initializing. */ - Dprintk("Before Callout %d.\n", cpu); + pr_debug("Before Callout %d.\n", cpu); cpu_set(cpu, cpu_callout_map); - Dprintk("After Callout %d.\n", cpu); + pr_debug("After Callout %d.\n", cpu); /* * Wait 5s total for a response @@ -922,10 +924,10 @@ do_rest: if (cpu_isset(cpu, cpu_callin_map)) { /* number CPUs logically, starting from 1 (BSP is 0) */ - Dprintk("OK.\n"); + pr_debug("OK.\n"); printk(KERN_INFO "CPU%d: ", cpu); print_cpu_info(&cpu_data(cpu)); - Dprintk("CPU has booted.\n"); + pr_debug("CPU has booted.\n"); } else { boot_error = 1; if (*((volatile unsigned char *)trampoline_base) @@ -970,7 +972,7 @@ int __cpuinit native_cpu_up(unsigned int cpu) WARN_ON(irqs_disabled()); - Dprintk("++++++++++++++++++++=_---CPU UP %u\n", cpu); + pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu); if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || !physid_isset(apicid, phys_cpu_present_map)) { @@ -982,7 +984,7 @@ int __cpuinit native_cpu_up(unsigned int cpu) * Already booted CPU? */ if (cpu_isset(cpu, cpu_callin_map)) { - Dprintk("do_boot_cpu %d Already started\n", cpu); + pr_debug("do_boot_cpu %d Already started\n", cpu); return -ENOSYS; } @@ -1009,7 +1011,7 @@ int __cpuinit native_cpu_up(unsigned int cpu) err = do_boot_cpu(apicid, cpu); #endif if (err) { - Dprintk("do_boot_cpu failed %d\n", err); + pr_debug("do_boot_cpu failed %d\n", err); return -EIO; } @@ -1055,6 +1057,34 @@ static __init void disable_smp(void) static int __init smp_sanity_check(unsigned max_cpus) { preempt_disable(); + +#if defined(CONFIG_X86_PC) && defined(CONFIG_X86_32) + if (def_to_bigsmp && nr_cpu_ids > 8) { + unsigned int cpu; + unsigned nr; + + printk(KERN_WARNING + "More than 8 CPUs detected - skipping them.\n" + "Use CONFIG_X86_GENERICARCH and CONFIG_X86_BIGSMP.\n"); + + nr = 0; + for_each_present_cpu(cpu) { + if (nr >= 8) + cpu_clear(cpu, cpu_present_map); + nr++; + } + + nr = 0; + for_each_possible_cpu(cpu) { + if (nr >= 8) + cpu_clear(cpu, cpu_possible_map); + nr++; + } + + nr_cpu_ids = 8; + } +#endif + if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { printk(KERN_WARNING "weird, boot CPU (#%d) not listed" "by the BIOS.\n", hard_smp_processor_id()); @@ -1147,10 +1177,17 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) * Setup boot CPU information */ smp_store_cpu_info(0); /* Final full version of the data */ +#ifdef CONFIG_X86_32 boot_cpu_logical_apicid = logical_smp_processor_id(); +#endif current_thread_info()->cpu = 0; /* needed? */ set_cpu_sibling_map(0); +#ifdef CONFIG_X86_64 + enable_IR_x2apic(); + setup_apic_routing(); +#endif + if (smp_sanity_check(max_cpus) < 0) { printk(KERN_INFO "SMP disabled\n"); disable_smp(); @@ -1158,9 +1195,9 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) } preempt_disable(); - if (GET_APIC_ID(read_apic_id()) != boot_cpu_physical_apicid) { + if (read_apic_id() != boot_cpu_physical_apicid) { panic("Boot APIC ID in local APIC unexpected (%d vs %d)", - GET_APIC_ID(read_apic_id()), boot_cpu_physical_apicid); + read_apic_id(), boot_cpu_physical_apicid); /* Or can we switch back to PIC here? */ } preempt_enable(); @@ -1193,6 +1230,9 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) printk(KERN_INFO "CPU%d: ", 0); print_cpu_info(&cpu_data(0)); setup_boot_clock(); + + if (is_uv_system()) + uv_system_init(); out: preempt_enable(); } @@ -1213,7 +1253,7 @@ void __init native_smp_prepare_boot_cpu(void) void __init native_smp_cpus_done(unsigned int max_cpus) { - Dprintk("Boot done.\n"); + pr_debug("Boot done.\n"); impress_friends(); smp_checks(); @@ -1223,39 +1263,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus) check_nmi_watchdog(); } -#ifdef CONFIG_HOTPLUG_CPU - -static void remove_siblinginfo(int cpu) -{ - int sibling; - struct cpuinfo_x86 *c = &cpu_data(cpu); - - for_each_cpu_mask(sibling, per_cpu(cpu_core_map, cpu)) { - cpu_clear(cpu, per_cpu(cpu_core_map, sibling)); - /*/ - * last thread sibling in this cpu core going down - */ - if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1) - cpu_data(sibling).booted_cores--; - } - - for_each_cpu_mask(sibling, per_cpu(cpu_sibling_map, cpu)) - cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling)); - cpus_clear(per_cpu(cpu_sibling_map, cpu)); - cpus_clear(per_cpu(cpu_core_map, cpu)); - c->phys_proc_id = 0; - c->cpu_core_id = 0; - cpu_clear(cpu, cpu_sibling_setup_map); -} - -static int additional_cpus __initdata = -1; - -static __init int setup_additional_cpus(char *s) -{ - return s && get_option(&s, &additional_cpus) ? 0 : -EINVAL; -} -early_param("additional_cpus", setup_additional_cpus); - /* * cpu_possible_map should be static, it cannot change as cpu's * are onlined, or offlined. The reason is per-cpu data-structures @@ -1275,24 +1282,13 @@ early_param("additional_cpus", setup_additional_cpus); */ __init void prefill_possible_map(void) { - int i; - int possible; + int i, possible; /* no processor from mptable or madt */ if (!num_processors) num_processors = 1; -#ifdef CONFIG_HOTPLUG_CPU - if (additional_cpus == -1) { - if (disabled_cpus > 0) - additional_cpus = disabled_cpus; - else - additional_cpus = 0; - } -#else - additional_cpus = 0; -#endif - possible = num_processors + additional_cpus; + possible = num_processors + disabled_cpus; if (possible > NR_CPUS) possible = NR_CPUS; @@ -1305,17 +1301,64 @@ __init void prefill_possible_map(void) nr_cpu_ids = possible; } +#ifdef CONFIG_HOTPLUG_CPU + +static void remove_siblinginfo(int cpu) +{ + int sibling; + struct cpuinfo_x86 *c = &cpu_data(cpu); + + for_each_cpu_mask_nr(sibling, per_cpu(cpu_core_map, cpu)) { + cpu_clear(cpu, per_cpu(cpu_core_map, sibling)); + /*/ + * last thread sibling in this cpu core going down + */ + if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1) + cpu_data(sibling).booted_cores--; + } + + for_each_cpu_mask_nr(sibling, per_cpu(cpu_sibling_map, cpu)) + cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling)); + cpus_clear(per_cpu(cpu_sibling_map, cpu)); + cpus_clear(per_cpu(cpu_core_map, cpu)); + c->phys_proc_id = 0; + c->cpu_core_id = 0; + cpu_clear(cpu, cpu_sibling_setup_map); +} + static void __ref remove_cpu_from_maps(int cpu) { cpu_clear(cpu, cpu_online_map); cpu_clear(cpu, cpu_callout_map); cpu_clear(cpu, cpu_callin_map); /* was set by cpu_init() */ - clear_bit(cpu, (unsigned long *)&cpu_initialized); + cpu_clear(cpu, cpu_initialized); numa_remove_cpu(cpu); } -int __cpu_disable(void) +void cpu_disable_common(void) +{ + int cpu = smp_processor_id(); + /* + * HACK: + * Allow any queued timer interrupts to get serviced + * This is only a temporary solution until we cleanup + * fixup_irqs as we do for IA64. + */ + local_irq_enable(); + mdelay(1); + + local_irq_disable(); + remove_siblinginfo(cpu); + + /* It's now safe to remove this processor from the online map */ + lock_vector_lock(); + remove_cpu_from_maps(cpu); + unlock_vector_lock(); + fixup_irqs(cpu_online_map); +} + +int native_cpu_disable(void) { int cpu = smp_processor_id(); @@ -1334,25 +1377,11 @@ int __cpu_disable(void) stop_apic_nmi_watchdog(NULL); clear_local_APIC(); - /* - * HACK: - * Allow any queued timer interrupts to get serviced - * This is only a temporary solution until we cleanup - * fixup_irqs as we do for IA64. - */ - local_irq_enable(); - mdelay(1); - - local_irq_disable(); - remove_siblinginfo(cpu); - - /* It's now safe to remove this processor from the online map */ - remove_cpu_from_maps(cpu); - fixup_irqs(cpu_online_map); + cpu_disable_common(); return 0; } -void __cpu_die(unsigned int cpu) +void native_cpu_die(unsigned int cpu) { /* We don't do anything here: idle task is faking death itself. */ unsigned int i; @@ -1369,28 +1398,45 @@ void __cpu_die(unsigned int cpu) } printk(KERN_ERR "CPU %u didn't die...\n", cpu); } + +void play_dead_common(void) +{ + idle_task_exit(); + reset_lazy_tlbstate(); + irq_ctx_exit(raw_smp_processor_id()); + c1e_remove_cpu(raw_smp_processor_id()); + + mb(); + /* Ack it */ + __get_cpu_var(cpu_state) = CPU_DEAD; + + /* + * With physical CPU hotplug, we should halt the cpu + */ + local_irq_disable(); +} + +void native_play_dead(void) +{ + play_dead_common(); + wbinvd_halt(); +} + #else /* ... !CONFIG_HOTPLUG_CPU */ -int __cpu_disable(void) +int native_cpu_disable(void) { return -ENOSYS; } -void __cpu_die(unsigned int cpu) +void native_cpu_die(unsigned int cpu) { /* We said "no" in __cpu_disable */ BUG(); } -#endif -/* - * If the BIOS enumerates physical processors before logical, - * maxcpus=N at enumeration-time can be used to disable HT. - */ -static int __init parse_maxcpus(char *arg) +void native_play_dead(void) { - extern unsigned int maxcpus; - - maxcpus = simple_strtoul(arg, NULL, 0); - return 0; + BUG(); } -early_param("maxcpus", parse_maxcpus); + +#endif diff --git a/arch/x86/kernel/smpcommon.c b/arch/x86/kernel/smpcommon.c index 99941b37eca0..397e309839dd 100644 --- a/arch/x86/kernel/smpcommon.c +++ b/arch/x86/kernel/smpcommon.c @@ -8,18 +8,21 @@ DEFINE_PER_CPU(unsigned long, this_cpu_off); EXPORT_PER_CPU_SYMBOL(this_cpu_off); -/* Initialize the CPU's GDT. This is either the boot CPU doing itself - (still using the master per-cpu area), or a CPU doing it for a - secondary which will soon come up. */ +/* + * Initialize the CPU's GDT. This is either the boot CPU doing itself + * (still using the master per-cpu area), or a CPU doing it for a + * secondary which will soon come up. + */ __cpuinit void init_gdt(int cpu) { - struct desc_struct *gdt = get_cpu_gdt_table(cpu); + struct desc_struct gdt; - pack_descriptor(&gdt[GDT_ENTRY_PERCPU], - __per_cpu_offset[cpu], 0xFFFFF, + pack_descriptor(&gdt, __per_cpu_offset[cpu], 0xFFFFF, 0x2 | DESCTYPE_S, 0x8); + gdt.s = 1; - gdt[GDT_ENTRY_PERCPU].s = 1; + write_gdt_entry(get_cpu_gdt_table(cpu), + GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S); per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu]; per_cpu(cpu_number, cpu) = cpu; diff --git a/arch/x86/kernel/smpcommon_32.c b/arch/x86/kernel/smpcommon_32.c deleted file mode 100644 index 8b137891791f..000000000000 --- a/arch/x86/kernel/smpcommon_32.c +++ /dev/null @@ -1 +0,0 @@ - diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 92c20fee6781..e8b9863ef8c4 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -105,6 +105,20 @@ static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) static int enable_single_step(struct task_struct *child) { struct pt_regs *regs = task_pt_regs(child); + unsigned long oflags; + + /* + * If we stepped into a sysenter/syscall insn, it trapped in + * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP. + * If user-mode had set TF itself, then it's still clear from + * do_debug() and we need to set it again to restore the user + * state so we don't wrongly set TIF_FORCED_TF below. + * If enable_single_step() was used last and that is what + * set TIF_SINGLESTEP, then both TF and TIF_FORCED_TF are + * already set and our bookkeeping is fine. + */ + if (unlikely(test_tsk_thread_flag(child, TIF_SINGLESTEP))) + regs->flags |= X86_EFLAGS_TF; /* * Always set TIF_SINGLESTEP - this guarantees that @@ -113,11 +127,7 @@ static int enable_single_step(struct task_struct *child) */ set_tsk_thread_flag(child, TIF_SINGLESTEP); - /* - * If TF was already set, don't do anything else - */ - if (regs->flags & X86_EFLAGS_TF) - return 0; + oflags = regs->flags; /* Set TF on the kernel stack.. */ regs->flags |= X86_EFLAGS_TF; @@ -126,9 +136,22 @@ static int enable_single_step(struct task_struct *child) * ..but if TF is changed by the instruction we will trace, * don't mark it as being "us" that set it, so that we * won't clear it by hand later. + * + * Note that if we don't actually execute the popf because + * of a signal arriving right now or suchlike, we will lose + * track of the fact that it really was "us" that set it. */ - if (is_setting_trap_flag(child, regs)) + if (is_setting_trap_flag(child, regs)) { + clear_tsk_thread_flag(child, TIF_FORCED_TF); return 0; + } + + /* + * If TF was already set, check whether it was us who set it. + * If not, we should never attempt a block step. + */ + if (oflags & X86_EFLAGS_TF) + return test_tsk_thread_flag(child, TIF_FORCED_TF); set_tsk_thread_flag(child, TIF_FORCED_TF); diff --git a/arch/x86/kernel/summit_32.c b/arch/x86/kernel/summit_32.c index d67ce5f044ba..7b987852e876 100644 --- a/arch/x86/kernel/summit_32.c +++ b/arch/x86/kernel/summit_32.c @@ -30,7 +30,7 @@ #include <linux/init.h> #include <asm/io.h> #include <asm/bios_ebda.h> -#include <asm/mach-summit/mach_mpparse.h> +#include <asm/summit/mpparse.h> static struct rio_table_hdr *rio_table_hdr __initdata; static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata; diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c index 7066cb855a60..1884a8d12bfa 100644 --- a/arch/x86/kernel/sys_i386_32.c +++ b/arch/x86/kernel/sys_i386_32.c @@ -22,6 +22,8 @@ #include <linux/uaccess.h> #include <linux/unistd.h> +#include <asm/syscalls.h> + asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff) diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 3b360ef33817..6bc211accf08 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -13,15 +13,17 @@ #include <linux/utsname.h> #include <linux/personality.h> #include <linux/random.h> +#include <linux/uaccess.h> -#include <asm/uaccess.h> #include <asm/ia32.h> +#include <asm/syscalls.h> -asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long off) +asmlinkage long sys_mmap(unsigned long addr, unsigned long len, + unsigned long prot, unsigned long flags, + unsigned long fd, unsigned long off) { long error; - struct file * file; + struct file *file; error = -EINVAL; if (off & ~PAGE_MASK) @@ -56,9 +58,9 @@ static void find_start_end(unsigned long flags, unsigned long *begin, unmapped base down for this case. This can give conflicts with the heap, but we assume that glibc malloc knows how to fall back to mmap. Give it 1GB - of playground for now. -AK */ - *begin = 0x40000000; - *end = 0x80000000; + of playground for now. -AK */ + *begin = 0x40000000; + *end = 0x80000000; if (current->flags & PF_RANDOMIZE) { new_begin = randomize_range(*begin, *begin + 0x02000000, 0); if (new_begin) @@ -66,9 +68,9 @@ static void find_start_end(unsigned long flags, unsigned long *begin, } } else { *begin = TASK_UNMAPPED_BASE; - *end = TASK_SIZE; + *end = TASK_SIZE; } -} +} unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, @@ -78,11 +80,11 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, struct vm_area_struct *vma; unsigned long start_addr; unsigned long begin, end; - + if (flags & MAP_FIXED) return addr; - find_start_end(flags, &begin, &end); + find_start_end(flags, &begin, &end); if (len > end) return -ENOMEM; @@ -96,12 +98,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, } if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32)) && len <= mm->cached_hole_size) { - mm->cached_hole_size = 0; + mm->cached_hole_size = 0; mm->free_area_cache = begin; } addr = mm->free_area_cache; - if (addr < begin) - addr = begin; + if (addr < begin) + addr = begin; start_addr = addr; full_search: @@ -127,7 +129,7 @@ full_search: return addr; } if (addr + mm->cached_hole_size < vma->vm_start) - mm->cached_hole_size = vma->vm_start - addr; + mm->cached_hole_size = vma->vm_start - addr; addr = vma->vm_end; } @@ -177,7 +179,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, vma = find_vma(mm, addr-len); if (!vma || addr <= vma->vm_start) /* remember the address as a hint for next time */ - return (mm->free_area_cache = addr-len); + return mm->free_area_cache = addr-len; } if (mm->mmap_base < len) @@ -194,7 +196,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, vma = find_vma(mm, addr); if (!vma || addr+len <= vma->vm_start) /* remember the address as a hint for next time */ - return (mm->free_area_cache = addr); + return mm->free_area_cache = addr; /* remember the largest hole we saw so far */ if (addr + mm->cached_hole_size < vma->vm_start) @@ -224,13 +226,13 @@ bottomup: } -asmlinkage long sys_uname(struct new_utsname __user * name) +asmlinkage long sys_uname(struct new_utsname __user *name) { int err; down_read(&uts_sem); - err = copy_to_user(name, utsname(), sizeof (*name)); + err = copy_to_user(name, utsname(), sizeof(*name)); up_read(&uts_sem); - if (personality(current->personality) == PER_LINUX32) - err |= copy_to_user(&name->machine, "i686", 5); + if (personality(current->personality) == PER_LINUX32) + err |= copy_to_user(&name->machine, "i686", 5); return err ? -EFAULT : 0; } diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c index 170d43c17487..de87d6008295 100644 --- a/arch/x86/kernel/syscall_64.c +++ b/arch/x86/kernel/syscall_64.c @@ -8,12 +8,12 @@ #define __NO_STUBS #define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; -#undef _ASM_X86_64_UNISTD_H_ +#undef _ASM_X86_UNISTD_64_H #include <asm/unistd_64.h> #undef __SYSCALL #define __SYSCALL(nr, sym) [nr] = sym, -#undef _ASM_X86_64_UNISTD_H_ +#undef _ASM_X86_UNISTD_64_H typedef void (*sys_call_ptr_t)(void); diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index adff5562f5fd..d44395ff34c3 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -326,3 +326,9 @@ ENTRY(sys_call_table) .long sys_fallocate .long sys_timerfd_settime /* 325 */ .long sys_timerfd_gettime + .long sys_signalfd4 + .long sys_eventfd2 + .long sys_epoll_create1 + .long sys_dup3 /* 330 */ + .long sys_pipe2 + .long sys_inotify_init1 diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index 059ca6ee59b4..77b400f06ea2 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -36,6 +36,7 @@ #include <asm/arch_hooks.h> #include <asm/hpet.h> #include <asm/time.h> +#include <asm/timer.h> #include "do_timer.h" @@ -46,10 +47,9 @@ unsigned long profile_pc(struct pt_regs *regs) unsigned long pc = instruction_pointer(regs); #ifdef CONFIG_SMP - if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->cs) && - in_lock_functions(pc)) { + if (!user_mode_vm(regs) && in_lock_functions(pc)) { #ifdef CONFIG_FRAME_POINTER - return *(unsigned long *)(regs->bp + 4); + return *(unsigned long *)(regs->bp + sizeof(long)); #else unsigned long *sp = (unsigned long *)®s->sp; @@ -94,6 +94,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id) do_timer_interrupt_hook(); +#ifdef CONFIG_MCA if (MCA_bus) { /* The PS/2 uses level-triggered interrupts. You can't turn them off, nor would you want to (any attempt to @@ -107,6 +108,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id) u8 irq_v = inb_p( 0x61 ); /* read the current state */ outb_p( irq_v|0x80, 0x61 ); /* reset the IRQ */ } +#endif return IRQ_HANDLED; } @@ -129,6 +131,7 @@ void __init hpet_time_init(void) */ void __init time_init(void) { + pre_time_init_hook(); tsc_init(); late_time_init = choose_time_init(); } diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index e3d49c553af2..cb19d650c216 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -16,6 +16,7 @@ #include <linux/interrupt.h> #include <linux/module.h> #include <linux/time.h> +#include <linux/mca.h> #include <asm/i8253.h> #include <asm/hpet.h> @@ -33,23 +34,34 @@ unsigned long profile_pc(struct pt_regs *regs) /* Assume the lock function has either no stack frame or a copy of flags from PUSHF Eflags always has bits 22 and up cleared unlike kernel addresses. */ - if (!user_mode(regs) && in_lock_functions(pc)) { + if (!user_mode_vm(regs) && in_lock_functions(pc)) { +#ifdef CONFIG_FRAME_POINTER + return *(unsigned long *)(regs->bp + sizeof(long)); +#else unsigned long *sp = (unsigned long *)regs->sp; if (sp[0] >> 22) return sp[0]; if (sp[1] >> 22) return sp[1]; +#endif } return pc; } EXPORT_SYMBOL(profile_pc); -static irqreturn_t timer_event_interrupt(int irq, void *dev_id) +irqreturn_t timer_interrupt(int irq, void *dev_id) { add_pda(irq0_irqs, 1); global_clock_event->event_handler(global_clock_event); +#ifdef CONFIG_MCA + if (MCA_bus) { + u8 irq_v = inb_p(0x61); /* read the current state */ + outb_p(irq_v|0x80, 0x61); /* reset the IRQ */ + } +#endif + return IRQ_HANDLED; } @@ -100,7 +112,7 @@ unsigned long __init calibrate_cpu(void) } static struct irqaction irq0 = { - .handler = timer_event_interrupt, + .handler = timer_interrupt, .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING, .mask = CPU_MASK_NONE, .name = "timer" @@ -111,16 +123,13 @@ void __init hpet_time_init(void) if (!hpet_enable()) setup_pit_timer(); + irq0.mask = cpumask_of_cpu(0); setup_irq(0, &irq0); } void __init time_init(void) { tsc_init(); - if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) - vgetcpu_mode = VGETCPU_RDTSCP; - else - vgetcpu_mode = VGETCPU_LSL; late_time_init = choose_time_init(); } diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c index fec1ecedc9b7..e00534b33534 100644 --- a/arch/x86/kernel/tlb_32.c +++ b/arch/x86/kernel/tlb_32.c @@ -241,3 +241,11 @@ void flush_tlb_all(void) on_each_cpu(do_flush_tlb_all, NULL, 1); } +void reset_lazy_tlbstate(void) +{ + int cpu = raw_smp_processor_id(); + + per_cpu(cpu_tlbstate, cpu).state = 0; + per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm; +} + diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index d0fbb7712ab0..04431f34fd16 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -6,7 +6,7 @@ * This code is released under the GNU General Public License version 2 or * later. */ -#include <linux/mc146818rtc.h> +#include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/kernel.h> @@ -17,6 +17,7 @@ #include <asm/genapic.h> #include <asm/idle.h> #include <asm/tsc.h> +#include <asm/irq_vectors.h> #include <mach_apic.h> @@ -783,7 +784,7 @@ static int __init uv_bau_init(void) uv_init_blade(blade, node, cur_cpu); cur_cpu += uv_blade_nr_possible_cpus(blade); } - set_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1); + alloc_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1); uv_enable_timeouts(); return 0; diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index ab6bf375a307..6bb7b8579e70 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -10,6 +10,7 @@ #include <asm/ldt.h> #include <asm/processor.h> #include <asm/proto.h> +#include <asm/syscalls.h> #include "tls.h" diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps.c index 8a768973c4f0..04d242ab0161 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps.c @@ -7,13 +7,11 @@ */ /* - * 'Traps.c' handles hardware traps and faults after we have saved some - * state in 'asm.s'. + * Handle hardware traps and faults. */ #include <linux/interrupt.h> #include <linux/kallsyms.h> #include <linux/spinlock.h> -#include <linux/highmem.h> #include <linux/kprobes.h> #include <linux/uaccess.h> #include <linux/utsname.h> @@ -32,6 +30,8 @@ #include <linux/bug.h> #include <linux/nmi.h> #include <linux/mm.h> +#include <linux/smp.h> +#include <linux/io.h> #ifdef CONFIG_EISA #include <linux/ioport.h> @@ -46,20 +46,31 @@ #include <linux/edac.h> #endif -#include <asm/arch_hooks.h> #include <asm/stacktrace.h> #include <asm/processor.h> #include <asm/debugreg.h> #include <asm/atomic.h> #include <asm/system.h> #include <asm/unwind.h> +#include <asm/traps.h> #include <asm/desc.h> #include <asm/i387.h> + +#include <mach_traps.h> + +#ifdef CONFIG_X86_64 +#include <asm/pgalloc.h> +#include <asm/proto.h> +#include <asm/pda.h> +#else +#include <asm/processor-flags.h> +#include <asm/arch_hooks.h> #include <asm/nmi.h> #include <asm/smp.h> #include <asm/io.h> +#include <asm/traps.h> -#include "mach_traps.h" +#include "cpu/mcheck/mce.h" DECLARE_BITMAP(used_vectors, NR_VECTORS); EXPORT_SYMBOL_GPL(used_vectors); @@ -76,431 +87,104 @@ char ignore_fpu_irq; */ gate_desc idt_table[256] __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; - -asmlinkage void divide_error(void); -asmlinkage void debug(void); -asmlinkage void nmi(void); -asmlinkage void int3(void); -asmlinkage void overflow(void); -asmlinkage void bounds(void); -asmlinkage void invalid_op(void); -asmlinkage void device_not_available(void); -asmlinkage void coprocessor_segment_overrun(void); -asmlinkage void invalid_TSS(void); -asmlinkage void segment_not_present(void); -asmlinkage void stack_segment(void); -asmlinkage void general_protection(void); -asmlinkage void page_fault(void); -asmlinkage void coprocessor_error(void); -asmlinkage void simd_coprocessor_error(void); -asmlinkage void alignment_check(void); -asmlinkage void spurious_interrupt_bug(void); -asmlinkage void machine_check(void); - -int panic_on_unrecovered_nmi; -int kstack_depth_to_print = 24; -static unsigned int code_bytes = 64; -static int ignore_nmis; -static int die_counter; - -void printk_address(unsigned long address, int reliable) -{ -#ifdef CONFIG_KALLSYMS - unsigned long offset = 0; - unsigned long symsize; - const char *symname; - char *modname; - char *delim = ":"; - char namebuf[KSYM_NAME_LEN]; - char reliab[4] = ""; - - symname = kallsyms_lookup(address, &symsize, &offset, - &modname, namebuf); - if (!symname) { - printk(" [<%08lx>]\n", address); - return; - } - if (!reliable) - strcpy(reliab, "? "); - - if (!modname) - modname = delim = ""; - printk(" [<%08lx>] %s%s%s%s%s+0x%lx/0x%lx\n", - address, reliab, delim, modname, delim, symname, offset, symsize); -#else - printk(" [<%08lx>]\n", address); #endif -} - -static inline int valid_stack_ptr(struct thread_info *tinfo, - void *p, unsigned int size) -{ - void *t = tinfo; - return p > t && p <= t + THREAD_SIZE - size; -} - -/* The form of the top of the frame on the stack */ -struct stack_frame { - struct stack_frame *next_frame; - unsigned long return_address; -}; -static inline unsigned long -print_context_stack(struct thread_info *tinfo, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data) -{ - struct stack_frame *frame = (struct stack_frame *)bp; - - while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) { - unsigned long addr; - - addr = *stack; - if (__kernel_text_address(addr)) { - if ((unsigned long) stack == bp + 4) { - ops->address(data, addr, 1); - frame = frame->next_frame; - bp = (unsigned long) frame; - } else { - ops->address(data, addr, bp == 0); - } - } - stack++; - } - return bp; -} - -void dump_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data) -{ - if (!task) - task = current; - - if (!stack) { - unsigned long dummy; - stack = &dummy; - if (task != current) - stack = (unsigned long *)task->thread.sp; - } - -#ifdef CONFIG_FRAME_POINTER - if (!bp) { - if (task == current) { - /* Grab bp right from our regs */ - asm("movl %%ebp, %0" : "=r" (bp) :); - } else { - /* bp is the last reg pushed by switch_to */ - bp = *(unsigned long *) task->thread.sp; - } - } -#endif - - for (;;) { - struct thread_info *context; - - context = (struct thread_info *) - ((unsigned long)stack & (~(THREAD_SIZE - 1))); - bp = print_context_stack(context, stack, bp, ops, data); - /* - * Should be after the line below, but somewhere - * in early boot context comes out corrupted and we - * can't reference it: - */ - if (ops->stack(data, "IRQ") < 0) - break; - stack = (unsigned long *)context->previous_esp; - if (!stack) - break; - touch_nmi_watchdog(); - } -} -EXPORT_SYMBOL(dump_trace); - -static void -print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) -{ - printk(data); - print_symbol(msg, symbol); - printk("\n"); -} - -static void print_trace_warning(void *data, char *msg) -{ - printk("%s%s\n", (char *)data, msg); -} - -static int print_trace_stack(void *data, char *name) -{ - return 0; -} - -/* - * Print one address/symbol entries per line. - */ -static void print_trace_address(void *data, unsigned long addr, int reliable) -{ - printk("%s [<%08lx>] ", (char *)data, addr); - if (!reliable) - printk("? "); - print_symbol("%s\n", addr); - touch_nmi_watchdog(); -} - -static const struct stacktrace_ops print_trace_ops = { - .warning = print_trace_warning, - .warning_symbol = print_trace_warning_symbol, - .stack = print_trace_stack, - .address = print_trace_address, -}; +static int ignore_nmis; -static void -show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl) +static inline void conditional_sti(struct pt_regs *regs) { - dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); - printk("%s =======================\n", log_lvl); + if (regs->flags & X86_EFLAGS_IF) + local_irq_enable(); } -void show_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp) +static inline void preempt_conditional_sti(struct pt_regs *regs) { - show_trace_log_lvl(task, regs, stack, bp, ""); + inc_preempt_count(); + if (regs->flags & X86_EFLAGS_IF) + local_irq_enable(); } -static void -show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl) +static inline void preempt_conditional_cli(struct pt_regs *regs) { - unsigned long *stack; - int i; - - if (sp == NULL) { - if (task) - sp = (unsigned long *)task->thread.sp; - else - sp = (unsigned long *)&sp; - } - - stack = sp; - for (i = 0; i < kstack_depth_to_print; i++) { - if (kstack_end(stack)) - break; - if (i && ((i % 8) == 0)) - printk("\n%s ", log_lvl); - printk("%08lx ", *stack++); - } - printk("\n%sCall Trace:\n", log_lvl); - - show_trace_log_lvl(task, regs, sp, bp, log_lvl); + if (regs->flags & X86_EFLAGS_IF) + local_irq_disable(); + dec_preempt_count(); } -void show_stack(struct task_struct *task, unsigned long *sp) +#ifdef CONFIG_X86_32 +static inline void +die_if_kernel(const char *str, struct pt_regs *regs, long err) { - printk(" "); - show_stack_log_lvl(task, NULL, sp, 0, ""); + if (!user_mode_vm(regs)) + die(str, regs, err); } /* - * The architecture-independent dump_stack generator + * Perform the lazy TSS's I/O bitmap copy. If the TSS has an + * invalid offset set (the LAZY one) and the faulting thread has + * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS, + * we set the offset field correctly and return 1. */ -void dump_stack(void) -{ - unsigned long bp = 0; - unsigned long stack; - -#ifdef CONFIG_FRAME_POINTER - if (!bp) - asm("movl %%ebp, %0" : "=r" (bp):); -#endif - - printk("Pid: %d, comm: %.20s %s %s %.*s\n", - current->pid, current->comm, print_tainted(), - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); - - show_trace(current, NULL, &stack, bp); -} - -EXPORT_SYMBOL(dump_stack); - -void show_registers(struct pt_regs *regs) +static int lazy_iobitmap_copy(void) { - int i; + struct thread_struct *thread; + struct tss_struct *tss; + int cpu; - print_modules(); - __show_registers(regs, 0); + cpu = get_cpu(); + tss = &per_cpu(init_tss, cpu); + thread = ¤t->thread; - printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", - TASK_COMM_LEN, current->comm, task_pid_nr(current), - current_thread_info(), current, task_thread_info(current)); - /* - * When in-kernel, we also print out the stack and code at the - * time of the fault.. - */ - if (!user_mode_vm(regs)) { - unsigned int code_prologue = code_bytes * 43 / 64; - unsigned int code_len = code_bytes; - unsigned char c; - u8 *ip; - - printk("\n" KERN_EMERG "Stack: "); - show_stack_log_lvl(NULL, regs, ®s->sp, 0, KERN_EMERG); - - printk(KERN_EMERG "Code: "); - - ip = (u8 *)regs->ip - code_prologue; - if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { - /* try starting at EIP */ - ip = (u8 *)regs->ip; - code_len = code_len - code_prologue + 1; - } - for (i = 0; i < code_len; i++, ip++) { - if (ip < (u8 *)PAGE_OFFSET || - probe_kernel_address(ip, c)) { - printk(" Bad EIP value."); - break; - } - if (ip == (u8 *)regs->ip) - printk("<%02x> ", c); - else - printk("%02x ", c); + if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY && + thread->io_bitmap_ptr) { + memcpy(tss->io_bitmap, thread->io_bitmap_ptr, + thread->io_bitmap_max); + /* + * If the previously set map was extending to higher ports + * than the current one, pad extra space with 0xff (no access). + */ + if (thread->io_bitmap_max < tss->io_bitmap_max) { + memset((char *) tss->io_bitmap + + thread->io_bitmap_max, 0xff, + tss->io_bitmap_max - thread->io_bitmap_max); } - } - printk("\n"); -} - -int is_valid_bugaddr(unsigned long ip) -{ - unsigned short ud2; - - if (ip < PAGE_OFFSET) - return 0; - if (probe_kernel_address((unsigned short *)ip, ud2)) - return 0; - - return ud2 == 0x0b0f; -} - -int __kprobes __die(const char *str, struct pt_regs *regs, long err) -{ - unsigned short ss; - unsigned long sp; + tss->io_bitmap_max = thread->io_bitmap_max; + tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; + tss->io_bitmap_owner = thread; + put_cpu(); - printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); -#ifdef CONFIG_PREEMPT - printk("PREEMPT "); -#endif -#ifdef CONFIG_SMP - printk("SMP "); -#endif -#ifdef CONFIG_DEBUG_PAGEALLOC - printk("DEBUG_PAGEALLOC"); -#endif - printk("\n"); - if (notify_die(DIE_OOPS, str, regs, err, - current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) return 1; - - show_registers(regs); - /* Executive summary in case the oops scrolled away */ - sp = (unsigned long) (®s->sp); - savesegment(ss, ss); - if (user_mode(regs)) { - sp = regs->sp; - ss = regs->ss & 0xffff; } - printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); - print_symbol("%s", regs->ip); - printk(" SS:ESP %04x:%08lx\n", ss, sp); - return 0; -} - -/* - * This is gone through when something in the kernel has done something bad - * and is about to be terminated: - */ -void die(const char *str, struct pt_regs *regs, long err) -{ - static struct { - raw_spinlock_t lock; - u32 lock_owner; - int lock_owner_depth; - } die = { - .lock = __RAW_SPIN_LOCK_UNLOCKED, - .lock_owner = -1, - .lock_owner_depth = 0 - }; - unsigned long flags; - - oops_enter(); - - if (die.lock_owner != raw_smp_processor_id()) { - console_verbose(); - raw_local_irq_save(flags); - __raw_spin_lock(&die.lock); - die.lock_owner = smp_processor_id(); - die.lock_owner_depth = 0; - bust_spinlocks(1); - } else { - raw_local_irq_save(flags); - } - - if (++die.lock_owner_depth < 3) { - report_bug(regs->ip, regs); - - if (__die(str, regs, err)) - regs = NULL; - } else { - printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); - } - - bust_spinlocks(0); - die.lock_owner = -1; - add_taint(TAINT_DIE); - __raw_spin_unlock(&die.lock); - raw_local_irq_restore(flags); - - if (!regs) - return; - - if (kexec_should_crash(current)) - crash_kexec(regs); - - if (in_interrupt()) - panic("Fatal exception in interrupt"); - - if (panic_on_oops) - panic("Fatal exception"); - - oops_exit(); - do_exit(SIGSEGV); -} + put_cpu(); -static inline void -die_if_kernel(const char *str, struct pt_regs *regs, long err) -{ - if (!user_mode_vm(regs)) - die(str, regs, err); + return 0; } +#endif static void __kprobes -do_trap(int trapnr, int signr, char *str, int vm86, struct pt_regs *regs, +do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, long error_code, siginfo_t *info) { struct task_struct *tsk = current; +#ifdef CONFIG_X86_32 if (regs->flags & X86_VM_MASK) { - if (vm86) + /* + * traps 0, 1, 3, 4, and 5 should be forwarded to vm86. + * On nmi (interrupt 2), do_trap should not be called. + */ + if (trapnr < 6) goto vm86_trap; goto trap_signal; } +#endif if (!user_mode(regs)) goto kernel_trap; +#ifdef CONFIG_X86_32 trap_signal: +#endif /* * We want error_code and trap_no set for userspace faults and * kernelspace faults which result in die(), but not @@ -513,6 +197,18 @@ trap_signal: tsk->thread.error_code = error_code; tsk->thread.trap_no = trapnr; +#ifdef CONFIG_X86_64 + if (show_unhandled_signals && unhandled_signal(tsk, signr) && + printk_ratelimit()) { + printk(KERN_INFO + "%s[%d] trap %s ip:%lx sp:%lx error:%lx", + tsk->comm, tsk->pid, str, + regs->ip, regs->sp, error_code); + print_vma_addr(" in ", regs->ip); + printk("\n"); + } +#endif + if (info) force_sig_info(signr, info, tsk); else @@ -527,120 +223,98 @@ kernel_trap: } return; +#ifdef CONFIG_X86_32 vm86_trap: if (handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr)) goto trap_signal; return; +#endif } #define DO_ERROR(trapnr, signr, str, name) \ -void do_##name(struct pt_regs *regs, long error_code) \ -{ \ - trace_hardirqs_fixup(); \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ -} - -#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \ -void do_##name(struct pt_regs *regs, long error_code) \ -{ \ - siginfo_t info; \ - if (irq) \ - local_irq_enable(); \ - info.si_signo = signr; \ - info.si_errno = 0; \ - info.si_code = sicode; \ - info.si_addr = (void __user *)siaddr; \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - do_trap(trapnr, signr, str, 0, regs, error_code, &info); \ -} - -#define DO_VM86_ERROR(trapnr, signr, str, name) \ -void do_##name(struct pt_regs *regs, long error_code) \ +dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ { \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ == NOTIFY_STOP) \ return; \ - do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ + conditional_sti(regs); \ + do_trap(trapnr, signr, str, regs, error_code, NULL); \ } -#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ -void do_##name(struct pt_regs *regs, long error_code) \ +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ +dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ { \ siginfo_t info; \ info.si_signo = signr; \ info.si_errno = 0; \ info.si_code = sicode; \ info.si_addr = (void __user *)siaddr; \ - trace_hardirqs_fixup(); \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ == NOTIFY_STOP) \ return; \ - do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ + conditional_sti(regs); \ + do_trap(trapnr, signr, str, regs, error_code, &info); \ } -DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) -#ifndef CONFIG_KPROBES -DO_VM86_ERROR(3, SIGTRAP, "int3", int3) -#endif -DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow) -DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) +DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) +DO_ERROR(4, SIGSEGV, "overflow", overflow) +DO_ERROR(5, SIGSEGV, "bounds", bounds) +DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) +#ifdef CONFIG_X86_32 DO_ERROR(12, SIGBUS, "stack segment", stack_segment) -DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) -DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1) +#endif +DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) + +#ifdef CONFIG_X86_64 +/* Runs on IST stack */ +dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code) +{ + if (notify_die(DIE_TRAP, "stack segment", regs, error_code, + 12, SIGBUS) == NOTIFY_STOP) + return; + preempt_conditional_sti(regs); + do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL); + preempt_conditional_cli(regs); +} -void __kprobes +dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) +{ + static const char str[] = "double fault"; + struct task_struct *tsk = current; + + /* Return not checked because double check cannot be ignored */ + notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV); + + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 8; + + /* This is always a kernel trap and never fixable (and thus must + never return). */ + for (;;) + die(str, regs, error_code); +} +#endif + +dotraplinkage void __kprobes do_general_protection(struct pt_regs *regs, long error_code) { struct task_struct *tsk; - struct thread_struct *thread; - struct tss_struct *tss; - int cpu; - cpu = get_cpu(); - tss = &per_cpu(init_tss, cpu); - thread = ¤t->thread; - - /* - * Perform the lazy TSS's I/O bitmap copy. If the TSS has an - * invalid offset set (the LAZY one) and the faulting thread has - * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS - * and we set the offset field correctly. Then we let the CPU to - * restart the faulting instruction. - */ - if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY && - thread->io_bitmap_ptr) { - memcpy(tss->io_bitmap, thread->io_bitmap_ptr, - thread->io_bitmap_max); - /* - * If the previously set map was extending to higher ports - * than the current one, pad extra space with 0xff (no access). - */ - if (thread->io_bitmap_max < tss->io_bitmap_max) { - memset((char *) tss->io_bitmap + - thread->io_bitmap_max, 0xff, - tss->io_bitmap_max - thread->io_bitmap_max); - } - tss->io_bitmap_max = thread->io_bitmap_max; - tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; - tss->io_bitmap_owner = thread; - put_cpu(); + conditional_sti(regs); +#ifdef CONFIG_X86_32 + if (lazy_iobitmap_copy()) { + /* restart the faulting instruction */ return; } - put_cpu(); if (regs->flags & X86_VM_MASK) goto gp_in_vm86; +#endif tsk = current; if (!user_mode(regs)) @@ -662,10 +336,12 @@ do_general_protection(struct pt_regs *regs, long error_code) force_sig(SIGSEGV, tsk); return; +#ifdef CONFIG_X86_32 gp_in_vm86: local_irq_enable(); handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); return; +#endif gp_in_kernel: if (fixup_exception(regs)) @@ -702,7 +378,8 @@ mem_parity_error(unsigned char reason, struct pt_regs *regs) printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); /* Clear and disable the memory parity error line. */ - clear_mem_error(reason); + reason = (reason & 0xf) | 4; + outb(reason, 0x61); } static notrace __kprobes void @@ -728,7 +405,8 @@ io_check_error(unsigned char reason, struct pt_regs *regs) static notrace __kprobes void unknown_nmi_error(unsigned char reason, struct pt_regs *regs) { - if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) + if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == + NOTIFY_STOP) return; #ifdef CONFIG_MCA /* @@ -751,41 +429,6 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); } -static DEFINE_SPINLOCK(nmi_print_lock); - -void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) -{ - if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) - return; - - spin_lock(&nmi_print_lock); - /* - * We are in trouble anyway, lets at least try - * to get a message out: - */ - bust_spinlocks(1); - printk(KERN_EMERG "%s", str); - printk(" on CPU%d, ip %08lx, registers:\n", - smp_processor_id(), regs->ip); - show_registers(regs); - if (do_panic) - panic("Non maskable interrupt"); - console_silent(); - spin_unlock(&nmi_print_lock); - bust_spinlocks(0); - - /* - * If we are in kernel we are probably nested up pretty bad - * and might aswell get out now while we still can: - */ - if (!user_mode_vm(regs)) { - current->thread.trap_no = 2; - crash_kexec(regs); - } - - do_exit(SIGSEGV); -} - static notrace __kprobes void default_do_nmi(struct pt_regs *regs) { unsigned char reason = 0; @@ -824,22 +467,25 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) mem_parity_error(reason, regs); if (reason & 0x40) io_check_error(reason, regs); +#ifdef CONFIG_X86_32 /* * Reassert NMI in case it became active meanwhile * as it's edge-triggered: */ reassert_nmi(); +#endif } -notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code) +dotraplinkage notrace __kprobes void +do_nmi(struct pt_regs *regs, long error_code) { - int cpu; - nmi_enter(); - cpu = smp_processor_id(); - - ++nmi_count(cpu); +#ifdef CONFIG_X86_32 + { int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); } +#else + add_pda(__nmi_count, 1); +#endif if (!ignore_nmis) default_do_nmi(regs); @@ -859,21 +505,44 @@ void restart_nmi(void) acpi_nmi_enable(); } -#ifdef CONFIG_KPROBES -void __kprobes do_int3(struct pt_regs *regs, long error_code) +/* May run on IST stack. */ +dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) { - trace_hardirqs_fixup(); - +#ifdef CONFIG_KPROBES if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) return; - /* - * This is an interrupt gate, because kprobes wants interrupts - * disabled. Normal trap handlers don't. - */ - restore_interrupts(regs); +#else + if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP) + == NOTIFY_STOP) + return; +#endif - do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL); + preempt_conditional_sti(regs); + do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); + preempt_conditional_cli(regs); +} + +#ifdef CONFIG_X86_64 +/* Help handler running on IST stack to switch back to user stack + for scheduling or signal handling. The actual stack switch is done in + entry.S */ +asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) +{ + struct pt_regs *regs = eregs; + /* Did already sync */ + if (eregs == (struct pt_regs *)eregs->sp) + ; + /* Exception from user space */ + else if (user_mode(eregs)) + regs = task_pt_regs(current); + /* Exception from kernel and interrupts are enabled. Move to + kernel process stack. */ + else if (eregs->flags & X86_EFLAGS_IF) + regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); + if (eregs != regs) + *regs = *eregs; + return regs; } #endif @@ -898,13 +567,14 @@ void __kprobes do_int3(struct pt_regs *regs, long error_code) * about restoring all the debug state, and ptrace doesn't have to * find every occurrence of the TF bit that could be saved away even * by user code) + * + * May run on IST stack. */ -void __kprobes do_debug(struct pt_regs *regs, long error_code) +dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) { struct task_struct *tsk = current; - unsigned int condition; - - trace_hardirqs_fixup(); + unsigned long condition; + int si_code; get_debugreg(condition, 6); @@ -917,9 +587,9 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code) if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, SIGTRAP) == NOTIFY_STOP) return; + /* It's safe to allow irq's after DR6 has been saved */ - if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); + preempt_conditional_sti(regs); /* Mask out spurious debug traps due to lazy DR7 setting */ if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { @@ -927,8 +597,10 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code) goto clear_dr7; } +#ifdef CONFIG_X86_32 if (regs->flags & X86_VM_MASK) goto debug_vm86; +#endif /* Save debug status register where ptrace can see it */ tsk->thread.debugreg6 = condition; @@ -938,17 +610,13 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code) * kernel space (but re-enable TF when returning to user mode). */ if (condition & DR_STEP) { - /* - * We already checked v86 mode above, so we can - * check for kernel mode by just checking the CPL - * of CS. - */ if (!user_mode(regs)) goto clear_TF_reenable; } + si_code = get_si_code(condition); /* Ok, finally something we can handle */ - send_sigtrap(tsk, regs, error_code); + send_sigtrap(tsk, regs, error_code, si_code); /* * Disable additional traps. They'll be re-enabled when @@ -956,18 +624,37 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code) */ clear_dr7: set_debugreg(0, 7); + preempt_conditional_cli(regs); return; +#ifdef CONFIG_X86_32 debug_vm86: handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); + preempt_conditional_cli(regs); return; +#endif clear_TF_reenable: set_tsk_thread_flag(tsk, TIF_SINGLESTEP); regs->flags &= ~X86_EFLAGS_TF; + preempt_conditional_cli(regs); return; } +#ifdef CONFIG_X86_64 +static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) +{ + if (fixup_exception(regs)) + return 1; + + notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE); + /* Illegal floating point operation in the kernel */ + current->thread.trap_no = trapnr; + die(str, regs, 0); + return 0; +} +#endif + /* * Note that we play around with the 'TS' bit in an attempt to get * the correct behaviour even in the presence of the asynchronous @@ -1004,7 +691,9 @@ void math_error(void __user *ip) swd = get_fpu_swd(task); switch (swd & ~cwd & 0x3f) { case 0x000: /* No unmasked exception */ +#ifdef CONFIG_X86_32 return; +#endif default: /* Multiple exceptions */ break; case 0x001: /* Invalid Op */ @@ -1032,9 +721,18 @@ void math_error(void __user *ip) force_sig_info(SIGFPE, &info, task); } -void do_coprocessor_error(struct pt_regs *regs, long error_code) +dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) { + conditional_sti(regs); + +#ifdef CONFIG_X86_32 ignore_fpu_irq = 1; +#else + if (!user_mode(regs) && + kernel_math_error(regs, "kernel x87 math error", 16)) + return; +#endif + math_error((void __user *)regs->ip); } @@ -1086,8 +784,12 @@ static void simd_math_error(void __user *ip) force_sig_info(SIGFPE, &info, task); } -void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) +dotraplinkage void +do_simd_coprocessor_error(struct pt_regs *regs, long error_code) { + conditional_sti(regs); + +#ifdef CONFIG_X86_32 if (cpu_has_xmm) { /* Handle SIMD FPU exceptions on PIII+ processors. */ ignore_fpu_irq = 1; @@ -1106,16 +808,25 @@ void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) current->thread.error_code = error_code; die_if_kernel("cache flush denied", regs, error_code); force_sig(SIGSEGV, current); +#else + if (!user_mode(regs) && + kernel_math_error(regs, "kernel simd math error", 19)) + return; + simd_math_error((void __user *)regs->ip); +#endif } -void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) +dotraplinkage void +do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) { + conditional_sti(regs); #if 0 /* No need to warn about this any longer. */ printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); #endif } +#ifdef CONFIG_X86_32 unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) { struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id()); @@ -1134,6 +845,15 @@ unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) return new_kesp; } +#else +asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) +{ +} + +asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) +{ +} +#endif /* * 'math_state_restore()' saves the current math information in the @@ -1166,14 +886,24 @@ asmlinkage void math_state_restore(void) } clts(); /* Allow maths ops (or we recurse) */ +#ifdef CONFIG_X86_32 restore_fpu(tsk); +#else + /* + * Paranoid restore. send a SIGSEGV if we fail to restore the state. + */ + if (unlikely(restore_fpu_checking(tsk))) { + stts(); + force_sig(SIGSEGV, tsk); + return; + } +#endif thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ tsk->fpu_counter++; } EXPORT_SYMBOL_GPL(math_state_restore); #ifndef CONFIG_MATH_EMULATION - asmlinkage void math_emulate(long arg) { printk(KERN_EMERG @@ -1182,12 +912,46 @@ asmlinkage void math_emulate(long arg) force_sig(SIGFPE, current); schedule(); } - #endif /* CONFIG_MATH_EMULATION */ +dotraplinkage void __kprobes +do_device_not_available(struct pt_regs *regs, long error) +{ +#ifdef CONFIG_X86_32 + if (read_cr0() & X86_CR0_EM) { + conditional_sti(regs); + math_emulate(0); + } else { + math_state_restore(); /* interrupts still off */ + conditional_sti(regs); + } +#else + math_state_restore(); +#endif +} + +#ifdef CONFIG_X86_32 +dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) +{ + siginfo_t info; + local_irq_enable(); + + info.si_signo = SIGILL; + info.si_errno = 0; + info.si_code = ILL_BADSTK; + info.si_addr = 0; + if (notify_die(DIE_TRAP, "iret exception", + regs, error_code, 32, SIGILL) == NOTIFY_STOP) + return; + do_trap(32, SIGILL, "iret exception", regs, error_code, &info); +} +#endif + void __init trap_init(void) { +#ifdef CONFIG_X86_32 int i; +#endif #ifdef CONFIG_EISA void __iomem *p = early_ioremap(0x0FFFD9, 4); @@ -1197,29 +961,40 @@ void __init trap_init(void) early_iounmap(p, 4); #endif - set_trap_gate(0, ÷_error); - set_intr_gate(1, &debug); - set_intr_gate(2, &nmi); - set_system_intr_gate(3, &int3); /* int3 can be called from all */ - set_system_gate(4, &overflow); /* int4 can be called from all */ - set_trap_gate(5, &bounds); - set_trap_gate(6, &invalid_op); - set_trap_gate(7, &device_not_available); + set_intr_gate(0, ÷_error); + set_intr_gate_ist(1, &debug, DEBUG_STACK); + set_intr_gate_ist(2, &nmi, NMI_STACK); + /* int3 can be called from all */ + set_system_intr_gate_ist(3, &int3, DEBUG_STACK); + /* int4 can be called from all */ + set_system_intr_gate(4, &overflow); + set_intr_gate(5, &bounds); + set_intr_gate(6, &invalid_op); + set_intr_gate(7, &device_not_available); +#ifdef CONFIG_X86_32 set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); - set_trap_gate(9, &coprocessor_segment_overrun); - set_trap_gate(10, &invalid_TSS); - set_trap_gate(11, &segment_not_present); - set_trap_gate(12, &stack_segment); - set_trap_gate(13, &general_protection); +#else + set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK); +#endif + set_intr_gate(9, &coprocessor_segment_overrun); + set_intr_gate(10, &invalid_TSS); + set_intr_gate(11, &segment_not_present); + set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK); + set_intr_gate(13, &general_protection); set_intr_gate(14, &page_fault); - set_trap_gate(15, &spurious_interrupt_bug); - set_trap_gate(16, &coprocessor_error); - set_trap_gate(17, &alignment_check); + set_intr_gate(15, &spurious_interrupt_bug); + set_intr_gate(16, &coprocessor_error); + set_intr_gate(17, &alignment_check); #ifdef CONFIG_X86_MCE - set_trap_gate(18, &machine_check); + set_intr_gate_ist(18, &machine_check, MCE_STACK); #endif - set_trap_gate(19, &simd_coprocessor_error); + set_intr_gate(19, &simd_coprocessor_error); +#ifdef CONFIG_IA32_EMULATION + set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); +#endif + +#ifdef CONFIG_X86_32 if (cpu_has_fxsr) { printk(KERN_INFO "Enabling fast FPU save and restore... "); set_in_cr4(X86_CR4_OSFXSR); @@ -1232,37 +1007,20 @@ void __init trap_init(void) printk("done.\n"); } - set_system_gate(SYSCALL_VECTOR, &system_call); + set_system_trap_gate(SYSCALL_VECTOR, &system_call); /* Reserve all the builtin and the syscall vector: */ for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) set_bit(i, used_vectors); set_bit(SYSCALL_VECTOR, used_vectors); - - init_thread_xstate(); +#endif /* * Should be a barrier for any external CPU state: */ cpu_init(); +#ifdef CONFIG_X86_32 trap_init_hook(); +#endif } - -static int __init kstack_setup(char *s) -{ - kstack_depth_to_print = simple_strtoul(s, NULL, 0); - - return 1; -} -__setup("kstack=", kstack_setup); - -static int __init code_bytes_setup(char *s) -{ - code_bytes = simple_strtoul(s, NULL, 0); - if (code_bytes > 8192) - code_bytes = 8192; - - return 1; -} -__setup("code_bytes=", code_bytes_setup); diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c deleted file mode 100644 index 2696a6837782..000000000000 --- a/arch/x86/kernel/traps_64.c +++ /dev/null @@ -1,1217 +0,0 @@ -/* - * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs - * - * Pentium III FXSR, SSE support - * Gareth Hughes <gareth@valinux.com>, May 2000 - */ - -/* - * 'Traps.c' handles hardware traps and faults after we have saved some - * state in 'entry.S'. - */ -#include <linux/moduleparam.h> -#include <linux/interrupt.h> -#include <linux/kallsyms.h> -#include <linux/spinlock.h> -#include <linux/kprobes.h> -#include <linux/uaccess.h> -#include <linux/utsname.h> -#include <linux/kdebug.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/ptrace.h> -#include <linux/string.h> -#include <linux/unwind.h> -#include <linux/delay.h> -#include <linux/errno.h> -#include <linux/kexec.h> -#include <linux/sched.h> -#include <linux/timer.h> -#include <linux/init.h> -#include <linux/bug.h> -#include <linux/nmi.h> -#include <linux/mm.h> - -#if defined(CONFIG_EDAC) -#include <linux/edac.h> -#endif - -#include <asm/stacktrace.h> -#include <asm/processor.h> -#include <asm/debugreg.h> -#include <asm/atomic.h> -#include <asm/system.h> -#include <asm/unwind.h> -#include <asm/desc.h> -#include <asm/i387.h> -#include <asm/nmi.h> -#include <asm/smp.h> -#include <asm/io.h> -#include <asm/pgalloc.h> -#include <asm/proto.h> -#include <asm/pda.h> - -#include <mach_traps.h> - -asmlinkage void divide_error(void); -asmlinkage void debug(void); -asmlinkage void nmi(void); -asmlinkage void int3(void); -asmlinkage void overflow(void); -asmlinkage void bounds(void); -asmlinkage void invalid_op(void); -asmlinkage void device_not_available(void); -asmlinkage void double_fault(void); -asmlinkage void coprocessor_segment_overrun(void); -asmlinkage void invalid_TSS(void); -asmlinkage void segment_not_present(void); -asmlinkage void stack_segment(void); -asmlinkage void general_protection(void); -asmlinkage void page_fault(void); -asmlinkage void coprocessor_error(void); -asmlinkage void simd_coprocessor_error(void); -asmlinkage void alignment_check(void); -asmlinkage void spurious_interrupt_bug(void); -asmlinkage void machine_check(void); - -int panic_on_unrecovered_nmi; -int kstack_depth_to_print = 12; -static unsigned int code_bytes = 64; -static int ignore_nmis; -static int die_counter; - -static inline void conditional_sti(struct pt_regs *regs) -{ - if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); -} - -static inline void preempt_conditional_sti(struct pt_regs *regs) -{ - inc_preempt_count(); - if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); -} - -static inline void preempt_conditional_cli(struct pt_regs *regs) -{ - if (regs->flags & X86_EFLAGS_IF) - local_irq_disable(); - /* Make sure to not schedule here because we could be running - on an exception stack. */ - dec_preempt_count(); -} - -void printk_address(unsigned long address, int reliable) -{ - printk(" [<%016lx>] %s%pS\n", address, reliable ? "": "? ", (void *) address); -} - -static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, - unsigned *usedp, char **idp) -{ - static char ids[][8] = { - [DEBUG_STACK - 1] = "#DB", - [NMI_STACK - 1] = "NMI", - [DOUBLEFAULT_STACK - 1] = "#DF", - [STACKFAULT_STACK - 1] = "#SS", - [MCE_STACK - 1] = "#MC", -#if DEBUG_STKSZ > EXCEPTION_STKSZ - [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" -#endif - }; - unsigned k; - - /* - * Iterate over all exception stacks, and figure out whether - * 'stack' is in one of them: - */ - for (k = 0; k < N_EXCEPTION_STACKS; k++) { - unsigned long end = per_cpu(orig_ist, cpu).ist[k]; - /* - * Is 'stack' above this exception frame's end? - * If yes then skip to the next frame. - */ - if (stack >= end) - continue; - /* - * Is 'stack' above this exception frame's start address? - * If yes then we found the right frame. - */ - if (stack >= end - EXCEPTION_STKSZ) { - /* - * Make sure we only iterate through an exception - * stack once. If it comes up for the second time - * then there's something wrong going on - just - * break out and return NULL: - */ - if (*usedp & (1U << k)) - break; - *usedp |= 1U << k; - *idp = ids[k]; - return (unsigned long *)end; - } - /* - * If this is a debug stack, and if it has a larger size than - * the usual exception stacks, then 'stack' might still - * be within the lower portion of the debug stack: - */ -#if DEBUG_STKSZ > EXCEPTION_STKSZ - if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) { - unsigned j = N_EXCEPTION_STACKS - 1; - - /* - * Black magic. A large debug stack is composed of - * multiple exception stack entries, which we - * iterate through now. Dont look: - */ - do { - ++j; - end -= EXCEPTION_STKSZ; - ids[j][4] = '1' + (j - N_EXCEPTION_STACKS); - } while (stack < end - EXCEPTION_STKSZ); - if (*usedp & (1U << j)) - break; - *usedp |= 1U << j; - *idp = ids[j]; - return (unsigned long *)end; - } -#endif - } - return NULL; -} - -/* - * x86-64 can have up to three kernel stacks: - * process stack - * interrupt stack - * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack - */ - -static inline int valid_stack_ptr(struct thread_info *tinfo, - void *p, unsigned int size, void *end) -{ - void *t = tinfo; - if (end) { - if (p < end && p >= (end-THREAD_SIZE)) - return 1; - else - return 0; - } - return p > t && p < t + THREAD_SIZE - size; -} - -/* The form of the top of the frame on the stack */ -struct stack_frame { - struct stack_frame *next_frame; - unsigned long return_address; -}; - -static inline unsigned long -print_context_stack(struct thread_info *tinfo, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data, - unsigned long *end) -{ - struct stack_frame *frame = (struct stack_frame *)bp; - - while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { - unsigned long addr; - - addr = *stack; - if (__kernel_text_address(addr)) { - if ((unsigned long) stack == bp + 8) { - ops->address(data, addr, 1); - frame = frame->next_frame; - bp = (unsigned long) frame; - } else { - ops->address(data, addr, bp == 0); - } - } - stack++; - } - return bp; -} - -void dump_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data) -{ - const unsigned cpu = get_cpu(); - unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr; - unsigned used = 0; - struct thread_info *tinfo; - - if (!task) - task = current; - - if (!stack) { - unsigned long dummy; - stack = &dummy; - if (task && task != current) - stack = (unsigned long *)task->thread.sp; - } - -#ifdef CONFIG_FRAME_POINTER - if (!bp) { - if (task == current) { - /* Grab bp right from our regs */ - asm("movq %%rbp, %0" : "=r" (bp) :); - } else { - /* bp is the last reg pushed by switch_to */ - bp = *(unsigned long *) task->thread.sp; - } - } -#endif - - /* - * Print function call entries in all stacks, starting at the - * current stack address. If the stacks consist of nested - * exceptions - */ - tinfo = task_thread_info(task); - for (;;) { - char *id; - unsigned long *estack_end; - estack_end = in_exception_stack(cpu, (unsigned long)stack, - &used, &id); - - if (estack_end) { - if (ops->stack(data, id) < 0) - break; - - bp = print_context_stack(tinfo, stack, bp, ops, - data, estack_end); - ops->stack(data, "<EOE>"); - /* - * We link to the next stack via the - * second-to-last pointer (index -2 to end) in the - * exception stack: - */ - stack = (unsigned long *) estack_end[-2]; - continue; - } - if (irqstack_end) { - unsigned long *irqstack; - irqstack = irqstack_end - - (IRQSTACKSIZE - 64) / sizeof(*irqstack); - - if (stack >= irqstack && stack < irqstack_end) { - if (ops->stack(data, "IRQ") < 0) - break; - bp = print_context_stack(tinfo, stack, bp, - ops, data, irqstack_end); - /* - * We link to the next stack (which would be - * the process stack normally) the last - * pointer (index -1 to end) in the IRQ stack: - */ - stack = (unsigned long *) (irqstack_end[-1]); - irqstack_end = NULL; - ops->stack(data, "EOI"); - continue; - } - } - break; - } - - /* - * This handles the process stack: - */ - bp = print_context_stack(tinfo, stack, bp, ops, data, NULL); - put_cpu(); -} -EXPORT_SYMBOL(dump_trace); - -static void -print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) -{ - print_symbol(msg, symbol); - printk("\n"); -} - -static void print_trace_warning(void *data, char *msg) -{ - printk("%s\n", msg); -} - -static int print_trace_stack(void *data, char *name) -{ - printk(" <%s> ", name); - return 0; -} - -static void print_trace_address(void *data, unsigned long addr, int reliable) -{ - touch_nmi_watchdog(); - printk_address(addr, reliable); -} - -static const struct stacktrace_ops print_trace_ops = { - .warning = print_trace_warning, - .warning_symbol = print_trace_warning_symbol, - .stack = print_trace_stack, - .address = print_trace_address, -}; - -void show_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp) -{ - printk("\nCall Trace:\n"); - dump_trace(task, regs, stack, bp, &print_trace_ops, NULL); - printk("\n"); -} - -static void -_show_stack(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp) -{ - unsigned long *stack; - int i; - const int cpu = smp_processor_id(); - unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr); - unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); - - // debugging aid: "show_stack(NULL, NULL);" prints the - // back trace for this cpu. - - if (sp == NULL) { - if (task) - sp = (unsigned long *)task->thread.sp; - else - sp = (unsigned long *)&sp; - } - - stack = sp; - for (i = 0; i < kstack_depth_to_print; i++) { - if (stack >= irqstack && stack <= irqstack_end) { - if (stack == irqstack_end) { - stack = (unsigned long *) (irqstack_end[-1]); - printk(" <EOI> "); - } - } else { - if (((long) stack & (THREAD_SIZE-1)) == 0) - break; - } - if (i && ((i % 4) == 0)) - printk("\n"); - printk(" %016lx", *stack++); - touch_nmi_watchdog(); - } - show_trace(task, regs, sp, bp); -} - -void show_stack(struct task_struct *task, unsigned long *sp) -{ - _show_stack(task, NULL, sp, 0); -} - -/* - * The architecture-independent dump_stack generator - */ -void dump_stack(void) -{ - unsigned long bp = 0; - unsigned long stack; - -#ifdef CONFIG_FRAME_POINTER - if (!bp) - asm("movq %%rbp, %0" : "=r" (bp):); -#endif - - printk("Pid: %d, comm: %.20s %s %s %.*s\n", - current->pid, current->comm, print_tainted(), - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); - show_trace(NULL, NULL, &stack, bp); -} - -EXPORT_SYMBOL(dump_stack); - -void show_registers(struct pt_regs *regs) -{ - int i; - unsigned long sp; - const int cpu = smp_processor_id(); - struct task_struct *cur = cpu_pda(cpu)->pcurrent; - - sp = regs->sp; - printk("CPU %d ", cpu); - __show_regs(regs); - printk("Process %s (pid: %d, threadinfo %p, task %p)\n", - cur->comm, cur->pid, task_thread_info(cur), cur); - - /* - * When in-kernel, we also print out the stack and code at the - * time of the fault.. - */ - if (!user_mode(regs)) { - unsigned int code_prologue = code_bytes * 43 / 64; - unsigned int code_len = code_bytes; - unsigned char c; - u8 *ip; - - printk("Stack: "); - _show_stack(NULL, regs, (unsigned long *)sp, regs->bp); - printk("\n"); - - printk(KERN_EMERG "Code: "); - - ip = (u8 *)regs->ip - code_prologue; - if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { - /* try starting at RIP */ - ip = (u8 *)regs->ip; - code_len = code_len - code_prologue + 1; - } - for (i = 0; i < code_len; i++, ip++) { - if (ip < (u8 *)PAGE_OFFSET || - probe_kernel_address(ip, c)) { - printk(" Bad RIP value."); - break; - } - if (ip == (u8 *)regs->ip) - printk("<%02x> ", c); - else - printk("%02x ", c); - } - } - printk("\n"); -} - -int is_valid_bugaddr(unsigned long ip) -{ - unsigned short ud2; - - if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2))) - return 0; - - return ud2 == 0x0b0f; -} - -static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; -static int die_owner = -1; -static unsigned int die_nest_count; - -unsigned __kprobes long oops_begin(void) -{ - int cpu; - unsigned long flags; - - oops_enter(); - - /* racy, but better than risking deadlock. */ - raw_local_irq_save(flags); - cpu = smp_processor_id(); - if (!__raw_spin_trylock(&die_lock)) { - if (cpu == die_owner) - /* nested oops. should stop eventually */; - else - __raw_spin_lock(&die_lock); - } - die_nest_count++; - die_owner = cpu; - console_verbose(); - bust_spinlocks(1); - return flags; -} - -void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) -{ - die_owner = -1; - bust_spinlocks(0); - die_nest_count--; - if (!die_nest_count) - /* Nest count reaches zero, release the lock. */ - __raw_spin_unlock(&die_lock); - raw_local_irq_restore(flags); - if (!regs) { - oops_exit(); - return; - } - if (panic_on_oops) - panic("Fatal exception"); - oops_exit(); - do_exit(signr); -} - -int __kprobes __die(const char *str, struct pt_regs *regs, long err) -{ - printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff, ++die_counter); -#ifdef CONFIG_PREEMPT - printk("PREEMPT "); -#endif -#ifdef CONFIG_SMP - printk("SMP "); -#endif -#ifdef CONFIG_DEBUG_PAGEALLOC - printk("DEBUG_PAGEALLOC"); -#endif - printk("\n"); - if (notify_die(DIE_OOPS, str, regs, err, - current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) - return 1; - - show_registers(regs); - add_taint(TAINT_DIE); - /* Executive summary in case the oops scrolled away */ - printk(KERN_ALERT "RIP "); - printk_address(regs->ip, 1); - printk(" RSP <%016lx>\n", regs->sp); - if (kexec_should_crash(current)) - crash_kexec(regs); - return 0; -} - -void die(const char *str, struct pt_regs *regs, long err) -{ - unsigned long flags = oops_begin(); - - if (!user_mode(regs)) - report_bug(regs->ip, regs); - - if (__die(str, regs, err)) - regs = NULL; - oops_end(flags, regs, SIGSEGV); -} - -notrace __kprobes void -die_nmi(char *str, struct pt_regs *regs, int do_panic) -{ - unsigned long flags; - - if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) - return; - - flags = oops_begin(); - /* - * We are in trouble anyway, lets at least try - * to get a message out. - */ - printk(KERN_EMERG "%s", str); - printk(" on CPU%d, ip %08lx, registers:\n", - smp_processor_id(), regs->ip); - show_registers(regs); - if (kexec_should_crash(current)) - crash_kexec(regs); - if (do_panic || panic_on_oops) - panic("Non maskable interrupt"); - oops_end(flags, NULL, SIGBUS); - nmi_exit(); - local_irq_enable(); - do_exit(SIGBUS); -} - -static void __kprobes -do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, - long error_code, siginfo_t *info) -{ - struct task_struct *tsk = current; - - if (!user_mode(regs)) - goto kernel_trap; - - /* - * We want error_code and trap_no set for userspace faults and - * kernelspace faults which result in die(), but not - * kernelspace faults which are fixed up. die() gives the - * process no chance to handle the signal and notice the - * kernel fault information, so that won't result in polluting - * the information about previously queued, but not yet - * delivered, faults. See also do_general_protection below. - */ - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; - - if (show_unhandled_signals && unhandled_signal(tsk, signr) && - printk_ratelimit()) { - printk(KERN_INFO - "%s[%d] trap %s ip:%lx sp:%lx error:%lx", - tsk->comm, tsk->pid, str, - regs->ip, regs->sp, error_code); - print_vma_addr(" in ", regs->ip); - printk("\n"); - } - - if (info) - force_sig_info(signr, info, tsk); - else - force_sig(signr, tsk); - return; - -kernel_trap: - if (!fixup_exception(regs)) { - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; - die(str, regs, error_code); - } - return; -} - -#define DO_ERROR(trapnr, signr, str, name) \ -asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ -{ \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - conditional_sti(regs); \ - do_trap(trapnr, signr, str, regs, error_code, NULL); \ -} - -#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ -asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ -{ \ - siginfo_t info; \ - info.si_signo = signr; \ - info.si_errno = 0; \ - info.si_code = sicode; \ - info.si_addr = (void __user *)siaddr; \ - trace_hardirqs_fixup(); \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - conditional_sti(regs); \ - do_trap(trapnr, signr, str, regs, error_code, &info); \ -} - -DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) -DO_ERROR(4, SIGSEGV, "overflow", overflow) -DO_ERROR(5, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) -DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) -DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) -DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) -DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) - -/* Runs on IST stack */ -asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code) -{ - if (notify_die(DIE_TRAP, "stack segment", regs, error_code, - 12, SIGBUS) == NOTIFY_STOP) - return; - preempt_conditional_sti(regs); - do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL); - preempt_conditional_cli(regs); -} - -asmlinkage void do_double_fault(struct pt_regs * regs, long error_code) -{ - static const char str[] = "double fault"; - struct task_struct *tsk = current; - - /* Return not checked because double check cannot be ignored */ - notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV); - - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 8; - - /* This is always a kernel trap and never fixable (and thus must - never return). */ - for (;;) - die(str, regs, error_code); -} - -asmlinkage void __kprobes -do_general_protection(struct pt_regs *regs, long error_code) -{ - struct task_struct *tsk; - - conditional_sti(regs); - - tsk = current; - if (!user_mode(regs)) - goto gp_in_kernel; - - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 13; - - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && - printk_ratelimit()) { - printk(KERN_INFO - "%s[%d] general protection ip:%lx sp:%lx error:%lx", - tsk->comm, tsk->pid, - regs->ip, regs->sp, error_code); - print_vma_addr(" in ", regs->ip); - printk("\n"); - } - - force_sig(SIGSEGV, tsk); - return; - -gp_in_kernel: - if (fixup_exception(regs)) - return; - - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 13; - if (notify_die(DIE_GPF, "general protection fault", regs, - error_code, 13, SIGSEGV) == NOTIFY_STOP) - return; - die("general protection fault", regs, error_code); -} - -static notrace __kprobes void -mem_parity_error(unsigned char reason, struct pt_regs *regs) -{ - printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", - reason); - printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n"); - -#if defined(CONFIG_EDAC) - if (edac_handler_set()) { - edac_atomic_assert_error(); - return; - } -#endif - - if (panic_on_unrecovered_nmi) - panic("NMI: Not continuing"); - - printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); - - /* Clear and disable the memory parity error line. */ - reason = (reason & 0xf) | 4; - outb(reason, 0x61); -} - -static notrace __kprobes void -io_check_error(unsigned char reason, struct pt_regs *regs) -{ - printk("NMI: IOCK error (debug interrupt?)\n"); - show_registers(regs); - - /* Re-enable the IOCK line, wait for a few seconds */ - reason = (reason & 0xf) | 8; - outb(reason, 0x61); - mdelay(2000); - reason &= ~8; - outb(reason, 0x61); -} - -static notrace __kprobes void -unknown_nmi_error(unsigned char reason, struct pt_regs * regs) -{ - if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) - return; - printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", - reason); - printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); - - if (panic_on_unrecovered_nmi) - panic("NMI: Not continuing"); - - printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); -} - -/* Runs on IST stack. This code must keep interrupts off all the time. - Nested NMIs are prevented by the CPU. */ -asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs) -{ - unsigned char reason = 0; - int cpu; - - cpu = smp_processor_id(); - - /* Only the BSP gets external NMIs from the system. */ - if (!cpu) - reason = get_nmi_reason(); - - if (!(reason & 0xc0)) { - if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) - == NOTIFY_STOP) - return; - /* - * Ok, so this is none of the documented NMI sources, - * so it must be the NMI watchdog. - */ - if (nmi_watchdog_tick(regs, reason)) - return; - if (!do_nmi_callback(regs, cpu)) - unknown_nmi_error(reason, regs); - - return; - } - if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) - return; - - /* AK: following checks seem to be broken on modern chipsets. FIXME */ - if (reason & 0x80) - mem_parity_error(reason, regs); - if (reason & 0x40) - io_check_error(reason, regs); -} - -asmlinkage notrace __kprobes void -do_nmi(struct pt_regs *regs, long error_code) -{ - nmi_enter(); - - add_pda(__nmi_count, 1); - - if (!ignore_nmis) - default_do_nmi(regs); - - nmi_exit(); -} - -void stop_nmi(void) -{ - acpi_nmi_disable(); - ignore_nmis++; -} - -void restart_nmi(void) -{ - ignore_nmis--; - acpi_nmi_enable(); -} - -/* runs on IST stack. */ -asmlinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) -{ - trace_hardirqs_fixup(); - - if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) - == NOTIFY_STOP) - return; - - preempt_conditional_sti(regs); - do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); - preempt_conditional_cli(regs); -} - -/* Help handler running on IST stack to switch back to user stack - for scheduling or signal handling. The actual stack switch is done in - entry.S */ -asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) -{ - struct pt_regs *regs = eregs; - /* Did already sync */ - if (eregs == (struct pt_regs *)eregs->sp) - ; - /* Exception from user space */ - else if (user_mode(eregs)) - regs = task_pt_regs(current); - /* Exception from kernel and interrupts are enabled. Move to - kernel process stack. */ - else if (eregs->flags & X86_EFLAGS_IF) - regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); - if (eregs != regs) - *regs = *eregs; - return regs; -} - -/* runs on IST stack. */ -asmlinkage void __kprobes do_debug(struct pt_regs * regs, - unsigned long error_code) -{ - struct task_struct *tsk = current; - unsigned long condition; - siginfo_t info; - - trace_hardirqs_fixup(); - - get_debugreg(condition, 6); - - /* - * The processor cleared BTF, so don't mark that we need it set. - */ - clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); - tsk->thread.debugctlmsr = 0; - - if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, - SIGTRAP) == NOTIFY_STOP) - return; - - preempt_conditional_sti(regs); - - /* Mask out spurious debug traps due to lazy DR7 setting */ - if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { - if (!tsk->thread.debugreg7) - goto clear_dr7; - } - - tsk->thread.debugreg6 = condition; - - /* - * Single-stepping through TF: make sure we ignore any events in - * kernel space (but re-enable TF when returning to user mode). - */ - if (condition & DR_STEP) { - if (!user_mode(regs)) - goto clear_TF_reenable; - } - - /* Ok, finally something we can handle */ - tsk->thread.trap_no = 1; - tsk->thread.error_code = error_code; - info.si_signo = SIGTRAP; - info.si_errno = 0; - info.si_code = TRAP_BRKPT; - info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL; - force_sig_info(SIGTRAP, &info, tsk); - -clear_dr7: - set_debugreg(0, 7); - preempt_conditional_cli(regs); - return; - -clear_TF_reenable: - set_tsk_thread_flag(tsk, TIF_SINGLESTEP); - regs->flags &= ~X86_EFLAGS_TF; - preempt_conditional_cli(regs); - return; -} - -static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) -{ - if (fixup_exception(regs)) - return 1; - - notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE); - /* Illegal floating point operation in the kernel */ - current->thread.trap_no = trapnr; - die(str, regs, 0); - return 0; -} - -/* - * Note that we play around with the 'TS' bit in an attempt to get - * the correct behaviour even in the presence of the asynchronous - * IRQ13 behaviour - */ -asmlinkage void do_coprocessor_error(struct pt_regs *regs) -{ - void __user *ip = (void __user *)(regs->ip); - struct task_struct *task; - siginfo_t info; - unsigned short cwd, swd; - - conditional_sti(regs); - if (!user_mode(regs) && - kernel_math_error(regs, "kernel x87 math error", 16)) - return; - - /* - * Save the info for the exception handler and clear the error. - */ - task = current; - save_init_fpu(task); - task->thread.trap_no = 16; - task->thread.error_code = 0; - info.si_signo = SIGFPE; - info.si_errno = 0; - info.si_code = __SI_FAULT; - info.si_addr = ip; - /* - * (~cwd & swd) will mask out exceptions that are not set to unmasked - * status. 0x3f is the exception bits in these regs, 0x200 is the - * C1 reg you need in case of a stack fault, 0x040 is the stack - * fault bit. We should only be taking one exception at a time, - * so if this combination doesn't produce any single exception, - * then we have a bad program that isn't synchronizing its FPU usage - * and it will suffer the consequences since we won't be able to - * fully reproduce the context of the exception - */ - cwd = get_fpu_cwd(task); - swd = get_fpu_swd(task); - switch (swd & ~cwd & 0x3f) { - case 0x000: /* No unmasked exception */ - default: /* Multiple exceptions */ - break; - case 0x001: /* Invalid Op */ - /* - * swd & 0x240 == 0x040: Stack Underflow - * swd & 0x240 == 0x240: Stack Overflow - * User must clear the SF bit (0x40) if set - */ - info.si_code = FPE_FLTINV; - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ - info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ - info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ - info.si_code = FPE_FLTRES; - break; - } - force_sig_info(SIGFPE, &info, task); -} - -asmlinkage void bad_intr(void) -{ - printk("bad interrupt"); -} - -asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) -{ - void __user *ip = (void __user *)(regs->ip); - struct task_struct *task; - siginfo_t info; - unsigned short mxcsr; - - conditional_sti(regs); - if (!user_mode(regs) && - kernel_math_error(regs, "kernel simd math error", 19)) - return; - - /* - * Save the info for the exception handler and clear the error. - */ - task = current; - save_init_fpu(task); - task->thread.trap_no = 19; - task->thread.error_code = 0; - info.si_signo = SIGFPE; - info.si_errno = 0; - info.si_code = __SI_FAULT; - info.si_addr = ip; - /* - * The SIMD FPU exceptions are handled a little differently, as there - * is only a single status/control register. Thus, to determine which - * unmasked exception was caught we must mask the exception mask bits - * at 0x1f80, and then use these to mask the exception bits at 0x3f. - */ - mxcsr = get_fpu_mxcsr(task); - switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { - case 0x000: - default: - break; - case 0x001: /* Invalid Op */ - info.si_code = FPE_FLTINV; - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ - info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ - info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ - info.si_code = FPE_FLTRES; - break; - } - force_sig_info(SIGFPE, &info, task); -} - -asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs) -{ -} - -asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) -{ -} - -asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) -{ -} - -/* - * 'math_state_restore()' saves the current math information in the - * old math state array, and gets the new ones from the current task - * - * Careful.. There are problems with IBM-designed IRQ13 behaviour. - * Don't touch unless you *really* know how it works. - */ -asmlinkage void math_state_restore(void) -{ - struct task_struct *me = current; - - if (!used_math()) { - local_irq_enable(); - /* - * does a slab alloc which can sleep - */ - if (init_fpu(me)) { - /* - * ran out of memory! - */ - do_group_exit(SIGKILL); - return; - } - local_irq_disable(); - } - - clts(); /* Allow maths ops (or we recurse) */ - restore_fpu_checking(&me->thread.xstate->fxsave); - task_thread_info(me)->status |= TS_USEDFPU; - me->fpu_counter++; -} -EXPORT_SYMBOL_GPL(math_state_restore); - -void __init trap_init(void) -{ - set_intr_gate(0, ÷_error); - set_intr_gate_ist(1, &debug, DEBUG_STACK); - set_intr_gate_ist(2, &nmi, NMI_STACK); - set_system_gate_ist(3, &int3, DEBUG_STACK); /* int3 can be called from all */ - set_system_gate(4, &overflow); /* int4 can be called from all */ - set_intr_gate(5, &bounds); - set_intr_gate(6, &invalid_op); - set_intr_gate(7, &device_not_available); - set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK); - set_intr_gate(9, &coprocessor_segment_overrun); - set_intr_gate(10, &invalid_TSS); - set_intr_gate(11, &segment_not_present); - set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK); - set_intr_gate(13, &general_protection); - set_intr_gate(14, &page_fault); - set_intr_gate(15, &spurious_interrupt_bug); - set_intr_gate(16, &coprocessor_error); - set_intr_gate(17, &alignment_check); -#ifdef CONFIG_X86_MCE - set_intr_gate_ist(18, &machine_check, MCE_STACK); -#endif - set_intr_gate(19, &simd_coprocessor_error); - -#ifdef CONFIG_IA32_EMULATION - set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall); -#endif - /* - * initialize the per thread extended state: - */ - init_thread_xstate(); - /* - * Should be a barrier for any external CPU state: - */ - cpu_init(); -} - -static int __init oops_setup(char *s) -{ - if (!s) - return -EINVAL; - if (!strcmp(s, "panic")) - panic_on_oops = 1; - return 0; -} -early_param("oops", oops_setup); - -static int __init kstack_setup(char *s) -{ - if (!s) - return -EINVAL; - kstack_depth_to_print = simple_strtoul(s, NULL, 0); - return 0; -} -early_param("kstack", kstack_setup); - -static int __init code_bytes_setup(char *s) -{ - code_bytes = simple_strtoul(s, NULL, 0); - if (code_bytes > 8192) - code_bytes = 8192; - - return 1; -} -__setup("code_bytes=", code_bytes_setup); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 7603c0553909..161bb850fc47 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -104,7 +104,7 @@ __setup("notsc", notsc_setup); /* * Read TSC and the reference counters. Take care of SMI disturbance */ -static u64 __init tsc_read_refs(u64 *pm, u64 *hpet) +static u64 tsc_read_refs(u64 *p, int hpet) { u64 t1, t2; int i; @@ -112,9 +112,9 @@ static u64 __init tsc_read_refs(u64 *pm, u64 *hpet) for (i = 0; i < MAX_RETRIES; i++) { t1 = get_cycles(); if (hpet) - *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF; + *p = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF; else - *pm = acpi_pm_read_early(); + *p = acpi_pm_read_early(); t2 = get_cycles(); if ((t2 - t1) < SMI_TRESHOLD) return t2; @@ -122,80 +122,390 @@ static u64 __init tsc_read_refs(u64 *pm, u64 *hpet) return ULLONG_MAX; } -/** - * native_calibrate_tsc - calibrate the tsc on boot +/* + * Calculate the TSC frequency from HPET reference */ -unsigned long native_calibrate_tsc(void) +static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2) { - unsigned long flags; - u64 tsc1, tsc2, tr1, tr2, delta, pm1, pm2, hpet1, hpet2; - int hpet = is_hpet_enabled(); - unsigned int tsc_khz_val = 0; + u64 tmp; - local_irq_save(flags); + if (hpet2 < hpet1) + hpet2 += 0x100000000ULL; + hpet2 -= hpet1; + tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD)); + do_div(tmp, 1000000); + do_div(deltatsc, tmp); + + return (unsigned long) deltatsc; +} + +/* + * Calculate the TSC frequency from PMTimer reference + */ +static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2) +{ + u64 tmp; - tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL); + if (!pm1 && !pm2) + return ULONG_MAX; + + if (pm2 < pm1) + pm2 += (u64)ACPI_PM_OVRRUN; + pm2 -= pm1; + tmp = pm2 * 1000000000LL; + do_div(tmp, PMTMR_TICKS_PER_SEC); + do_div(deltatsc, tmp); + + return (unsigned long) deltatsc; +} + +#define CAL_MS 10 +#define CAL_LATCH (CLOCK_TICK_RATE / (1000 / CAL_MS)) +#define CAL_PIT_LOOPS 1000 + +#define CAL2_MS 50 +#define CAL2_LATCH (CLOCK_TICK_RATE / (1000 / CAL2_MS)) +#define CAL2_PIT_LOOPS 5000 + + +/* + * Try to calibrate the TSC against the Programmable + * Interrupt Timer and return the frequency of the TSC + * in kHz. + * + * Return ULONG_MAX on failure to calibrate. + */ +static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin) +{ + u64 tsc, t1, t2, delta; + unsigned long tscmin, tscmax; + int pitcnt; + /* Set the Gate high, disable speaker */ outb((inb(0x61) & ~0x02) | 0x01, 0x61); + /* + * Setup CTC channel 2* for mode 0, (interrupt on terminal + * count mode), binary count. Set the latch register to 50ms + * (LSB then MSB) to begin countdown. + */ outb(0xb0, 0x43); - outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42); - outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42); - tr1 = get_cycles(); - while ((inb(0x61) & 0x20) == 0); - tr2 = get_cycles(); + outb(latch & 0xff, 0x42); + outb(latch >> 8, 0x42); + + tsc = t1 = t2 = get_cycles(); + + pitcnt = 0; + tscmax = 0; + tscmin = ULONG_MAX; + while ((inb(0x61) & 0x20) == 0) { + t2 = get_cycles(); + delta = t2 - tsc; + tsc = t2; + if ((unsigned long) delta < tscmin) + tscmin = (unsigned int) delta; + if ((unsigned long) delta > tscmax) + tscmax = (unsigned int) delta; + pitcnt++; + } + + /* + * Sanity checks: + * + * If we were not able to read the PIT more than loopmin + * times, then we have been hit by a massive SMI + * + * If the maximum is 10 times larger than the minimum, + * then we got hit by an SMI as well. + */ + if (pitcnt < loopmin || tscmax > 10 * tscmin) + return ULONG_MAX; + + /* Calculate the PIT value */ + delta = t2 - t1; + do_div(delta, ms); + return delta; +} - tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL); +/* + * This reads the current MSB of the PIT counter, and + * checks if we are running on sufficiently fast and + * non-virtualized hardware. + * + * Our expectations are: + * + * - the PIT is running at roughly 1.19MHz + * + * - each IO is going to take about 1us on real hardware, + * but we allow it to be much faster (by a factor of 10) or + * _slightly_ slower (ie we allow up to a 2us read+counter + * update - anything else implies a unacceptably slow CPU + * or PIT for the fast calibration to work. + * + * - with 256 PIT ticks to read the value, we have 214us to + * see the same MSB (and overhead like doing a single TSC + * read per MSB value etc). + * + * - We're doing 2 reads per loop (LSB, MSB), and we expect + * them each to take about a microsecond on real hardware. + * So we expect a count value of around 100. But we'll be + * generous, and accept anything over 50. + * + * - if the PIT is stuck, and we see *many* more reads, we + * return early (and the next caller of pit_expect_msb() + * then consider it a failure when they don't see the + * next expected value). + * + * These expectations mean that we know that we have seen the + * transition from one expected value to another with a fairly + * high accuracy, and we didn't miss any events. We can thus + * use the TSC value at the transitions to calculate a pretty + * good value for the TSC frequencty. + */ +static inline int pit_expect_msb(unsigned char val) +{ + int count = 0; + for (count = 0; count < 50000; count++) { + /* Ignore LSB */ + inb(0x42); + if (inb(0x42) != val) + break; + } + return count > 50; +} + +/* + * How many MSB values do we want to see? We aim for a + * 15ms calibration, which assuming a 2us counter read + * error should give us roughly 150 ppm precision for + * the calibration. + */ +#define QUICK_PIT_MS 15 +#define QUICK_PIT_ITERATIONS (QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256) + +static unsigned long quick_pit_calibrate(void) +{ + /* Set the Gate high, disable speaker */ + outb((inb(0x61) & ~0x02) | 0x01, 0x61); + + /* + * Counter 2, mode 0 (one-shot), binary count + * + * NOTE! Mode 2 decrements by two (and then the + * output is flipped each time, giving the same + * final output frequency as a decrement-by-one), + * so mode 0 is much better when looking at the + * individual counts. + */ + outb(0xb0, 0x43); + + /* Start at 0xffff */ + outb(0xff, 0x42); + outb(0xff, 0x42); + + if (pit_expect_msb(0xff)) { + int i; + u64 t1, t2, delta; + unsigned char expect = 0xfe; + + t1 = get_cycles(); + for (i = 0; i < QUICK_PIT_ITERATIONS; i++, expect--) { + if (!pit_expect_msb(expect)) + goto failed; + } + t2 = get_cycles(); + + /* + * Make sure we can rely on the second TSC timestamp: + */ + if (!pit_expect_msb(expect)) + goto failed; + + /* + * Ok, if we get here, then we've seen the + * MSB of the PIT decrement QUICK_PIT_ITERATIONS + * times, and each MSB had many hits, so we never + * had any sudden jumps. + * + * As a result, we can depend on there not being + * any odd delays anywhere, and the TSC reads are + * reliable. + * + * kHz = ticks / time-in-seconds / 1000; + * kHz = (t2 - t1) / (QPI * 256 / PIT_TICK_RATE) / 1000 + * kHz = ((t2 - t1) * PIT_TICK_RATE) / (QPI * 256 * 1000) + */ + delta = (t2 - t1)*PIT_TICK_RATE; + do_div(delta, QUICK_PIT_ITERATIONS*256*1000); + printk("Fast TSC calibration using PIT\n"); + return delta; + } +failed: + return 0; +} + +/** + * native_calibrate_tsc - calibrate the tsc on boot + */ +unsigned long native_calibrate_tsc(void) +{ + u64 tsc1, tsc2, delta, ref1, ref2; + unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; + unsigned long flags, latch, ms, fast_calibrate; + int hpet = is_hpet_enabled(), i, loopmin; + + local_irq_save(flags); + fast_calibrate = quick_pit_calibrate(); local_irq_restore(flags); + if (fast_calibrate) + return fast_calibrate; /* - * Preset the result with the raw and inaccurate PIT - * calibration value + * Run 5 calibration loops to get the lowest frequency value + * (the best estimate). We use two different calibration modes + * here: + * + * 1) PIT loop. We set the PIT Channel 2 to oneshot mode and + * load a timeout of 50ms. We read the time right after we + * started the timer and wait until the PIT count down reaches + * zero. In each wait loop iteration we read the TSC and check + * the delta to the previous read. We keep track of the min + * and max values of that delta. The delta is mostly defined + * by the IO time of the PIT access, so we can detect when a + * SMI/SMM disturbance happend between the two reads. If the + * maximum time is significantly larger than the minimum time, + * then we discard the result and have another try. + * + * 2) Reference counter. If available we use the HPET or the + * PMTIMER as a reference to check the sanity of that value. + * We use separate TSC readouts and check inside of the + * reference read for a SMI/SMM disturbance. We dicard + * disturbed values here as well. We do that around the PIT + * calibration delay loop as we have to wait for a certain + * amount of time anyway. */ - delta = (tr2 - tr1); - do_div(delta, 50); - tsc_khz_val = delta; - - /* hpet or pmtimer available ? */ - if (!hpet && !pm1 && !pm2) { - printk(KERN_INFO "TSC calibrated against PIT\n"); - goto out; + + /* Preset PIT loop values */ + latch = CAL_LATCH; + ms = CAL_MS; + loopmin = CAL_PIT_LOOPS; + + for (i = 0; i < 3; i++) { + unsigned long tsc_pit_khz; + + /* + * Read the start value and the reference count of + * hpet/pmtimer when available. Then do the PIT + * calibration, which will take at least 50ms, and + * read the end value. + */ + local_irq_save(flags); + tsc1 = tsc_read_refs(&ref1, hpet); + tsc_pit_khz = pit_calibrate_tsc(latch, ms, loopmin); + tsc2 = tsc_read_refs(&ref2, hpet); + local_irq_restore(flags); + + /* Pick the lowest PIT TSC calibration so far */ + tsc_pit_min = min(tsc_pit_min, tsc_pit_khz); + + /* hpet or pmtimer available ? */ + if (!hpet && !ref1 && !ref2) + continue; + + /* Check, whether the sampling was disturbed by an SMI */ + if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX) + continue; + + tsc2 = (tsc2 - tsc1) * 1000000LL; + if (hpet) + tsc2 = calc_hpet_ref(tsc2, ref1, ref2); + else + tsc2 = calc_pmtimer_ref(tsc2, ref1, ref2); + + tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2); + + /* Check the reference deviation */ + delta = ((u64) tsc_pit_min) * 100; + do_div(delta, tsc_ref_min); + + /* + * If both calibration results are inside a 10% window + * then we can be sure, that the calibration + * succeeded. We break out of the loop right away. We + * use the reference value, as it is more precise. + */ + if (delta >= 90 && delta <= 110) { + printk(KERN_INFO + "TSC: PIT calibration matches %s. %d loops\n", + hpet ? "HPET" : "PMTIMER", i + 1); + return tsc_ref_min; + } + + /* + * Check whether PIT failed more than once. This + * happens in virtualized environments. We need to + * give the virtual PC a slightly longer timeframe for + * the HPET/PMTIMER to make the result precise. + */ + if (i == 1 && tsc_pit_min == ULONG_MAX) { + latch = CAL2_LATCH; + ms = CAL2_MS; + loopmin = CAL2_PIT_LOOPS; + } } - /* Check, whether the sampling was disturbed by an SMI */ - if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX) { - printk(KERN_WARNING "TSC calibration disturbed by SMI, " - "using PIT calibration result\n"); - goto out; + /* + * Now check the results. + */ + if (tsc_pit_min == ULONG_MAX) { + /* PIT gave no useful value */ + printk(KERN_WARNING "TSC: Unable to calibrate against PIT\n"); + + /* We don't have an alternative source, disable TSC */ + if (!hpet && !ref1 && !ref2) { + printk("TSC: No reference (HPET/PMTIMER) available\n"); + return 0; + } + + /* The alternative source failed as well, disable TSC */ + if (tsc_ref_min == ULONG_MAX) { + printk(KERN_WARNING "TSC: HPET/PMTIMER calibration " + "failed.\n"); + return 0; + } + + /* Use the alternative source */ + printk(KERN_INFO "TSC: using %s reference calibration\n", + hpet ? "HPET" : "PMTIMER"); + + return tsc_ref_min; } - tsc2 = (tsc2 - tsc1) * 1000000LL; - - if (hpet) { - printk(KERN_INFO "TSC calibrated against HPET\n"); - if (hpet2 < hpet1) - hpet2 += 0x100000000ULL; - hpet2 -= hpet1; - tsc1 = ((u64)hpet2 * hpet_readl(HPET_PERIOD)); - do_div(tsc1, 1000000); - } else { - printk(KERN_INFO "TSC calibrated against PM_TIMER\n"); - if (pm2 < pm1) - pm2 += (u64)ACPI_PM_OVRRUN; - pm2 -= pm1; - tsc1 = pm2 * 1000000000LL; - do_div(tsc1, PMTMR_TICKS_PER_SEC); + /* We don't have an alternative source, use the PIT calibration value */ + if (!hpet && !ref1 && !ref2) { + printk(KERN_INFO "TSC: Using PIT calibration value\n"); + return tsc_pit_min; } - do_div(tsc2, tsc1); - tsc_khz_val = tsc2; + /* The alternative source failed, use the PIT calibration value */ + if (tsc_ref_min == ULONG_MAX) { + printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed. " + "Using PIT calibration\n"); + return tsc_pit_min; + } -out: - return tsc_khz_val; + /* + * The calibration values differ too much. In doubt, we use + * the PIT value as we know that there are PMTIMERs around + * running at double speed. At least we let the user know: + */ + printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n", + hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min); + printk(KERN_INFO "TSC: Using PIT calibration value\n"); + return tsc_pit_min; } - #ifdef CONFIG_X86_32 /* Only called from the Powernow K7 cpu freq driver */ int recalibrate_cpu_khz(void) @@ -314,7 +624,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, mark_tsc_unstable("cpufreq changes"); } - set_cyc2ns_scale(tsc_khz_ref, freq->cpu); + set_cyc2ns_scale(tsc_khz, freq->cpu); return 0; } @@ -325,6 +635,10 @@ static struct notifier_block time_cpufreq_notifier_block = { static int __init cpufreq_tsc(void) { + if (!cpu_has_tsc) + return 0; + if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + return 0; cpufreq_register_notifier(&time_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); return 0; diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 0577825cf89b..9ffb01c31c40 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -88,11 +88,9 @@ static __cpuinit void check_tsc_warp(void) __raw_spin_unlock(&sync_lock); } } - if (!(now-start)) { - printk("Warning: zero tsc calibration delta: %Ld [max: %Ld]\n", + WARN(!(now-start), + "Warning: zero tsc calibration delta: %Ld [max: %Ld]\n", now-start, end-start); - WARN_ON(1); - } } /* diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c new file mode 100644 index 000000000000..aeef529917e4 --- /dev/null +++ b/arch/x86/kernel/uv_irq.c @@ -0,0 +1,79 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * SGI UV IRQ functions + * + * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved. + */ + +#include <linux/module.h> +#include <linux/irq.h> + +#include <asm/apic.h> +#include <asm/uv/uv_irq.h> + +static void uv_noop(unsigned int irq) +{ +} + +static unsigned int uv_noop_ret(unsigned int irq) +{ + return 0; +} + +static void uv_ack_apic(unsigned int irq) +{ + ack_APIC_irq(); +} + +struct irq_chip uv_irq_chip = { + .name = "UV-CORE", + .startup = uv_noop_ret, + .shutdown = uv_noop, + .enable = uv_noop, + .disable = uv_noop, + .ack = uv_noop, + .mask = uv_noop, + .unmask = uv_noop, + .eoi = uv_ack_apic, + .end = uv_noop, +}; + +/* + * Set up a mapping of an available irq and vector, and enable the specified + * MMR that defines the MSI that is to be sent to the specified CPU when an + * interrupt is raised. + */ +int uv_setup_irq(char *irq_name, int cpu, int mmr_blade, + unsigned long mmr_offset) +{ + int irq; + int ret; + + irq = create_irq(); + if (irq <= 0) + return -EBUSY; + + ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset); + if (ret != irq) + destroy_irq(irq); + + return ret; +} +EXPORT_SYMBOL_GPL(uv_setup_irq); + +/* + * Tear down a mapping of an irq and vector, and disable the specified MMR that + * defined the MSI that was to be sent to the specified CPU when an interrupt + * was raised. + * + * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq(). + */ +void uv_teardown_irq(unsigned int irq, int mmr_blade, unsigned long mmr_offset) +{ + arch_disable_uv_irq(mmr_blade, mmr_offset); + destroy_irq(irq); +} +EXPORT_SYMBOL_GPL(uv_teardown_irq); diff --git a/arch/x86/kernel/uv_sysfs.c b/arch/x86/kernel/uv_sysfs.c new file mode 100644 index 000000000000..67f9b9dbf800 --- /dev/null +++ b/arch/x86/kernel/uv_sysfs.c @@ -0,0 +1,72 @@ +/* + * This file supports the /sys/firmware/sgi_uv interfaces for SGI UV. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) Russ Anderson + */ + +#include <linux/sysdev.h> +#include <asm/uv/bios.h> + +struct kobject *sgi_uv_kobj; + +static ssize_t partition_id_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%ld\n", sn_partition_id); +} + +static ssize_t coherence_id_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%ld\n", partition_coherence_id()); +} + +static struct kobj_attribute partition_id_attr = + __ATTR(partition_id, S_IRUGO, partition_id_show, NULL); + +static struct kobj_attribute coherence_id_attr = + __ATTR(coherence_id, S_IRUGO, coherence_id_show, NULL); + + +static int __init sgi_uv_sysfs_init(void) +{ + unsigned long ret; + + if (!sgi_uv_kobj) + sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj); + if (!sgi_uv_kobj) { + printk(KERN_WARNING "kobject_create_and_add sgi_uv failed \n"); + return -EINVAL; + } + + ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr); + if (ret) { + printk(KERN_WARNING "sysfs_create_file partition_id failed \n"); + return ret; + } + + ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr); + if (ret) { + printk(KERN_WARNING "sysfs_create_file coherence_id failed \n"); + return ret; + } + + return 0; +} + +device_initcall(sgi_uv_sysfs_init); diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index e94bdb6add1d..0c9667f0752a 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -25,45 +25,31 @@ #include <asm/visws/cobalt.h> #include <asm/visws/piix4.h> #include <asm/arch_hooks.h> +#include <asm/io_apic.h> #include <asm/fixmap.h> #include <asm/reboot.h> #include <asm/setup.h> #include <asm/e820.h> -#include <asm/smp.h> #include <asm/io.h> #include <mach_ipi.h> #include "mach_apic.h" -#include <linux/init.h> -#include <linux/smp.h> - #include <linux/kernel_stat.h> -#include <linux/interrupt.h> -#include <linux/init.h> -#include <asm/io.h> -#include <asm/apic.h> #include <asm/i8259.h> #include <asm/irq_vectors.h> -#include <asm/visws/cobalt.h> #include <asm/visws/lithium.h> -#include <asm/visws/piix4.h> #include <linux/sched.h> #include <linux/kernel.h> -#include <linux/init.h> #include <linux/pci.h> #include <linux/pci_ids.h> extern int no_broadcast; -#include <asm/io.h> #include <asm/apic.h> -#include <asm/arch_hooks.h> -#include <asm/visws/cobalt.h> -#include <asm/visws/lithium.h> char visws_board_type = -1; char visws_board_rev = -1; @@ -73,7 +59,7 @@ int is_visws_box(void) return visws_board_type >= 0; } -static int __init visws_time_init_quirk(void) +static int __init visws_time_init(void) { printk(KERN_INFO "Starting Cobalt Timer system clock\n"); @@ -93,7 +79,7 @@ static int __init visws_time_init_quirk(void) return 0; } -static int __init visws_pre_intr_init_quirk(void) +static int __init visws_pre_intr_init(void) { init_VISWS_APIC_irqs(); @@ -114,7 +100,7 @@ EXPORT_SYMBOL(sgivwfb_mem_size); long long mem_size __initdata = 0; -static char * __init visws_memory_setup_quirk(void) +static char * __init visws_memory_setup(void) { long long gfx_mem_size = 8 * MB; @@ -176,7 +162,7 @@ static void visws_machine_power_off(void) outl(PIIX_SPECIAL_STOP, 0xCFC); } -static int __init visws_get_smp_config_quirk(unsigned int early) +static int __init visws_get_smp_config(unsigned int early) { /* * Prevent MP-table parsing by the generic code: @@ -184,15 +170,13 @@ static int __init visws_get_smp_config_quirk(unsigned int early) return 1; } -extern unsigned int __cpuinitdata maxcpus; - /* * The Visual Workstation is Intel MP compliant in the hardware * sense, but it doesn't have a BIOS(-configuration table). * No problem for Linux. */ -static void __init MP_processor_info (struct mpc_config_processor *m) +static void __init MP_processor_info(struct mpc_config_processor *m) { int ver, logical_apicid; physid_mask_t apic_cpus; @@ -232,7 +216,7 @@ static void __init MP_processor_info (struct mpc_config_processor *m) apic_version[m->mpc_apicid] = ver; } -int __init visws_find_smp_config_quirk(unsigned int reserve) +static int __init visws_find_smp_config(unsigned int reserve) { struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS); unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); @@ -244,8 +228,8 @@ int __init visws_find_smp_config_quirk(unsigned int reserve) ncpus = CO_CPU_MAX; } - if (ncpus > maxcpus) - ncpus = maxcpus; + if (ncpus > setup_max_cpus) + ncpus = setup_max_cpus; #ifdef CONFIG_X86_LOCAL_APIC smp_found_config = 1; @@ -258,7 +242,17 @@ int __init visws_find_smp_config_quirk(unsigned int reserve) return 1; } -extern int visws_trap_init_quirk(void); +static int visws_trap_init(void); + +static struct x86_quirks visws_x86_quirks __initdata = { + .arch_time_init = visws_time_init, + .arch_pre_intr_init = visws_pre_intr_init, + .arch_memory_setup = visws_memory_setup, + .arch_intr_init = NULL, + .arch_trap_init = visws_trap_init, + .mach_get_smp_config = visws_get_smp_config, + .mach_find_smp_config = visws_find_smp_config, +}; void __init visws_early_detect(void) { @@ -272,16 +266,10 @@ void __init visws_early_detect(void) /* * Install special quirks for timer, interrupt and memory setup: - */ - arch_time_init_quirk = visws_time_init_quirk; - arch_pre_intr_init_quirk = visws_pre_intr_init_quirk; - arch_memory_setup_quirk = visws_memory_setup_quirk; - - /* * Fall back to generic behavior for traps: + * Override generic MP-table parsing: */ - arch_intr_init_quirk = NULL; - arch_trap_init_quirk = visws_trap_init_quirk; + x86_quirks = &visws_x86_quirks; /* * Install reboot quirks: @@ -294,12 +282,6 @@ void __init visws_early_detect(void) */ no_broadcast = 0; - /* - * Override generic MP-table parsing: - */ - mach_get_smp_config_quirk = visws_get_smp_config_quirk; - mach_find_smp_config_quirk = visws_find_smp_config_quirk; - #ifdef CONFIG_X86_IO_APIC /* * Turn off IO-APIC detection and initialization: @@ -426,7 +408,7 @@ static __init void cobalt_init(void) co_apic_read(CO_APIC_ID)); } -int __init visws_trap_init_quirk(void) +static int __init visws_trap_init(void) { lithium_init(); cobalt_init(); @@ -502,10 +484,11 @@ static void disable_cobalt_irq(unsigned int irq) static unsigned int startup_cobalt_irq(unsigned int irq) { unsigned long flags; + struct irq_desc *desc = irq_to_desc(irq); spin_lock_irqsave(&cobalt_lock, flags); - if ((irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING))) - irq_desc[irq].status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING); + if ((desc->status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING))) + desc->status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING); enable_cobalt_irq(irq); spin_unlock_irqrestore(&cobalt_lock, flags); return 0; @@ -524,9 +507,10 @@ static void ack_cobalt_irq(unsigned int irq) static void end_cobalt_irq(unsigned int irq) { unsigned long flags; + struct irq_desc *desc = irq_to_desc(irq); spin_lock_irqsave(&cobalt_lock, flags); - if (!(irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS))) + if (!(desc->status & (IRQ_DISABLED | IRQ_INPROGRESS))) enable_cobalt_irq(irq); spin_unlock_irqrestore(&cobalt_lock, flags); } @@ -644,12 +628,12 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id) spin_unlock_irqrestore(&i8259A_lock, flags); - desc = irq_desc + realirq; + desc = irq_to_desc(realirq); /* * handle this 'virtual interrupt' as a Cobalt one now. */ - kstat_cpu(smp_processor_id()).irqs[realirq]++; + kstat_incr_irqs_this_cpu(realirq, desc); if (likely(desc->action != NULL)) handle_IRQ_event(realirq, desc->action); @@ -680,27 +664,29 @@ void init_VISWS_APIC_irqs(void) int i; for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) { - irq_desc[i].status = IRQ_DISABLED; - irq_desc[i].action = 0; - irq_desc[i].depth = 1; + struct irq_desc *desc = irq_to_desc(i); + + desc->status = IRQ_DISABLED; + desc->action = 0; + desc->depth = 1; if (i == 0) { - irq_desc[i].chip = &cobalt_irq_type; + desc->chip = &cobalt_irq_type; } else if (i == CO_IRQ_IDE0) { - irq_desc[i].chip = &cobalt_irq_type; + desc->chip = &cobalt_irq_type; } else if (i == CO_IRQ_IDE1) { - irq_desc[i].chip = &cobalt_irq_type; + desc->chip = &cobalt_irq_type; } else if (i == CO_IRQ_8259) { - irq_desc[i].chip = &piix4_master_irq_type; + desc->chip = &piix4_master_irq_type; } else if (i < CO_IRQ_APIC0) { - irq_desc[i].chip = &piix4_virtual_irq_type; + desc->chip = &piix4_virtual_irq_type; } else if (IS_CO_APIC(i)) { - irq_desc[i].chip = &cobalt_irq_type; + desc->chip = &cobalt_irq_type; } } diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 38f566fa27d2..4eeb5cf9720d 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -46,6 +46,7 @@ #include <asm/io.h> #include <asm/tlbflush.h> #include <asm/irq.h> +#include <asm/syscalls.h> /* * Known problems: diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index b15346092b7b..8b6c393ab9fd 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -37,6 +37,7 @@ #include <asm/timer.h> #include <asm/vmi_time.h> #include <asm/kmap_types.h> +#include <asm/setup.h> /* Convenient for calling VMI functions indirectly in the ROM */ typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void); @@ -234,7 +235,7 @@ static void vmi_write_ldt_entry(struct desc_struct *dt, int entry, const void *desc) { u32 *ldt_entry = (u32 *)desc; - vmi_ops.write_idt_entry(dt, entry, ldt_entry[0], ldt_entry[1]); + vmi_ops.write_ldt_entry(dt, entry, ldt_entry[0], ldt_entry[1]); } static void vmi_load_sp0(struct tss_struct *tss, @@ -392,13 +393,13 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type) } #endif -static void vmi_allocate_pte(struct mm_struct *mm, u32 pfn) +static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn) { vmi_set_page_type(pfn, VMI_PAGE_L1); vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); } -static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn) +static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn) { /* * This call comes in very early, before mem_map is setup. @@ -409,20 +410,20 @@ static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn) vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0); } -static void vmi_allocate_pmd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count) +static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count) { vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE); vmi_check_page_type(clonepfn, VMI_PAGE_L2); vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count); } -static void vmi_release_pte(u32 pfn) +static void vmi_release_pte(unsigned long pfn) { vmi_ops.release_page(pfn, VMI_PAGE_L1); vmi_set_page_type(pfn, VMI_PAGE_NORMAL); } -static void vmi_release_pmd(u32 pfn) +static void vmi_release_pmd(unsigned long pfn) { vmi_ops.release_page(pfn, VMI_PAGE_L2); vmi_set_page_type(pfn, VMI_PAGE_NORMAL); @@ -683,7 +684,7 @@ void vmi_bringup(void) { /* We must establish the lowmem mapping for MMU ops to work */ if (vmi_ops.set_linear_mapping) - vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, max_low_pfn, 0); + vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, MAXMEM_PFN, 0); } /* @@ -904,9 +905,8 @@ static inline int __init activate_vmi(void) #endif #ifdef CONFIG_X86_LOCAL_APIC - para_fill(pv_apic_ops.apic_read, APICRead); - para_fill(pv_apic_ops.apic_write, APICWrite); - para_fill(pv_apic_ops.apic_write_atomic, APICWrite); + para_fill(apic_ops->read, APICRead); + para_fill(apic_ops->write, APICWrite); #endif /* diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index 6953859fe289..254ee07f8635 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -235,11 +235,14 @@ static void __devinit vmi_time_init_clockevent(void) void __init vmi_time_init(void) { + unsigned int cpu; /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */ outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */ vmi_time_init_clockevent(); setup_irq(0, &vmi_clock_action); + for_each_possible_cpu(cpu) + per_cpu(vector_irq, cpu)[vmi_get_timer_vector()] = 0; } #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index cdb2363697d2..a9b8560adbc2 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -140,10 +140,10 @@ SECTIONS *(.con_initcall.init) __con_initcall_end = .; } - .x86cpuvendor.init : AT(ADDR(.x86cpuvendor.init) - LOAD_OFFSET) { - __x86cpuvendor_start = .; - *(.x86cpuvendor.init) - __x86cpuvendor_end = .; + .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { + __x86_cpu_dev_start = .; + *(.x86_cpu_dev.init) + __x86_cpu_dev_end = .; } SECURITY_INIT . = ALIGN(4); @@ -180,6 +180,7 @@ SECTIONS . = ALIGN(PAGE_SIZE); .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { __per_cpu_start = .; + *(.data.percpu.page_aligned) *(.data.percpu) *(.data.percpu.shared_aligned) __per_cpu_end = .; @@ -209,3 +210,11 @@ SECTIONS DWARF_DEBUG } + +#ifdef CONFIG_KEXEC +/* Link time checks */ +#include <asm/kexec.h> + +ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, + "kexec control code size is too big") +#endif diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 63e5c1a22e88..46e05447405b 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -168,12 +168,11 @@ SECTIONS *(.con_initcall.init) } __con_initcall_end = .; - . = ALIGN(16); - __x86cpuvendor_start = .; - .x86cpuvendor.init : AT(ADDR(.x86cpuvendor.init) - LOAD_OFFSET) { - *(.x86cpuvendor.init) + __x86_cpu_dev_start = .; + .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { + *(.x86_cpu_dev.init) } - __x86cpuvendor_end = .; + __x86_cpu_dev_end = .; SECURITY_INIT . = ALIGN(8); diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index 0c029e8959c7..7766d36983fc 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -61,7 +61,7 @@ static void vsmp_irq_enable(void) native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC)); } -static unsigned __init vsmp_patch(u8 type, u16 clobbers, void *ibuf, +static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf, unsigned long addr, unsigned len) { switch (type) { diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c new file mode 100644 index 000000000000..b13acb75e822 --- /dev/null +++ b/arch/x86/kernel/xsave.c @@ -0,0 +1,345 @@ +/* + * xsave/xrstor support. + * + * Author: Suresh Siddha <suresh.b.siddha@intel.com> + */ +#include <linux/bootmem.h> +#include <linux/compat.h> +#include <asm/i387.h> +#ifdef CONFIG_IA32_EMULATION +#include <asm/sigcontext32.h> +#endif +#include <asm/xcr.h> + +/* + * Supported feature mask by the CPU and the kernel. + */ +u64 pcntxt_mask; + +struct _fpx_sw_bytes fx_sw_reserved; +#ifdef CONFIG_IA32_EMULATION +struct _fpx_sw_bytes fx_sw_reserved_ia32; +#endif + +/* + * Check for the presence of extended state information in the + * user fpstate pointer in the sigcontext. + */ +int check_for_xstate(struct i387_fxsave_struct __user *buf, + void __user *fpstate, + struct _fpx_sw_bytes *fx_sw_user) +{ + int min_xstate_size = sizeof(struct i387_fxsave_struct) + + sizeof(struct xsave_hdr_struct); + unsigned int magic2; + int err; + + err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0], + sizeof(struct _fpx_sw_bytes)); + + if (err) + return err; + + /* + * First Magic check failed. + */ + if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1) + return -1; + + /* + * Check for error scenarios. + */ + if (fx_sw_user->xstate_size < min_xstate_size || + fx_sw_user->xstate_size > xstate_size || + fx_sw_user->xstate_size > fx_sw_user->extended_size) + return -1; + + err = __get_user(magic2, (__u32 *) (((void *)fpstate) + + fx_sw_user->extended_size - + FP_XSTATE_MAGIC2_SIZE)); + /* + * Check for the presence of second magic word at the end of memory + * layout. This detects the case where the user just copied the legacy + * fpstate layout with out copying the extended state information + * in the memory layout. + */ + if (err || magic2 != FP_XSTATE_MAGIC2) + return -1; + + return 0; +} + +#ifdef CONFIG_X86_64 +/* + * Signal frame handlers. + */ + +int save_i387_xstate(void __user *buf) +{ + struct task_struct *tsk = current; + int err = 0; + + if (!access_ok(VERIFY_WRITE, buf, sig_xstate_size)) + return -EACCES; + + BUG_ON(sig_xstate_size < xstate_size); + + if ((unsigned long)buf % 64) + printk("save_i387_xstate: bad fpstate %p\n", buf); + + if (!used_math()) + return 0; + clear_used_math(); /* trigger finit */ + if (task_thread_info(tsk)->status & TS_USEDFPU) { + /* + * Start with clearing the user buffer. This will present a + * clean context for the bytes not touched by the fxsave/xsave. + */ + err = __clear_user(buf, sig_xstate_size); + if (err) + return err; + + if (task_thread_info(tsk)->status & TS_XSAVE) + err = xsave_user(buf); + else + err = fxsave_user(buf); + + if (err) + return err; + task_thread_info(tsk)->status &= ~TS_USEDFPU; + stts(); + } else { + if (__copy_to_user(buf, &tsk->thread.xstate->fxsave, + xstate_size)) + return -1; + } + + if (task_thread_info(tsk)->status & TS_XSAVE) { + struct _fpstate __user *fx = buf; + struct _xstate __user *x = buf; + u64 xstate_bv; + + err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved, + sizeof(struct _fpx_sw_bytes)); + + err |= __put_user(FP_XSTATE_MAGIC2, + (__u32 __user *) (buf + sig_xstate_size + - FP_XSTATE_MAGIC2_SIZE)); + + /* + * Read the xstate_bv which we copied (directly from the cpu or + * from the state in task struct) to the user buffers and + * set the FP/SSE bits. + */ + err |= __get_user(xstate_bv, &x->xstate_hdr.xstate_bv); + + /* + * For legacy compatible, we always set FP/SSE bits in the bit + * vector while saving the state to the user context. This will + * enable us capturing any changes(during sigreturn) to + * the FP/SSE bits by the legacy applications which don't touch + * xstate_bv in the xsave header. + * + * xsave aware apps can change the xstate_bv in the xsave + * header as well as change any contents in the memory layout. + * xrestore as part of sigreturn will capture all the changes. + */ + xstate_bv |= XSTATE_FPSSE; + + err |= __put_user(xstate_bv, &x->xstate_hdr.xstate_bv); + + if (err) + return err; + } + + return 1; +} + +/* + * Restore the extended state if present. Otherwise, restore the FP/SSE + * state. + */ +int restore_user_xstate(void __user *buf) +{ + struct _fpx_sw_bytes fx_sw_user; + u64 mask; + int err; + + if (((unsigned long)buf % 64) || + check_for_xstate(buf, buf, &fx_sw_user)) + goto fx_only; + + mask = fx_sw_user.xstate_bv; + + /* + * restore the state passed by the user. + */ + err = xrestore_user(buf, mask); + if (err) + return err; + + /* + * init the state skipped by the user. + */ + mask = pcntxt_mask & ~mask; + + xrstor_state(init_xstate_buf, mask); + + return 0; + +fx_only: + /* + * couldn't find the extended state information in the + * memory layout. Restore just the FP/SSE and init all + * the other extended state. + */ + xrstor_state(init_xstate_buf, pcntxt_mask & ~XSTATE_FPSSE); + return fxrstor_checking((__force struct i387_fxsave_struct *)buf); +} + +/* + * This restores directly out of user space. Exceptions are handled. + */ +int restore_i387_xstate(void __user *buf) +{ + struct task_struct *tsk = current; + int err = 0; + + if (!buf) { + if (used_math()) + goto clear; + return 0; + } else + if (!access_ok(VERIFY_READ, buf, sig_xstate_size)) + return -EACCES; + + if (!used_math()) { + err = init_fpu(tsk); + if (err) + return err; + } + + if (!(task_thread_info(current)->status & TS_USEDFPU)) { + clts(); + task_thread_info(current)->status |= TS_USEDFPU; + } + if (task_thread_info(tsk)->status & TS_XSAVE) + err = restore_user_xstate(buf); + else + err = fxrstor_checking((__force struct i387_fxsave_struct *) + buf); + if (unlikely(err)) { + /* + * Encountered an error while doing the restore from the + * user buffer, clear the fpu state. + */ +clear: + clear_fpu(tsk); + clear_used_math(); + } + return err; +} +#endif + +/* + * Prepare the SW reserved portion of the fxsave memory layout, indicating + * the presence of the extended state information in the memory layout + * pointed by the fpstate pointer in the sigcontext. + * This will be saved when ever the FP and extended state context is + * saved on the user stack during the signal handler delivery to the user. + */ +static void prepare_fx_sw_frame(void) +{ + int size_extended = (xstate_size - sizeof(struct i387_fxsave_struct)) + + FP_XSTATE_MAGIC2_SIZE; + + sig_xstate_size = sizeof(struct _fpstate) + size_extended; + +#ifdef CONFIG_IA32_EMULATION + sig_xstate_ia32_size = sizeof(struct _fpstate_ia32) + size_extended; +#endif + + memset(&fx_sw_reserved, 0, sizeof(fx_sw_reserved)); + + fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; + fx_sw_reserved.extended_size = sig_xstate_size; + fx_sw_reserved.xstate_bv = pcntxt_mask; + fx_sw_reserved.xstate_size = xstate_size; +#ifdef CONFIG_IA32_EMULATION + memcpy(&fx_sw_reserved_ia32, &fx_sw_reserved, + sizeof(struct _fpx_sw_bytes)); + fx_sw_reserved_ia32.extended_size = sig_xstate_ia32_size; +#endif +} + +/* + * Represents init state for the supported extended state. + */ +struct xsave_struct *init_xstate_buf; + +#ifdef CONFIG_X86_64 +unsigned int sig_xstate_size = sizeof(struct _fpstate); +#endif + +/* + * Enable the extended processor state save/restore feature + */ +void __cpuinit xsave_init(void) +{ + if (!cpu_has_xsave) + return; + + set_in_cr4(X86_CR4_OSXSAVE); + + /* + * Enable all the features that the HW is capable of + * and the Linux kernel is aware of. + */ + xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); +} + +/* + * setup the xstate image representing the init state + */ +static void __init setup_xstate_init(void) +{ + init_xstate_buf = alloc_bootmem(xstate_size); + init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT; +} + +/* + * Enable and initialize the xsave feature. + */ +void __init xsave_cntxt_init(void) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); + pcntxt_mask = eax + ((u64)edx << 32); + + if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { + printk(KERN_ERR "FP/SSE not shown under xsave features 0x%llx\n", + pcntxt_mask); + BUG(); + } + + /* + * for now OS knows only about FP/SSE + */ + pcntxt_mask = pcntxt_mask & XCNTXT_MASK; + xsave_init(); + + /* + * Recompute the context size for enabled features + */ + cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); + xstate_size = ebx; + + prepare_fx_sw_frame(); + + setup_xstate_init(); + + printk(KERN_INFO "xsave/xrstor: enabled xstate_bv 0x%llx, " + "cntxt size 0x%x\n", + pcntxt_mask, xstate_size); +} diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 8d45fabc5f3b..ce3251ce5504 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -21,6 +21,7 @@ config KVM tristate "Kernel-based Virtual Machine (KVM) support" depends on HAVE_KVM select PREEMPT_NOTIFIERS + select MMU_NOTIFIER select ANON_INODES ---help--- Support hosting fully virtualized guest machines using hardware diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index c97d35c218db..c02343594b4d 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -2,10 +2,14 @@ # Makefile for Kernel-based Virtual Machine module # -common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o) +common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ + coalesced_mmio.o irq_comm.o) ifeq ($(CONFIG_KVM_TRACE),y) common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o) endif +ifeq ($(CONFIG_DMAR),y) +common-objs += $(addprefix ../../../virt/kvm/, vtd.o) +endif EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 3829aa7b663f..11c6725fb798 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -91,7 +91,7 @@ static void pit_set_gate(struct kvm *kvm, int channel, u32 val) c->gate = val; } -int pit_get_gate(struct kvm *kvm, int channel) +static int pit_get_gate(struct kvm *kvm, int channel) { WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); @@ -193,23 +193,21 @@ static void pit_latch_status(struct kvm *kvm, int channel) } } -int __pit_timer_fn(struct kvm_kpit_state *ps) +static int __pit_timer_fn(struct kvm_kpit_state *ps) { struct kvm_vcpu *vcpu0 = ps->pit->kvm->vcpus[0]; struct kvm_kpit_timer *pt = &ps->pit_timer; - atomic_inc(&pt->pending); - smp_mb__after_atomic_inc(); - if (vcpu0) { + if (!atomic_inc_and_test(&pt->pending)) set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests); - if (waitqueue_active(&vcpu0->wq)) { - vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; - wake_up_interruptible(&vcpu0->wq); - } - } - pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period); - pt->scheduled = ktime_to_ns(pt->timer.expires); + if (vcpu0 && waitqueue_active(&vcpu0->wq)) + wake_up_interruptible(&vcpu0->wq); + + hrtimer_add_expires_ns(&pt->timer, pt->period); + pt->scheduled = hrtimer_get_expires_ns(&pt->timer); + if (pt->period) + ps->channels[0].count_load_time = hrtimer_get_expires(&pt->timer); return (pt->period == 0 ? 0 : 1); } @@ -218,12 +216,22 @@ int pit_has_pending_timer(struct kvm_vcpu *vcpu) { struct kvm_pit *pit = vcpu->kvm->arch.vpit; - if (pit && vcpu->vcpu_id == 0 && pit->pit_state.inject_pending) + if (pit && vcpu->vcpu_id == 0 && pit->pit_state.irq_ack) return atomic_read(&pit->pit_state.pit_timer.pending); - return 0; } +static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) +{ + struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, + irq_ack_notifier); + spin_lock(&ps->inject_lock); + if (atomic_dec_return(&ps->pit_timer.pending) < 0) + atomic_inc(&ps->pit_timer.pending); + ps->irq_ack = 1; + spin_unlock(&ps->inject_lock); +} + static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) { struct kvm_kpit_state *ps; @@ -249,7 +257,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) timer = &pit->pit_state.pit_timer.timer; if (hrtimer_cancel(timer)) - hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS); + hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } static void destroy_pit_timer(struct kvm_kpit_timer *pt) @@ -258,8 +266,9 @@ static void destroy_pit_timer(struct kvm_kpit_timer *pt) hrtimer_cancel(&pt->timer); } -static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period) +static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) { + struct kvm_kpit_timer *pt = &ps->pit_timer; s64 interval; interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); @@ -271,6 +280,7 @@ static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period) pt->period = (is_period == 0) ? 0 : interval; pt->timer.function = pit_timer_fn; atomic_set(&pt->pending, 0); + ps->irq_ack = 1; hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval), HRTIMER_MODE_ABS); @@ -305,10 +315,11 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val) case 1: /* FIXME: enhance mode 4 precision */ case 4: - create_pit_timer(&ps->pit_timer, val, 0); + create_pit_timer(ps, val, 0); break; case 2: - create_pit_timer(&ps->pit_timer, val, 1); + case 3: + create_pit_timer(ps, val, 1); break; default: destroy_pit_timer(&ps->pit_timer); @@ -459,7 +470,8 @@ static void pit_ioport_read(struct kvm_io_device *this, mutex_unlock(&pit_state->lock); } -static int pit_in_range(struct kvm_io_device *this, gpa_t addr) +static int pit_in_range(struct kvm_io_device *this, gpa_t addr, + int len, int is_write) { return ((addr >= KVM_PIT_BASE_ADDRESS) && (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH)); @@ -500,7 +512,8 @@ static void speaker_ioport_read(struct kvm_io_device *this, mutex_unlock(&pit_state->lock); } -static int speaker_in_range(struct kvm_io_device *this, gpa_t addr) +static int speaker_in_range(struct kvm_io_device *this, gpa_t addr, + int len, int is_write) { return (addr == KVM_SPEAKER_BASE_ADDRESS); } @@ -520,7 +533,7 @@ void kvm_pit_reset(struct kvm_pit *pit) mutex_unlock(&pit->pit_state.lock); atomic_set(&pit->pit_state.pit_timer.pending, 0); - pit->pit_state.inject_pending = 1; + pit->pit_state.irq_ack = 1; } struct kvm_pit *kvm_create_pit(struct kvm *kvm) @@ -534,6 +547,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm) mutex_init(&pit->pit_state.lock); mutex_lock(&pit->pit_state.lock); + spin_lock_init(&pit->pit_state.inject_lock); /* Initialize PIO device */ pit->dev.read = pit_ioport_read; @@ -555,6 +569,9 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm) pit_state->pit = pit; hrtimer_init(&pit_state->pit_timer.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + pit_state->irq_ack_notifier.gsi = 0; + pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq; + kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); mutex_unlock(&pit->pit_state.lock); kvm_pit_reset(pit); @@ -575,13 +592,11 @@ void kvm_free_pit(struct kvm *kvm) } } -void __inject_pit_timer_intr(struct kvm *kvm) +static void __inject_pit_timer_intr(struct kvm *kvm) { mutex_lock(&kvm->lock); - kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 1); - kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 0); - kvm_pic_set_irq(pic_irqchip(kvm), 0, 1); - kvm_pic_set_irq(pic_irqchip(kvm), 0, 0); + kvm_set_irq(kvm, 0, 1); + kvm_set_irq(kvm, 0, 0); mutex_unlock(&kvm->lock); } @@ -592,37 +607,19 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) struct kvm_kpit_state *ps; if (vcpu && pit) { + int inject = 0; ps = &pit->pit_state; - /* Try to inject pending interrupts when: - * 1. Pending exists - * 2. Last interrupt was accepted or waited for too long time*/ - if (atomic_read(&ps->pit_timer.pending) && - (ps->inject_pending || - (jiffies - ps->last_injected_time - >= KVM_MAX_PIT_INTR_INTERVAL))) { - ps->inject_pending = 0; - __inject_pit_timer_intr(kvm); - ps->last_injected_time = jiffies; - } - } -} - -void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec) -{ - struct kvm_arch *arch = &vcpu->kvm->arch; - struct kvm_kpit_state *ps; - - if (vcpu && arch->vpit) { - ps = &arch->vpit->pit_state; - if (atomic_read(&ps->pit_timer.pending) && - (((arch->vpic->pics[0].imr & 1) == 0 && - arch->vpic->pics[0].irq_base == vec) || - (arch->vioapic->redirtbl[0].fields.vector == vec && - arch->vioapic->redirtbl[0].fields.mask != 1))) { - ps->inject_pending = 1; - atomic_dec(&ps->pit_timer.pending); - ps->channels[0].count_load_time = ktime_get(); + /* Try to inject pending interrupts when + * last one has been acked. + */ + spin_lock(&ps->inject_lock); + if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) { + ps->irq_ack = 0; + inject = 1; } + spin_unlock(&ps->inject_lock); + if (inject) + __inject_pit_timer_intr(kvm); } } diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index db25c2a6c8c4..e436d4983aa1 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -8,7 +8,6 @@ struct kvm_kpit_timer { int irq; s64 period; /* unit: ns */ s64 scheduled; - ktime_t last_update; atomic_t pending; }; @@ -34,8 +33,9 @@ struct kvm_kpit_state { u32 speaker_data_on; struct mutex lock; struct kvm_pit *pit; - bool inject_pending; /* if inject pending interrupts */ - unsigned long last_injected_time; + spinlock_t inject_lock; + unsigned long irq_ack; + struct kvm_irq_ack_notifier irq_ack_notifier; }; struct kvm_pit { @@ -54,7 +54,6 @@ struct kvm_pit { #define KVM_PIT_CHANNEL_MASK 0x3 void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu); -void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec); void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val); struct kvm_pit *kvm_create_pit(struct kvm *kvm); void kvm_free_pit(struct kvm *kvm); diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index ab29cf2def47..17e41e165f1a 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -30,6 +30,19 @@ #include <linux/kvm_host.h> +static void pic_clear_isr(struct kvm_kpic_state *s, int irq) +{ + s->isr &= ~(1 << irq); + s->isr_ack |= (1 << irq); +} + +void kvm_pic_clear_isr_ack(struct kvm *kvm) +{ + struct kvm_pic *s = pic_irqchip(kvm); + s->pics[0].isr_ack = 0xff; + s->pics[1].isr_ack = 0xff; +} + /* * set irq level. If an edge is detected, then the IRR is set to 1 */ @@ -130,8 +143,10 @@ void kvm_pic_set_irq(void *opaque, int irq, int level) { struct kvm_pic *s = opaque; - pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); - pic_update_irq(s); + if (irq >= 0 && irq < PIC_NUM_PINS) { + pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); + pic_update_irq(s); + } } /* @@ -139,11 +154,12 @@ void kvm_pic_set_irq(void *opaque, int irq, int level) */ static inline void pic_intack(struct kvm_kpic_state *s, int irq) { + s->isr |= 1 << irq; if (s->auto_eoi) { if (s->rotate_on_auto_eoi) s->priority_add = (irq + 1) & 7; - } else - s->isr |= (1 << irq); + pic_clear_isr(s, irq); + } /* * We don't clear a level sensitive interrupt here */ @@ -151,9 +167,10 @@ static inline void pic_intack(struct kvm_kpic_state *s, int irq) s->irr &= ~(1 << irq); } -int kvm_pic_read_irq(struct kvm_pic *s) +int kvm_pic_read_irq(struct kvm *kvm) { int irq, irq2, intno; + struct kvm_pic *s = pic_irqchip(kvm); irq = pic_get_irq(&s->pics[0]); if (irq >= 0) { @@ -179,16 +196,32 @@ int kvm_pic_read_irq(struct kvm_pic *s) intno = s->pics[0].irq_base + irq; } pic_update_irq(s); + kvm_notify_acked_irq(kvm, irq); return intno; } void kvm_pic_reset(struct kvm_kpic_state *s) { + int irq, irqbase; + struct kvm *kvm = s->pics_state->irq_request_opaque; + struct kvm_vcpu *vcpu0 = kvm->vcpus[0]; + + if (s == &s->pics_state->pics[0]) + irqbase = 0; + else + irqbase = 8; + + for (irq = 0; irq < PIC_NUM_PINS/2; irq++) { + if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0)) + if (s->irr & (1 << irq) || s->isr & (1 << irq)) + kvm_notify_acked_irq(kvm, irq+irqbase); + } s->last_irr = 0; s->irr = 0; s->imr = 0; s->isr = 0; + s->isr_ack = 0xff; s->priority_add = 0; s->irq_base = 0; s->read_reg_select = 0; @@ -241,7 +274,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) priority = get_priority(s, s->isr); if (priority != 8) { irq = (priority + s->priority_add) & 7; - s->isr &= ~(1 << irq); + pic_clear_isr(s, irq); if (cmd == 5) s->priority_add = (irq + 1) & 7; pic_update_irq(s->pics_state); @@ -249,7 +282,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) break; case 3: irq = val & 7; - s->isr &= ~(1 << irq); + pic_clear_isr(s, irq); pic_update_irq(s->pics_state); break; case 6: @@ -258,8 +291,8 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) break; case 7: irq = val & 7; - s->isr &= ~(1 << irq); s->priority_add = (irq + 1) & 7; + pic_clear_isr(s, irq); pic_update_irq(s->pics_state); break; default: @@ -301,7 +334,7 @@ static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1) s->pics_state->pics[0].irr &= ~(1 << 2); } s->irr &= ~(1 << ret); - s->isr &= ~(1 << ret); + pic_clear_isr(s, ret); if (addr1 >> 7 || ret != 2) pic_update_irq(s->pics_state); } else { @@ -346,7 +379,8 @@ static u32 elcr_ioport_read(void *opaque, u32 addr1) return s->elcr; } -static int picdev_in_range(struct kvm_io_device *this, gpa_t addr) +static int picdev_in_range(struct kvm_io_device *this, gpa_t addr, + int len, int is_write) { switch (addr) { case 0x20: @@ -419,10 +453,14 @@ static void pic_irq_request(void *opaque, int level) { struct kvm *kvm = opaque; struct kvm_vcpu *vcpu = kvm->vcpus[0]; + struct kvm_pic *s = pic_irqchip(kvm); + int irq = pic_get_irq(&s->pics[0]); - pic_irqchip(kvm)->output = level; - if (vcpu) + s->output = level; + if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) { + s->pics[0].isr_ack &= ~(1 << irq); kvm_vcpu_kick(vcpu); + } } struct kvm_pic *kvm_create_pic(struct kvm *kvm) diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 76d736b5f664..c019b8edcdb7 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -72,7 +72,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v) if (kvm_apic_accept_pic_intr(v)) { s = pic_irqchip(v->kvm); s->output = 0; /* PIC */ - vector = kvm_pic_read_irq(s); + vector = kvm_pic_read_irq(v->kvm); } } return vector; @@ -90,7 +90,6 @@ EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec) { kvm_apic_timer_intr_post(vcpu, vec); - kvm_pit_timer_intr_post(vcpu, vec); /* TODO: PIT, RTC etc. */ } EXPORT_SYMBOL_GPL(kvm_timer_intr_post); diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 2a15be2275c0..f17c8f5bbf31 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -30,6 +30,8 @@ #include "ioapic.h" #include "lapic.h" +#define PIC_NUM_PINS 16 + struct kvm; struct kvm_vcpu; @@ -40,6 +42,7 @@ struct kvm_kpic_state { u8 irr; /* interrupt request register */ u8 imr; /* interrupt mask register */ u8 isr; /* interrupt service register */ + u8 isr_ack; /* interrupt ack detection */ u8 priority_add; /* highest irq priority */ u8 irq_base; u8 read_reg_select; @@ -61,12 +64,13 @@ struct kvm_pic { void *irq_request_opaque; int output; /* intr from master PIC */ struct kvm_io_device dev; + void (*ack_notifier)(void *opaque, int irq); }; struct kvm_pic *kvm_create_pic(struct kvm *kvm); -void kvm_pic_set_irq(void *opaque, int irq, int level); -int kvm_pic_read_irq(struct kvm_pic *s); +int kvm_pic_read_irq(struct kvm *kvm); void kvm_pic_update_irq(struct kvm_pic *s); +void kvm_pic_clear_isr_ack(struct kvm *kvm); static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) { diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h new file mode 100644 index 000000000000..1ff819dce7d3 --- /dev/null +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -0,0 +1,32 @@ +#ifndef ASM_KVM_CACHE_REGS_H +#define ASM_KVM_CACHE_REGS_H + +static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, + enum kvm_reg reg) +{ + if (!test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail)) + kvm_x86_ops->cache_reg(vcpu, reg); + + return vcpu->arch.regs[reg]; +} + +static inline void kvm_register_write(struct kvm_vcpu *vcpu, + enum kvm_reg reg, + unsigned long val) +{ + vcpu->arch.regs[reg] = val; + __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); + __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); +} + +static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu) +{ + return kvm_register_read(vcpu, VCPU_REGS_RIP); +} + +static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val) +{ + kvm_register_write(vcpu, VCPU_REGS_RIP, val); +} + +#endif diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index ebc03f5ae162..0fc3cab48943 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -32,6 +32,7 @@ #include <asm/current.h> #include <asm/apicdef.h> #include <asm/atomic.h> +#include "kvm_cache_regs.h" #include "irq.h" #define PRId64 "d" @@ -338,13 +339,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, } else apic_clear_vector(vector, apic->regs + APIC_TMR); - if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) - kvm_vcpu_kick(vcpu); - else if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) { - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; - if (waitqueue_active(&vcpu->wq)) - wake_up_interruptible(&vcpu->wq); - } + kvm_vcpu_kick(vcpu); result = (orig_irr == 0); break; @@ -356,8 +351,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, case APIC_DM_SMI: printk(KERN_DEBUG "Ignoring guest SMI\n"); break; + case APIC_DM_NMI: - printk(KERN_DEBUG "Ignoring guest NMI\n"); + kvm_inject_nmi(vcpu); break; case APIC_DM_INIT: @@ -369,21 +365,18 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; kvm_vcpu_kick(vcpu); } else { - printk(KERN_DEBUG - "Ignoring de-assert INIT to vcpu %d\n", - vcpu->vcpu_id); + apic_debug("Ignoring de-assert INIT to vcpu %d\n", + vcpu->vcpu_id); } - break; case APIC_DM_STARTUP: - printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n", - vcpu->vcpu_id, vector); + apic_debug("SIPI to vcpu %d vector 0x%02x\n", + vcpu->vcpu_id, vector); if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { vcpu->arch.sipi_vector = vector; vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; - if (waitqueue_active(&vcpu->wq)) - wake_up_interruptible(&vcpu->wq); + kvm_vcpu_kick(vcpu); } break; @@ -437,7 +430,7 @@ struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector, static void apic_set_eoi(struct kvm_lapic *apic) { int vector = apic_find_highest_isr(apic); - + int trigger_mode; /* * Not every write EOI will has corresponding ISR, * one example is when Kernel check timer on setup_IO_APIC @@ -449,7 +442,10 @@ static void apic_set_eoi(struct kvm_lapic *apic) apic_update_ppr(apic); if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR)) - kvm_ioapic_update_eoi(apic->vcpu->kvm, vector); + trigger_mode = IOAPIC_LEVEL_TRIG; + else + trigger_mode = IOAPIC_EDGE_TRIG; + kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); } static void apic_send_ipi(struct kvm_lapic *apic) @@ -557,8 +553,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write) struct kvm_run *run = vcpu->run; set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests); - kvm_x86_ops->cache_regs(vcpu); - run->tpr_access.rip = vcpu->arch.rip; + run->tpr_access.rip = kvm_rip_read(vcpu); run->tpr_access.is_write = write; } @@ -572,6 +567,8 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) { u32 val = 0; + KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler); + if (offset >= LAPIC_MMIO_LENGTH) return 0; @@ -680,9 +677,9 @@ static void apic_mmio_write(struct kvm_io_device *this, * Refer SDM 8.4.1 */ if (len != 4 || alignment) { - if (printk_ratelimit()) - printk(KERN_ERR "apic write: bad size=%d %lx\n", - len, (long)address); + /* Don't shout loud, $infamous_os would cause only noise. */ + apic_debug("apic write: bad size=%d %lx\n", + len, (long)address); return; } @@ -695,6 +692,8 @@ static void apic_mmio_write(struct kvm_io_device *this, offset &= 0xff0; + KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler); + switch (offset) { case APIC_ID: /* Local APIC ID */ apic_set_reg(apic, APIC_ID, val); @@ -780,7 +779,8 @@ static void apic_mmio_write(struct kvm_io_device *this, } -static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr) +static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr, + int len, int size) { struct kvm_lapic *apic = (struct kvm_lapic *)this->private; int ret = 0; @@ -939,17 +939,14 @@ static int __apic_timer_fn(struct kvm_lapic *apic) int result = 0; wait_queue_head_t *q = &apic->vcpu->wq; - atomic_inc(&apic->timer.pending); - set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests); - if (waitqueue_active(q)) { - apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + if(!atomic_inc_and_test(&apic->timer.pending)) + set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests); + if (waitqueue_active(q)) wake_up_interruptible(q); - } + if (apic_lvtt_period(apic)) { result = 1; - apic->timer.dev.expires = ktime_add_ns( - apic->timer.dev.expires, - apic->timer.period); + hrtimer_add_expires_ns(&apic->timer.dev, apic->timer.period); } return result; } @@ -1118,7 +1115,7 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) timer = &apic->timer.dev; if (hrtimer_cancel(timer)) - hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS); + hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 676c396c9cee..81858881287e 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -31,6 +31,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu); u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); +u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 7e7c3969f7a2..99c239c5c0ac 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -66,9 +66,13 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {} #endif #if defined(MMU_DEBUG) || defined(AUDIT) -static int dbg = 1; +static int dbg = 0; +module_param(dbg, bool, 0644); #endif +static int oos_shadow = 1; +module_param(oos_shadow, bool, 0644); + #ifndef MMU_DEBUG #define ASSERT(x) do { } while (0) #else @@ -134,18 +138,24 @@ static int dbg = 1; #define ACC_USER_MASK PT_USER_MASK #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) -struct kvm_pv_mmu_op_buffer { - void *ptr; - unsigned len; - unsigned processed; - char buf[512] __aligned(sizeof(long)); -}; +#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) struct kvm_rmap_desc { u64 *shadow_ptes[RMAP_EXT]; struct kvm_rmap_desc *more; }; +struct kvm_shadow_walk { + int (*entry)(struct kvm_shadow_walk *walk, struct kvm_vcpu *vcpu, + u64 addr, u64 *spte, int level); +}; + +struct kvm_unsync_walk { + int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk); +}; + +typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp); + static struct kmem_cache *pte_chain_cache; static struct kmem_cache *rmap_desc_cache; static struct kmem_cache *mmu_page_header_cache; @@ -404,16 +414,19 @@ static int host_largepage_backed(struct kvm *kvm, gfn_t gfn) { struct vm_area_struct *vma; unsigned long addr; + int ret = 0; addr = gfn_to_hva(kvm, gfn); if (kvm_is_error_hva(addr)) - return 0; + return ret; + down_read(¤t->mm->mmap_sem); vma = find_vma(current->mm, addr); if (vma && is_vm_hugetlb_page(vma)) - return 1; + ret = 1; + up_read(¤t->mm->mmap_sem); - return 0; + return ret; } static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn) @@ -648,8 +661,88 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn) if (write_protected) kvm_flush_remote_tlbs(kvm); +} + +static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) +{ + u64 *spte; + int need_tlb_flush = 0; + + while ((spte = rmap_next(kvm, rmapp, NULL))) { + BUG_ON(!(*spte & PT_PRESENT_MASK)); + rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); + rmap_remove(kvm, spte); + set_shadow_pte(spte, shadow_trap_nonpresent_pte); + need_tlb_flush = 1; + } + return need_tlb_flush; +} + +static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, + int (*handler)(struct kvm *kvm, unsigned long *rmapp)) +{ + int i; + int retval = 0; - account_shadowed(kvm, gfn); + /* + * If mmap_sem isn't taken, we can look the memslots with only + * the mmu_lock by skipping over the slots with userspace_addr == 0. + */ + for (i = 0; i < kvm->nmemslots; i++) { + struct kvm_memory_slot *memslot = &kvm->memslots[i]; + unsigned long start = memslot->userspace_addr; + unsigned long end; + + /* mmu_lock protects userspace_addr */ + if (!start) + continue; + + end = start + (memslot->npages << PAGE_SHIFT); + if (hva >= start && hva < end) { + gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; + retval |= handler(kvm, &memslot->rmap[gfn_offset]); + retval |= handler(kvm, + &memslot->lpage_info[ + gfn_offset / + KVM_PAGES_PER_HPAGE].rmap_pde); + } + } + + return retval; +} + +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) +{ + return kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); +} + +static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp) +{ + u64 *spte; + int young = 0; + + /* always return old for EPT */ + if (!shadow_accessed_mask) + return 0; + + spte = rmap_next(kvm, rmapp, NULL); + while (spte) { + int _young; + u64 _spte = *spte; + BUG_ON(!(_spte & PT_PRESENT_MASK)); + _young = _spte & PT_ACCESSED_MASK; + if (_young) { + young = 1; + clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); + } + spte = rmap_next(kvm, rmapp, spte); + } + return young; +} + +int kvm_age_hva(struct kvm *kvm, unsigned long hva) +{ + return kvm_handle_hva(kvm, hva, kvm_age_rmapp); } #ifdef MMU_DEBUG @@ -776,6 +869,138 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, BUG(); } + +static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + mmu_parent_walk_fn fn) +{ + struct kvm_pte_chain *pte_chain; + struct hlist_node *node; + struct kvm_mmu_page *parent_sp; + int i; + + if (!sp->multimapped && sp->parent_pte) { + parent_sp = page_header(__pa(sp->parent_pte)); + fn(vcpu, parent_sp); + mmu_parent_walk(vcpu, parent_sp, fn); + return; + } + hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) + for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { + if (!pte_chain->parent_ptes[i]) + break; + parent_sp = page_header(__pa(pte_chain->parent_ptes[i])); + fn(vcpu, parent_sp); + mmu_parent_walk(vcpu, parent_sp, fn); + } +} + +static void kvm_mmu_update_unsync_bitmap(u64 *spte) +{ + unsigned int index; + struct kvm_mmu_page *sp = page_header(__pa(spte)); + + index = spte - sp->spt; + __set_bit(index, sp->unsync_child_bitmap); + sp->unsync_children = 1; +} + +static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp) +{ + struct kvm_pte_chain *pte_chain; + struct hlist_node *node; + int i; + + if (!sp->parent_pte) + return; + + if (!sp->multimapped) { + kvm_mmu_update_unsync_bitmap(sp->parent_pte); + return; + } + + hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) + for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { + if (!pte_chain->parent_ptes[i]) + break; + kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]); + } +} + +static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + sp->unsync_children = 1; + kvm_mmu_update_parents_unsync(sp); + return 1; +} + +static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp) +{ + mmu_parent_walk(vcpu, sp, unsync_walk_fn); + kvm_mmu_update_parents_unsync(sp); +} + +static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp) +{ + int i; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) + sp->spt[i] = shadow_trap_nonpresent_pte; +} + +static int nonpaging_sync_page(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp) +{ + return 1; +} + +static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) +{ +} + +#define for_each_unsync_children(bitmap, idx) \ + for (idx = find_first_bit(bitmap, 512); \ + idx < 512; \ + idx = find_next_bit(bitmap, 512, idx+1)) + +static int mmu_unsync_walk(struct kvm_mmu_page *sp, + struct kvm_unsync_walk *walker) +{ + int i, ret; + + if (!sp->unsync_children) + return 0; + + for_each_unsync_children(sp->unsync_child_bitmap, i) { + u64 ent = sp->spt[i]; + + if (is_shadow_present_pte(ent)) { + struct kvm_mmu_page *child; + child = page_header(ent & PT64_BASE_ADDR_MASK); + + if (child->unsync_children) { + ret = mmu_unsync_walk(child, walker); + if (ret) + return ret; + __clear_bit(i, sp->unsync_child_bitmap); + } + + if (child->unsync) { + ret = walker->entry(child, walker); + __clear_bit(i, sp->unsync_child_bitmap); + if (ret) + return ret; + } + } + } + + if (find_first_bit(sp->unsync_child_bitmap, 512) == 512) + sp->unsync_children = 0; + + return 0; +} + static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) { unsigned index; @@ -796,6 +1021,59 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) return NULL; } +static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + WARN_ON(!sp->unsync); + sp->unsync = 0; + --kvm->stat.mmu_unsync; +} + +static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp); + +static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + if (sp->role.glevels != vcpu->arch.mmu.root_level) { + kvm_mmu_zap_page(vcpu->kvm, sp); + return 1; + } + + rmap_write_protect(vcpu->kvm, sp->gfn); + if (vcpu->arch.mmu.sync_page(vcpu, sp)) { + kvm_mmu_zap_page(vcpu->kvm, sp); + return 1; + } + + kvm_mmu_flush_tlb(vcpu); + kvm_unlink_unsync_page(vcpu->kvm, sp); + return 0; +} + +struct sync_walker { + struct kvm_vcpu *vcpu; + struct kvm_unsync_walk walker; +}; + +static int mmu_sync_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk) +{ + struct sync_walker *sync_walk = container_of(walk, struct sync_walker, + walker); + struct kvm_vcpu *vcpu = sync_walk->vcpu; + + kvm_sync_page(vcpu, sp); + return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)); +} + +static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + struct sync_walker walker = { + .walker = { .entry = mmu_sync_fn, }, + .vcpu = vcpu, + }; + + while (mmu_unsync_walk(sp, &walker.walker)) + cond_resched_lock(&vcpu->kvm->mmu_lock); +} + static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gaddr, @@ -809,7 +1087,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, unsigned quadrant; struct hlist_head *bucket; struct kvm_mmu_page *sp; - struct hlist_node *node; + struct hlist_node *node, *tmp; role.word = 0; role.glevels = vcpu->arch.mmu.root_level; @@ -825,9 +1103,20 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn, role.word); index = kvm_page_table_hashfn(gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; - hlist_for_each_entry(sp, node, bucket, hash_link) - if (sp->gfn == gfn && sp->role.word == role.word) { + hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link) + if (sp->gfn == gfn) { + if (sp->unsync) + if (kvm_sync_page(vcpu, sp)) + continue; + + if (sp->role.word != role.word) + continue; + mmu_page_add_parent_pte(vcpu, sp, parent_pte); + if (sp->unsync_children) { + set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); + kvm_mmu_mark_parents_unsync(vcpu, sp); + } pgprintk("%s: found\n", __func__); return sp; } @@ -839,12 +1128,46 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, sp->gfn = gfn; sp->role = role; hlist_add_head(&sp->hash_link, bucket); - if (!metaphysical) + if (!metaphysical) { rmap_write_protect(vcpu->kvm, gfn); - vcpu->arch.mmu.prefetch_page(vcpu, sp); + account_shadowed(vcpu->kvm, gfn); + } + if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) + vcpu->arch.mmu.prefetch_page(vcpu, sp); + else + nonpaging_prefetch_page(vcpu, sp); return sp; } +static int walk_shadow(struct kvm_shadow_walk *walker, + struct kvm_vcpu *vcpu, u64 addr) +{ + hpa_t shadow_addr; + int level; + int r; + u64 *sptep; + unsigned index; + + shadow_addr = vcpu->arch.mmu.root_hpa; + level = vcpu->arch.mmu.shadow_root_level; + if (level == PT32E_ROOT_LEVEL) { + shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; + shadow_addr &= PT64_BASE_ADDR_MASK; + --level; + } + + while (level >= PT_PAGE_TABLE_LEVEL) { + index = SHADOW_PT_INDEX(addr, level); + sptep = ((u64 *)__va(shadow_addr)) + index; + r = walker->entry(walker, vcpu, addr, sptep, level); + if (r) + return r; + shadow_addr = *sptep & PT64_BASE_ADDR_MASK; + --level; + } + return 0; +} + static void kvm_mmu_page_unlink_children(struct kvm *kvm, struct kvm_mmu_page *sp) { @@ -860,7 +1183,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm, rmap_remove(kvm, &pt[i]); pt[i] = shadow_trap_nonpresent_pte; } - kvm_flush_remote_tlbs(kvm); return; } @@ -879,7 +1201,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm, } pt[i] = shadow_trap_nonpresent_pte; } - kvm_flush_remote_tlbs(kvm); } static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) @@ -896,11 +1217,10 @@ static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm) kvm->vcpus[i]->arch.last_pte_updated = NULL; } -static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) +static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) { u64 *parent_pte; - ++kvm->stat.mmu_shadow_zapped; while (sp->multimapped || sp->parent_pte) { if (!sp->multimapped) parent_pte = sp->parent_pte; @@ -915,18 +1235,59 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_put_page(sp, parent_pte); set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte); } +} + +struct zap_walker { + struct kvm_unsync_walk walker; + struct kvm *kvm; + int zapped; +}; + +static int mmu_zap_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk) +{ + struct zap_walker *zap_walk = container_of(walk, struct zap_walker, + walker); + kvm_mmu_zap_page(zap_walk->kvm, sp); + zap_walk->zapped = 1; + return 0; +} + +static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + struct zap_walker walker = { + .walker = { .entry = mmu_zap_fn, }, + .kvm = kvm, + .zapped = 0, + }; + + if (sp->role.level == PT_PAGE_TABLE_LEVEL) + return 0; + mmu_unsync_walk(sp, &walker.walker); + return walker.zapped; +} + +static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + int ret; + ++kvm->stat.mmu_shadow_zapped; + ret = mmu_zap_unsync_children(kvm, sp); kvm_mmu_page_unlink_children(kvm, sp); + kvm_mmu_unlink_parents(kvm, sp); + kvm_flush_remote_tlbs(kvm); + if (!sp->role.invalid && !sp->role.metaphysical) + unaccount_shadowed(kvm, sp->gfn); + if (sp->unsync) + kvm_unlink_unsync_page(kvm, sp); if (!sp->root_count) { - if (!sp->role.metaphysical) - unaccount_shadowed(kvm, sp->gfn); hlist_del(&sp->hash_link); kvm_mmu_free_page(kvm, sp); } else { - list_move(&sp->link, &kvm->arch.active_mmu_pages); sp->role.invalid = 1; + list_move(&sp->link, &kvm->arch.active_mmu_pages); kvm_reload_remote_mmus(kvm); } kvm_mmu_reset_last_pte_updated(kvm); + return ret; } /* @@ -979,8 +1340,9 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) if (sp->gfn == gfn && !sp->role.metaphysical) { pgprintk("%s: gfn %lx role %x\n", __func__, gfn, sp->role.word); - kvm_mmu_zap_page(kvm, sp); r = 1; + if (kvm_mmu_zap_page(kvm, sp)) + n = bucket->first; } return r; } @@ -1003,6 +1365,20 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) __set_bit(slot, &sp->slot_bitmap); } +static void mmu_convert_notrap(struct kvm_mmu_page *sp) +{ + int i; + u64 *pt = sp->spt; + + if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte) + return; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + if (pt[i] == shadow_notrap_nonpresent_pte) + set_shadow_pte(&pt[i], shadow_trap_nonpresent_pte); + } +} + struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) { struct page *page; @@ -1012,51 +1388,60 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) if (gpa == UNMAPPED_GVA) return NULL; - down_read(¤t->mm->mmap_sem); page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); - up_read(¤t->mm->mmap_sem); return page; } -static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, - unsigned pt_access, unsigned pte_access, - int user_fault, int write_fault, int dirty, - int *ptwrite, int largepage, gfn_t gfn, - pfn_t pfn, bool speculative) +static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { - u64 spte; - int was_rmapped = 0; - int was_writeble = is_writeble_pte(*shadow_pte); + unsigned index; + struct hlist_head *bucket; + struct kvm_mmu_page *s; + struct hlist_node *node, *n; - pgprintk("%s: spte %llx access %x write_fault %d" - " user_fault %d gfn %lx\n", - __func__, *shadow_pte, pt_access, - write_fault, user_fault, gfn); + index = kvm_page_table_hashfn(sp->gfn); + bucket = &vcpu->kvm->arch.mmu_page_hash[index]; + /* don't unsync if pagetable is shadowed with multiple roles */ + hlist_for_each_entry_safe(s, node, n, bucket, hash_link) { + if (s->gfn != sp->gfn || s->role.metaphysical) + continue; + if (s->role.word != sp->role.word) + return 1; + } + kvm_mmu_mark_parents_unsync(vcpu, sp); + ++vcpu->kvm->stat.mmu_unsync; + sp->unsync = 1; + mmu_convert_notrap(sp); + return 0; +} - if (is_rmap_pte(*shadow_pte)) { - /* - * If we overwrite a PTE page pointer with a 2MB PMD, unlink - * the parent of the now unreachable PTE. - */ - if (largepage && !is_large_pte(*shadow_pte)) { - struct kvm_mmu_page *child; - u64 pte = *shadow_pte; +static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, + bool can_unsync) +{ + struct kvm_mmu_page *shadow; - child = page_header(pte & PT64_BASE_ADDR_MASK); - mmu_page_remove_parent_pte(child, shadow_pte); - } else if (pfn != spte_to_pfn(*shadow_pte)) { - pgprintk("hfn old %lx new %lx\n", - spte_to_pfn(*shadow_pte), pfn); - rmap_remove(vcpu->kvm, shadow_pte); - } else { - if (largepage) - was_rmapped = is_large_pte(*shadow_pte); - else - was_rmapped = 1; - } + shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); + if (shadow) { + if (shadow->role.level != PT_PAGE_TABLE_LEVEL) + return 1; + if (shadow->unsync) + return 0; + if (can_unsync && oos_shadow) + return kvm_unsync_page(vcpu, shadow); + return 1; } + return 0; +} +static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, + unsigned pte_access, int user_fault, + int write_fault, int dirty, int largepage, + gfn_t gfn, pfn_t pfn, bool speculative, + bool can_unsync) +{ + u64 spte; + int ret = 0; /* * We don't set the accessed bit, since we sometimes want to see * whether the guest actually used the pte (in order to detect @@ -1064,7 +1449,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, */ spte = shadow_base_present_pte | shadow_dirty_mask; if (!speculative) - pte_access |= PT_ACCESSED_MASK; + spte |= shadow_accessed_mask; if (!dirty) pte_access &= ~ACC_WRITE_MASK; if (pte_access & ACC_EXEC_MASK) @@ -1080,35 +1465,82 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, if ((pte_access & ACC_WRITE_MASK) || (write_fault && !is_write_protection(vcpu) && !user_fault)) { - struct kvm_mmu_page *shadow; + + if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) { + ret = 1; + spte = shadow_trap_nonpresent_pte; + goto set_pte; + } spte |= PT_WRITABLE_MASK; - shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); - if (shadow || - (largepage && has_wrprotected_page(vcpu->kvm, gfn))) { + if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { pgprintk("%s: found shadow page for %lx, marking ro\n", __func__, gfn); + ret = 1; pte_access &= ~ACC_WRITE_MASK; - if (is_writeble_pte(spte)) { + if (is_writeble_pte(spte)) spte &= ~PT_WRITABLE_MASK; - kvm_x86_ops->tlb_flush(vcpu); - } - if (write_fault) - *ptwrite = 1; } } if (pte_access & ACC_WRITE_MASK) mark_page_dirty(vcpu->kvm, gfn); - pgprintk("%s: setting spte %llx\n", __func__, spte); - pgprintk("instantiating %s PTE (%s) at %d (%llx) addr %llx\n", - (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB", - (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte); +set_pte: set_shadow_pte(shadow_pte, spte); - if (!was_rmapped && (spte & PT_PAGE_SIZE_MASK) - && (spte & PT_PRESENT_MASK)) + return ret; +} + +static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, + unsigned pt_access, unsigned pte_access, + int user_fault, int write_fault, int dirty, + int *ptwrite, int largepage, gfn_t gfn, + pfn_t pfn, bool speculative) +{ + int was_rmapped = 0; + int was_writeble = is_writeble_pte(*shadow_pte); + + pgprintk("%s: spte %llx access %x write_fault %d" + " user_fault %d gfn %lx\n", + __func__, *shadow_pte, pt_access, + write_fault, user_fault, gfn); + + if (is_rmap_pte(*shadow_pte)) { + /* + * If we overwrite a PTE page pointer with a 2MB PMD, unlink + * the parent of the now unreachable PTE. + */ + if (largepage && !is_large_pte(*shadow_pte)) { + struct kvm_mmu_page *child; + u64 pte = *shadow_pte; + + child = page_header(pte & PT64_BASE_ADDR_MASK); + mmu_page_remove_parent_pte(child, shadow_pte); + } else if (pfn != spte_to_pfn(*shadow_pte)) { + pgprintk("hfn old %lx new %lx\n", + spte_to_pfn(*shadow_pte), pfn); + rmap_remove(vcpu->kvm, shadow_pte); + } else { + if (largepage) + was_rmapped = is_large_pte(*shadow_pte); + else + was_rmapped = 1; + } + } + if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault, + dirty, largepage, gfn, pfn, speculative, true)) { + if (write_fault) + *ptwrite = 1; + kvm_x86_ops->tlb_flush(vcpu); + } + + pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte); + pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", + is_large_pte(*shadow_pte)? "2MB" : "4kB", + is_present_pte(*shadow_pte)?"RW":"R", gfn, + *shadow_pte, shadow_pte); + if (!was_rmapped && is_large_pte(*shadow_pte)) ++vcpu->kvm->stat.lpages; page_header_update_slot(vcpu->kvm, shadow_pte, gfn); @@ -1122,61 +1554,77 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, else kvm_release_pfn_clean(pfn); } - if (!ptwrite || !*ptwrite) + if (speculative) { vcpu->arch.last_pte_updated = shadow_pte; + vcpu->arch.last_pte_gfn = gfn; + } } static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) { } -static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, - int largepage, gfn_t gfn, pfn_t pfn, - int level) -{ - hpa_t table_addr = vcpu->arch.mmu.root_hpa; - int pt_write = 0; - - for (; ; level--) { - u32 index = PT64_INDEX(v, level); - u64 *table; - - ASSERT(VALID_PAGE(table_addr)); - table = __va(table_addr); +struct direct_shadow_walk { + struct kvm_shadow_walk walker; + pfn_t pfn; + int write; + int largepage; + int pt_write; +}; - if (level == 1) { - mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, - 0, write, 1, &pt_write, 0, gfn, pfn, false); - return pt_write; - } +static int direct_map_entry(struct kvm_shadow_walk *_walk, + struct kvm_vcpu *vcpu, + u64 addr, u64 *sptep, int level) +{ + struct direct_shadow_walk *walk = + container_of(_walk, struct direct_shadow_walk, walker); + struct kvm_mmu_page *sp; + gfn_t pseudo_gfn; + gfn_t gfn = addr >> PAGE_SHIFT; + + if (level == PT_PAGE_TABLE_LEVEL + || (walk->largepage && level == PT_DIRECTORY_LEVEL)) { + mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL, + 0, walk->write, 1, &walk->pt_write, + walk->largepage, gfn, walk->pfn, false); + ++vcpu->stat.pf_fixed; + return 1; + } - if (largepage && level == 2) { - mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, - 0, write, 1, &pt_write, 1, gfn, pfn, false); - return pt_write; + if (*sptep == shadow_trap_nonpresent_pte) { + pseudo_gfn = (addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; + sp = kvm_mmu_get_page(vcpu, pseudo_gfn, (gva_t)addr, level - 1, + 1, ACC_ALL, sptep); + if (!sp) { + pgprintk("nonpaging_map: ENOMEM\n"); + kvm_release_pfn_clean(walk->pfn); + return -ENOMEM; } - if (table[index] == shadow_trap_nonpresent_pte) { - struct kvm_mmu_page *new_table; - gfn_t pseudo_gfn; - - pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK) - >> PAGE_SHIFT; - new_table = kvm_mmu_get_page(vcpu, pseudo_gfn, - v, level - 1, - 1, ACC_ALL, &table[index]); - if (!new_table) { - pgprintk("nonpaging_map: ENOMEM\n"); - kvm_release_pfn_clean(pfn); - return -ENOMEM; - } - - table[index] = __pa(new_table->spt) - | PT_PRESENT_MASK | PT_WRITABLE_MASK - | shadow_user_mask | shadow_x_mask; - } - table_addr = table[index] & PT64_BASE_ADDR_MASK; + set_shadow_pte(sptep, + __pa(sp->spt) + | PT_PRESENT_MASK | PT_WRITABLE_MASK + | shadow_user_mask | shadow_x_mask); } + return 0; +} + +static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, + int largepage, gfn_t gfn, pfn_t pfn) +{ + int r; + struct direct_shadow_walk walker = { + .walker = { .entry = direct_map_entry, }, + .pfn = pfn, + .largepage = largepage, + .write = write, + .pt_write = 0, + }; + + r = walk_shadow(&walker.walker, vcpu, gfn << PAGE_SHIFT); + if (r < 0) + return r; + return walker.pt_write; } static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) @@ -1184,15 +1632,16 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) int r; int largepage = 0; pfn_t pfn; + unsigned long mmu_seq; - down_read(¤t->mm->mmap_sem); if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { gfn &= ~(KVM_PAGES_PER_HPAGE-1); largepage = 1; } + mmu_seq = vcpu->kvm->mmu_notifier_seq; + smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, gfn); - up_read(¤t->mm->mmap_sem); /* mmio */ if (is_error_pfn(pfn)) { @@ -1201,25 +1650,22 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) } spin_lock(&vcpu->kvm->mmu_lock); + if (mmu_notifier_retry(vcpu, mmu_seq)) + goto out_unlock; kvm_mmu_free_some_pages(vcpu); - r = __direct_map(vcpu, v, write, largepage, gfn, pfn, - PT32E_ROOT_LEVEL); + r = __direct_map(vcpu, v, write, largepage, gfn, pfn); spin_unlock(&vcpu->kvm->mmu_lock); return r; -} - - -static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp) -{ - int i; - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) - sp->spt[i] = shadow_trap_nonpresent_pte; +out_unlock: + spin_unlock(&vcpu->kvm->mmu_lock); + kvm_release_pfn_clean(pfn); + return 0; } + static void mmu_free_roots(struct kvm_vcpu *vcpu) { int i; @@ -1303,6 +1749,37 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); } +static void mmu_sync_roots(struct kvm_vcpu *vcpu) +{ + int i; + struct kvm_mmu_page *sp; + + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return; + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { + hpa_t root = vcpu->arch.mmu.root_hpa; + sp = page_header(root); + mmu_sync_children(vcpu, sp); + return; + } + for (i = 0; i < 4; ++i) { + hpa_t root = vcpu->arch.mmu.pae_root[i]; + + if (root) { + root &= PT64_BASE_ADDR_MASK; + sp = page_header(root); + mmu_sync_children(vcpu, sp); + } + } +} + +void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) +{ + spin_lock(&vcpu->kvm->mmu_lock); + mmu_sync_roots(vcpu); + spin_unlock(&vcpu->kvm->mmu_lock); +} + static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) { return vaddr; @@ -1335,6 +1812,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, int r; int largepage = 0; gfn_t gfn = gpa >> PAGE_SHIFT; + unsigned long mmu_seq; ASSERT(vcpu); ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); @@ -1343,24 +1821,31 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, if (r) return r; - down_read(¤t->mm->mmap_sem); if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { gfn &= ~(KVM_PAGES_PER_HPAGE-1); largepage = 1; } + mmu_seq = vcpu->kvm->mmu_notifier_seq; + smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, gfn); - up_read(¤t->mm->mmap_sem); if (is_error_pfn(pfn)) { kvm_release_pfn_clean(pfn); return 1; } spin_lock(&vcpu->kvm->mmu_lock); + if (mmu_notifier_retry(vcpu, mmu_seq)) + goto out_unlock; kvm_mmu_free_some_pages(vcpu); r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, - largepage, gfn, pfn, kvm_x86_ops->get_tdp_level()); + largepage, gfn, pfn); spin_unlock(&vcpu->kvm->mmu_lock); return r; + +out_unlock: + spin_unlock(&vcpu->kvm->mmu_lock); + kvm_release_pfn_clean(pfn); + return 0; } static void nonpaging_free(struct kvm_vcpu *vcpu) @@ -1377,6 +1862,8 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu) context->gva_to_gpa = nonpaging_gva_to_gpa; context->free = nonpaging_free; context->prefetch_page = nonpaging_prefetch_page; + context->sync_page = nonpaging_sync_page; + context->invlpg = nonpaging_invlpg; context->root_level = 0; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; @@ -1424,6 +1911,8 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) context->page_fault = paging64_page_fault; context->gva_to_gpa = paging64_gva_to_gpa; context->prefetch_page = paging64_prefetch_page; + context->sync_page = paging64_sync_page; + context->invlpg = paging64_invlpg; context->free = paging_free; context->root_level = level; context->shadow_root_level = level; @@ -1445,6 +1934,8 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) context->gva_to_gpa = paging32_gva_to_gpa; context->free = paging_free; context->prefetch_page = paging32_prefetch_page; + context->sync_page = paging32_sync_page; + context->invlpg = paging32_invlpg; context->root_level = PT32_ROOT_LEVEL; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; @@ -1464,6 +1955,8 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->page_fault = tdp_page_fault; context->free = nonpaging_free; context->prefetch_page = nonpaging_prefetch_page; + context->sync_page = nonpaging_sync_page; + context->invlpg = nonpaging_invlpg; context->shadow_root_level = kvm_x86_ops->get_tdp_level(); context->root_hpa = INVALID_PAGE; @@ -1535,6 +2028,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); mmu_alloc_roots(vcpu); + mmu_sync_roots(vcpu); spin_unlock(&vcpu->kvm->mmu_lock); kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); kvm_mmu_flush_tlb(vcpu); @@ -1655,13 +2149,13 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, return; gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; - down_read(¤t->mm->mmap_sem); if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) { gfn &= ~(KVM_PAGES_PER_HPAGE-1); vcpu->arch.update_pte.largepage = 1; } + vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; + smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, gfn); - up_read(¤t->mm->mmap_sem); if (is_error_pfn(pfn)) { kvm_release_pfn_clean(pfn); @@ -1671,6 +2165,18 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, vcpu->arch.update_pte.pfn = pfn; } +static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + u64 *spte = vcpu->arch.last_pte_updated; + + if (spte + && vcpu->arch.last_pte_gfn == gfn + && shadow_accessed_mask + && !(*spte & shadow_accessed_mask) + && is_shadow_present_pte(*spte)) + set_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); +} + void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, int bytes) { @@ -1694,6 +2200,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes); spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_access_page(vcpu, gfn); kvm_mmu_free_some_pages(vcpu); ++vcpu->kvm->stat.mmu_pte_write; kvm_mmu_audit(vcpu, "pre pte write"); @@ -1710,7 +2217,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, index = kvm_page_table_hashfn(gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { - if (sp->gfn != gfn || sp->role.metaphysical) + if (sp->gfn != gfn || sp->role.metaphysical || sp->role.invalid) continue; pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); @@ -1728,7 +2235,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, */ pgprintk("misaligned: gpa %llx bytes %d role %x\n", gpa, bytes, sp->role.word); - kvm_mmu_zap_page(vcpu->kvm, sp); + if (kvm_mmu_zap_page(vcpu->kvm, sp)) + n = bucket->first; ++vcpu->kvm->stat.mmu_flooded; continue; } @@ -1791,6 +2299,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) spin_unlock(&vcpu->kvm->mmu_lock); return r; } +EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) { @@ -1841,12 +2350,28 @@ out: } EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); +void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) +{ + spin_lock(&vcpu->kvm->mmu_lock); + vcpu->arch.mmu.invlpg(vcpu, gva); + spin_unlock(&vcpu->kvm->mmu_lock); + kvm_mmu_flush_tlb(vcpu); + ++vcpu->stat.invlpg; +} +EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); + void kvm_enable_tdp(void) { tdp_enabled = true; } EXPORT_SYMBOL_GPL(kvm_enable_tdp); +void kvm_disable_tdp(void) +{ + tdp_enabled = false; +} +EXPORT_SYMBOL_GPL(kvm_disable_tdp); + static void free_mmu_pages(struct kvm_vcpu *vcpu) { struct kvm_mmu_page *sp; @@ -1921,6 +2446,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) { struct kvm_mmu_page *sp; + spin_lock(&kvm->mmu_lock); list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { int i; u64 *pt; @@ -1934,6 +2460,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) if (pt[i] & PT_WRITABLE_MASK) pt[i] &= ~PT_WRITABLE_MASK; } + kvm_flush_remote_tlbs(kvm); + spin_unlock(&kvm->mmu_lock); } void kvm_mmu_zap_all(struct kvm *kvm) @@ -1942,13 +2470,15 @@ void kvm_mmu_zap_all(struct kvm *kvm) spin_lock(&kvm->mmu_lock); list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) - kvm_mmu_zap_page(kvm, sp); + if (kvm_mmu_zap_page(kvm, sp)) + node = container_of(kvm->arch.active_mmu_pages.next, + struct kvm_mmu_page, link); spin_unlock(&kvm->mmu_lock); kvm_flush_remote_tlbs(kvm); } -void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm) +static void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm) { struct kvm_mmu_page *page; @@ -1968,6 +2498,8 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask) list_for_each_entry(kvm, &vm_list, vm_list) { int npages; + if (!down_read_trylock(&kvm->slots_lock)) + continue; spin_lock(&kvm->mmu_lock); npages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages; @@ -1980,6 +2512,7 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask) nr_to_scan--; spin_unlock(&kvm->mmu_lock); + up_read(&kvm->slots_lock); } if (kvm_freed) list_move_tail(&kvm_freed->vm_list, &vm_list); @@ -2154,18 +2687,18 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, gpa_t addr, unsigned long *ret) { int r; - struct kvm_pv_mmu_op_buffer buffer; + struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer; - buffer.ptr = buffer.buf; - buffer.len = min_t(unsigned long, bytes, sizeof buffer.buf); - buffer.processed = 0; + buffer->ptr = buffer->buf; + buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf); + buffer->processed = 0; - r = kvm_read_guest(vcpu->kvm, addr, buffer.buf, buffer.len); + r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len); if (r) goto out; - while (buffer.len) { - r = kvm_pv_mmu_op_one(vcpu, &buffer); + while (buffer->len) { + r = kvm_pv_mmu_op_one(vcpu, buffer); if (r < 0) goto out; if (r == 0) @@ -2174,7 +2707,7 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, r = 1; out: - *ret = buffer.processed; + *ret = buffer->processed; return r; } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 1730757bbc7a..258e5d56298e 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -15,7 +15,8 @@ #define PT_USER_MASK (1ULL << 2) #define PT_PWT_MASK (1ULL << 3) #define PT_PCD_MASK (1ULL << 4) -#define PT_ACCESSED_MASK (1ULL << 5) +#define PT_ACCESSED_SHIFT 5 +#define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT) #define PT_DIRTY_MASK (1ULL << 6) #define PT_PAGE_SIZE_MASK (1ULL << 7) #define PT_PAT_MASK (1ULL << 7) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 934c7b619396..613ec9aa674a 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -25,11 +25,11 @@ #if PTTYPE == 64 #define pt_element_t u64 #define guest_walker guest_walker64 + #define shadow_walker shadow_walker64 #define FNAME(name) paging##64_##name #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK #define PT_INDEX(addr, level) PT64_INDEX(addr, level) - #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) #define PT_LEVEL_BITS PT64_LEVEL_BITS #ifdef CONFIG_X86_64 @@ -42,11 +42,11 @@ #elif PTTYPE == 32 #define pt_element_t u32 #define guest_walker guest_walker32 + #define shadow_walker shadow_walker32 #define FNAME(name) paging##32_##name #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK #define PT_INDEX(addr, level) PT32_INDEX(addr, level) - #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) #define PT_LEVEL_BITS PT32_LEVEL_BITS #define PT_MAX_FULL_LEVELS 2 @@ -73,6 +73,17 @@ struct guest_walker { u32 error_code; }; +struct shadow_walker { + struct kvm_shadow_walk walker; + struct guest_walker *guest_walker; + int user_fault; + int write_fault; + int largepage; + int *ptwrite; + pfn_t pfn; + u64 *sptep; +}; + static gfn_t gpte_to_gfn(pt_element_t gpte) { return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; @@ -91,14 +102,10 @@ static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, pt_element_t *table; struct page *page; - down_read(¤t->mm->mmap_sem); page = gfn_to_page(kvm, table_gfn); - up_read(¤t->mm->mmap_sem); table = kmap_atomic(page, KM_USER0); - ret = CMPXCHG(&table[index], orig_pte, new_pte); - kunmap_atomic(table, KM_USER0); kvm_release_page_dirty(page); @@ -263,6 +270,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, pfn = vcpu->arch.update_pte.pfn; if (is_error_pfn(pfn)) return; + if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq)) + return; kvm_get_pfn(pfn); mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte), @@ -272,86 +281,89 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, /* * Fetch a shadow pte for a specific level in the paging hierarchy. */ -static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, - struct guest_walker *walker, - int user_fault, int write_fault, int largepage, - int *ptwrite, pfn_t pfn) +static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw, + struct kvm_vcpu *vcpu, u64 addr, + u64 *sptep, int level) { - hpa_t shadow_addr; - int level; - u64 *shadow_ent; - unsigned access = walker->pt_access; - - if (!is_present_pte(walker->ptes[walker->level - 1])) - return NULL; - - shadow_addr = vcpu->arch.mmu.root_hpa; - level = vcpu->arch.mmu.shadow_root_level; - if (level == PT32E_ROOT_LEVEL) { - shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; - shadow_addr &= PT64_BASE_ADDR_MASK; - --level; + struct shadow_walker *sw = + container_of(_sw, struct shadow_walker, walker); + struct guest_walker *gw = sw->guest_walker; + unsigned access = gw->pt_access; + struct kvm_mmu_page *shadow_page; + u64 spte; + int metaphysical; + gfn_t table_gfn; + int r; + pt_element_t curr_pte; + + if (level == PT_PAGE_TABLE_LEVEL + || (sw->largepage && level == PT_DIRECTORY_LEVEL)) { + mmu_set_spte(vcpu, sptep, access, gw->pte_access & access, + sw->user_fault, sw->write_fault, + gw->ptes[gw->level-1] & PT_DIRTY_MASK, + sw->ptwrite, sw->largepage, gw->gfn, sw->pfn, + false); + sw->sptep = sptep; + return 1; } - for (; ; level--) { - u32 index = SHADOW_PT_INDEX(addr, level); - struct kvm_mmu_page *shadow_page; - u64 shadow_pte; - int metaphysical; - gfn_t table_gfn; - - shadow_ent = ((u64 *)__va(shadow_addr)) + index; - if (level == PT_PAGE_TABLE_LEVEL) - break; - - if (largepage && level == PT_DIRECTORY_LEVEL) - break; + if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) + return 0; - if (is_shadow_present_pte(*shadow_ent) - && !is_large_pte(*shadow_ent)) { - shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; - continue; - } + if (is_large_pte(*sptep)) { + set_shadow_pte(sptep, shadow_trap_nonpresent_pte); + kvm_flush_remote_tlbs(vcpu->kvm); + rmap_remove(vcpu->kvm, sptep); + } - if (is_large_pte(*shadow_ent)) - rmap_remove(vcpu->kvm, shadow_ent); - - if (level - 1 == PT_PAGE_TABLE_LEVEL - && walker->level == PT_DIRECTORY_LEVEL) { - metaphysical = 1; - if (!is_dirty_pte(walker->ptes[level - 1])) - access &= ~ACC_WRITE_MASK; - table_gfn = gpte_to_gfn(walker->ptes[level - 1]); - } else { - metaphysical = 0; - table_gfn = walker->table_gfn[level - 2]; - } - shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, - metaphysical, access, - shadow_ent); - if (!metaphysical) { - int r; - pt_element_t curr_pte; - r = kvm_read_guest_atomic(vcpu->kvm, - walker->pte_gpa[level - 2], - &curr_pte, sizeof(curr_pte)); - if (r || curr_pte != walker->ptes[level - 2]) { - kvm_release_pfn_clean(pfn); - return NULL; - } + if (level == PT_DIRECTORY_LEVEL && gw->level == PT_DIRECTORY_LEVEL) { + metaphysical = 1; + if (!is_dirty_pte(gw->ptes[level - 1])) + access &= ~ACC_WRITE_MASK; + table_gfn = gpte_to_gfn(gw->ptes[level - 1]); + } else { + metaphysical = 0; + table_gfn = gw->table_gfn[level - 2]; + } + shadow_page = kvm_mmu_get_page(vcpu, table_gfn, (gva_t)addr, level-1, + metaphysical, access, sptep); + if (!metaphysical) { + r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 2], + &curr_pte, sizeof(curr_pte)); + if (r || curr_pte != gw->ptes[level - 2]) { + kvm_release_pfn_clean(sw->pfn); + sw->sptep = NULL; + return 1; } - shadow_addr = __pa(shadow_page->spt); - shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK - | PT_WRITABLE_MASK | PT_USER_MASK; - *shadow_ent = shadow_pte; } - mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, - user_fault, write_fault, - walker->ptes[walker->level-1] & PT_DIRTY_MASK, - ptwrite, largepage, walker->gfn, pfn, false); + spte = __pa(shadow_page->spt) | PT_PRESENT_MASK | PT_ACCESSED_MASK + | PT_WRITABLE_MASK | PT_USER_MASK; + *sptep = spte; + return 0; +} + +static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, + struct guest_walker *guest_walker, + int user_fault, int write_fault, int largepage, + int *ptwrite, pfn_t pfn) +{ + struct shadow_walker walker = { + .walker = { .entry = FNAME(shadow_walk_entry), }, + .guest_walker = guest_walker, + .user_fault = user_fault, + .write_fault = write_fault, + .largepage = largepage, + .ptwrite = ptwrite, + .pfn = pfn, + }; + + if (!is_present_pte(guest_walker->ptes[guest_walker->level - 1])) + return NULL; - return shadow_ent; + walk_shadow(&walker.walker, vcpu, addr); + + return walker.sptep; } /* @@ -380,6 +392,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, int r; pfn_t pfn; int largepage = 0; + unsigned long mmu_seq; pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); kvm_mmu_audit(vcpu, "pre page fault"); @@ -404,7 +417,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, return 0; } - down_read(¤t->mm->mmap_sem); if (walker.level == PT_DIRECTORY_LEVEL) { gfn_t large_gfn; large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1); @@ -413,8 +425,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, largepage = 1; } } + mmu_seq = vcpu->kvm->mmu_notifier_seq; + smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); - up_read(¤t->mm->mmap_sem); /* mmio */ if (is_error_pfn(pfn)) { @@ -424,6 +437,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, } spin_lock(&vcpu->kvm->mmu_lock); + if (mmu_notifier_retry(vcpu, mmu_seq)) + goto out_unlock; kvm_mmu_free_some_pages(vcpu); shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, largepage, &write_pt, pfn); @@ -439,6 +454,36 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, spin_unlock(&vcpu->kvm->mmu_lock); return write_pt; + +out_unlock: + spin_unlock(&vcpu->kvm->mmu_lock); + kvm_release_pfn_clean(pfn); + return 0; +} + +static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw, + struct kvm_vcpu *vcpu, u64 addr, + u64 *sptep, int level) +{ + + if (level == PT_PAGE_TABLE_LEVEL) { + if (is_shadow_present_pte(*sptep)) + rmap_remove(vcpu->kvm, sptep); + set_shadow_pte(sptep, shadow_trap_nonpresent_pte); + return 1; + } + if (!is_shadow_present_pte(*sptep)) + return 1; + return 0; +} + +static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) +{ + struct shadow_walker walker = { + .walker = { .entry = FNAME(shadow_invlpg_entry), }, + }; + + walk_shadow(&walker.walker, vcpu, gva); } static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) @@ -460,8 +505,9 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { - int i, offset = 0, r = 0; - pt_element_t pt; + int i, j, offset, r; + pt_element_t pt[256 / sizeof(pt_element_t)]; + gpa_t pte_gpa; if (sp->role.metaphysical || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) { @@ -469,28 +515,83 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, return; } + pte_gpa = gfn_to_gpa(sp->gfn); + if (PTTYPE == 32) { + offset = sp->role.quadrant << PT64_LEVEL_BITS; + pte_gpa += offset * sizeof(pt_element_t); + } + + for (i = 0; i < PT64_ENT_PER_PAGE; i += ARRAY_SIZE(pt)) { + r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt); + pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t); + for (j = 0; j < ARRAY_SIZE(pt); ++j) + if (r || is_present_pte(pt[j])) + sp->spt[i+j] = shadow_trap_nonpresent_pte; + else + sp->spt[i+j] = shadow_notrap_nonpresent_pte; + } +} + +/* + * Using the cached information from sp->gfns is safe because: + * - The spte has a reference to the struct page, so the pfn for a given gfn + * can't change unless all sptes pointing to it are nuked first. + * - Alias changes zap the entire shadow cache. + */ +static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + int i, offset, nr_present; + + offset = nr_present = 0; + if (PTTYPE == 32) offset = sp->role.quadrant << PT64_LEVEL_BITS; - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - gpa_t pte_gpa = gfn_to_gpa(sp->gfn); + for (i = 0; i < PT64_ENT_PER_PAGE; i++) { + unsigned pte_access; + pt_element_t gpte; + gpa_t pte_gpa; + gfn_t gfn = sp->gfns[i]; + + if (!is_shadow_present_pte(sp->spt[i])) + continue; + + pte_gpa = gfn_to_gpa(sp->gfn); pte_gpa += (i+offset) * sizeof(pt_element_t); - r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &pt, - sizeof(pt_element_t)); - if (r || is_present_pte(pt)) - sp->spt[i] = shadow_trap_nonpresent_pte; - else - sp->spt[i] = shadow_notrap_nonpresent_pte; + if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, + sizeof(pt_element_t))) + return -EINVAL; + + if (gpte_to_gfn(gpte) != gfn || !is_present_pte(gpte) || + !(gpte & PT_ACCESSED_MASK)) { + u64 nonpresent; + + rmap_remove(vcpu->kvm, &sp->spt[i]); + if (is_present_pte(gpte)) + nonpresent = shadow_trap_nonpresent_pte; + else + nonpresent = shadow_notrap_nonpresent_pte; + set_shadow_pte(&sp->spt[i], nonpresent); + continue; + } + + nr_present++; + pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); + set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, + is_dirty_pte(gpte), 0, gfn, + spte_to_pfn(sp->spt[i]), true, false); } + + return !nr_present; } #undef pt_element_t #undef guest_walker +#undef shadow_walker #undef FNAME #undef PT_BASE_ADDR_MASK #undef PT_INDEX -#undef SHADOW_PT_INDEX #undef PT_LEVEL_MASK #undef PT_DIR_BASE_ADDR_MASK #undef PT_LEVEL_BITS diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 6b0d5fa5bab3..9c4ce657d963 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -18,6 +18,7 @@ #include "kvm_svm.h" #include "irq.h" #include "mmu.h" +#include "kvm_cache_regs.h" #include <linux/module.h> #include <linux/kernel.h> @@ -27,16 +28,14 @@ #include <asm/desc.h> +#define __ex(x) __kvm_handle_fault_on_reboot(x) + MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); #define IOPM_ALLOC_ORDER 2 #define MSRPM_ALLOC_ORDER 1 -#define DB_VECTOR 1 -#define UD_VECTOR 6 -#define GP_VECTOR 13 - #define DR7_GD_MASK (1 << 13) #define DR6_BD_MASK (1 << 13) @@ -45,7 +44,7 @@ MODULE_LICENSE("GPL"); #define SVM_FEATURE_NPT (1 << 0) #define SVM_FEATURE_LBRV (1 << 1) -#define SVM_DEATURE_SVML (1 << 2) +#define SVM_FEATURE_SVML (1 << 2) #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) @@ -60,6 +59,7 @@ static int npt = 1; module_param(npt, int, S_IRUGO); static void kvm_reput_irq(struct vcpu_svm *svm); +static void svm_flush_tlb(struct kvm_vcpu *vcpu); static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) { @@ -129,17 +129,17 @@ static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq) static inline void clgi(void) { - asm volatile (SVM_CLGI); + asm volatile (__ex(SVM_CLGI)); } static inline void stgi(void) { - asm volatile (SVM_STGI); + asm volatile (__ex(SVM_STGI)); } static inline void invlpga(unsigned long addr, u32 asid) { - asm volatile (SVM_INVLPGA :: "a"(addr), "c"(asid)); + asm volatile (__ex(SVM_INVLPGA) :: "a"(addr), "c"(asid)); } static inline unsigned long kvm_read_cr2(void) @@ -233,13 +233,11 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) printk(KERN_DEBUG "%s: NOP\n", __func__); return; } - if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) - printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n", - __func__, - svm->vmcb->save.rip, - svm->next_rip); + if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE) + printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n", + __func__, kvm_rip_read(vcpu), svm->next_rip); - vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip; + kvm_rip_write(vcpu, svm->next_rip); svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; vcpu->arch.interrupt_window_open = 1; @@ -270,19 +268,11 @@ static int has_svm(void) static void svm_hardware_disable(void *garbage) { - struct svm_cpu_data *svm_data - = per_cpu(svm_data, raw_smp_processor_id()); - - if (svm_data) { - uint64_t efer; + uint64_t efer; - wrmsrl(MSR_VM_HSAVE_PA, 0); - rdmsrl(MSR_EFER, efer); - wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK); - per_cpu(svm_data, raw_smp_processor_id()) = NULL; - __free_page(svm_data->save_area); - kfree(svm_data); - } + wrmsrl(MSR_VM_HSAVE_PA, 0); + rdmsrl(MSR_EFER, efer); + wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK); } static void svm_hardware_enable(void *garbage) @@ -321,6 +311,19 @@ static void svm_hardware_enable(void *garbage) page_to_pfn(svm_data->save_area) << PAGE_SHIFT); } +static void svm_cpu_uninit(int cpu) +{ + struct svm_cpu_data *svm_data + = per_cpu(svm_data, raw_smp_processor_id()); + + if (!svm_data) + return; + + per_cpu(svm_data, raw_smp_processor_id()) = NULL; + __free_page(svm_data->save_area); + kfree(svm_data); +} + static int svm_cpu_init(int cpu) { struct svm_cpu_data *svm_data; @@ -446,7 +449,8 @@ static __init int svm_hardware_setup(void) if (npt_enabled) { printk(KERN_INFO "kvm: Nested Paging enabled\n"); kvm_enable_tdp(); - } + } else + kvm_disable_tdp(); return 0; @@ -458,6 +462,11 @@ err: static __exit void svm_hardware_unsetup(void) { + int cpu; + + for_each_online_cpu(cpu) + svm_cpu_uninit(cpu); + __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); iopm_base = 0; } @@ -516,6 +525,7 @@ static void init_vmcb(struct vcpu_svm *svm) (1ULL << INTERCEPT_CPUID) | (1ULL << INTERCEPT_INVD) | (1ULL << INTERCEPT_HLT) | + (1ULL << INTERCEPT_INVLPG) | (1ULL << INTERCEPT_INVLPGA) | (1ULL << INTERCEPT_IOIO_PROT) | (1ULL << INTERCEPT_MSR_PROT) | @@ -567,6 +577,7 @@ static void init_vmcb(struct vcpu_svm *svm) save->dr7 = 0x400; save->rflags = 2; save->rip = 0x0000fff0; + svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; /* * cr0 val on cpu init should be 0x60000010, we enable cpu @@ -579,7 +590,8 @@ static void init_vmcb(struct vcpu_svm *svm) if (npt_enabled) { /* Setup VMCB for Nested Paging */ control->nested_ctl = 1; - control->intercept &= ~(1ULL << INTERCEPT_TASK_SWITCH); + control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) | + (1ULL << INTERCEPT_INVLPG)); control->intercept_exceptions &= ~(1 << PF_VECTOR); control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK| INTERCEPT_CR3_MASK); @@ -601,10 +613,12 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu) init_vmcb(svm); if (vcpu->vcpu_id != 0) { - svm->vmcb->save.rip = 0; + kvm_rip_write(vcpu, 0); svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; } + vcpu->arch.regs_avail = ~0; + vcpu->arch.regs_dirty = ~0; return 0; } @@ -707,27 +721,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) rdtscll(vcpu->arch.host_tsc); } -static void svm_vcpu_decache(struct kvm_vcpu *vcpu) -{ -} - -static void svm_cache_regs(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; - vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; - vcpu->arch.rip = svm->vmcb->save.rip; -} - -static void svm_decache_regs(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; - svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; - svm->vmcb->save.rip = vcpu->arch.rip; -} - static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) { return to_svm(vcpu)->vmcb->save.rflags; @@ -869,6 +862,10 @@ set: static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE; + unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4; + + if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) + force_new_asid(vcpu); vcpu->arch.cr4 = cr4; if (!npt_enabled) @@ -949,7 +946,9 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data) static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) { - return to_svm(vcpu)->db_regs[dr]; + unsigned long val = to_svm(vcpu)->db_regs[dr]; + KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler); + return val; } static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, @@ -997,13 +996,35 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) struct kvm *kvm = svm->vcpu.kvm; u64 fault_address; u32 error_code; + bool event_injection = false; if (!irqchip_in_kernel(kvm) && - is_external_interrupt(exit_int_info)) + is_external_interrupt(exit_int_info)) { + event_injection = true; push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK); + } fault_address = svm->vmcb->control.exit_info_2; error_code = svm->vmcb->control.exit_info_1; + + if (!npt_enabled) + KVMTRACE_3D(PAGE_FAULT, &svm->vcpu, error_code, + (u32)fault_address, (u32)(fault_address >> 32), + handler); + else + KVMTRACE_3D(TDP_FAULT, &svm->vcpu, error_code, + (u32)fault_address, (u32)(fault_address >> 32), + handler); + /* + * FIXME: Tis shouldn't be necessary here, but there is a flush + * missing in the MMU code. Until we find this bug, flush the + * complete TLB here on an NPF + */ + if (npt_enabled) + svm_flush_tlb(&svm->vcpu); + + if (!npt_enabled && event_injection) + kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); } @@ -1081,6 +1102,19 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port); } +static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + KVMTRACE_0D(NMI, &svm->vcpu, handler); + return 1; +} + +static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + ++svm->vcpu.stat.irq_exits; + KVMTRACE_0D(INTR, &svm->vcpu, handler); + return 1; +} + static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { return 1; @@ -1088,14 +1122,14 @@ static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - svm->next_rip = svm->vmcb->save.rip + 1; + svm->next_rip = kvm_rip_read(&svm->vcpu) + 1; skip_emulated_instruction(&svm->vcpu); return kvm_emulate_halt(&svm->vcpu); } static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - svm->next_rip = svm->vmcb->save.rip + 3; + svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; skip_emulated_instruction(&svm->vcpu); kvm_emulate_hypercall(&svm->vcpu); return 1; @@ -1127,11 +1161,18 @@ static int task_switch_interception(struct vcpu_svm *svm, static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - svm->next_rip = svm->vmcb->save.rip + 2; + svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; kvm_emulate_cpuid(&svm->vcpu); return 1; } +static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE) + pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); + return 1; +} + static int emulate_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { @@ -1219,9 +1260,12 @@ static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) if (svm_get_msr(&svm->vcpu, ecx, &data)) kvm_inject_gp(&svm->vcpu, 0); else { - svm->vmcb->save.rax = data & 0xffffffff; + KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data, + (u32)(data >> 32), handler); + + svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff; svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; - svm->next_rip = svm->vmcb->save.rip + 2; + svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; skip_emulated_instruction(&svm->vcpu); } return 1; @@ -1284,16 +1328,19 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) case MSR_K7_EVNTSEL1: case MSR_K7_EVNTSEL2: case MSR_K7_EVNTSEL3: + case MSR_K7_PERFCTR0: + case MSR_K7_PERFCTR1: + case MSR_K7_PERFCTR2: + case MSR_K7_PERFCTR3: /* - * only support writing 0 to the performance counters for now - * to make Windows happy. Should be replaced by a real - * performance counter emulation later. + * Just discard all writes to the performance counters; this + * should keep both older linux and windows 64-bit guests + * happy */ - if (data != 0) - goto unhandled; + pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", ecx, data); + break; default: - unhandled: return kvm_set_msr_common(vcpu, ecx, data); } return 0; @@ -1302,9 +1349,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; - u64 data = (svm->vmcb->save.rax & -1u) + u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); - svm->next_rip = svm->vmcb->save.rip + 2; + + KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32), + handler); + + svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; if (svm_set_msr(&svm->vcpu, ecx, data)) kvm_inject_gp(&svm->vcpu, 0); else @@ -1323,6 +1374,8 @@ static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) static int interrupt_window_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { + KVMTRACE_0D(PEND_INTR, &svm->vcpu, handler); + svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR); svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; /* @@ -1364,8 +1417,8 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, - [SVM_EXIT_INTR] = nop_on_interception, - [SVM_EXIT_NMI] = nop_on_interception, + [SVM_EXIT_INTR] = intr_interception, + [SVM_EXIT_NMI] = nmi_interception, [SVM_EXIT_SMI] = nop_on_interception, [SVM_EXIT_INIT] = nop_on_interception, [SVM_EXIT_VINTR] = interrupt_window_interception, @@ -1373,7 +1426,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, [SVM_EXIT_CPUID] = cpuid_interception, [SVM_EXIT_INVD] = emulate_on_interception, [SVM_EXIT_HLT] = halt_interception, - [SVM_EXIT_INVLPG] = emulate_on_interception, + [SVM_EXIT_INVLPG] = invlpg_interception, [SVM_EXIT_INVLPGA] = invalid_op_interception, [SVM_EXIT_IOIO] = io_interception, [SVM_EXIT_MSR] = msr_interception, @@ -1397,6 +1450,9 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); u32 exit_code = svm->vmcb->control.exit_code; + KVMTRACE_3D(VMEXIT, vcpu, exit_code, (u32)svm->vmcb->save.rip, + (u32)((u64)svm->vmcb->save.rip >> 32), entryexit); + if (npt_enabled) { int mmu_reload = 0; if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) { @@ -1470,6 +1526,9 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) { struct vmcb_control_area *control; + KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler); + + ++svm->vcpu.stat.irq_injections; control = &svm->vmcb->control; control->int_vector = irq; control->int_ctl &= ~V_INTR_PRIO_MASK; @@ -1648,6 +1707,12 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; } +#ifdef CONFIG_X86_64 +#define R "r" +#else +#define R "e" +#endif + static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1655,14 +1720,18 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) u16 gs_selector; u16 ldt_selector; + svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; + svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; + svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; + pre_svm_run(svm); sync_lapic_to_cr8(vcpu); save_host_msrs(vcpu); - fs_selector = read_fs(); - gs_selector = read_gs(); - ldt_selector = read_ldt(); + fs_selector = kvm_read_fs(); + gs_selector = kvm_read_gs(); + ldt_selector = kvm_read_ldt(); svm->host_cr2 = kvm_read_cr2(); svm->host_dr6 = read_dr6(); svm->host_dr7 = read_dr7(); @@ -1682,19 +1751,14 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) local_irq_enable(); asm volatile ( + "push %%"R"bp; \n\t" + "mov %c[rbx](%[svm]), %%"R"bx \n\t" + "mov %c[rcx](%[svm]), %%"R"cx \n\t" + "mov %c[rdx](%[svm]), %%"R"dx \n\t" + "mov %c[rsi](%[svm]), %%"R"si \n\t" + "mov %c[rdi](%[svm]), %%"R"di \n\t" + "mov %c[rbp](%[svm]), %%"R"bp \n\t" #ifdef CONFIG_X86_64 - "push %%rbp; \n\t" -#else - "push %%ebp; \n\t" -#endif - -#ifdef CONFIG_X86_64 - "mov %c[rbx](%[svm]), %%rbx \n\t" - "mov %c[rcx](%[svm]), %%rcx \n\t" - "mov %c[rdx](%[svm]), %%rdx \n\t" - "mov %c[rsi](%[svm]), %%rsi \n\t" - "mov %c[rdi](%[svm]), %%rdi \n\t" - "mov %c[rbp](%[svm]), %%rbp \n\t" "mov %c[r8](%[svm]), %%r8 \n\t" "mov %c[r9](%[svm]), %%r9 \n\t" "mov %c[r10](%[svm]), %%r10 \n\t" @@ -1703,41 +1767,24 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) "mov %c[r13](%[svm]), %%r13 \n\t" "mov %c[r14](%[svm]), %%r14 \n\t" "mov %c[r15](%[svm]), %%r15 \n\t" -#else - "mov %c[rbx](%[svm]), %%ebx \n\t" - "mov %c[rcx](%[svm]), %%ecx \n\t" - "mov %c[rdx](%[svm]), %%edx \n\t" - "mov %c[rsi](%[svm]), %%esi \n\t" - "mov %c[rdi](%[svm]), %%edi \n\t" - "mov %c[rbp](%[svm]), %%ebp \n\t" #endif -#ifdef CONFIG_X86_64 - /* Enter guest mode */ - "push %%rax \n\t" - "mov %c[vmcb](%[svm]), %%rax \n\t" - SVM_VMLOAD "\n\t" - SVM_VMRUN "\n\t" - SVM_VMSAVE "\n\t" - "pop %%rax \n\t" -#else /* Enter guest mode */ - "push %%eax \n\t" - "mov %c[vmcb](%[svm]), %%eax \n\t" - SVM_VMLOAD "\n\t" - SVM_VMRUN "\n\t" - SVM_VMSAVE "\n\t" - "pop %%eax \n\t" -#endif + "push %%"R"ax \n\t" + "mov %c[vmcb](%[svm]), %%"R"ax \n\t" + __ex(SVM_VMLOAD) "\n\t" + __ex(SVM_VMRUN) "\n\t" + __ex(SVM_VMSAVE) "\n\t" + "pop %%"R"ax \n\t" /* Save guest registers, load host registers */ + "mov %%"R"bx, %c[rbx](%[svm]) \n\t" + "mov %%"R"cx, %c[rcx](%[svm]) \n\t" + "mov %%"R"dx, %c[rdx](%[svm]) \n\t" + "mov %%"R"si, %c[rsi](%[svm]) \n\t" + "mov %%"R"di, %c[rdi](%[svm]) \n\t" + "mov %%"R"bp, %c[rbp](%[svm]) \n\t" #ifdef CONFIG_X86_64 - "mov %%rbx, %c[rbx](%[svm]) \n\t" - "mov %%rcx, %c[rcx](%[svm]) \n\t" - "mov %%rdx, %c[rdx](%[svm]) \n\t" - "mov %%rsi, %c[rsi](%[svm]) \n\t" - "mov %%rdi, %c[rdi](%[svm]) \n\t" - "mov %%rbp, %c[rbp](%[svm]) \n\t" "mov %%r8, %c[r8](%[svm]) \n\t" "mov %%r9, %c[r9](%[svm]) \n\t" "mov %%r10, %c[r10](%[svm]) \n\t" @@ -1746,18 +1793,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) "mov %%r13, %c[r13](%[svm]) \n\t" "mov %%r14, %c[r14](%[svm]) \n\t" "mov %%r15, %c[r15](%[svm]) \n\t" - - "pop %%rbp; \n\t" -#else - "mov %%ebx, %c[rbx](%[svm]) \n\t" - "mov %%ecx, %c[rcx](%[svm]) \n\t" - "mov %%edx, %c[rdx](%[svm]) \n\t" - "mov %%esi, %c[rsi](%[svm]) \n\t" - "mov %%edi, %c[rdi](%[svm]) \n\t" - "mov %%ebp, %c[rbp](%[svm]) \n\t" - - "pop %%ebp; \n\t" #endif + "pop %%"R"bp" : : [svm]"a"(svm), [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), @@ -1778,11 +1815,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15])) #endif : "cc", "memory" + , R"bx", R"cx", R"dx", R"si", R"di" #ifdef CONFIG_X86_64 - , "rbx", "rcx", "rdx", "rsi", "rdi" , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15" -#else - , "ebx", "ecx", "edx" , "esi", "edi" #endif ); @@ -1790,14 +1825,17 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) load_db_regs(svm->host_db_regs); vcpu->arch.cr2 = svm->vmcb->save.cr2; + vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; + vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; + vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; write_dr6(svm->host_dr6); write_dr7(svm->host_dr7); kvm_write_cr2(svm->host_cr2); - load_fs(fs_selector); - load_gs(gs_selector); - load_ldt(ldt_selector); + kvm_load_fs(fs_selector); + kvm_load_gs(gs_selector); + kvm_load_ldt(ldt_selector); load_host_msrs(vcpu); reload_tss(vcpu); @@ -1811,6 +1849,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) svm->next_rip = 0; } +#undef R + static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1889,7 +1929,6 @@ static struct kvm_x86_ops svm_x86_ops = { .prepare_guest_switch = svm_prepare_guest_switch, .vcpu_load = svm_vcpu_load, .vcpu_put = svm_vcpu_put, - .vcpu_decache = svm_vcpu_decache, .set_guest_debug = svm_guest_debug, .get_msr = svm_get_msr, @@ -1910,8 +1949,6 @@ static struct kvm_x86_ops svm_x86_ops = { .set_gdt = svm_set_gdt, .get_dr = svm_get_dr, .set_dr = svm_set_dr, - .cache_regs = svm_cache_regs, - .decache_regs = svm_decache_regs, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 10ce6ee4c491..2643b430d83a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -26,10 +26,14 @@ #include <linux/highmem.h> #include <linux/sched.h> #include <linux/moduleparam.h> +#include "kvm_cache_regs.h" +#include "x86.h" #include <asm/io.h> #include <asm/desc.h> +#define __ex(x) __kvm_handle_fault_on_reboot(x) + MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); @@ -45,6 +49,9 @@ module_param(flexpriority_enabled, bool, 0); static int enable_ept = 1; module_param(enable_ept, bool, 0); +static int emulate_invalid_guest_state = 0; +module_param(emulate_invalid_guest_state, bool, 0); + struct vmcs { u32 revision_id; u32 abort; @@ -53,6 +60,8 @@ struct vmcs { struct vcpu_vmx { struct kvm_vcpu vcpu; + struct list_head local_vcpus_link; + unsigned long host_rsp; int launched; u8 fail; u32 idt_vectoring_info; @@ -80,6 +89,7 @@ struct vcpu_vmx { } irq; } rmode; int vpid; + bool emulation_required; }; static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) @@ -88,9 +98,11 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) } static int init_rmode(struct kvm *kvm); +static u64 construct_eptp(unsigned long root_hpa); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); +static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu); static struct page *vmx_io_bitmap_a; static struct page *vmx_io_bitmap_b; @@ -260,6 +272,11 @@ static inline int cpu_has_vmx_vpid(void) SECONDARY_EXEC_ENABLE_VPID); } +static inline int cpu_has_virtual_nmis(void) +{ + return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; +} + static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) { int i; @@ -278,7 +295,7 @@ static inline void __invvpid(int ext, u16 vpid, gva_t gva) u64 gva; } operand = { vpid, 0, gva }; - asm volatile (ASM_VMX_INVVPID + asm volatile (__ex(ASM_VMX_INVVPID) /* CF==1 or ZF==1 --> rc = -1 */ "; ja 1f ; ud2 ; 1:" : : "a"(&operand), "c"(ext) : "cc", "memory"); @@ -290,7 +307,7 @@ static inline void __invept(int ext, u64 eptp, gpa_t gpa) u64 eptp, gpa; } operand = {eptp, gpa}; - asm volatile (ASM_VMX_INVEPT + asm volatile (__ex(ASM_VMX_INVEPT) /* CF==1 or ZF==1 --> rc = -1 */ "; ja 1f ; ud2 ; 1:\n" : : "a" (&operand), "c" (ext) : "cc", "memory"); @@ -311,7 +328,7 @@ static void vmcs_clear(struct vmcs *vmcs) u64 phys_addr = __pa(vmcs); u8 error; - asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0" + asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0" : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) : "cc", "memory"); if (error) @@ -329,6 +346,9 @@ static void __vcpu_clear(void *arg) if (per_cpu(current_vmcs, cpu) == vmx->vmcs) per_cpu(current_vmcs, cpu) = NULL; rdtscll(vmx->vcpu.arch.host_tsc); + list_del(&vmx->local_vcpus_link); + vmx->vcpu.cpu = -1; + vmx->launched = 0; } static void vcpu_clear(struct vcpu_vmx *vmx) @@ -336,7 +356,6 @@ static void vcpu_clear(struct vcpu_vmx *vmx) if (vmx->vcpu.cpu == -1) return; smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1); - vmx->launched = 0; } static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx) @@ -378,7 +397,7 @@ static unsigned long vmcs_readl(unsigned long field) { unsigned long value; - asm volatile (ASM_VMX_VMREAD_RDX_RAX + asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX) : "=a"(value) : "d"(field) : "cc"); return value; } @@ -413,7 +432,7 @@ static void vmcs_writel(unsigned long field, unsigned long value) { u8 error; - asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0" + asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0" : "=q"(error) : "a"(value), "d"(field) : "cc"); if (unlikely(error)) vmwrite_error(field, value); @@ -431,10 +450,8 @@ static void vmcs_write32(unsigned long field, u32 value) static void vmcs_write64(unsigned long field, u64 value) { -#ifdef CONFIG_X86_64 - vmcs_writel(field, value); -#else vmcs_writel(field, value); +#ifndef CONFIG_X86_64 asm volatile (""); vmcs_writel(field+1, value >> 32); #endif @@ -458,7 +475,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) if (!vcpu->fpu_active) eb |= 1u << NM_VECTOR; if (vcpu->guest_debug.enabled) - eb |= 1u << 1; + eb |= 1u << DB_VECTOR; if (vcpu->arch.rmode.active) eb = ~0; if (vm_need_ept()) @@ -474,7 +491,7 @@ static void reload_tss(void) struct descriptor_table gdt; struct desc_struct *descs; - get_gdt(&gdt); + kvm_get_gdt(&gdt); descs = (void *)gdt.base; descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ load_TR_desc(); @@ -530,9 +547,9 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) * Set host fs and gs selectors. Unfortunately, 22.2.3 does not * allow segment selectors with cpl > 0 or ti == 1. */ - vmx->host_state.ldt_sel = read_ldt(); + vmx->host_state.ldt_sel = kvm_read_ldt(); vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; - vmx->host_state.fs_sel = read_fs(); + vmx->host_state.fs_sel = kvm_read_fs(); if (!(vmx->host_state.fs_sel & 7)) { vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); vmx->host_state.fs_reload_needed = 0; @@ -540,7 +557,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) vmcs_write16(HOST_FS_SELECTOR, 0); vmx->host_state.fs_reload_needed = 1; } - vmx->host_state.gs_sel = read_gs(); + vmx->host_state.gs_sel = kvm_read_gs(); if (!(vmx->host_state.gs_sel & 7)) vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); else { @@ -576,15 +593,15 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) ++vmx->vcpu.stat.host_state_reload; vmx->host_state.loaded = 0; if (vmx->host_state.fs_reload_needed) - load_fs(vmx->host_state.fs_sel); + kvm_load_fs(vmx->host_state.fs_sel); if (vmx->host_state.gs_ldt_reload_needed) { - load_ldt(vmx->host_state.ldt_sel); + kvm_load_ldt(vmx->host_state.ldt_sel); /* * If we have to reload gs, we must take care to * preserve our gs base. */ local_irq_save(flags); - load_gs(vmx->host_state.gs_sel); + kvm_load_gs(vmx->host_state.gs_sel); #ifdef CONFIG_X86_64 wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); #endif @@ -617,13 +634,17 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) vcpu_clear(vmx); kvm_migrate_timers(vcpu); vpid_sync_vcpu_all(vmx); + local_irq_disable(); + list_add(&vmx->local_vcpus_link, + &per_cpu(vcpus_on_cpu, cpu)); + local_irq_enable(); } if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { u8 error; per_cpu(current_vmcs, cpu) = vmx->vmcs; - asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0" + asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) : "cc"); if (error) @@ -640,8 +661,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) * Linux uses per-cpu TSS and GDT, so set these when switching * processors. */ - vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */ - get_gdt(&dt); + vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */ + kvm_get_gdt(&dt); vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */ rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); @@ -684,11 +705,6 @@ static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) update_exception_bitmap(vcpu); } -static void vmx_vcpu_decache(struct kvm_vcpu *vcpu) -{ - vcpu_clear(to_vmx(vcpu)); -} - static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) { return vmcs_readl(GUEST_RFLAGS); @@ -706,9 +722,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) unsigned long rip; u32 interruptibility; - rip = vmcs_readl(GUEST_RIP); + rip = kvm_rip_read(vcpu); rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); - vmcs_writel(GUEST_RIP, rip); + kvm_rip_write(vcpu, rip); /* * We emulated an instruction, so temporary interrupt blocking @@ -724,19 +740,35 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, bool has_error_code, u32 error_code) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (has_error_code) + vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); + + if (vcpu->arch.rmode.active) { + vmx->rmode.irq.pending = true; + vmx->rmode.irq.vector = nr; + vmx->rmode.irq.rip = kvm_rip_read(vcpu); + if (nr == BP_VECTOR) + vmx->rmode.irq.rip++; + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + nr | INTR_TYPE_SOFT_INTR + | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0) + | INTR_INFO_VALID_MASK); + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); + kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); + return; + } + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, nr | INTR_TYPE_EXCEPTION | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0) | INTR_INFO_VALID_MASK); - if (has_error_code) - vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); } static bool vmx_exception_injected(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - - return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); + return false; } /* @@ -913,6 +945,18 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) case MSR_IA32_TIME_STAMP_COUNTER: guest_write_tsc(data); break; + case MSR_P6_PERFCTR0: + case MSR_P6_PERFCTR1: + case MSR_P6_EVNTSEL0: + case MSR_P6_EVNTSEL1: + /* + * Just discard all writes to the performance counters; this + * should keep both older linux and windows 64-bit guests + * happy + */ + pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", msr_index, data); + + break; default: vmx_load_host_state(vmx); msr = find_msr_entry(vmx, msr_index); @@ -926,24 +970,19 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) return ret; } -/* - * Sync the rsp and rip registers into the vcpu structure. This allows - * registers to be accessed by indexing vcpu->arch.regs. - */ -static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu) -{ - vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); - vcpu->arch.rip = vmcs_readl(GUEST_RIP); -} - -/* - * Syncs rsp and rip back into the vmcs. Should be called after possible - * modification. - */ -static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu) +static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) { - vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); - vmcs_writel(GUEST_RIP, vcpu->arch.rip); + __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); + switch (reg) { + case VCPU_REGS_RSP: + vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); + break; + case VCPU_REGS_RIP: + vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); + break; + default: + break; + } } static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) @@ -986,17 +1025,9 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) static int vmx_get_irq(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 idtv_info_field; - - idtv_info_field = vmx->idt_vectoring_info; - if (idtv_info_field & INTR_INFO_VALID_MASK) { - if (is_external_interrupt(idtv_info_field)) - return idtv_info_field & VECTORING_INFO_VECTOR_MASK; - else - printk(KERN_DEBUG "pending exception: not handled yet\n"); - } - return -1; + if (!vcpu->arch.interrupt.pending) + return -1; + return vcpu->arch.interrupt.nr; } static __init int cpu_has_kvm_support(void) @@ -1010,9 +1041,9 @@ static __init int vmx_disabled_by_bios(void) u64 msr; rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); - return (msr & (MSR_IA32_FEATURE_CONTROL_LOCKED | - MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) - == MSR_IA32_FEATURE_CONTROL_LOCKED; + return (msr & (FEATURE_CONTROL_LOCKED | + FEATURE_CONTROL_VMXON_ENABLED)) + == FEATURE_CONTROL_LOCKED; /* locked but not enabled */ } @@ -1022,23 +1053,36 @@ static void hardware_enable(void *garbage) u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); u64 old; + INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); rdmsrl(MSR_IA32_FEATURE_CONTROL, old); - if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED | - MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) - != (MSR_IA32_FEATURE_CONTROL_LOCKED | - MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) + if ((old & (FEATURE_CONTROL_LOCKED | + FEATURE_CONTROL_VMXON_ENABLED)) + != (FEATURE_CONTROL_LOCKED | + FEATURE_CONTROL_VMXON_ENABLED)) /* enable and lock */ wrmsrl(MSR_IA32_FEATURE_CONTROL, old | - MSR_IA32_FEATURE_CONTROL_LOCKED | - MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED); + FEATURE_CONTROL_LOCKED | + FEATURE_CONTROL_VMXON_ENABLED); write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ - asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr) + asm volatile (ASM_VMX_VMXON_RAX + : : "a"(&phys_addr), "m"(phys_addr) : "memory", "cc"); } +static void vmclear_local_vcpus(void) +{ + int cpu = raw_smp_processor_id(); + struct vcpu_vmx *vmx, *n; + + list_for_each_entry_safe(vmx, n, &per_cpu(vcpus_on_cpu, cpu), + local_vcpus_link) + __vcpu_clear(vmx); +} + static void hardware_disable(void *garbage) { - asm volatile (ASM_VMX_VMXOFF : : : "cc"); + vmclear_local_vcpus(); + asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); write_cr4(read_cr4() & ~X86_CR4_VMXE); } @@ -1072,7 +1116,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) u32 _vmentry_control = 0; min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; - opt = 0; + opt = PIN_BASED_VIRTUAL_NMIS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, &_pin_based_exec_control) < 0) return -EIO; @@ -1086,7 +1130,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) CPU_BASED_CR3_STORE_EXITING | CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MOV_DR_EXITING | - CPU_BASED_USE_TSC_OFFSETING; + CPU_BASED_USE_TSC_OFFSETING | + CPU_BASED_INVLPG_EXITING; opt = CPU_BASED_TPR_SHADOW | CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; @@ -1115,9 +1160,11 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; #endif if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { - /* CR3 accesses don't need to cause VM Exits when EPT enabled */ + /* CR3 accesses and invlpg don't need to cause VM Exits when EPT + enabled */ min &= ~(CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING); + CPU_BASED_CR3_STORE_EXITING | + CPU_BASED_INVLPG_EXITING); if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, &_cpu_based_exec_control) < 0) return -EIO; @@ -1254,7 +1301,9 @@ static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save) static void enter_pmode(struct kvm_vcpu *vcpu) { unsigned long flags; + struct vcpu_vmx *vmx = to_vmx(vcpu); + vmx->emulation_required = 1; vcpu->arch.rmode.active = 0; vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); @@ -1271,6 +1320,9 @@ static void enter_pmode(struct kvm_vcpu *vcpu) update_exception_bitmap(vcpu); + if (emulate_invalid_guest_state) + return; + fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es); fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); @@ -1311,7 +1363,9 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save) static void enter_rmode(struct kvm_vcpu *vcpu) { unsigned long flags; + struct vcpu_vmx *vmx = to_vmx(vcpu); + vmx->emulation_required = 1; vcpu->arch.rmode.active = 1; vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); @@ -1333,6 +1387,9 @@ static void enter_rmode(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); update_exception_bitmap(vcpu); + if (emulate_invalid_guest_state) + goto continue_rmode; + vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); vmcs_write32(GUEST_SS_LIMIT, 0xffff); vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); @@ -1348,6 +1405,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu) fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); +continue_rmode: kvm_mmu_reset_context(vcpu); init_rmode(vcpu->kvm); } @@ -1389,6 +1447,8 @@ static void exit_lmode(struct kvm_vcpu *vcpu) static void vmx_flush_tlb(struct kvm_vcpu *vcpu) { vpid_sync_vcpu_all(to_vmx(vcpu)); + if (vm_need_ept()) + ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); } static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) @@ -1420,7 +1480,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, if (!(cr0 & X86_CR0_PG)) { /* From paging/starting to nonpaging */ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, - vmcs_config.cpu_based_exec_ctrl | + vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) | (CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING)); vcpu->arch.cr0 = cr0; @@ -1430,7 +1490,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, } else if (!is_paging(vcpu)) { /* From nonpaging to paging */ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, - vmcs_config.cpu_based_exec_ctrl & + vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) & ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING)); vcpu->arch.cr0 = cr0; @@ -1679,6 +1739,186 @@ static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) vmcs_writel(GUEST_GDTR_BASE, dt->base); } +static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) +{ + struct kvm_segment var; + u32 ar; + + vmx_get_segment(vcpu, &var, seg); + ar = vmx_segment_access_rights(&var); + + if (var.base != (var.selector << 4)) + return false; + if (var.limit != 0xffff) + return false; + if (ar != 0xf3) + return false; + + return true; +} + +static bool code_segment_valid(struct kvm_vcpu *vcpu) +{ + struct kvm_segment cs; + unsigned int cs_rpl; + + vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); + cs_rpl = cs.selector & SELECTOR_RPL_MASK; + + if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK)) + return false; + if (!cs.s) + return false; + if (!(~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK))) { + if (cs.dpl > cs_rpl) + return false; + } else if (cs.type & AR_TYPE_CODE_MASK) { + if (cs.dpl != cs_rpl) + return false; + } + if (!cs.present) + return false; + + /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */ + return true; +} + +static bool stack_segment_valid(struct kvm_vcpu *vcpu) +{ + struct kvm_segment ss; + unsigned int ss_rpl; + + vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); + ss_rpl = ss.selector & SELECTOR_RPL_MASK; + + if ((ss.type != 3) || (ss.type != 7)) + return false; + if (!ss.s) + return false; + if (ss.dpl != ss_rpl) /* DPL != RPL */ + return false; + if (!ss.present) + return false; + + return true; +} + +static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) +{ + struct kvm_segment var; + unsigned int rpl; + + vmx_get_segment(vcpu, &var, seg); + rpl = var.selector & SELECTOR_RPL_MASK; + + if (!var.s) + return false; + if (!var.present) + return false; + if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) { + if (var.dpl < rpl) /* DPL < RPL */ + return false; + } + + /* TODO: Add other members to kvm_segment_field to allow checking for other access + * rights flags + */ + return true; +} + +static bool tr_valid(struct kvm_vcpu *vcpu) +{ + struct kvm_segment tr; + + vmx_get_segment(vcpu, &tr, VCPU_SREG_TR); + + if (tr.selector & SELECTOR_TI_MASK) /* TI = 1 */ + return false; + if ((tr.type != 3) || (tr.type != 11)) /* TODO: Check if guest is in IA32e mode */ + return false; + if (!tr.present) + return false; + + return true; +} + +static bool ldtr_valid(struct kvm_vcpu *vcpu) +{ + struct kvm_segment ldtr; + + vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR); + + if (ldtr.selector & SELECTOR_TI_MASK) /* TI = 1 */ + return false; + if (ldtr.type != 2) + return false; + if (!ldtr.present) + return false; + + return true; +} + +static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) +{ + struct kvm_segment cs, ss; + + vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); + vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); + + return ((cs.selector & SELECTOR_RPL_MASK) == + (ss.selector & SELECTOR_RPL_MASK)); +} + +/* + * Check if guest state is valid. Returns true if valid, false if + * not. + * We assume that registers are always usable + */ +static bool guest_state_valid(struct kvm_vcpu *vcpu) +{ + /* real mode guest state checks */ + if (!(vcpu->arch.cr0 & X86_CR0_PE)) { + if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) + return false; + if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) + return false; + if (!rmode_segment_valid(vcpu, VCPU_SREG_DS)) + return false; + if (!rmode_segment_valid(vcpu, VCPU_SREG_ES)) + return false; + if (!rmode_segment_valid(vcpu, VCPU_SREG_FS)) + return false; + if (!rmode_segment_valid(vcpu, VCPU_SREG_GS)) + return false; + } else { + /* protected mode guest state checks */ + if (!cs_ss_rpl_check(vcpu)) + return false; + if (!code_segment_valid(vcpu)) + return false; + if (!stack_segment_valid(vcpu)) + return false; + if (!data_segment_valid(vcpu, VCPU_SREG_DS)) + return false; + if (!data_segment_valid(vcpu, VCPU_SREG_ES)) + return false; + if (!data_segment_valid(vcpu, VCPU_SREG_FS)) + return false; + if (!data_segment_valid(vcpu, VCPU_SREG_GS)) + return false; + if (!tr_valid(vcpu)) + return false; + if (!ldtr_valid(vcpu)) + return false; + } + /* TODO: + * - Add checks on RIP + * - Add checks on RFLAGS + */ + + return true; +} + static int init_rmode_tss(struct kvm *kvm) { gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; @@ -1690,7 +1930,8 @@ static int init_rmode_tss(struct kvm *kvm) if (r < 0) goto out; data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; - r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16)); + r = kvm_write_guest_page(kvm, fn++, &data, + TSS_IOPB_BASE_OFFSET, sizeof(u16)); if (r < 0) goto out; r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); @@ -1753,7 +1994,7 @@ static void seg_setup(int seg) vmcs_write16(sf->selector, 0); vmcs_writel(sf->base, 0); vmcs_write32(sf->limit, 0xffff); - vmcs_write32(sf->ar_bytes, 0x93); + vmcs_write32(sf->ar_bytes, 0xf3); } static int alloc_apic_access_page(struct kvm *kvm) @@ -1772,9 +2013,7 @@ static int alloc_apic_access_page(struct kvm *kvm) if (r) goto out; - down_read(¤t->mm->mmap_sem); kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); - up_read(¤t->mm->mmap_sem); out: up_write(&kvm->slots_lock); return r; @@ -1796,10 +2035,8 @@ static int alloc_identity_pagetable(struct kvm *kvm) if (r) goto out; - down_read(¤t->mm->mmap_sem); kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT); - up_read(¤t->mm->mmap_sem); out: up_write(&kvm->slots_lock); return r; @@ -1821,7 +2058,7 @@ static void allocate_vpid(struct vcpu_vmx *vmx) spin_unlock(&vmx_vpid_lock); } -void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr) +static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr) { void *va; @@ -1881,7 +2118,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) } if (!vm_need_ept()) exec_control |= CPU_BASED_CR3_STORE_EXITING | - CPU_BASED_CR3_LOAD_EXITING; + CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_INVLPG_EXITING; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); if (cpu_has_secondary_exec_ctrls()) { @@ -1907,8 +2145,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ - vmcs_write16(HOST_FS_SELECTOR, read_fs()); /* 22.2.4 */ - vmcs_write16(HOST_GS_SELECTOR, read_gs()); /* 22.2.4 */ + vmcs_write16(HOST_FS_SELECTOR, kvm_read_fs()); /* 22.2.4 */ + vmcs_write16(HOST_GS_SELECTOR, kvm_read_gs()); /* 22.2.4 */ vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ #ifdef CONFIG_X86_64 rdmsrl(MSR_FS_BASE, a); @@ -1922,7 +2160,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ - get_idt(&dt); + kvm_get_idt(&dt); vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); @@ -1983,6 +2221,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) u64 msr; int ret; + vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); down_read(&vcpu->kvm->slots_lock); if (!init_rmode(vmx->vcpu.kvm)) { ret = -ENOMEM; @@ -2000,6 +2239,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) fx_init(&vmx->vcpu); + seg_setup(VCPU_SREG_CS); /* * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. @@ -2011,8 +2251,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8); vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12); } - vmcs_write32(GUEST_CS_LIMIT, 0xffff); - vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); seg_setup(VCPU_SREG_DS); seg_setup(VCPU_SREG_ES); @@ -2036,10 +2274,10 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_RFLAGS, 0x02); if (vmx->vcpu.vcpu_id == 0) - vmcs_writel(GUEST_RIP, 0xfff0); + kvm_rip_write(vcpu, 0xfff0); else - vmcs_writel(GUEST_RIP, 0); - vmcs_writel(GUEST_RSP, 0); + kvm_rip_write(vcpu, 0); + kvm_register_write(vcpu, VCPU_REGS_RSP, 0); /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */ vmcs_writel(GUEST_DR7, 0x400); @@ -2089,6 +2327,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) ret = 0; + /* HACK: Don't enable emulation on guest boot/reset */ + vmx->emulation_required = 0; + out: up_read(&vcpu->kvm->slots_lock); return ret; @@ -2100,20 +2341,27 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler); + ++vcpu->stat.irq_injections; if (vcpu->arch.rmode.active) { vmx->rmode.irq.pending = true; vmx->rmode.irq.vector = irq; - vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP); + vmx->rmode.irq.rip = kvm_rip_read(vcpu); vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK); vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); - vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1); + kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); return; } vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); } +static void vmx_inject_nmi(struct kvm_vcpu *vcpu) +{ + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); +} + static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) { int word_index = __ffs(vcpu->arch.irq_summary); @@ -2123,7 +2371,7 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); if (!vcpu->arch.irq_pending[word_index]) clear_bit(word_index, &vcpu->arch.irq_summary); - vmx_inject_irq(vcpu, irq); + kvm_queue_interrupt(vcpu, irq); } @@ -2137,13 +2385,12 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu, (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); if (vcpu->arch.interrupt_window_open && - vcpu->arch.irq_summary && - !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK)) - /* - * If interrupts enabled, and not blocked by sti or mov ss. Good. - */ + vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending) kvm_do_inject_irq(vcpu); + if (vcpu->arch.interrupt_window_open && vcpu->arch.interrupt.pending) + vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); if (!vcpu->arch.interrupt_window_open && (vcpu->arch.irq_summary || kvm_run->request_interrupt_window)) @@ -2194,9 +2441,6 @@ static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu) static int handle_rmode_exception(struct kvm_vcpu *vcpu, int vec, u32 err_code) { - if (!vcpu->arch.rmode.active) - return 0; - /* * Instruction with address size override prefix opcode 0x67 * Cause the #SS fault with 0 error code in VM86 mode. @@ -2204,6 +2448,25 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE) return 1; + /* + * Forward all other exceptions that are valid in real mode. + * FIXME: Breaks guest debugging in real mode, needs to be fixed with + * the required debugging infrastructure rework. + */ + switch (vec) { + case DE_VECTOR: + case DB_VECTOR: + case BP_VECTOR: + case OF_VECTOR: + case BR_VECTOR: + case UD_VECTOR: + case DF_VECTOR: + case SS_VECTOR: + case GP_VECTOR: + case MF_VECTOR: + kvm_queue_exception(vcpu, vec); + return 1; + } return 0; } @@ -2245,7 +2508,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } error_code = 0; - rip = vmcs_readl(GUEST_RIP); + rip = kvm_rip_read(vcpu); if (intr_info & INTR_INFO_DELIVER_CODE_MASK) error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); if (is_page_fault(intr_info)) { @@ -2255,6 +2518,8 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) cr2 = vmcs_readl(EXIT_QUALIFICATION); KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, (u32)((u64)cr2 >> 32), handler); + if (vcpu->arch.interrupt.pending || vcpu->arch.exception.pending) + kvm_mmu_unprotect_page_virt(vcpu, cr2); return kvm_mmu_page_fault(vcpu, cr2, error_code); } @@ -2341,27 +2606,25 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) reg = (exit_qualification >> 8) & 15; switch ((exit_qualification >> 4) & 3) { case 0: /* mov to cr */ - KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)vcpu->arch.regs[reg], - (u32)((u64)vcpu->arch.regs[reg] >> 32), handler); + KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, + (u32)kvm_register_read(vcpu, reg), + (u32)((u64)kvm_register_read(vcpu, reg) >> 32), + handler); switch (cr) { case 0: - vcpu_load_rsp_rip(vcpu); - kvm_set_cr0(vcpu, vcpu->arch.regs[reg]); + kvm_set_cr0(vcpu, kvm_register_read(vcpu, reg)); skip_emulated_instruction(vcpu); return 1; case 3: - vcpu_load_rsp_rip(vcpu); - kvm_set_cr3(vcpu, vcpu->arch.regs[reg]); + kvm_set_cr3(vcpu, kvm_register_read(vcpu, reg)); skip_emulated_instruction(vcpu); return 1; case 4: - vcpu_load_rsp_rip(vcpu); - kvm_set_cr4(vcpu, vcpu->arch.regs[reg]); + kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg)); skip_emulated_instruction(vcpu); return 1; case 8: - vcpu_load_rsp_rip(vcpu); - kvm_set_cr8(vcpu, vcpu->arch.regs[reg]); + kvm_set_cr8(vcpu, kvm_register_read(vcpu, reg)); skip_emulated_instruction(vcpu); if (irqchip_in_kernel(vcpu->kvm)) return 1; @@ -2370,7 +2633,6 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) }; break; case 2: /* clts */ - vcpu_load_rsp_rip(vcpu); vmx_fpu_deactivate(vcpu); vcpu->arch.cr0 &= ~X86_CR0_TS; vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); @@ -2381,21 +2643,17 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) case 1: /*mov from cr*/ switch (cr) { case 3: - vcpu_load_rsp_rip(vcpu); - vcpu->arch.regs[reg] = vcpu->arch.cr3; - vcpu_put_rsp_rip(vcpu); + kvm_register_write(vcpu, reg, vcpu->arch.cr3); KVMTRACE_3D(CR_READ, vcpu, (u32)cr, - (u32)vcpu->arch.regs[reg], - (u32)((u64)vcpu->arch.regs[reg] >> 32), + (u32)kvm_register_read(vcpu, reg), + (u32)((u64)kvm_register_read(vcpu, reg) >> 32), handler); skip_emulated_instruction(vcpu); return 1; case 8: - vcpu_load_rsp_rip(vcpu); - vcpu->arch.regs[reg] = kvm_get_cr8(vcpu); - vcpu_put_rsp_rip(vcpu); + kvm_register_write(vcpu, reg, kvm_get_cr8(vcpu)); KVMTRACE_2D(CR_READ, vcpu, (u32)cr, - (u32)vcpu->arch.regs[reg], handler); + (u32)kvm_register_read(vcpu, reg), handler); skip_emulated_instruction(vcpu); return 1; } @@ -2427,7 +2685,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) exit_qualification = vmcs_readl(EXIT_QUALIFICATION); dr = exit_qualification & 7; reg = (exit_qualification >> 8) & 15; - vcpu_load_rsp_rip(vcpu); if (exit_qualification & 16) { /* mov from dr */ switch (dr) { @@ -2440,12 +2697,11 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) default: val = 0; } - vcpu->arch.regs[reg] = val; + kvm_register_write(vcpu, reg, val); KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler); } else { /* mov to dr */ } - vcpu_put_rsp_rip(vcpu); skip_emulated_instruction(vcpu); return 1; } @@ -2538,6 +2794,15 @@ static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } +static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u64 exit_qualification = vmcs_read64(EXIT_QUALIFICATION); + + kvm_mmu_invlpg(vcpu, exit_qualification); + skip_emulated_instruction(vcpu); + return 1; +} + static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { skip_emulated_instruction(vcpu); @@ -2554,8 +2819,6 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) exit_qualification = vmcs_read64(EXIT_QUALIFICATION); offset = exit_qualification & 0xffful; - KVMTRACE_1D(APIC_ACCESS, vcpu, (u32)offset, handler); - er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); if (er != EMULATE_DONE) { @@ -2639,6 +2902,56 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } +static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u32 cpu_based_vm_exec_control; + + /* clear pending NMI */ + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + ++vcpu->stat.nmi_window_exits; + + return 1; +} + +static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, + struct kvm_run *kvm_run) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int err; + + preempt_enable(); + local_irq_enable(); + + while (!guest_state_valid(vcpu)) { + err = emulate_instruction(vcpu, kvm_run, 0, 0, 0); + + switch (err) { + case EMULATE_DONE: + break; + case EMULATE_DO_MMIO: + kvm_report_emulation_failure(vcpu, "mmio"); + /* TODO: Handle MMIO */ + return; + default: + kvm_report_emulation_failure(vcpu, "emulation failure"); + return; + } + + if (signal_pending(current)) + break; + if (need_resched()) + schedule(); + } + + local_irq_disable(); + preempt_disable(); + + /* Guest state should be valid now, no more emulation should be needed */ + vmx->emulation_required = 0; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -2649,6 +2962,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, [EXIT_REASON_EXCEPTION_NMI] = handle_exception, [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, + [EXIT_REASON_NMI_WINDOW] = handle_nmi_window, [EXIT_REASON_IO_INSTRUCTION] = handle_io, [EXIT_REASON_CR_ACCESS] = handle_cr, [EXIT_REASON_DR_ACCESS] = handle_dr, @@ -2657,6 +2971,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, [EXIT_REASON_MSR_WRITE] = handle_wrmsr, [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, [EXIT_REASON_HLT] = handle_halt, + [EXIT_REASON_INVLPG] = handle_invlpg, [EXIT_REASON_VMCALL] = handle_vmcall, [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, [EXIT_REASON_APIC_ACCESS] = handle_apic_access, @@ -2678,8 +2993,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); u32 vectoring_info = vmx->idt_vectoring_info; - KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)vmcs_readl(GUEST_RIP), - (u32)((u64)vmcs_readl(GUEST_RIP) >> 32), entryexit); + KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu), + (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit); /* Access CR3 don't cause VMExit in paging mode, so we need * to sync with guest real CR3. */ @@ -2736,64 +3051,128 @@ static void enable_irq_window(struct kvm_vcpu *vcpu) vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); } -static void vmx_intr_assist(struct kvm_vcpu *vcpu) +static void enable_nmi_window(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 idtv_info_field, intr_info_field; - int has_ext_irq, interrupt_window_open; - int vector; - - update_tpr_threshold(vcpu); + u32 cpu_based_vm_exec_control; - has_ext_irq = kvm_cpu_has_interrupt(vcpu); - intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); - idtv_info_field = vmx->idt_vectoring_info; - if (intr_info_field & INTR_INFO_VALID_MASK) { - if (idtv_info_field & INTR_INFO_VALID_MASK) { - /* TODO: fault when IDT_Vectoring */ - if (printk_ratelimit()) - printk(KERN_ERR "Fault when IDT_Vectoring\n"); - } - if (has_ext_irq) - enable_irq_window(vcpu); + if (!cpu_has_virtual_nmis()) return; - } - if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) { - if ((idtv_info_field & VECTORING_INFO_TYPE_MASK) - == INTR_TYPE_EXT_INTR - && vcpu->arch.rmode.active) { - u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK; - vmx_inject_irq(vcpu, vect); - if (unlikely(has_ext_irq)) - enable_irq_window(vcpu); - return; - } + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); +} - KVMTRACE_1D(REDELIVER_EVT, vcpu, idtv_info_field, handler); +static int vmx_nmi_enabled(struct kvm_vcpu *vcpu) +{ + u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); + return !(guest_intr & (GUEST_INTR_STATE_NMI | + GUEST_INTR_STATE_MOV_SS | + GUEST_INTR_STATE_STI)); +} - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field); - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, - vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); +static int vmx_irq_enabled(struct kvm_vcpu *vcpu) +{ + u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); + return (!(guest_intr & (GUEST_INTR_STATE_MOV_SS | + GUEST_INTR_STATE_STI)) && + (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)); +} - if (unlikely(idtv_info_field & INTR_INFO_DELIVER_CODE_MASK)) - vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, - vmcs_read32(IDT_VECTORING_ERROR_CODE)); - if (unlikely(has_ext_irq)) +static void enable_intr_window(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.nmi_pending) + enable_nmi_window(vcpu); + else if (kvm_cpu_has_interrupt(vcpu)) + enable_irq_window(vcpu); +} + +static void vmx_complete_interrupts(struct vcpu_vmx *vmx) +{ + u32 exit_intr_info; + u32 idt_vectoring_info; + bool unblock_nmi; + u8 vector; + int type; + bool idtv_info_valid; + u32 error; + + exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + if (cpu_has_virtual_nmis()) { + unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; + vector = exit_intr_info & INTR_INFO_VECTOR_MASK; + /* + * SDM 3: 25.7.1.2 + * Re-set bit "block by NMI" before VM entry if vmexit caused by + * a guest IRET fault. + */ + if (unblock_nmi && vector != DF_VECTOR) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + } + + idt_vectoring_info = vmx->idt_vectoring_info; + idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; + vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; + type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; + if (vmx->vcpu.arch.nmi_injected) { + /* + * SDM 3: 25.7.1.2 + * Clear bit "block by NMI" before VM entry if a NMI delivery + * faulted. + */ + if (idtv_info_valid && type == INTR_TYPE_NMI_INTR) + vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + else + vmx->vcpu.arch.nmi_injected = false; + } + kvm_clear_exception_queue(&vmx->vcpu); + if (idtv_info_valid && type == INTR_TYPE_EXCEPTION) { + if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { + error = vmcs_read32(IDT_VECTORING_ERROR_CODE); + kvm_queue_exception_e(&vmx->vcpu, vector, error); + } else + kvm_queue_exception(&vmx->vcpu, vector); + vmx->idt_vectoring_info = 0; + } + kvm_clear_interrupt_queue(&vmx->vcpu); + if (idtv_info_valid && type == INTR_TYPE_EXT_INTR) { + kvm_queue_interrupt(&vmx->vcpu, vector); + vmx->idt_vectoring_info = 0; + } +} + +static void vmx_intr_assist(struct kvm_vcpu *vcpu) +{ + update_tpr_threshold(vcpu); + + if (cpu_has_virtual_nmis()) { + if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) { + if (vmx_nmi_enabled(vcpu)) { + vcpu->arch.nmi_pending = false; + vcpu->arch.nmi_injected = true; + } else { + enable_intr_window(vcpu); + return; + } + } + if (vcpu->arch.nmi_injected) { + vmx_inject_nmi(vcpu); + enable_intr_window(vcpu); + return; + } + } + if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) { + if (vmx_irq_enabled(vcpu)) + kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu)); + else enable_irq_window(vcpu); - return; } - if (!has_ext_irq) - return; - interrupt_window_open = - ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && - (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); - if (interrupt_window_open) { - vector = kvm_cpu_get_interrupt(vcpu); - vmx_inject_irq(vcpu, vector); - kvm_timer_intr_post(vcpu, vector); - } else - enable_irq_window(vcpu); + if (vcpu->arch.interrupt.pending) { + vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); + kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr); + } } /* @@ -2805,9 +3184,9 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu) static void fixup_rmode_irq(struct vcpu_vmx *vmx) { vmx->rmode.irq.pending = 0; - if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip) + if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip) return; - vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip); + kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip); if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK; vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR; @@ -2819,11 +3198,30 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx) | vmx->rmode.irq.vector; } +#ifdef CONFIG_X86_64 +#define R "r" +#define Q "q" +#else +#define R "e" +#define Q "l" +#endif + static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 intr_info; + /* Handle invalid guest state instead of entering VMX */ + if (vmx->emulation_required && emulate_invalid_guest_state) { + handle_invalid_guest_state(vcpu, kvm_run); + return; + } + + if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) + vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); + if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) + vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); + /* * Loading guest fpu may have cleared host cr0.ts */ @@ -2831,26 +3229,25 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) asm( /* Store host registers */ -#ifdef CONFIG_X86_64 - "push %%rdx; push %%rbp;" - "push %%rcx \n\t" -#else - "push %%edx; push %%ebp;" - "push %%ecx \n\t" -#endif - ASM_VMX_VMWRITE_RSP_RDX "\n\t" + "push %%"R"dx; push %%"R"bp;" + "push %%"R"cx \n\t" + "cmp %%"R"sp, %c[host_rsp](%0) \n\t" + "je 1f \n\t" + "mov %%"R"sp, %c[host_rsp](%0) \n\t" + __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" + "1: \n\t" /* Check if vmlaunch of vmresume is needed */ "cmpl $0, %c[launched](%0) \n\t" /* Load guest registers. Don't clobber flags. */ + "mov %c[cr2](%0), %%"R"ax \n\t" + "mov %%"R"ax, %%cr2 \n\t" + "mov %c[rax](%0), %%"R"ax \n\t" + "mov %c[rbx](%0), %%"R"bx \n\t" + "mov %c[rdx](%0), %%"R"dx \n\t" + "mov %c[rsi](%0), %%"R"si \n\t" + "mov %c[rdi](%0), %%"R"di \n\t" + "mov %c[rbp](%0), %%"R"bp \n\t" #ifdef CONFIG_X86_64 - "mov %c[cr2](%0), %%rax \n\t" - "mov %%rax, %%cr2 \n\t" - "mov %c[rax](%0), %%rax \n\t" - "mov %c[rbx](%0), %%rbx \n\t" - "mov %c[rdx](%0), %%rdx \n\t" - "mov %c[rsi](%0), %%rsi \n\t" - "mov %c[rdi](%0), %%rdi \n\t" - "mov %c[rbp](%0), %%rbp \n\t" "mov %c[r8](%0), %%r8 \n\t" "mov %c[r9](%0), %%r9 \n\t" "mov %c[r10](%0), %%r10 \n\t" @@ -2859,34 +3256,25 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) "mov %c[r13](%0), %%r13 \n\t" "mov %c[r14](%0), %%r14 \n\t" "mov %c[r15](%0), %%r15 \n\t" - "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */ -#else - "mov %c[cr2](%0), %%eax \n\t" - "mov %%eax, %%cr2 \n\t" - "mov %c[rax](%0), %%eax \n\t" - "mov %c[rbx](%0), %%ebx \n\t" - "mov %c[rdx](%0), %%edx \n\t" - "mov %c[rsi](%0), %%esi \n\t" - "mov %c[rdi](%0), %%edi \n\t" - "mov %c[rbp](%0), %%ebp \n\t" - "mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */ #endif + "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */ + /* Enter guest mode */ "jne .Llaunched \n\t" - ASM_VMX_VMLAUNCH "\n\t" + __ex(ASM_VMX_VMLAUNCH) "\n\t" "jmp .Lkvm_vmx_return \n\t" - ".Llaunched: " ASM_VMX_VMRESUME "\n\t" + ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" ".Lkvm_vmx_return: " /* Save guest registers, load host registers, keep flags */ + "xchg %0, (%%"R"sp) \n\t" + "mov %%"R"ax, %c[rax](%0) \n\t" + "mov %%"R"bx, %c[rbx](%0) \n\t" + "push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t" + "mov %%"R"dx, %c[rdx](%0) \n\t" + "mov %%"R"si, %c[rsi](%0) \n\t" + "mov %%"R"di, %c[rdi](%0) \n\t" + "mov %%"R"bp, %c[rbp](%0) \n\t" #ifdef CONFIG_X86_64 - "xchg %0, (%%rsp) \n\t" - "mov %%rax, %c[rax](%0) \n\t" - "mov %%rbx, %c[rbx](%0) \n\t" - "pushq (%%rsp); popq %c[rcx](%0) \n\t" - "mov %%rdx, %c[rdx](%0) \n\t" - "mov %%rsi, %c[rsi](%0) \n\t" - "mov %%rdi, %c[rdi](%0) \n\t" - "mov %%rbp, %c[rbp](%0) \n\t" "mov %%r8, %c[r8](%0) \n\t" "mov %%r9, %c[r9](%0) \n\t" "mov %%r10, %c[r10](%0) \n\t" @@ -2895,28 +3283,16 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) "mov %%r13, %c[r13](%0) \n\t" "mov %%r14, %c[r14](%0) \n\t" "mov %%r15, %c[r15](%0) \n\t" - "mov %%cr2, %%rax \n\t" - "mov %%rax, %c[cr2](%0) \n\t" - - "pop %%rbp; pop %%rbp; pop %%rdx \n\t" -#else - "xchg %0, (%%esp) \n\t" - "mov %%eax, %c[rax](%0) \n\t" - "mov %%ebx, %c[rbx](%0) \n\t" - "pushl (%%esp); popl %c[rcx](%0) \n\t" - "mov %%edx, %c[rdx](%0) \n\t" - "mov %%esi, %c[rsi](%0) \n\t" - "mov %%edi, %c[rdi](%0) \n\t" - "mov %%ebp, %c[rbp](%0) \n\t" - "mov %%cr2, %%eax \n\t" - "mov %%eax, %c[cr2](%0) \n\t" - - "pop %%ebp; pop %%ebp; pop %%edx \n\t" #endif + "mov %%cr2, %%"R"ax \n\t" + "mov %%"R"ax, %c[cr2](%0) \n\t" + + "pop %%"R"bp; pop %%"R"bp; pop %%"R"dx \n\t" "setbe %c[fail](%0) \n\t" : : "c"(vmx), "d"((unsigned long)HOST_RSP), [launched]"i"(offsetof(struct vcpu_vmx, launched)), [fail]"i"(offsetof(struct vcpu_vmx, fail)), + [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), @@ -2936,20 +3312,22 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) #endif [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) : "cc", "memory" + , R"bx", R"di", R"si" #ifdef CONFIG_X86_64 - , "rbx", "rdi", "rsi" , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" -#else - , "ebx", "edi", "rsi" #endif ); + vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); + vcpu->arch.regs_dirty = 0; + vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); if (vmx->rmode.irq.pending) fixup_rmode_irq(vmx); vcpu->arch.interrupt_window_open = - (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; + (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & + (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)) == 0; asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); vmx->launched = 1; @@ -2957,18 +3335,24 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) intr_info = vmcs_read32(VM_EXIT_INTR_INFO); /* We need to handle NMIs before interrupts are enabled */ - if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */ + if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200 && + (intr_info & INTR_INFO_VALID_MASK)) { KVMTRACE_0D(NMI, vcpu, handler); asm("int $2"); } + + vmx_complete_interrupts(vmx); } +#undef R +#undef Q + static void vmx_free_vmcs(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); if (vmx->vmcs) { - on_each_cpu(__vcpu_clear, vmx, 1); + vcpu_clear(vmx); free_vmcs(vmx->vmcs); vmx->vmcs = NULL; } @@ -2999,15 +3383,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) return ERR_PTR(-ENOMEM); allocate_vpid(vmx); - if (id == 0 && vm_need_ept()) { - kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK | - VMX_EPT_WRITABLE_MASK | - VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT); - kvm_mmu_set_mask_ptes(0ull, VMX_EPT_FAKE_ACCESSED_MASK, - VMX_EPT_FAKE_DIRTY_MASK, 0ull, - VMX_EPT_EXECUTABLE_MASK); - kvm_enable_tdp(); - } err = kvm_vcpu_init(&vmx->vcpu, kvm, id); if (err) @@ -3095,7 +3470,6 @@ static struct kvm_x86_ops vmx_x86_ops = { .prepare_guest_switch = vmx_save_host_state, .vcpu_load = vmx_vcpu_load, .vcpu_put = vmx_vcpu_put, - .vcpu_decache = vmx_vcpu_decache, .set_guest_debug = set_guest_debug, .guest_debug_pre = kvm_guest_debug_pre, @@ -3115,8 +3489,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_idt = vmx_set_idt, .get_gdt = vmx_get_gdt, .set_gdt = vmx_set_gdt, - .cache_regs = vcpu_load_rsp_rip, - .decache_regs = vcpu_put_rsp_rip, + .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, @@ -3187,8 +3560,16 @@ static int __init vmx_init(void) vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP); vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP); - if (cpu_has_vmx_ept()) + if (vm_need_ept()) { bypass_guest_pf = 0; + kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK | + VMX_EPT_WRITABLE_MASK | + VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT); + kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, + VMX_EPT_EXECUTABLE_MASK); + kvm_enable_tdp(); + } else + kvm_disable_tdp(); if (bypass_guest_pf) kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h index 79d94c610dfe..3e010d21fdd7 100644 --- a/arch/x86/kvm/vmx.h +++ b/arch/x86/kvm/vmx.h @@ -40,6 +40,7 @@ #define CPU_BASED_CR8_LOAD_EXITING 0x00080000 #define CPU_BASED_CR8_STORE_EXITING 0x00100000 #define CPU_BASED_TPR_SHADOW 0x00200000 +#define CPU_BASED_VIRTUAL_NMI_PENDING 0x00400000 #define CPU_BASED_MOV_DR_EXITING 0x00800000 #define CPU_BASED_UNCOND_IO_EXITING 0x01000000 #define CPU_BASED_USE_IO_BITMAPS 0x02000000 @@ -216,7 +217,7 @@ enum vmcs_field { #define EXIT_REASON_TRIPLE_FAULT 2 #define EXIT_REASON_PENDING_INTERRUPT 7 - +#define EXIT_REASON_NMI_WINDOW 8 #define EXIT_REASON_TASK_SWITCH 9 #define EXIT_REASON_CPUID 10 #define EXIT_REASON_HLT 12 @@ -251,7 +252,9 @@ enum vmcs_field { #define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */ #define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */ #define INTR_INFO_DELIVER_CODE_MASK 0x800 /* 11 */ +#define INTR_INFO_UNBLOCK_NMI 0x1000 /* 12 */ #define INTR_INFO_VALID_MASK 0x80000000 /* 31 */ +#define INTR_INFO_RESVD_BITS_MASK 0x7ffff000 #define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK #define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK @@ -259,9 +262,16 @@ enum vmcs_field { #define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK #define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ +#define INTR_TYPE_NMI_INTR (2 << 8) /* NMI */ #define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */ #define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */ +/* GUEST_INTERRUPTIBILITY_INFO flags. */ +#define GUEST_INTR_STATE_STI 0x00000001 +#define GUEST_INTR_STATE_MOV_SS 0x00000002 +#define GUEST_INTR_STATE_SMI 0x00000004 +#define GUEST_INTR_STATE_NMI 0x00000008 + /* * Exit Qualifications for MOV for Control Register Access */ @@ -321,24 +331,6 @@ enum vmcs_field { #define AR_RESERVD_MASK 0xfffe0f00 -#define MSR_IA32_VMX_BASIC 0x480 -#define MSR_IA32_VMX_PINBASED_CTLS 0x481 -#define MSR_IA32_VMX_PROCBASED_CTLS 0x482 -#define MSR_IA32_VMX_EXIT_CTLS 0x483 -#define MSR_IA32_VMX_ENTRY_CTLS 0x484 -#define MSR_IA32_VMX_MISC 0x485 -#define MSR_IA32_VMX_CR0_FIXED0 0x486 -#define MSR_IA32_VMX_CR0_FIXED1 0x487 -#define MSR_IA32_VMX_CR4_FIXED0 0x488 -#define MSR_IA32_VMX_CR4_FIXED1 0x489 -#define MSR_IA32_VMX_VMCS_ENUM 0x48a -#define MSR_IA32_VMX_PROCBASED_CTLS2 0x48b -#define MSR_IA32_VMX_EPT_VPID_CAP 0x48c - -#define MSR_IA32_FEATURE_CONTROL 0x3a -#define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1 -#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4 - #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9 #define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT 10 @@ -360,8 +352,6 @@ enum vmcs_field { #define VMX_EPT_READABLE_MASK 0x1ull #define VMX_EPT_WRITABLE_MASK 0x2ull #define VMX_EPT_EXECUTABLE_MASK 0x4ull -#define VMX_EPT_FAKE_ACCESSED_MASK (1ull << 62) -#define VMX_EPT_FAKE_DIRTY_MASK (1ull << 63) #define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0faa2546b1cd..4f0677d1eae8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4,10 +4,14 @@ * derived from drivers/kvm/kvm_main.c * * Copyright (C) 2006 Qumranet, Inc. + * Copyright (C) 2008 Qumranet, Inc. + * Copyright IBM Corporation, 2008 * * Authors: * Avi Kivity <avi@qumranet.com> * Yaniv Kamay <yaniv@qumranet.com> + * Amit Shah <amit.shah@qumranet.com> + * Ben-Ami Yassour <benami@il.ibm.com> * * This work is licensed under the terms of the GNU GPL, version 2. See * the COPYING file in the top-level directory. @@ -19,14 +23,18 @@ #include "mmu.h" #include "i8254.h" #include "tss.h" +#include "kvm_cache_regs.h" +#include "x86.h" #include <linux/clocksource.h> +#include <linux/interrupt.h> #include <linux/kvm.h> #include <linux/fs.h> #include <linux/vmalloc.h> #include <linux/module.h> #include <linux/mman.h> #include <linux/highmem.h> +#include <linux/intel-iommu.h> #include <asm/uaccess.h> #include <asm/msr.h> @@ -61,6 +69,7 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries); struct kvm_x86_ops *kvm_x86_ops; +EXPORT_SYMBOL_GPL(kvm_x86_ops); struct kvm_stats_debugfs_item debugfs_entries[] = { { "pf_fixed", VCPU_STAT(pf_fixed) }, @@ -72,6 +81,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "mmio_exits", VCPU_STAT(mmio_exits) }, { "signal_exits", VCPU_STAT(signal_exits) }, { "irq_window", VCPU_STAT(irq_window_exits) }, + { "nmi_window", VCPU_STAT(nmi_window_exits) }, { "halt_exits", VCPU_STAT(halt_exits) }, { "halt_wakeup", VCPU_STAT(halt_wakeup) }, { "hypercalls", VCPU_STAT(hypercalls) }, @@ -82,6 +92,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "fpu_reload", VCPU_STAT(fpu_reload) }, { "insn_emulation", VCPU_STAT(insn_emulation) }, { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, + { "irq_injections", VCPU_STAT(irq_injections) }, { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, { "mmu_pte_write", VM_STAT(mmu_pte_write) }, { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, @@ -89,12 +100,12 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "mmu_flooded", VM_STAT(mmu_flooded) }, { "mmu_recycled", VM_STAT(mmu_recycled) }, { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, + { "mmu_unsync", VM_STAT(mmu_unsync) }, { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, { "largepages", VM_STAT(lpages) }, { NULL } }; - unsigned long segment_base(u16 selector) { struct descriptor_table gdt; @@ -173,6 +184,12 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); } +void kvm_inject_nmi(struct kvm_vcpu *vcpu) +{ + vcpu->arch.nmi_pending = 1; +} +EXPORT_SYMBOL_GPL(kvm_inject_nmi); + void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) { WARN_ON(vcpu->arch.exception.pending); @@ -345,6 +362,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4); void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { + kvm_mmu_sync_roots(vcpu); kvm_mmu_flush_tlb(vcpu); return; } @@ -557,7 +575,7 @@ static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info * hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", - __FUNCTION__, tsc_khz, hv_clock->tsc_shift, + __func__, tsc_khz, hv_clock->tsc_shift, hv_clock->tsc_to_system_mul); } @@ -604,6 +622,38 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); } +static bool msr_mtrr_valid(unsigned msr) +{ + switch (msr) { + case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1: + case MSR_MTRRfix64K_00000: + case MSR_MTRRfix16K_80000: + case MSR_MTRRfix16K_A0000: + case MSR_MTRRfix4K_C0000: + case MSR_MTRRfix4K_C8000: + case MSR_MTRRfix4K_D0000: + case MSR_MTRRfix4K_D8000: + case MSR_MTRRfix4K_E0000: + case MSR_MTRRfix4K_E8000: + case MSR_MTRRfix4K_F0000: + case MSR_MTRRfix4K_F8000: + case MSR_MTRRdefType: + case MSR_IA32_CR_PAT: + return true; + case 0x2f8: + return true; + } + return false; +} + +static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + if (!msr_mtrr_valid(msr)) + return 1; + + vcpu->arch.mtrr[msr - 0x200] = data; + return 0; +} int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) { @@ -623,10 +673,23 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n", __func__, data); break; + case MSR_IA32_DEBUGCTLMSR: + if (!data) { + /* We support the non-activated case already */ + break; + } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) { + /* Values other than LBR and BTF are vendor-specific, + thus reserved and should throw a #GP */ + return 1; + } + pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", + __func__, data); + break; case MSR_IA32_UCODE_REV: case MSR_IA32_UCODE_WRITE: - case 0x200 ... 0x2ff: /* MTRRs */ break; + case 0x200 ... 0x2ff: + return set_msr_mtrr(vcpu, msr, data); case MSR_IA32_APICBASE: kvm_set_apic_base(vcpu, data); break; @@ -652,10 +715,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) /* ...but clean it before doing the actual write */ vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); - down_read(¤t->mm->mmap_sem); vcpu->arch.time_page = gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); - up_read(¤t->mm->mmap_sem); if (is_error_page(vcpu->arch.time_page)) { kvm_release_page_clean(vcpu->arch.time_page); @@ -684,6 +745,15 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); } +static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + if (!msr_mtrr_valid(msr)) + return 1; + + *pdata = vcpu->arch.mtrr[msr - 0x200]; + return 0; +} + int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) { u64 data; @@ -703,13 +773,21 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_MC0_MISC+8: case MSR_IA32_MC0_MISC+12: case MSR_IA32_MC0_MISC+16: + case MSR_IA32_MC0_MISC+20: case MSR_IA32_UCODE_REV: case MSR_IA32_EBL_CR_POWERON: - /* MTRR registers */ - case 0xfe: - case 0x200 ... 0x2ff: + case MSR_IA32_DEBUGCTLMSR: + case MSR_IA32_LASTBRANCHFROMIP: + case MSR_IA32_LASTBRANCHTOIP: + case MSR_IA32_LASTINTFROMIP: + case MSR_IA32_LASTINTTOIP: data = 0; break; + case MSR_MTRRcap: + data = 0x500 | KVM_NR_VAR_MTRR; + break; + case 0x200 ... 0x2ff: + return get_msr_mtrr(vcpu, msr, pdata); case 0xcd: /* fsb frequency */ data = 3; break; @@ -817,41 +895,6 @@ out: return r; } -/* - * Make sure that a cpu that is being hot-unplugged does not have any vcpus - * cached on it. - */ -void decache_vcpus_on_cpu(int cpu) -{ - struct kvm *vm; - struct kvm_vcpu *vcpu; - int i; - - spin_lock(&kvm_lock); - list_for_each_entry(vm, &vm_list, vm_list) - for (i = 0; i < KVM_MAX_VCPUS; ++i) { - vcpu = vm->vcpus[i]; - if (!vcpu) - continue; - /* - * If the vcpu is locked, then it is running on some - * other cpu and therefore it is not cached on the - * cpu in question. - * - * If it's not locked, check the last cpu it executed - * on. - */ - if (mutex_trylock(&vcpu->mutex)) { - if (vcpu->cpu == cpu) { - kvm_x86_ops->vcpu_decache(vcpu); - vcpu->cpu = -1; - } - mutex_unlock(&vcpu->mutex); - } - } - spin_unlock(&kvm_lock); -} - int kvm_dev_ioctl_check_extension(long ext) { int r; @@ -867,8 +910,12 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_PIT: case KVM_CAP_NOP_IO_DELAY: case KVM_CAP_MP_STATE: + case KVM_CAP_SYNC_MMU: r = 1; break; + case KVM_CAP_COALESCED_MMIO: + r = KVM_COALESCED_MMIO_PAGE_OFFSET; + break; case KVM_CAP_VAPIC: r = !kvm_x86_ops->cpu_has_accelerated_tpr(); break; @@ -881,6 +928,9 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_PV_MMU: r = !tdp_enabled; break; + case KVM_CAP_IOMMU: + r = intel_iommu_found(); + break; default: r = 0; break; @@ -1283,28 +1333,33 @@ long kvm_arch_vcpu_ioctl(struct file *filp, struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; int r; + struct kvm_lapic_state *lapic = NULL; switch (ioctl) { case KVM_GET_LAPIC: { - struct kvm_lapic_state lapic; + lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); - memset(&lapic, 0, sizeof lapic); - r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic); + r = -ENOMEM; + if (!lapic) + goto out; + r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic); if (r) goto out; r = -EFAULT; - if (copy_to_user(argp, &lapic, sizeof lapic)) + if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state))) goto out; r = 0; break; } case KVM_SET_LAPIC: { - struct kvm_lapic_state lapic; - + lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); + r = -ENOMEM; + if (!lapic) + goto out; r = -EFAULT; - if (copy_from_user(&lapic, argp, sizeof lapic)) + if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state))) goto out; - r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);; + r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic); if (r) goto out; r = 0; @@ -1402,6 +1457,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EINVAL; } out: + if (lapic) + kfree(lapic); return r; } @@ -1476,6 +1533,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, goto out; down_write(&kvm->slots_lock); + spin_lock(&kvm->mmu_lock); p = &kvm->arch.aliases[alias->slot]; p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; @@ -1487,6 +1545,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, break; kvm->arch.naliases = n; + spin_unlock(&kvm->mmu_lock); kvm_mmu_zap_all(kvm); up_write(&kvm->slots_lock); @@ -1608,6 +1667,15 @@ long kvm_arch_vm_ioctl(struct file *filp, struct kvm *kvm = filp->private_data; void __user *argp = (void __user *)arg; int r = -EINVAL; + /* + * This union makes it completely explicit to gcc-3.x + * that these two variables' stack usage should be + * combined, not added together. + */ + union { + struct kvm_pit_state ps; + struct kvm_memory_alias alias; + } u; switch (ioctl) { case KVM_SET_TSS_ADDR: @@ -1639,17 +1707,14 @@ long kvm_arch_vm_ioctl(struct file *filp, case KVM_GET_NR_MMU_PAGES: r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); break; - case KVM_SET_MEMORY_ALIAS: { - struct kvm_memory_alias alias; - + case KVM_SET_MEMORY_ALIAS: r = -EFAULT; - if (copy_from_user(&alias, argp, sizeof alias)) + if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias))) goto out; - r = kvm_vm_ioctl_set_memory_alias(kvm, &alias); + r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias); if (r) goto out; break; - } case KVM_CREATE_IRQCHIP: r = -ENOMEM; kvm->arch.vpic = kvm_create_pic(kvm); @@ -1677,13 +1742,7 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; if (irqchip_in_kernel(kvm)) { mutex_lock(&kvm->lock); - if (irq_event.irq < 16) - kvm_pic_set_irq(pic_irqchip(kvm), - irq_event.irq, - irq_event.level); - kvm_ioapic_set_irq(kvm->arch.vioapic, - irq_event.irq, - irq_event.level); + kvm_set_irq(kvm, irq_event.irq, irq_event.level); mutex_unlock(&kvm->lock); r = 0; } @@ -1691,65 +1750,77 @@ long kvm_arch_vm_ioctl(struct file *filp, } case KVM_GET_IRQCHIP: { /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ - struct kvm_irqchip chip; + struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); - r = -EFAULT; - if (copy_from_user(&chip, argp, sizeof chip)) + r = -ENOMEM; + if (!chip) goto out; + r = -EFAULT; + if (copy_from_user(chip, argp, sizeof *chip)) + goto get_irqchip_out; r = -ENXIO; if (!irqchip_in_kernel(kvm)) - goto out; - r = kvm_vm_ioctl_get_irqchip(kvm, &chip); + goto get_irqchip_out; + r = kvm_vm_ioctl_get_irqchip(kvm, chip); if (r) - goto out; + goto get_irqchip_out; r = -EFAULT; - if (copy_to_user(argp, &chip, sizeof chip)) - goto out; + if (copy_to_user(argp, chip, sizeof *chip)) + goto get_irqchip_out; r = 0; + get_irqchip_out: + kfree(chip); + if (r) + goto out; break; } case KVM_SET_IRQCHIP: { /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ - struct kvm_irqchip chip; + struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); - r = -EFAULT; - if (copy_from_user(&chip, argp, sizeof chip)) + r = -ENOMEM; + if (!chip) goto out; + r = -EFAULT; + if (copy_from_user(chip, argp, sizeof *chip)) + goto set_irqchip_out; r = -ENXIO; if (!irqchip_in_kernel(kvm)) - goto out; - r = kvm_vm_ioctl_set_irqchip(kvm, &chip); + goto set_irqchip_out; + r = kvm_vm_ioctl_set_irqchip(kvm, chip); if (r) - goto out; + goto set_irqchip_out; r = 0; + set_irqchip_out: + kfree(chip); + if (r) + goto out; break; } case KVM_GET_PIT: { - struct kvm_pit_state ps; r = -EFAULT; - if (copy_from_user(&ps, argp, sizeof ps)) + if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state))) goto out; r = -ENXIO; if (!kvm->arch.vpit) goto out; - r = kvm_vm_ioctl_get_pit(kvm, &ps); + r = kvm_vm_ioctl_get_pit(kvm, &u.ps); if (r) goto out; r = -EFAULT; - if (copy_to_user(argp, &ps, sizeof ps)) + if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state))) goto out; r = 0; break; } case KVM_SET_PIT: { - struct kvm_pit_state ps; r = -EFAULT; - if (copy_from_user(&ps, argp, sizeof ps)) + if (copy_from_user(&u.ps, argp, sizeof u.ps)) goto out; r = -ENXIO; if (!kvm->arch.vpit) goto out; - r = kvm_vm_ioctl_set_pit(kvm, &ps); + r = kvm_vm_ioctl_set_pit(kvm, &u.ps); if (r) goto out; r = 0; @@ -1781,13 +1852,14 @@ static void kvm_init_msr_list(void) * Only apic need an MMIO device hook, so shortcut now.. */ static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu, - gpa_t addr) + gpa_t addr, int len, + int is_write) { struct kvm_io_device *dev; if (vcpu->arch.apic) { dev = &vcpu->arch.apic->dev; - if (dev->in_range(dev, addr)) + if (dev->in_range(dev, addr, len, is_write)) return dev; } return NULL; @@ -1795,13 +1867,15 @@ static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu, static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, - gpa_t addr) + gpa_t addr, int len, + int is_write) { struct kvm_io_device *dev; - dev = vcpu_find_pervcpu_dev(vcpu, addr); + dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write); if (dev == NULL) - dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr); + dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len, + is_write); return dev; } @@ -1869,7 +1943,7 @@ mmio: * Is this MMIO handled locally? */ mutex_lock(&vcpu->kvm->lock); - mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); + mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0); if (mmio_dev) { kvm_iodevice_read(mmio_dev, gpa, bytes, val); mutex_unlock(&vcpu->kvm->lock); @@ -1924,7 +1998,7 @@ mmio: * Is this MMIO handled locally? */ mutex_lock(&vcpu->kvm->lock); - mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); + mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1); if (mmio_dev) { kvm_iodevice_write(mmio_dev, gpa, bytes, val); mutex_unlock(&vcpu->kvm->lock); @@ -1993,9 +2067,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, val = *(u64 *)new; - down_read(¤t->mm->mmap_sem); page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); - up_read(¤t->mm->mmap_sem); kaddr = kmap_atomic(page, KM_USER0); set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); @@ -2015,11 +2087,13 @@ static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) { + kvm_mmu_invlpg(vcpu, address); return X86EMUL_CONTINUE; } int emulate_clts(struct kvm_vcpu *vcpu) { + KVMTRACE_0D(CLTS, vcpu, handler); kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); return X86EMUL_CONTINUE; } @@ -2053,21 +2127,19 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) { - static int reported; u8 opcodes[4]; - unsigned long rip = vcpu->arch.rip; + unsigned long rip = kvm_rip_read(vcpu); unsigned long rip_linear; - rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); - - if (reported) + if (!printk_ratelimit()) return; + rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); + emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu); printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); - reported = 1; } EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); @@ -2078,6 +2150,14 @@ static struct x86_emulate_ops emulate_ops = { .cmpxchg_emulated = emulator_cmpxchg_emulated, }; +static void cache_all_regs(struct kvm_vcpu *vcpu) +{ + kvm_register_read(vcpu, VCPU_REGS_RAX); + kvm_register_read(vcpu, VCPU_REGS_RSP); + kvm_register_read(vcpu, VCPU_REGS_RIP); + vcpu->arch.regs_dirty = ~0; +} + int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, unsigned long cr2, @@ -2087,8 +2167,15 @@ int emulate_instruction(struct kvm_vcpu *vcpu, int r; struct decode_cache *c; + kvm_clear_exception_queue(vcpu); vcpu->arch.mmio_fault_cr2 = cr2; - kvm_x86_ops->cache_regs(vcpu); + /* + * TODO: fix x86_emulate.c to use guest_read/write_register + * instead of direct ->regs accesses, can save hundred cycles + * on Intel for instructions that don't read/change RSP, for + * for example. + */ + cache_all_regs(vcpu); vcpu->mmio_is_write = 0; vcpu->arch.pio.string = 0; @@ -2105,27 +2192,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu, ? X86EMUL_MODE_PROT64 : cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; - if (vcpu->arch.emulate_ctxt.mode == X86EMUL_MODE_PROT64) { - vcpu->arch.emulate_ctxt.cs_base = 0; - vcpu->arch.emulate_ctxt.ds_base = 0; - vcpu->arch.emulate_ctxt.es_base = 0; - vcpu->arch.emulate_ctxt.ss_base = 0; - } else { - vcpu->arch.emulate_ctxt.cs_base = - get_segment_base(vcpu, VCPU_SREG_CS); - vcpu->arch.emulate_ctxt.ds_base = - get_segment_base(vcpu, VCPU_SREG_DS); - vcpu->arch.emulate_ctxt.es_base = - get_segment_base(vcpu, VCPU_SREG_ES); - vcpu->arch.emulate_ctxt.ss_base = - get_segment_base(vcpu, VCPU_SREG_SS); - } - - vcpu->arch.emulate_ctxt.gs_base = - get_segment_base(vcpu, VCPU_SREG_GS); - vcpu->arch.emulate_ctxt.fs_base = - get_segment_base(vcpu, VCPU_SREG_FS); - r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); /* Reject the instructions other than VMCALL/VMMCALL when @@ -2169,7 +2235,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu, return EMULATE_DO_MMIO; } - kvm_x86_ops->decache_regs(vcpu); kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); if (vcpu->mmio_is_write) { @@ -2222,20 +2287,19 @@ int complete_pio(struct kvm_vcpu *vcpu) struct kvm_pio_request *io = &vcpu->arch.pio; long delta; int r; - - kvm_x86_ops->cache_regs(vcpu); + unsigned long val; if (!io->string) { - if (io->in) - memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data, - io->size); + if (io->in) { + val = kvm_register_read(vcpu, VCPU_REGS_RAX); + memcpy(&val, vcpu->arch.pio_data, io->size); + kvm_register_write(vcpu, VCPU_REGS_RAX, val); + } } else { if (io->in) { r = pio_copy_data(vcpu); - if (r) { - kvm_x86_ops->cache_regs(vcpu); + if (r) return r; - } } delta = 1; @@ -2245,19 +2309,24 @@ int complete_pio(struct kvm_vcpu *vcpu) * The size of the register should really depend on * current address size. */ - vcpu->arch.regs[VCPU_REGS_RCX] -= delta; + val = kvm_register_read(vcpu, VCPU_REGS_RCX); + val -= delta; + kvm_register_write(vcpu, VCPU_REGS_RCX, val); } if (io->down) delta = -delta; delta *= io->size; - if (io->in) - vcpu->arch.regs[VCPU_REGS_RDI] += delta; - else - vcpu->arch.regs[VCPU_REGS_RSI] += delta; + if (io->in) { + val = kvm_register_read(vcpu, VCPU_REGS_RDI); + val += delta; + kvm_register_write(vcpu, VCPU_REGS_RDI, val); + } else { + val = kvm_register_read(vcpu, VCPU_REGS_RSI); + val += delta; + kvm_register_write(vcpu, VCPU_REGS_RSI, val); + } } - kvm_x86_ops->decache_regs(vcpu); - io->count -= io->cur_count; io->cur_count = 0; @@ -2300,15 +2369,17 @@ static void pio_string_write(struct kvm_io_device *pio_dev, } static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, - gpa_t addr) + gpa_t addr, int len, + int is_write) { - return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr); + return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write); } int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, int size, unsigned port) { struct kvm_io_device *pio_dev; + unsigned long val; vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; @@ -2329,13 +2400,12 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, handler); - kvm_x86_ops->cache_regs(vcpu); - memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4); - kvm_x86_ops->decache_regs(vcpu); + val = kvm_register_read(vcpu, VCPU_REGS_RAX); + memcpy(vcpu->arch.pio_data, &val, 4); kvm_x86_ops->skip_emulated_instruction(vcpu); - pio_dev = vcpu_find_pio_dev(vcpu, port); + pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in); if (pio_dev) { kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data); complete_pio(vcpu); @@ -2417,7 +2487,9 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, } } - pio_dev = vcpu_find_pio_dev(vcpu, port); + pio_dev = vcpu_find_pio_dev(vcpu, port, + vcpu->arch.pio.cur_count, + !vcpu->arch.pio.in); if (!vcpu->arch.pio.in) { /* string PIO write */ ret = pio_copy_data(vcpu); @@ -2487,11 +2559,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu) KVMTRACE_0D(HLT, vcpu, handler); if (irqchip_in_kernel(vcpu->kvm)) { vcpu->arch.mp_state = KVM_MP_STATE_HALTED; - up_read(&vcpu->kvm->slots_lock); - kvm_vcpu_block(vcpu); - down_read(&vcpu->kvm->slots_lock); - if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) - return -EINTR; return 1; } else { vcpu->run->exit_reason = KVM_EXIT_HLT; @@ -2514,13 +2581,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) unsigned long nr, a0, a1, a2, a3, ret; int r = 1; - kvm_x86_ops->cache_regs(vcpu); - - nr = vcpu->arch.regs[VCPU_REGS_RAX]; - a0 = vcpu->arch.regs[VCPU_REGS_RBX]; - a1 = vcpu->arch.regs[VCPU_REGS_RCX]; - a2 = vcpu->arch.regs[VCPU_REGS_RDX]; - a3 = vcpu->arch.regs[VCPU_REGS_RSI]; + nr = kvm_register_read(vcpu, VCPU_REGS_RAX); + a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); + a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); + a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); + a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler); @@ -2543,8 +2608,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) ret = -KVM_ENOSYS; break; } - vcpu->arch.regs[VCPU_REGS_RAX] = ret; - kvm_x86_ops->decache_regs(vcpu); + kvm_register_write(vcpu, VCPU_REGS_RAX, ret); ++vcpu->stat.hypercalls; return r; } @@ -2554,6 +2618,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu) { char instruction[3]; int ret = 0; + unsigned long rip = kvm_rip_read(vcpu); /* @@ -2563,9 +2628,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu) */ kvm_mmu_zap_all(vcpu->kvm); - kvm_x86_ops->cache_regs(vcpu); kvm_x86_ops->patch_hypercall(vcpu, instruction); - if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu) + if (emulator_write_emulated(rip, instruction, 3, vcpu) != X86EMUL_CONTINUE) ret = -EFAULT; @@ -2600,27 +2664,41 @@ void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) { + unsigned long value; + kvm_x86_ops->decache_cr4_guest_bits(vcpu); switch (cr) { case 0: - return vcpu->arch.cr0; + value = vcpu->arch.cr0; + break; case 2: - return vcpu->arch.cr2; + value = vcpu->arch.cr2; + break; case 3: - return vcpu->arch.cr3; + value = vcpu->arch.cr3; + break; case 4: - return vcpu->arch.cr4; + value = vcpu->arch.cr4; + break; case 8: - return kvm_get_cr8(vcpu); + value = kvm_get_cr8(vcpu); + break; default: vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); return 0; } + KVMTRACE_3D(CR_READ, vcpu, (u32)cr, (u32)value, + (u32)((u64)value >> 32), handler); + + return value; } void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, unsigned long *rflags) { + KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)val, + (u32)((u64)val >> 32), handler); + switch (cr) { case 0: kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); @@ -2681,13 +2759,12 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) u32 function, index; struct kvm_cpuid_entry2 *e, *best; - kvm_x86_ops->cache_regs(vcpu); - function = vcpu->arch.regs[VCPU_REGS_RAX]; - index = vcpu->arch.regs[VCPU_REGS_RCX]; - vcpu->arch.regs[VCPU_REGS_RAX] = 0; - vcpu->arch.regs[VCPU_REGS_RBX] = 0; - vcpu->arch.regs[VCPU_REGS_RCX] = 0; - vcpu->arch.regs[VCPU_REGS_RDX] = 0; + function = kvm_register_read(vcpu, VCPU_REGS_RAX); + index = kvm_register_read(vcpu, VCPU_REGS_RCX); + kvm_register_write(vcpu, VCPU_REGS_RAX, 0); + kvm_register_write(vcpu, VCPU_REGS_RBX, 0); + kvm_register_write(vcpu, VCPU_REGS_RCX, 0); + kvm_register_write(vcpu, VCPU_REGS_RDX, 0); best = NULL; for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { e = &vcpu->arch.cpuid_entries[i]; @@ -2705,18 +2782,17 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) best = e; } if (best) { - vcpu->arch.regs[VCPU_REGS_RAX] = best->eax; - vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx; - vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx; - vcpu->arch.regs[VCPU_REGS_RDX] = best->edx; + kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); + kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); + kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); + kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); } - kvm_x86_ops->decache_regs(vcpu); kvm_x86_ops->skip_emulated_instruction(vcpu); KVMTRACE_5D(CPUID, vcpu, function, - (u32)vcpu->arch.regs[VCPU_REGS_RAX], - (u32)vcpu->arch.regs[VCPU_REGS_RBX], - (u32)vcpu->arch.regs[VCPU_REGS_RCX], - (u32)vcpu->arch.regs[VCPU_REGS_RDX], handler); + (u32)kvm_register_read(vcpu, VCPU_REGS_RAX), + (u32)kvm_register_read(vcpu, VCPU_REGS_RBX), + (u32)kvm_register_read(vcpu, VCPU_REGS_RCX), + (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler); } EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); @@ -2757,9 +2833,7 @@ static void vapic_enter(struct kvm_vcpu *vcpu) if (!apic || !apic->vapic_addr) return; - down_read(¤t->mm->mmap_sem); page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); - up_read(¤t->mm->mmap_sem); vcpu->arch.apic->vapic_page = page; } @@ -2771,32 +2845,16 @@ static void vapic_exit(struct kvm_vcpu *vcpu) if (!apic || !apic->vapic_addr) return; + down_read(&vcpu->kvm->slots_lock); kvm_release_page_dirty(apic->vapic_page); mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); + up_read(&vcpu->kvm->slots_lock); } -static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { int r; - if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { - pr_debug("vcpu %d received sipi with vector # %x\n", - vcpu->vcpu_id, vcpu->arch.sipi_vector); - kvm_lapic_reset(vcpu); - r = kvm_x86_ops->vcpu_reset(vcpu); - if (r) - return r; - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; - } - - down_read(&vcpu->kvm->slots_lock); - vapic_enter(vcpu); - -preempted: - if (vcpu->guest_debug.enabled) - kvm_x86_ops->guest_debug_pre(vcpu); - -again: if (vcpu->requests) if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) kvm_mmu_unload(vcpu); @@ -2808,6 +2866,8 @@ again: if (vcpu->requests) { if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) __kvm_migrate_timers(vcpu); + if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests)) + kvm_mmu_sync_roots(vcpu); if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) kvm_x86_ops->tlb_flush(vcpu); if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, @@ -2833,21 +2893,15 @@ again: local_irq_disable(); - if (vcpu->requests || need_resched()) { + if (vcpu->requests || need_resched() || signal_pending(current)) { local_irq_enable(); preempt_enable(); r = 1; goto out; } - if (signal_pending(current)) { - local_irq_enable(); - preempt_enable(); - r = -EINTR; - kvm_run->exit_reason = KVM_EXIT_INTR; - ++vcpu->stat.signal_exits; - goto out; - } + if (vcpu->guest_debug.enabled) + kvm_x86_ops->guest_debug_pre(vcpu); vcpu->guest_mode = 1; /* @@ -2896,8 +2950,8 @@ again: * Profile KVM exit RIPs: */ if (unlikely(prof_on == KVM_PROFILING)) { - kvm_x86_ops->cache_regs(vcpu); - profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip); + unsigned long rip = kvm_rip_read(vcpu); + profile_hit(KVM_PROFILING, (void *)rip); } if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu)) @@ -2906,31 +2960,66 @@ again: kvm_lapic_sync_from_vapic(vcpu); r = kvm_x86_ops->handle_exit(kvm_run, vcpu); +out: + return r; +} - if (r > 0) { - if (dm_request_for_irq_injection(vcpu, kvm_run)) { - r = -EINTR; - kvm_run->exit_reason = KVM_EXIT_INTR; - ++vcpu->stat.request_irq_exits; - goto out; - } - if (!need_resched()) - goto again; +static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + int r; + + if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { + pr_debug("vcpu %d received sipi with vector # %x\n", + vcpu->vcpu_id, vcpu->arch.sipi_vector); + kvm_lapic_reset(vcpu); + r = kvm_x86_ops->vcpu_reset(vcpu); + if (r) + return r; + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } -out: - up_read(&vcpu->kvm->slots_lock); - if (r > 0) { - kvm_resched(vcpu); - down_read(&vcpu->kvm->slots_lock); - goto preempted; + down_read(&vcpu->kvm->slots_lock); + vapic_enter(vcpu); + + r = 1; + while (r > 0) { + if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) + r = vcpu_enter_guest(vcpu, kvm_run); + else { + up_read(&vcpu->kvm->slots_lock); + kvm_vcpu_block(vcpu); + down_read(&vcpu->kvm->slots_lock); + if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) + if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) + vcpu->arch.mp_state = + KVM_MP_STATE_RUNNABLE; + if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) + r = -EINTR; + } + + if (r > 0) { + if (dm_request_for_irq_injection(vcpu, kvm_run)) { + r = -EINTR; + kvm_run->exit_reason = KVM_EXIT_INTR; + ++vcpu->stat.request_irq_exits; + } + if (signal_pending(current)) { + r = -EINTR; + kvm_run->exit_reason = KVM_EXIT_INTR; + ++vcpu->stat.signal_exits; + } + if (need_resched()) { + up_read(&vcpu->kvm->slots_lock); + kvm_resched(vcpu); + down_read(&vcpu->kvm->slots_lock); + } + } } + up_read(&vcpu->kvm->slots_lock); post_kvm_run_save(vcpu, kvm_run); - down_read(&vcpu->kvm->slots_lock); vapic_exit(vcpu); - up_read(&vcpu->kvm->slots_lock); return r; } @@ -2942,15 +3031,16 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu_load(vcpu); + if (vcpu->sigset_active) + sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); + if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { kvm_vcpu_block(vcpu); - vcpu_put(vcpu); - return -EAGAIN; + clear_bit(KVM_REQ_UNHALT, &vcpu->requests); + r = -EAGAIN; + goto out; } - if (vcpu->sigset_active) - sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); - /* re-sync apic's tpr */ if (!irqchip_in_kernel(vcpu->kvm)) kvm_set_cr8(vcpu, kvm_run->cr8); @@ -2980,11 +3070,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } } #endif - if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) { - kvm_x86_ops->cache_regs(vcpu); - vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret; - kvm_x86_ops->decache_regs(vcpu); - } + if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) + kvm_register_write(vcpu, VCPU_REGS_RAX, + kvm_run->hypercall.ret); r = __vcpu_run(vcpu, kvm_run); @@ -3000,28 +3088,26 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { vcpu_load(vcpu); - kvm_x86_ops->cache_regs(vcpu); - - regs->rax = vcpu->arch.regs[VCPU_REGS_RAX]; - regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX]; - regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX]; - regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX]; - regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI]; - regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI]; - regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP]; - regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP]; + regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); + regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); + regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); + regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX); + regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI); + regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI); + regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); + regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP); #ifdef CONFIG_X86_64 - regs->r8 = vcpu->arch.regs[VCPU_REGS_R8]; - regs->r9 = vcpu->arch.regs[VCPU_REGS_R9]; - regs->r10 = vcpu->arch.regs[VCPU_REGS_R10]; - regs->r11 = vcpu->arch.regs[VCPU_REGS_R11]; - regs->r12 = vcpu->arch.regs[VCPU_REGS_R12]; - regs->r13 = vcpu->arch.regs[VCPU_REGS_R13]; - regs->r14 = vcpu->arch.regs[VCPU_REGS_R14]; - regs->r15 = vcpu->arch.regs[VCPU_REGS_R15]; + regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8); + regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9); + regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10); + regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11); + regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12); + regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13); + regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14); + regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15); #endif - regs->rip = vcpu->arch.rip; + regs->rip = kvm_rip_read(vcpu); regs->rflags = kvm_x86_ops->get_rflags(vcpu); /* @@ -3039,29 +3125,29 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { vcpu_load(vcpu); - vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax; - vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx; - vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx; - vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx; - vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi; - vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi; - vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp; - vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp; + kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); + kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); + kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); + kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx); + kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi); + kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi); + kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp); + kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp); #ifdef CONFIG_X86_64 - vcpu->arch.regs[VCPU_REGS_R8] = regs->r8; - vcpu->arch.regs[VCPU_REGS_R9] = regs->r9; - vcpu->arch.regs[VCPU_REGS_R10] = regs->r10; - vcpu->arch.regs[VCPU_REGS_R11] = regs->r11; - vcpu->arch.regs[VCPU_REGS_R12] = regs->r12; - vcpu->arch.regs[VCPU_REGS_R13] = regs->r13; - vcpu->arch.regs[VCPU_REGS_R14] = regs->r14; - vcpu->arch.regs[VCPU_REGS_R15] = regs->r15; + kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8); + kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9); + kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10); + kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11); + kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12); + kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13); + kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14); + kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15); + #endif - vcpu->arch.rip = regs->rip; + kvm_rip_write(vcpu, regs->rip); kvm_x86_ops->set_rflags(vcpu, regs->rflags); - kvm_x86_ops->decache_regs(vcpu); vcpu->arch.exception.pending = false; @@ -3070,8 +3156,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) return 0; } -static void get_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg) +void kvm_get_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) { kvm_x86_ops->get_segment(vcpu, var, seg); } @@ -3080,7 +3166,7 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) { struct kvm_segment cs; - get_segment(vcpu, &cs, VCPU_SREG_CS); + kvm_get_segment(vcpu, &cs, VCPU_SREG_CS); *db = cs.db; *l = cs.l; } @@ -3094,15 +3180,15 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, vcpu_load(vcpu); - get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); - get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); - get_segment(vcpu, &sregs->es, VCPU_SREG_ES); - get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); - get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); - get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); + kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); + kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); + kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); + kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); + kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); + kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); - get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); - get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); + kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); + kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); kvm_x86_ops->get_idt(vcpu, &dt); sregs->idt.limit = dt.limit; @@ -3154,7 +3240,7 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, return 0; } -static void set_segment(struct kvm_vcpu *vcpu, +static void kvm_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { kvm_x86_ops->set_segment(vcpu, var, seg); @@ -3168,6 +3254,10 @@ static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector, kvm_desct->base |= seg_desc->base2 << 24; kvm_desct->limit = seg_desc->limit0; kvm_desct->limit |= seg_desc->limit << 16; + if (seg_desc->g) { + kvm_desct->limit <<= 12; + kvm_desct->limit |= 0xfff; + } kvm_desct->selector = selector; kvm_desct->type = seg_desc->type; kvm_desct->present = seg_desc->p; @@ -3191,7 +3281,7 @@ static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu, if (selector & 1 << 2) { struct kvm_segment kvm_seg; - get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR); + kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR); if (kvm_seg.unusable) dtable->limit = 0; @@ -3207,6 +3297,7 @@ static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu, static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, struct desc_struct *seg_desc) { + gpa_t gpa; struct descriptor_table dtable; u16 index = selector >> 3; @@ -3216,13 +3307,16 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); return 1; } - return kvm_read_guest(vcpu->kvm, dtable.base + index * 8, seg_desc, 8); + gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base); + gpa += index * 8; + return kvm_read_guest(vcpu->kvm, gpa, seg_desc, 8); } /* allowed just for 8 bytes segments */ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, struct desc_struct *seg_desc) { + gpa_t gpa; struct descriptor_table dtable; u16 index = selector >> 3; @@ -3230,7 +3324,9 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, if (dtable.limit < index * 8 + 7) return 1; - return kvm_write_guest(vcpu->kvm, dtable.base + index * 8, seg_desc, 8); + gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base); + gpa += index * 8; + return kvm_write_guest(vcpu->kvm, gpa, seg_desc, 8); } static u32 get_tss_base_addr(struct kvm_vcpu *vcpu, @@ -3242,62 +3338,14 @@ static u32 get_tss_base_addr(struct kvm_vcpu *vcpu, base_addr |= (seg_desc->base1 << 16); base_addr |= (seg_desc->base2 << 24); - return base_addr; -} - -static int load_tss_segment32(struct kvm_vcpu *vcpu, - struct desc_struct *seg_desc, - struct tss_segment_32 *tss) -{ - u32 base_addr; - - base_addr = get_tss_base_addr(vcpu, seg_desc); - - return kvm_read_guest(vcpu->kvm, base_addr, tss, - sizeof(struct tss_segment_32)); -} - -static int save_tss_segment32(struct kvm_vcpu *vcpu, - struct desc_struct *seg_desc, - struct tss_segment_32 *tss) -{ - u32 base_addr; - - base_addr = get_tss_base_addr(vcpu, seg_desc); - - return kvm_write_guest(vcpu->kvm, base_addr, tss, - sizeof(struct tss_segment_32)); -} - -static int load_tss_segment16(struct kvm_vcpu *vcpu, - struct desc_struct *seg_desc, - struct tss_segment_16 *tss) -{ - u32 base_addr; - - base_addr = get_tss_base_addr(vcpu, seg_desc); - - return kvm_read_guest(vcpu->kvm, base_addr, tss, - sizeof(struct tss_segment_16)); -} - -static int save_tss_segment16(struct kvm_vcpu *vcpu, - struct desc_struct *seg_desc, - struct tss_segment_16 *tss) -{ - u32 base_addr; - - base_addr = get_tss_base_addr(vcpu, seg_desc); - - return kvm_write_guest(vcpu->kvm, base_addr, tss, - sizeof(struct tss_segment_16)); + return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr); } static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) { struct kvm_segment kvm_seg; - get_segment(vcpu, &kvm_seg, seg); + kvm_get_segment(vcpu, &kvm_seg, seg); return kvm_seg.selector; } @@ -3313,11 +3361,33 @@ static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu, return 0; } -static int load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, - int type_bits, int seg) +static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg) +{ + struct kvm_segment segvar = { + .base = selector << 4, + .limit = 0xffff, + .selector = selector, + .type = 3, + .present = 1, + .dpl = 3, + .db = 0, + .s = 1, + .l = 0, + .g = 0, + .avl = 0, + .unusable = 0, + }; + kvm_x86_ops->set_segment(vcpu, &segvar, seg); + return 0; +} + +int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, + int type_bits, int seg) { struct kvm_segment kvm_seg; + if (!(vcpu->arch.cr0 & X86_CR0_PE)) + return kvm_load_realmode_segment(vcpu, selector, seg); if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) return 1; kvm_seg.type |= type_bits; @@ -3327,7 +3397,7 @@ static int load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, if (!kvm_seg.s) kvm_seg.unusable = 1; - set_segment(vcpu, &kvm_seg, seg); + kvm_set_segment(vcpu, &kvm_seg, seg); return 0; } @@ -3335,17 +3405,16 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu, struct tss_segment_32 *tss) { tss->cr3 = vcpu->arch.cr3; - tss->eip = vcpu->arch.rip; + tss->eip = kvm_rip_read(vcpu); tss->eflags = kvm_x86_ops->get_rflags(vcpu); - tss->eax = vcpu->arch.regs[VCPU_REGS_RAX]; - tss->ecx = vcpu->arch.regs[VCPU_REGS_RCX]; - tss->edx = vcpu->arch.regs[VCPU_REGS_RDX]; - tss->ebx = vcpu->arch.regs[VCPU_REGS_RBX]; - tss->esp = vcpu->arch.regs[VCPU_REGS_RSP]; - tss->ebp = vcpu->arch.regs[VCPU_REGS_RBP]; - tss->esi = vcpu->arch.regs[VCPU_REGS_RSI]; - tss->edi = vcpu->arch.regs[VCPU_REGS_RDI]; - + tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX); + tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); + tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX); + tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX); + tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP); + tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP); + tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI); + tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI); tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); @@ -3361,37 +3430,37 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu, { kvm_set_cr3(vcpu, tss->cr3); - vcpu->arch.rip = tss->eip; + kvm_rip_write(vcpu, tss->eip); kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2); - vcpu->arch.regs[VCPU_REGS_RAX] = tss->eax; - vcpu->arch.regs[VCPU_REGS_RCX] = tss->ecx; - vcpu->arch.regs[VCPU_REGS_RDX] = tss->edx; - vcpu->arch.regs[VCPU_REGS_RBX] = tss->ebx; - vcpu->arch.regs[VCPU_REGS_RSP] = tss->esp; - vcpu->arch.regs[VCPU_REGS_RBP] = tss->ebp; - vcpu->arch.regs[VCPU_REGS_RSI] = tss->esi; - vcpu->arch.regs[VCPU_REGS_RDI] = tss->edi; + kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax); + kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx); + kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx); + kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx); + kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp); + kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp); + kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi); + kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi); - if (load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) + if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) return 1; - if (load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) + if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) return 1; - if (load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) + if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) return 1; - if (load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) + if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) return 1; - if (load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) + if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) return 1; - if (load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS)) + if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS)) return 1; - if (load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS)) + if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS)) return 1; return 0; } @@ -3399,16 +3468,16 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu, static void save_state_to_tss16(struct kvm_vcpu *vcpu, struct tss_segment_16 *tss) { - tss->ip = vcpu->arch.rip; + tss->ip = kvm_rip_read(vcpu); tss->flag = kvm_x86_ops->get_rflags(vcpu); - tss->ax = vcpu->arch.regs[VCPU_REGS_RAX]; - tss->cx = vcpu->arch.regs[VCPU_REGS_RCX]; - tss->dx = vcpu->arch.regs[VCPU_REGS_RDX]; - tss->bx = vcpu->arch.regs[VCPU_REGS_RBX]; - tss->sp = vcpu->arch.regs[VCPU_REGS_RSP]; - tss->bp = vcpu->arch.regs[VCPU_REGS_RBP]; - tss->si = vcpu->arch.regs[VCPU_REGS_RSI]; - tss->di = vcpu->arch.regs[VCPU_REGS_RDI]; + tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX); + tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX); + tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX); + tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX); + tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP); + tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP); + tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI); + tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI); tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); @@ -3421,49 +3490,55 @@ static void save_state_to_tss16(struct kvm_vcpu *vcpu, static int load_state_from_tss16(struct kvm_vcpu *vcpu, struct tss_segment_16 *tss) { - vcpu->arch.rip = tss->ip; + kvm_rip_write(vcpu, tss->ip); kvm_x86_ops->set_rflags(vcpu, tss->flag | 2); - vcpu->arch.regs[VCPU_REGS_RAX] = tss->ax; - vcpu->arch.regs[VCPU_REGS_RCX] = tss->cx; - vcpu->arch.regs[VCPU_REGS_RDX] = tss->dx; - vcpu->arch.regs[VCPU_REGS_RBX] = tss->bx; - vcpu->arch.regs[VCPU_REGS_RSP] = tss->sp; - vcpu->arch.regs[VCPU_REGS_RBP] = tss->bp; - vcpu->arch.regs[VCPU_REGS_RSI] = tss->si; - vcpu->arch.regs[VCPU_REGS_RDI] = tss->di; - - if (load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) + kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax); + kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx); + kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx); + kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx); + kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp); + kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp); + kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si); + kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di); + + if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) return 1; - if (load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) + if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) return 1; - if (load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) + if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) return 1; - if (load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) + if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) return 1; - if (load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) + if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) return 1; return 0; } -int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, - struct desc_struct *cseg_desc, +static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, + u32 old_tss_base, struct desc_struct *nseg_desc) { struct tss_segment_16 tss_segment_16; int ret = 0; - if (load_tss_segment16(vcpu, cseg_desc, &tss_segment_16)) + if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16, + sizeof tss_segment_16)) goto out; save_state_to_tss16(vcpu, &tss_segment_16); - save_tss_segment16(vcpu, cseg_desc, &tss_segment_16); - if (load_tss_segment16(vcpu, nseg_desc, &tss_segment_16)) + if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16, + sizeof tss_segment_16)) goto out; + + if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), + &tss_segment_16, sizeof tss_segment_16)) + goto out; + if (load_state_from_tss16(vcpu, &tss_segment_16)) goto out; @@ -3472,21 +3547,27 @@ out: return ret; } -int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, - struct desc_struct *cseg_desc, +static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, + u32 old_tss_base, struct desc_struct *nseg_desc) { struct tss_segment_32 tss_segment_32; int ret = 0; - if (load_tss_segment32(vcpu, cseg_desc, &tss_segment_32)) + if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32, + sizeof tss_segment_32)) goto out; save_state_to_tss32(vcpu, &tss_segment_32); - save_tss_segment32(vcpu, cseg_desc, &tss_segment_32); - if (load_tss_segment32(vcpu, nseg_desc, &tss_segment_32)) + if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32, + sizeof tss_segment_32)) goto out; + + if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), + &tss_segment_32, sizeof tss_segment_32)) + goto out; + if (load_state_from_tss32(vcpu, &tss_segment_32)) goto out; @@ -3501,16 +3582,20 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) struct desc_struct cseg_desc; struct desc_struct nseg_desc; int ret = 0; + u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR); + u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR); - get_segment(vcpu, &tr_seg, VCPU_SREG_TR); + old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base); + /* FIXME: Handle errors. Failure to read either TSS or their + * descriptors should generate a pagefault. + */ if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc)) goto out; - if (load_guest_segment_descriptor(vcpu, tr_seg.selector, &cseg_desc)) + if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc)) goto out; - if (reason != TASK_SWITCH_IRET) { int cpl; @@ -3528,8 +3613,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { cseg_desc.type &= ~(1 << 1); //clear the B flag - save_guest_segment_descriptor(vcpu, tr_seg.selector, - &cseg_desc); + save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc); } if (reason == TASK_SWITCH_IRET) { @@ -3538,13 +3622,12 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) } kvm_x86_ops->skip_emulated_instruction(vcpu); - kvm_x86_ops->cache_regs(vcpu); if (nseg_desc.type & 8) - ret = kvm_task_switch_32(vcpu, tss_selector, &cseg_desc, + ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base, &nseg_desc); else - ret = kvm_task_switch_16(vcpu, tss_selector, &cseg_desc, + ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_base, &nseg_desc); if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { @@ -3561,9 +3644,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS); seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); tr_seg.type = 11; - set_segment(vcpu, &tr_seg, VCPU_SREG_TR); + kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); out: - kvm_x86_ops->decache_regs(vcpu); return ret; } EXPORT_SYMBOL_GPL(kvm_task_switch); @@ -3626,17 +3708,24 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, pr_debug("Set back pending irq %d\n", pending_vec); } + kvm_pic_clear_isr_ack(vcpu->kvm); } - set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); - set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); - set_segment(vcpu, &sregs->es, VCPU_SREG_ES); - set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); - set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); - set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); + kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); + kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); + kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES); + kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); + kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); + kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); + + kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); + kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); - set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); - set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); + /* Older userspace won't unhalt the vcpu on reset. */ + if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 && + sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && + !(vcpu->arch.cr0 & X86_CR0_PE)) + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; vcpu_put(vcpu); @@ -3751,14 +3840,14 @@ void fx_init(struct kvm_vcpu *vcpu) * allocate ram with GFP_KERNEL. */ if (!used_math()) - fx_save(&vcpu->arch.host_fx_image); + kvm_fx_save(&vcpu->arch.host_fx_image); /* Initialize guest FPU by resetting ours and saving into guest's */ preempt_disable(); - fx_save(&vcpu->arch.host_fx_image); - fx_finit(); - fx_save(&vcpu->arch.guest_fx_image); - fx_restore(&vcpu->arch.host_fx_image); + kvm_fx_save(&vcpu->arch.host_fx_image); + kvm_fx_finit(); + kvm_fx_save(&vcpu->arch.guest_fx_image); + kvm_fx_restore(&vcpu->arch.host_fx_image); preempt_enable(); vcpu->arch.cr0 |= X86_CR0_ET; @@ -3775,8 +3864,8 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) return; vcpu->guest_fpu_loaded = 1; - fx_save(&vcpu->arch.host_fx_image); - fx_restore(&vcpu->arch.guest_fx_image); + kvm_fx_save(&vcpu->arch.host_fx_image); + kvm_fx_restore(&vcpu->arch.guest_fx_image); } EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); @@ -3786,8 +3875,8 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) return; vcpu->guest_fpu_loaded = 0; - fx_save(&vcpu->arch.guest_fx_image); - fx_restore(&vcpu->arch.host_fx_image); + kvm_fx_save(&vcpu->arch.guest_fx_image); + kvm_fx_restore(&vcpu->arch.host_fx_image); ++vcpu->stat.fpu_reload; } EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); @@ -3922,6 +4011,7 @@ struct kvm *kvm_arch_create_vm(void) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); + INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); return kvm; } @@ -3954,6 +4044,8 @@ static void kvm_free_vcpus(struct kvm *kvm) void kvm_arch_destroy_vm(struct kvm *kvm) { + kvm_iommu_unmap_guest(kvm); + kvm_free_all_assigned_devices(kvm); kvm_free_pit(kvm); kfree(kvm->arch.vpic); kfree(kvm->arch.vioapic); @@ -3979,16 +4071,23 @@ int kvm_arch_set_memory_region(struct kvm *kvm, */ if (!user_alloc) { if (npages && !old.rmap) { + unsigned long userspace_addr; + down_write(¤t->mm->mmap_sem); - memslot->userspace_addr = do_mmap(NULL, 0, - npages * PAGE_SIZE, - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS, - 0); + userspace_addr = do_mmap(NULL, 0, + npages * PAGE_SIZE, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + 0); up_write(¤t->mm->mmap_sem); - if (IS_ERR((void *)memslot->userspace_addr)) - return PTR_ERR((void *)memslot->userspace_addr); + if (IS_ERR((void *)userspace_addr)) + return PTR_ERR((void *)userspace_addr); + + /* set userspace_addr atomically for kvm_hva_to_rmapp */ + spin_lock(&kvm->mmu_lock); + memslot->userspace_addr = userspace_addr; + spin_unlock(&kvm->mmu_lock); } else { if (!old.user_alloc && old.rmap) { int ret; @@ -4016,6 +4115,11 @@ int kvm_arch_set_memory_region(struct kvm *kvm, return 0; } +void kvm_arch_flush_shadow(struct kvm *kvm) +{ + kvm_mmu_zap_all(kvm); +} + int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h new file mode 100644 index 000000000000..6a4be78a7384 --- /dev/null +++ b/arch/x86/kvm/x86.h @@ -0,0 +1,22 @@ +#ifndef ARCH_X86_KVM_X86_H +#define ARCH_X86_KVM_X86_H + +#include <linux/kvm_host.h> + +static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) +{ + vcpu->arch.exception.pending = false; +} + +static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector) +{ + vcpu->arch.interrupt.pending = true; + vcpu->arch.interrupt.nr = vector; +} + +static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu) +{ + vcpu->arch.interrupt.pending = false; +} + +#endif diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 932f216d890c..ea051173b0da 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -26,6 +26,7 @@ #define DPRINTF(_f, _a ...) printf(_f , ## _a) #else #include <linux/kvm_host.h> +#include "kvm_cache_regs.h" #define DPRINTF(x...) do {} while (0) #endif #include <linux/module.h> @@ -46,25 +47,26 @@ #define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ #define DstReg (2<<1) /* Register operand. */ #define DstMem (3<<1) /* Memory operand. */ -#define DstMask (3<<1) +#define DstAcc (4<<1) /* Destination Accumulator */ +#define DstMask (7<<1) /* Source operand type. */ -#define SrcNone (0<<3) /* No source operand. */ -#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */ -#define SrcReg (1<<3) /* Register operand. */ -#define SrcMem (2<<3) /* Memory operand. */ -#define SrcMem16 (3<<3) /* Memory operand (16-bit). */ -#define SrcMem32 (4<<3) /* Memory operand (32-bit). */ -#define SrcImm (5<<3) /* Immediate operand. */ -#define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */ -#define SrcMask (7<<3) +#define SrcNone (0<<4) /* No source operand. */ +#define SrcImplicit (0<<4) /* Source operand is implicit in the opcode. */ +#define SrcReg (1<<4) /* Register operand. */ +#define SrcMem (2<<4) /* Memory operand. */ +#define SrcMem16 (3<<4) /* Memory operand (16-bit). */ +#define SrcMem32 (4<<4) /* Memory operand (32-bit). */ +#define SrcImm (5<<4) /* Immediate operand. */ +#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */ +#define SrcMask (7<<4) /* Generic ModRM decode. */ -#define ModRM (1<<6) +#define ModRM (1<<7) /* Destination is only written; never read. */ -#define Mov (1<<7) -#define BitOp (1<<8) -#define MemAbs (1<<9) /* Memory operand is absolute displacement */ -#define String (1<<10) /* String instruction (rep capable) */ -#define Stack (1<<11) /* Stack instruction (push/pop) */ +#define Mov (1<<8) +#define BitOp (1<<9) +#define MemAbs (1<<10) /* Memory operand is absolute displacement */ +#define String (1<<12) /* String instruction (rep capable) */ +#define Stack (1<<13) /* Stack instruction (push/pop) */ #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ #define GroupMask 0xff /* Group number stored in bits 0:7 */ @@ -94,7 +96,7 @@ static u16 opcode_table[256] = { /* 0x20 - 0x27 */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - SrcImmByte, SrcImm, 0, 0, + DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, /* 0x28 - 0x2F */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, @@ -106,7 +108,8 @@ static u16 opcode_table[256] = { /* 0x38 - 0x3F */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, + ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, + 0, 0, /* 0x40 - 0x47 */ DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, /* 0x48 - 0x4F */ @@ -121,7 +124,7 @@ static u16 opcode_table[256] = { 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , 0, 0, 0, 0, /* 0x68 - 0x6F */ - 0, 0, ImplicitOps | Mov | Stack, 0, + SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0, SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ /* 0x70 - 0x77 */ @@ -138,9 +141,11 @@ static u16 opcode_table[256] = { /* 0x88 - 0x8F */ ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - 0, ModRM | DstReg, 0, Group | Group1A, - /* 0x90 - 0x9F */ - 0, 0, 0, 0, 0, 0, 0, 0, + DstMem | SrcReg | ModRM | Mov, ModRM | DstReg, + DstReg | SrcMem | ModRM | Mov, Group | Group1A, + /* 0x90 - 0x97 */ + DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, + /* 0x98 - 0x9F */ 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, /* 0xA0 - 0xA7 */ ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, @@ -151,8 +156,16 @@ static u16 opcode_table[256] = { 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, ByteOp | ImplicitOps | String, ImplicitOps | String, - /* 0xB0 - 0xBF */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xB0 - 0xB7 */ + ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, + ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, + ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, + ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, + /* 0xB8 - 0xBF */ + DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, + DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, + DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, + DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, /* 0xC0 - 0xC7 */ ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, 0, ImplicitOps | Stack, 0, 0, @@ -166,16 +179,20 @@ static u16 opcode_table[256] = { /* 0xD8 - 0xDF */ 0, 0, 0, 0, 0, 0, 0, 0, /* 0xE0 - 0xE7 */ - 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xE8 - 0xEF */ - ImplicitOps | Stack, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps, 0, 0, 0, 0, + SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, + SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, + /* 0xE8 - 0xEF */ + ImplicitOps | Stack, SrcImm | ImplicitOps, + ImplicitOps, SrcImmByte | ImplicitOps, + SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, + SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* 0xF0 - 0xF7 */ 0, 0, 0, 0, ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, /* 0xF8 - 0xFF */ ImplicitOps, 0, ImplicitOps, ImplicitOps, - 0, 0, Group | Group4, Group | Group5, + ImplicitOps, ImplicitOps, Group | Group4, Group | Group5, }; static u16 twobyte_table[256] = { @@ -215,7 +232,7 @@ static u16 twobyte_table[256] = { /* 0xA0 - 0xA7 */ 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, /* 0xA8 - 0xAF */ - 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, + 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, ModRM, 0, /* 0xB0 - 0xB7 */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, DstMem | SrcReg | ModRM | BitOp, @@ -264,15 +281,16 @@ static u16 group_table[] = { ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 0, 0, 0, 0, [Group3*8] = - DstMem | SrcImm | ModRM | SrcImm, 0, - DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, + DstMem | SrcImm | ModRM, 0, + DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 0, 0, 0, 0, [Group4*8] = ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 0, 0, 0, 0, 0, 0, [Group5*8] = - DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 0, 0, - SrcMem | ModRM, 0, SrcMem | ModRM | Stack, 0, + DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, + SrcMem | ModRM | Stack, 0, + SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0, [Group7*8] = 0, 0, ModRM | SrcMem, ModRM | SrcMem, SrcNone | ModRM | DstMem | Mov, 0, @@ -518,6 +536,39 @@ static inline void jmp_rel(struct decode_cache *c, int rel) register_address_increment(c, &c->eip, rel); } +static void set_seg_override(struct decode_cache *c, int seg) +{ + c->has_seg_override = true; + c->seg_override = seg; +} + +static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) +{ + if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) + return 0; + + return kvm_x86_ops->get_segment_base(ctxt->vcpu, seg); +} + +static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt, + struct decode_cache *c) +{ + if (!c->has_seg_override) + return 0; + + return seg_base(ctxt, c->seg_override); +} + +static unsigned long es_base(struct x86_emulate_ctxt *ctxt) +{ + return seg_base(ctxt, VCPU_SREG_ES); +} + +static unsigned long ss_base(struct x86_emulate_ctxt *ctxt) +{ + return seg_base(ctxt, VCPU_SREG_SS); +} + static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, unsigned long linear, u8 *dest) @@ -660,7 +711,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, { struct decode_cache *c = &ctxt->decode; u8 sib; - int index_reg = 0, base_reg = 0, scale, rip_relative = 0; + int index_reg = 0, base_reg = 0, scale; int rc = 0; if (c->rex_prefix) { @@ -731,47 +782,28 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, } if (c->modrm_rm == 2 || c->modrm_rm == 3 || (c->modrm_rm == 6 && c->modrm_mod != 0)) - if (!c->override_base) - c->override_base = &ctxt->ss_base; + if (!c->has_seg_override) + set_seg_override(c, VCPU_SREG_SS); c->modrm_ea = (u16)c->modrm_ea; } else { /* 32/64-bit ModR/M decode. */ - switch (c->modrm_rm) { - case 4: - case 12: + if ((c->modrm_rm & 7) == 4) { sib = insn_fetch(u8, 1, c->eip); index_reg |= (sib >> 3) & 7; base_reg |= sib & 7; scale = sib >> 6; - switch (base_reg) { - case 5: - if (c->modrm_mod != 0) - c->modrm_ea += c->regs[base_reg]; - else - c->modrm_ea += - insn_fetch(s32, 4, c->eip); - break; - default: + if ((base_reg & 7) == 5 && c->modrm_mod == 0) + c->modrm_ea += insn_fetch(s32, 4, c->eip); + else c->modrm_ea += c->regs[base_reg]; - } - switch (index_reg) { - case 4: - break; - default: + if (index_reg != 4) c->modrm_ea += c->regs[index_reg] << scale; - } - break; - case 5: - if (c->modrm_mod != 0) - c->modrm_ea += c->regs[c->modrm_rm]; - else if (ctxt->mode == X86EMUL_MODE_PROT64) - rip_relative = 1; - break; - default: + } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) { + if (ctxt->mode == X86EMUL_MODE_PROT64) + c->rip_relative = 1; + } else c->modrm_ea += c->regs[c->modrm_rm]; - break; - } switch (c->modrm_mod) { case 0: if (c->modrm_rm == 5) @@ -785,22 +817,6 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, break; } } - if (rip_relative) { - c->modrm_ea += c->eip; - switch (c->d & SrcMask) { - case SrcImmByte: - c->modrm_ea += 1; - break; - case SrcImm: - if (c->d & ByteOp) - c->modrm_ea += 1; - else - if (c->op_bytes == 8) - c->modrm_ea += 4; - else - c->modrm_ea += c->op_bytes; - } - } done: return rc; } @@ -837,7 +853,8 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) /* Shadow copy of register state. Committed on successful emulation. */ memset(c, 0, sizeof(struct decode_cache)); - c->eip = ctxt->vcpu->arch.rip; + c->eip = kvm_rip_read(ctxt->vcpu); + ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); switch (mode) { @@ -876,23 +893,15 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) /* switch between 2/4 bytes */ c->ad_bytes = def_ad_bytes ^ 6; break; + case 0x26: /* ES override */ case 0x2e: /* CS override */ - c->override_base = &ctxt->cs_base; - break; + case 0x36: /* SS override */ case 0x3e: /* DS override */ - c->override_base = &ctxt->ds_base; - break; - case 0x26: /* ES override */ - c->override_base = &ctxt->es_base; + set_seg_override(c, (c->b >> 3) & 3); break; case 0x64: /* FS override */ - c->override_base = &ctxt->fs_base; - break; case 0x65: /* GS override */ - c->override_base = &ctxt->gs_base; - break; - case 0x36: /* SS override */ - c->override_base = &ctxt->ss_base; + set_seg_override(c, c->b & 7); break; case 0x40 ... 0x4f: /* REX */ if (mode != X86EMUL_MODE_PROT64) @@ -964,15 +973,11 @@ done_prefixes: if (rc) goto done; - if (!c->override_base) - c->override_base = &ctxt->ds_base; - if (mode == X86EMUL_MODE_PROT64 && - c->override_base != &ctxt->fs_base && - c->override_base != &ctxt->gs_base) - c->override_base = NULL; + if (!c->has_seg_override) + set_seg_override(c, VCPU_SREG_DS); - if (c->override_base) - c->modrm_ea += *c->override_base; + if (!(!c->twobyte && c->b == 0x8d)) + c->modrm_ea += seg_override_base(ctxt, c); if (c->ad_bytes != 8) c->modrm_ea = (u32)c->modrm_ea; @@ -1049,6 +1054,7 @@ done_prefixes: break; case DstMem: if ((c->d & ModRM) && c->modrm_mod == 3) { + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; c->dst.type = OP_REG; c->dst.val = c->dst.orig_val = c->modrm_val; c->dst.ptr = c->modrm_ptr; @@ -1056,8 +1062,28 @@ done_prefixes: } c->dst.type = OP_MEM; break; + case DstAcc: + c->dst.type = OP_REG; + c->dst.bytes = c->op_bytes; + c->dst.ptr = &c->regs[VCPU_REGS_RAX]; + switch (c->op_bytes) { + case 1: + c->dst.val = *(u8 *)c->dst.ptr; + break; + case 2: + c->dst.val = *(u16 *)c->dst.ptr; + break; + case 4: + c->dst.val = *(u32 *)c->dst.ptr; + break; + } + c->dst.orig_val = c->dst.val; + break; } + if (c->rip_relative) + c->modrm_ea += c->eip; + done: return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; } @@ -1070,7 +1096,7 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt) c->dst.bytes = c->op_bytes; c->dst.val = c->src.val; register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); - c->dst.ptr = (void *) register_address(c, ctxt->ss_base, + c->dst.ptr = (void *) register_address(c, ss_base(ctxt), c->regs[VCPU_REGS_RSP]); } @@ -1080,7 +1106,7 @@ static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, struct decode_cache *c = &ctxt->decode; int rc; - rc = ops->read_std(register_address(c, ctxt->ss_base, + rc = ops->read_std(register_address(c, ss_base(ctxt), c->regs[VCPU_REGS_RSP]), &c->dst.val, c->dst.bytes, ctxt->vcpu); if (rc != 0) @@ -1156,6 +1182,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, case 1: /* dec */ emulate_1op("dec", c->dst, ctxt->eflags); break; + case 2: /* call near abs */ { + long int old_eip; + old_eip = c->eip; + c->eip = c->src.val; + c->src.val = old_eip; + emulate_push(ctxt); + break; + } case 4: /* jmp abs */ c->eip = c->src.val; break; @@ -1256,6 +1290,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) u64 msr_data; unsigned long saved_eip = 0; struct decode_cache *c = &ctxt->decode; + unsigned int port; + int io_dir_in; int rc = 0; /* Shadow copy of register state. Committed on successful emulation. @@ -1272,7 +1308,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) if (c->rep_prefix && (c->d & String)) { /* All REP prefixes have the same first termination condition */ if (c->regs[VCPU_REGS_RCX] == 0) { - ctxt->vcpu->arch.rip = c->eip; + kvm_rip_write(ctxt->vcpu, c->eip); goto done; } /* The second termination condition only applies for REPE @@ -1286,17 +1322,17 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) (c->b == 0xae) || (c->b == 0xaf)) { if ((c->rep_prefix == REPE_PREFIX) && ((ctxt->eflags & EFLG_ZF) == 0)) { - ctxt->vcpu->arch.rip = c->eip; + kvm_rip_write(ctxt->vcpu, c->eip); goto done; } if ((c->rep_prefix == REPNE_PREFIX) && ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) { - ctxt->vcpu->arch.rip = c->eip; + kvm_rip_write(ctxt->vcpu, c->eip); goto done; } } c->regs[VCPU_REGS_RCX]--; - c->eip = ctxt->vcpu->arch.rip; + c->eip = kvm_rip_read(ctxt->vcpu); } if (c->src.type == OP_MEM) { @@ -1356,27 +1392,10 @@ special_insn: sbb: /* sbb */ emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); break; - case 0x20 ... 0x23: + case 0x20 ... 0x25: and: /* and */ emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); break; - case 0x24: /* and al imm8 */ - c->dst.type = OP_REG; - c->dst.ptr = &c->regs[VCPU_REGS_RAX]; - c->dst.val = *(u8 *)c->dst.ptr; - c->dst.bytes = 1; - c->dst.orig_val = c->dst.val; - goto and; - case 0x25: /* and ax imm16, or eax imm32 */ - c->dst.type = OP_REG; - c->dst.bytes = c->op_bytes; - c->dst.ptr = &c->regs[VCPU_REGS_RAX]; - if (c->op_bytes == 2) - c->dst.val = *(u16 *)c->dst.ptr; - else - c->dst.val = *(u32 *)c->dst.ptr; - c->dst.orig_val = c->dst.val; - goto and; case 0x28 ... 0x2d: sub: /* sub */ emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); @@ -1402,11 +1421,11 @@ special_insn: register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); c->dst.ptr = (void *) register_address( - c, ctxt->ss_base, c->regs[VCPU_REGS_RSP]); + c, ss_base(ctxt), c->regs[VCPU_REGS_RSP]); break; case 0x58 ... 0x5f: /* pop reg */ pop_instruction: - if ((rc = ops->read_std(register_address(c, ctxt->ss_base, + if ((rc = ops->read_std(register_address(c, ss_base(ctxt), c->regs[VCPU_REGS_RSP]), c->dst.ptr, c->op_bytes, ctxt->vcpu)) != 0) goto done; @@ -1420,9 +1439,8 @@ special_insn: goto cannot_emulate; c->dst.val = (s32) c->src.val; break; + case 0x68: /* push imm */ case 0x6a: /* push imm8 */ - c->src.val = 0L; - c->src.val = insn_fetch(s8, 1, c->eip); emulate_push(ctxt); break; case 0x6c: /* insb */ @@ -1433,7 +1451,7 @@ special_insn: c->rep_prefix ? address_mask(c, c->regs[VCPU_REGS_RCX]) : 1, (ctxt->eflags & EFLG_DF), - register_address(c, ctxt->es_base, + register_address(c, es_base(ctxt), c->regs[VCPU_REGS_RDI]), c->rep_prefix, c->regs[VCPU_REGS_RDX]) == 0) { @@ -1449,9 +1467,8 @@ special_insn: c->rep_prefix ? address_mask(c, c->regs[VCPU_REGS_RCX]) : 1, (ctxt->eflags & EFLG_DF), - register_address(c, c->override_base ? - *c->override_base : - ctxt->ds_base, + register_address(c, + seg_override_base(ctxt, c), c->regs[VCPU_REGS_RSI]), c->rep_prefix, c->regs[VCPU_REGS_RDX]) == 0) { @@ -1490,6 +1507,7 @@ special_insn: emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); break; case 0x86 ... 0x87: /* xchg */ + xchg: /* Write back the register source. */ switch (c->dst.bytes) { case 1: @@ -1514,14 +1532,60 @@ special_insn: break; case 0x88 ... 0x8b: /* mov */ goto mov; + case 0x8c: { /* mov r/m, sreg */ + struct kvm_segment segreg; + + if (c->modrm_reg <= 5) + kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg); + else { + printk(KERN_INFO "0x8c: Invalid segreg in modrm byte 0x%02x\n", + c->modrm); + goto cannot_emulate; + } + c->dst.val = segreg.selector; + break; + } case 0x8d: /* lea r16/r32, m */ c->dst.val = c->modrm_ea; break; + case 0x8e: { /* mov seg, r/m16 */ + uint16_t sel; + int type_bits; + int err; + + sel = c->src.val; + if (c->modrm_reg <= 5) { + type_bits = (c->modrm_reg == 1) ? 9 : 1; + err = kvm_load_segment_descriptor(ctxt->vcpu, sel, + type_bits, c->modrm_reg); + } else { + printk(KERN_INFO "Invalid segreg in modrm byte 0x%02x\n", + c->modrm); + goto cannot_emulate; + } + + if (err < 0) + goto cannot_emulate; + + c->dst.type = OP_NONE; /* Disable writeback. */ + break; + } case 0x8f: /* pop (sole member of Grp1a) */ rc = emulate_grp1a(ctxt, ops); if (rc != 0) goto done; break; + case 0x90: /* nop / xchg r8,rax */ + if (!(c->rex_prefix & 1)) { /* nop */ + c->dst.type = OP_NONE; + break; + } + case 0x91 ... 0x97: /* xchg reg,rax */ + c->src.type = c->dst.type = OP_REG; + c->src.bytes = c->dst.bytes = c->op_bytes; + c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX]; + c->src.val = *(c->src.ptr); + goto xchg; case 0x9c: /* pushf */ c->src.val = (unsigned long) ctxt->eflags; emulate_push(ctxt); @@ -1540,11 +1604,10 @@ special_insn: c->dst.type = OP_MEM; c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; c->dst.ptr = (unsigned long *)register_address(c, - ctxt->es_base, + es_base(ctxt), c->regs[VCPU_REGS_RDI]); if ((rc = ops->read_emulated(register_address(c, - c->override_base ? *c->override_base : - ctxt->ds_base, + seg_override_base(ctxt, c), c->regs[VCPU_REGS_RSI]), &c->dst.val, c->dst.bytes, ctxt->vcpu)) != 0) @@ -1560,8 +1623,7 @@ special_insn: c->src.type = OP_NONE; /* Disable writeback. */ c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; c->src.ptr = (unsigned long *)register_address(c, - c->override_base ? *c->override_base : - ctxt->ds_base, + seg_override_base(ctxt, c), c->regs[VCPU_REGS_RSI]); if ((rc = ops->read_emulated((unsigned long)c->src.ptr, &c->src.val, @@ -1572,7 +1634,7 @@ special_insn: c->dst.type = OP_NONE; /* Disable writeback. */ c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; c->dst.ptr = (unsigned long *)register_address(c, - ctxt->es_base, + es_base(ctxt), c->regs[VCPU_REGS_RDI]); if ((rc = ops->read_emulated((unsigned long)c->dst.ptr, &c->dst.val, @@ -1596,7 +1658,7 @@ special_insn: c->dst.type = OP_MEM; c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; c->dst.ptr = (unsigned long *)register_address(c, - ctxt->es_base, + es_base(ctxt), c->regs[VCPU_REGS_RDI]); c->dst.val = c->regs[VCPU_REGS_RAX]; register_address_increment(c, &c->regs[VCPU_REGS_RDI], @@ -1608,8 +1670,7 @@ special_insn: c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; if ((rc = ops->read_emulated(register_address(c, - c->override_base ? *c->override_base : - ctxt->ds_base, + seg_override_base(ctxt, c), c->regs[VCPU_REGS_RSI]), &c->dst.val, c->dst.bytes, @@ -1622,6 +1683,8 @@ special_insn: case 0xae ... 0xaf: /* scas */ DPRINTF("Urk! I don't handle SCAS.\n"); goto cannot_emulate; + case 0xb0 ... 0xbf: /* mov r, imm */ + goto mov; case 0xc0 ... 0xc1: emulate_grp2(ctxt); break; @@ -1640,6 +1703,16 @@ special_insn: c->src.val = c->regs[VCPU_REGS_RCX]; emulate_grp2(ctxt); break; + case 0xe4: /* inb */ + case 0xe5: /* in */ + port = insn_fetch(u8, 1, c->eip); + io_dir_in = 1; + goto do_io; + case 0xe6: /* outb */ + case 0xe7: /* out */ + port = insn_fetch(u8, 1, c->eip); + io_dir_in = 0; + goto do_io; case 0xe8: /* call (near) */ { long int rel; switch (c->op_bytes) { @@ -1660,13 +1733,55 @@ special_insn: break; } case 0xe9: /* jmp rel */ - case 0xeb: /* jmp rel short */ + goto jmp; + case 0xea: /* jmp far */ { + uint32_t eip; + uint16_t sel; + + switch (c->op_bytes) { + case 2: + eip = insn_fetch(u16, 2, c->eip); + break; + case 4: + eip = insn_fetch(u32, 4, c->eip); + break; + default: + DPRINTF("jmp far: Invalid op_bytes\n"); + goto cannot_emulate; + } + sel = insn_fetch(u16, 2, c->eip); + if (kvm_load_segment_descriptor(ctxt->vcpu, sel, 9, VCPU_SREG_CS) < 0) { + DPRINTF("jmp far: Failed to load CS descriptor\n"); + goto cannot_emulate; + } + + c->eip = eip; + break; + } + case 0xeb: + jmp: /* jmp rel short */ jmp_rel(c, c->src.val); c->dst.type = OP_NONE; /* Disable writeback. */ break; + case 0xec: /* in al,dx */ + case 0xed: /* in (e/r)ax,dx */ + port = c->regs[VCPU_REGS_RDX]; + io_dir_in = 1; + goto do_io; + case 0xee: /* out al,dx */ + case 0xef: /* out (e/r)ax,dx */ + port = c->regs[VCPU_REGS_RDX]; + io_dir_in = 0; + do_io: if (kvm_emulate_pio(ctxt->vcpu, NULL, io_dir_in, + (c->d & ByteOp) ? 1 : c->op_bytes, + port) != 0) { + c->eip = saved_eip; + goto cannot_emulate; + } + return 0; case 0xf4: /* hlt */ ctxt->vcpu->arch.halt_request = 1; - goto done; + break; case 0xf5: /* cmc */ /* complement carry flag from eflags reg */ ctxt->eflags ^= EFLG_CF; @@ -1689,6 +1804,14 @@ special_insn: ctxt->eflags |= X86_EFLAGS_IF; c->dst.type = OP_NONE; /* Disable writeback. */ break; + case 0xfc: /* cld */ + ctxt->eflags &= ~EFLG_DF; + c->dst.type = OP_NONE; /* Disable writeback. */ + break; + case 0xfd: /* std */ + ctxt->eflags |= EFLG_DF; + c->dst.type = OP_NONE; /* Disable writeback. */ + break; case 0xfe ... 0xff: /* Grp4/Grp5 */ rc = emulate_grp45(ctxt, ops); if (rc != 0) @@ -1703,7 +1826,7 @@ writeback: /* Commit shadow register state. */ memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); - ctxt->vcpu->arch.rip = c->eip; + kvm_rip_write(ctxt->vcpu, c->eip); done: if (rc == X86EMUL_UNHANDLEABLE) { @@ -1728,7 +1851,7 @@ twobyte_insn: goto done; /* Let the processor re-execute the fixed hypercall */ - c->eip = ctxt->vcpu->arch.rip; + c->eip = kvm_rip_read(ctxt->vcpu); /* Disable writeback. */ c->dst.type = OP_NONE; break; @@ -1824,7 +1947,7 @@ twobyte_insn: rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data); if (rc) { kvm_inject_gp(ctxt->vcpu, 0); - c->eip = ctxt->vcpu->arch.rip; + c->eip = kvm_rip_read(ctxt->vcpu); } rc = X86EMUL_CONTINUE; c->dst.type = OP_NONE; @@ -1834,7 +1957,7 @@ twobyte_insn: rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data); if (rc) { kvm_inject_gp(ctxt->vcpu, 0); - c->eip = ctxt->vcpu->arch.rip; + c->eip = kvm_rip_read(ctxt->vcpu); } else { c->regs[VCPU_REGS_RAX] = (u32)msr_data; c->regs[VCPU_REGS_RDX] = msr_data >> 32; @@ -1882,6 +2005,8 @@ twobyte_insn: c->src.val &= (c->dst.bytes << 3) - 1; emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); break; + case 0xae: /* clflush */ + break; case 0xb0 ... 0xb1: /* cmpxchg */ /* * Save real source value, then compare EAX against diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 50dad44fb542..48ee4f9435f4 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -55,6 +55,7 @@ #include <linux/lguest_launcher.h> #include <linux/virtio_console.h> #include <linux/pm.h> +#include <asm/apic.h> #include <asm/lguest.h> #include <asm/paravirt.h> #include <asm/param.h> @@ -581,7 +582,7 @@ static void __init lguest_init_IRQ(void) for (i = 0; i < LGUEST_IRQS; i++) { int vector = FIRST_EXTERNAL_VECTOR + i; if (vector != SYSCALL_VECTOR) { - set_intr_gate(vector, interrupt[i]); + set_intr_gate(vector, interrupt[vector]); set_irq_chip_and_handler_name(i, &lguest_irq_controller, handle_level_irq, "level"); @@ -783,14 +784,44 @@ static void lguest_wbinvd(void) * code qualifies for Advanced. It will also never interrupt anything. It * does, however, allow us to get through the Linux boot code. */ #ifdef CONFIG_X86_LOCAL_APIC -static void lguest_apic_write(unsigned long reg, u32 v) +static void lguest_apic_write(u32 reg, u32 v) { } -static u32 lguest_apic_read(unsigned long reg) +static u32 lguest_apic_read(u32 reg) { return 0; } + +static u64 lguest_apic_icr_read(void) +{ + return 0; +} + +static void lguest_apic_icr_write(u32 low, u32 id) +{ + /* Warn to see if there's any stray references */ + WARN_ON(1); +} + +static void lguest_apic_wait_icr_idle(void) +{ + return; +} + +static u32 lguest_apic_safe_wait_icr_idle(void) +{ + return 0; +} + +static struct apic_ops lguest_basic_apic_ops = { + .read = lguest_apic_read, + .write = lguest_apic_write, + .icr_read = lguest_apic_icr_read, + .icr_write = lguest_apic_icr_write, + .wait_icr_idle = lguest_apic_wait_icr_idle, + .safe_wait_icr_idle = lguest_apic_safe_wait_icr_idle, +}; #endif /* STOP! Until an interrupt comes in. */ @@ -990,9 +1021,7 @@ __init void lguest_init(void) #ifdef CONFIG_X86_LOCAL_APIC /* apic read/write intercepts */ - pv_apic_ops.apic_write = lguest_apic_write; - pv_apic_ops.apic_write_atomic = lguest_apic_write; - pv_apic_ops.apic_read = lguest_apic_read; + apic_ops = &lguest_basic_apic_ops; #endif /* time operations */ @@ -1015,6 +1044,9 @@ __init void lguest_init(void) init_pg_tables_start = __pa(pg0); init_pg_tables_end = __pa(pg0); + /* As described in head_32.S, we map the first 128M of memory. */ + max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; + /* Load the %fs segment register (the per-cpu segment register) with * the normal data segment to get through booting. */ asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory"); diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index aa3fa4119424..55e11aa6d66c 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -17,9 +17,6 @@ ifeq ($(CONFIG_X86_32),y) lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o else obj-y += io_64.o iomap_copy_64.o - - CFLAGS_csum-partial_64.o := -funroll-loops - lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o lib-y += thunk_64.o clear_page_64.o copy_page_64.o lib-y += memmove_64.o memset_64.o diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index dfdf428975c0..f118c110af32 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -52,7 +52,7 @@ jnz 100b 102: .section .fixup,"ax" -103: addl %r8d,%edx /* ecx is zerorest also */ +103: addl %ecx,%edx /* ecx is zerorest also */ jmp copy_user_handle_tail .previous diff --git a/arch/x86/lib/copy_user_nocache_64.S b/arch/x86/lib/copy_user_nocache_64.S index 40e0e309d27e..cb0c112386fb 100644 --- a/arch/x86/lib/copy_user_nocache_64.S +++ b/arch/x86/lib/copy_user_nocache_64.S @@ -32,7 +32,7 @@ jnz 100b 102: .section .fixup,"ax" -103: addl %r8d,%edx /* ecx is zerorest also */ +103: addl %ecx,%edx /* ecx is zerorest also */ jmp copy_user_handle_tail .previous @@ -108,7 +108,6 @@ ENTRY(__copy_user_nocache) jmp 60f 50: movl %ecx,%edx 60: sfence - movl %r8d,%ecx jmp copy_user_handle_tail .previous diff --git a/arch/x86/lib/msr-on-cpu.c b/arch/x86/lib/msr-on-cpu.c index d5a2b39f882b..321cf720dbb6 100644 --- a/arch/x86/lib/msr-on-cpu.c +++ b/arch/x86/lib/msr-on-cpu.c @@ -16,36 +16,46 @@ static void __rdmsr_on_cpu(void *info) rdmsr(rv->msr_no, rv->l, rv->h); } -static void __rdmsr_safe_on_cpu(void *info) +static void __wrmsr_on_cpu(void *info) { struct msr_info *rv = info; - rv->err = rdmsr_safe(rv->msr_no, &rv->l, &rv->h); + wrmsr(rv->msr_no, rv->l, rv->h); } -static int _rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h, int safe) +int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) { - int err = 0; + int err; struct msr_info rv; rv.msr_no = msr_no; - if (safe) { - smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1); - err = rv.err; - } else { - smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1); - } + err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1); *l = rv.l; *h = rv.h; return err; } -static void __wrmsr_on_cpu(void *info) +int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) +{ + int err; + struct msr_info rv; + + rv.msr_no = msr_no; + rv.l = l; + rv.h = h; + err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1); + + return err; +} + +/* These "safe" variants are slower and should be used when the target MSR + may not actually exist. */ +static void __rdmsr_safe_on_cpu(void *info) { struct msr_info *rv = info; - wrmsr(rv->msr_no, rv->l, rv->h); + rv->err = rdmsr_safe(rv->msr_no, &rv->l, &rv->h); } static void __wrmsr_safe_on_cpu(void *info) @@ -55,44 +65,30 @@ static void __wrmsr_safe_on_cpu(void *info) rv->err = wrmsr_safe(rv->msr_no, rv->l, rv->h); } -static int _wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h, int safe) +int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) { - int err = 0; + int err; struct msr_info rv; rv.msr_no = msr_no; - rv.l = l; - rv.h = h; - if (safe) { - smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1); - err = rv.err; - } else { - smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1); - } - - return err; -} - -void wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) -{ - _wrmsr_on_cpu(cpu, msr_no, l, h, 0); -} + err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1); + *l = rv.l; + *h = rv.h; -void rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) -{ - _rdmsr_on_cpu(cpu, msr_no, l, h, 0); + return err ? err : rv.err; } -/* These "safe" variants are slower and should be used when the target MSR - may not actually exist. */ int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) { - return _wrmsr_on_cpu(cpu, msr_no, l, h, 1); -} + int err; + struct msr_info rv; -int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) -{ - return _rdmsr_on_cpu(cpu, msr_no, l, h, 1); + rv.msr_no = msr_no; + rv.l = l; + rv.h = h; + err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1); + + return err ? err : rv.err; } EXPORT_SYMBOL(rdmsr_on_cpu); diff --git a/arch/x86/lib/string_32.c b/arch/x86/lib/string_32.c index 94972e7c094d..82004d2bf05e 100644 --- a/arch/x86/lib/string_32.c +++ b/arch/x86/lib/string_32.c @@ -22,7 +22,7 @@ char *strcpy(char *dest, const char *src) "testb %%al,%%al\n\t" "jne 1b" : "=&S" (d0), "=&D" (d1), "=&a" (d2) - :"0" (src), "1" (dest) : "memory"); + : "0" (src), "1" (dest) : "memory"); return dest; } EXPORT_SYMBOL(strcpy); @@ -42,7 +42,7 @@ char *strncpy(char *dest, const char *src, size_t count) "stosb\n" "2:" : "=&S" (d0), "=&D" (d1), "=&c" (d2), "=&a" (d3) - :"0" (src), "1" (dest), "2" (count) : "memory"); + : "0" (src), "1" (dest), "2" (count) : "memory"); return dest; } EXPORT_SYMBOL(strncpy); @@ -60,7 +60,7 @@ char *strcat(char *dest, const char *src) "testb %%al,%%al\n\t" "jne 1b" : "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3) - : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu): "memory"); + : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu) : "memory"); return dest; } EXPORT_SYMBOL(strcat); @@ -105,9 +105,9 @@ int strcmp(const char *cs, const char *ct) "2:\tsbbl %%eax,%%eax\n\t" "orb $1,%%al\n" "3:" - :"=a" (res), "=&S" (d0), "=&D" (d1) - :"1" (cs), "2" (ct) - :"memory"); + : "=a" (res), "=&S" (d0), "=&D" (d1) + : "1" (cs), "2" (ct) + : "memory"); return res; } EXPORT_SYMBOL(strcmp); @@ -130,9 +130,9 @@ int strncmp(const char *cs, const char *ct, size_t count) "3:\tsbbl %%eax,%%eax\n\t" "orb $1,%%al\n" "4:" - :"=a" (res), "=&S" (d0), "=&D" (d1), "=&c" (d2) - :"1" (cs), "2" (ct), "3" (count) - :"memory"); + : "=a" (res), "=&S" (d0), "=&D" (d1), "=&c" (d2) + : "1" (cs), "2" (ct), "3" (count) + : "memory"); return res; } EXPORT_SYMBOL(strncmp); @@ -152,9 +152,9 @@ char *strchr(const char *s, int c) "movl $1,%1\n" "2:\tmovl %1,%0\n\t" "decl %0" - :"=a" (res), "=&S" (d0) - :"1" (s), "0" (c) - :"memory"); + : "=a" (res), "=&S" (d0) + : "1" (s), "0" (c) + : "memory"); return res; } EXPORT_SYMBOL(strchr); @@ -169,9 +169,9 @@ size_t strlen(const char *s) "scasb\n\t" "notl %0\n\t" "decl %0" - :"=c" (res), "=&D" (d0) - :"1" (s), "a" (0), "0" (0xffffffffu) - :"memory"); + : "=c" (res), "=&D" (d0) + : "1" (s), "a" (0), "0" (0xffffffffu) + : "memory"); return res; } EXPORT_SYMBOL(strlen); @@ -189,9 +189,9 @@ void *memchr(const void *cs, int c, size_t count) "je 1f\n\t" "movl $1,%0\n" "1:\tdecl %0" - :"=D" (res), "=&c" (d0) - :"a" (c), "0" (cs), "1" (count) - :"memory"); + : "=D" (res), "=&c" (d0) + : "a" (c), "0" (cs), "1" (count) + : "memory"); return res; } EXPORT_SYMBOL(memchr); @@ -228,9 +228,9 @@ size_t strnlen(const char *s, size_t count) "cmpl $-1,%1\n\t" "jne 1b\n" "3:\tsubl %2,%0" - :"=a" (res), "=&d" (d0) - :"c" (s), "1" (count) - :"memory"); + : "=a" (res), "=&d" (d0) + : "c" (s), "1" (count) + : "memory"); return res; } EXPORT_SYMBOL(strnlen); diff --git a/arch/x86/lib/strstr_32.c b/arch/x86/lib/strstr_32.c index 42e8a50303f3..8e2d55f754bf 100644 --- a/arch/x86/lib/strstr_32.c +++ b/arch/x86/lib/strstr_32.c @@ -23,9 +23,9 @@ __asm__ __volatile__( "jne 1b\n\t" "xorl %%eax,%%eax\n\t" "2:" - :"=a" (__res), "=&c" (d0), "=&S" (d1) - :"0" (0), "1" (0xffffffff), "2" (cs), "g" (ct) - :"dx", "di"); + : "=a" (__res), "=&c" (d0), "=&S" (d1) + : "0" (0), "1" (0xffffffff), "2" (cs), "g" (ct) + : "dx", "di"); return __res; } diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 24e60944971a..9e68075544f6 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -14,6 +14,13 @@ #include <asm/uaccess.h> #include <asm/mmx.h> +#ifdef CONFIG_X86_INTEL_USERCOPY +/* + * Alignment at which movsl is preferred for bulk memory copies. + */ +struct movsl_mask movsl_mask __read_mostly; +#endif + static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned long n) { #ifdef CONFIG_X86_INTEL_USERCOPY diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c index 48278fa7d3de..37b9ae4d44c5 100644 --- a/arch/x86/mach-default/setup.c +++ b/arch/x86/mach-default/setup.c @@ -10,13 +10,7 @@ #include <asm/e820.h> #include <asm/setup.h> -/* - * Any quirks to be performed to initialize timers/irqs/etc? - */ -int (*arch_time_init_quirk)(void); -int (*arch_pre_intr_init_quirk)(void); -int (*arch_intr_init_quirk)(void); -int (*arch_trap_init_quirk)(void); +#include <mach_ipi.h> #ifdef CONFIG_HOTPLUG_CPU #define DEFAULT_SEND_IPI (1) @@ -24,7 +18,7 @@ int (*arch_trap_init_quirk)(void); #define DEFAULT_SEND_IPI (0) #endif -int no_broadcast=DEFAULT_SEND_IPI; +int no_broadcast = DEFAULT_SEND_IPI; /** * pre_intr_init_hook - initialisation prior to setting up interrupt vectors @@ -37,22 +31,13 @@ int no_broadcast=DEFAULT_SEND_IPI; **/ void __init pre_intr_init_hook(void) { - if (arch_pre_intr_init_quirk) { - if (arch_pre_intr_init_quirk()) + if (x86_quirks->arch_pre_intr_init) { + if (x86_quirks->arch_pre_intr_init()) return; } init_ISA_irqs(); } -/* - * IRQ2 is cascade interrupt to second interrupt controller - */ -static struct irqaction irq2 = { - .handler = no_action, - .mask = CPU_MASK_NONE, - .name = "cascade", -}; - /** * intr_init_hook - post gate setup interrupt initialisation * @@ -64,16 +49,10 @@ static struct irqaction irq2 = { **/ void __init intr_init_hook(void) { - if (arch_intr_init_quirk) { - if (arch_intr_init_quirk()) + if (x86_quirks->arch_intr_init) { + if (x86_quirks->arch_intr_init()) return; } -#ifdef CONFIG_X86_LOCAL_APIC - apic_intr_init(); -#endif - - if (!acpi_ioapic) - setup_irq(2, &irq2); } /** @@ -97,8 +76,8 @@ void __init pre_setup_arch_hook(void) **/ void __init trap_init_hook(void) { - if (arch_trap_init_quirk) { - if (arch_trap_init_quirk()) + if (x86_quirks->arch_trap_init) { + if (x86_quirks->arch_trap_init()) return; } } @@ -111,6 +90,16 @@ static struct irqaction irq0 = { }; /** + * pre_time_init_hook - do any specific initialisations before. + * + **/ +void __init pre_time_init_hook(void) +{ + if (x86_quirks->arch_pre_time_init) + x86_quirks->arch_pre_time_init(); +} + +/** * time_init_hook - do any specific initialisations for the system timer. * * Description: @@ -119,13 +108,13 @@ static struct irqaction irq0 = { **/ void __init time_init_hook(void) { - if (arch_time_init_quirk) { + if (x86_quirks->arch_time_init) { /* * A nonzero return code does not mean failure, it means * that the architecture quirk does not want any * generic (timer) setup to be performed after this: */ - if (arch_time_init_quirk()) + if (x86_quirks->arch_time_init()) return; } diff --git a/arch/x86/mach-es7000/Makefile b/arch/x86/mach-es7000/Makefile deleted file mode 100644 index 3ef8b43b62fc..000000000000 --- a/arch/x86/mach-es7000/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -# -# Makefile for the linux kernel. -# - -obj-$(CONFIG_X86_ES7000) := es7000plat.o diff --git a/arch/x86/mach-es7000/es7000.h b/arch/x86/mach-es7000/es7000.h deleted file mode 100644 index c8d5aa132fa0..000000000000 --- a/arch/x86/mach-es7000/es7000.h +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Written by: Garry Forsgren, Unisys Corporation - * Natalie Protasevich, Unisys Corporation - * This file contains the code to configure and interface - * with Unisys ES7000 series hardware system manager. - * - * Copyright (c) 2003 Unisys Corporation. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston MA 02111-1307, USA. - * - * Contact information: Unisys Corporation, Township Line & Union Meeting - * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or: - * - * http://www.unisys.com - */ - -/* - * ES7000 chipsets - */ - -#define NON_UNISYS 0 -#define ES7000_CLASSIC 1 -#define ES7000_ZORRO 2 - - -#define MIP_REG 1 -#define MIP_PSAI_REG 4 - -#define MIP_BUSY 1 -#define MIP_SPIN 0xf0000 -#define MIP_VALID 0x0100000000000000ULL -#define MIP_PORT(VALUE) ((VALUE >> 32) & 0xffff) - -#define MIP_RD_LO(VALUE) (VALUE & 0xffffffff) - -struct mip_reg_info { - unsigned long long mip_info; - unsigned long long delivery_info; - unsigned long long host_reg; - unsigned long long mip_reg; -}; - -struct part_info { - unsigned char type; - unsigned char length; - unsigned char part_id; - unsigned char apic_mode; - unsigned long snum; - char ptype[16]; - char sname[64]; - char pname[64]; -}; - -struct psai { - unsigned long long entry_type; - unsigned long long addr; - unsigned long long bep_addr; -}; - -struct es7000_mem_info { - unsigned char type; - unsigned char length; - unsigned char resv[6]; - unsigned long long start; - unsigned long long size; -}; - -struct es7000_oem_table { - unsigned long long hdr; - struct mip_reg_info mip; - struct part_info pif; - struct es7000_mem_info shm; - struct psai psai; -}; - -#ifdef CONFIG_ACPI - -struct oem_table { - struct acpi_table_header Header; - u32 OEMTableAddr; - u32 OEMTableSize; -}; - -extern int find_unisys_acpi_oem_table(unsigned long *oem_addr); -#endif - -struct mip_reg { - unsigned long long off_0; - unsigned long long off_8; - unsigned long long off_10; - unsigned long long off_18; - unsigned long long off_20; - unsigned long long off_28; - unsigned long long off_30; - unsigned long long off_38; -}; - -#define MIP_SW_APIC 0x1020b -#define MIP_FUNC(VALUE) (VALUE & 0xff) - -extern int parse_unisys_oem (char *oemptr); -extern void setup_unisys(void); -extern int es7000_start_cpu(int cpu, unsigned long eip); -extern void es7000_sw_apic(void); diff --git a/arch/x86/mach-generic/Makefile b/arch/x86/mach-generic/Makefile index 0dbd7803a1d5..6730f4e7c744 100644 --- a/arch/x86/mach-generic/Makefile +++ b/arch/x86/mach-generic/Makefile @@ -9,4 +9,3 @@ obj-$(CONFIG_X86_NUMAQ) += numaq.o obj-$(CONFIG_X86_SUMMIT) += summit.o obj-$(CONFIG_X86_BIGSMP) += bigsmp.o obj-$(CONFIG_X86_ES7000) += es7000.o -obj-$(CONFIG_X86_ES7000) += ../../x86/mach-es7000/ diff --git a/arch/x86/mach-generic/bigsmp.c b/arch/x86/mach-generic/bigsmp.c index 59d771714559..3c3b471ea496 100644 --- a/arch/x86/mach-generic/bigsmp.c +++ b/arch/x86/mach-generic/bigsmp.c @@ -5,18 +5,17 @@ #define APIC_DEFINITION 1 #include <linux/threads.h> #include <linux/cpumask.h> -#include <asm/smp.h> #include <asm/mpspec.h> #include <asm/genapic.h> #include <asm/fixmap.h> #include <asm/apicdef.h> #include <linux/kernel.h> -#include <linux/smp.h> #include <linux/init.h> #include <linux/dmi.h> -#include <asm/mach-bigsmp/mach_apic.h> -#include <asm/mach-bigsmp/mach_apicdef.h> -#include <asm/mach-bigsmp/mach_ipi.h> +#include <asm/bigsmp/apicdef.h> +#include <linux/smp.h> +#include <asm/bigsmp/apic.h> +#include <asm/bigsmp/ipi.h> #include <asm/mach-default/mach_mpparse.h> static int dmi_bigsmp; /* can be set by dmi scanners */ @@ -42,6 +41,10 @@ static const struct dmi_system_id bigsmp_dmi_table[] = { { } }; +static cpumask_t vector_allocation_domain(int cpu) +{ + return cpumask_of_cpu(cpu); +} static int probe_bigsmp(void) { diff --git a/arch/x86/mach-generic/es7000.c b/arch/x86/mach-generic/es7000.c index 4742626f08c4..28459cab3ddb 100644 --- a/arch/x86/mach-generic/es7000.c +++ b/arch/x86/mach-generic/es7000.c @@ -4,20 +4,19 @@ #define APIC_DEFINITION 1 #include <linux/threads.h> #include <linux/cpumask.h> -#include <asm/smp.h> #include <asm/mpspec.h> #include <asm/genapic.h> #include <asm/fixmap.h> #include <asm/apicdef.h> #include <linux/kernel.h> #include <linux/string.h> -#include <linux/smp.h> #include <linux/init.h> -#include <asm/mach-es7000/mach_apicdef.h> -#include <asm/mach-es7000/mach_apic.h> -#include <asm/mach-es7000/mach_ipi.h> -#include <asm/mach-es7000/mach_mpparse.h> -#include <asm/mach-es7000/mach_wakecpu.h> +#include <asm/es7000/apicdef.h> +#include <linux/smp.h> +#include <asm/es7000/apic.h> +#include <asm/es7000/ipi.h> +#include <asm/es7000/mpparse.h> +#include <asm/es7000/wakecpu.h> static int probe_es7000(void) { @@ -48,16 +47,26 @@ static __init int mps_oem_check(struct mp_config_table *mpc, char *oem, /* Hook from generic ACPI tables.c */ static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) { - unsigned long oem_addr; + unsigned long oem_addr = 0; + int check_dsdt; + int ret = 0; + + /* check dsdt at first to avoid clear fix_map for oem_addr */ + check_dsdt = es7000_check_dsdt(); + if (!find_unisys_acpi_oem_table(&oem_addr)) { - if (es7000_check_dsdt()) - return parse_unisys_oem((char *)oem_addr); + if (check_dsdt) + ret = parse_unisys_oem((char *)oem_addr); else { setup_unisys(); - return 1; + ret = 1; } + /* + * we need to unmap it + */ + unmap_unisys_acpi_oem_table(oem_addr); } - return 0; + return ret; } #else static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) @@ -66,4 +75,18 @@ static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) } #endif +static cpumask_t vector_allocation_domain(int cpu) +{ + /* Careful. Some cpus do not strictly honor the set of cpus + * specified in the interrupt destination when using lowest + * priority interrupt delivery mode. + * + * In particular there was a hyperthreading cpu observed to + * deliver interrupts to the wrong hyperthread when only one + * hyperthread was specified in the interrupt desitination. + */ + cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; + return domain; +} + struct genapic __initdata_refok apic_es7000 = APIC_INIT("es7000", probe_es7000); diff --git a/arch/x86/mach-generic/numaq.c b/arch/x86/mach-generic/numaq.c index 8091e68764c4..71a309b122e6 100644 --- a/arch/x86/mach-generic/numaq.c +++ b/arch/x86/mach-generic/numaq.c @@ -4,7 +4,6 @@ #define APIC_DEFINITION 1 #include <linux/threads.h> #include <linux/cpumask.h> -#include <linux/smp.h> #include <asm/mpspec.h> #include <asm/genapic.h> #include <asm/fixmap.h> @@ -12,11 +11,12 @@ #include <linux/kernel.h> #include <linux/string.h> #include <linux/init.h> -#include <asm/mach-numaq/mach_apic.h> -#include <asm/mach-numaq/mach_apicdef.h> -#include <asm/mach-numaq/mach_ipi.h> -#include <asm/mach-numaq/mach_mpparse.h> -#include <asm/mach-numaq/mach_wakecpu.h> +#include <asm/numaq/apicdef.h> +#include <linux/smp.h> +#include <asm/numaq/apic.h> +#include <asm/numaq/ipi.h> +#include <asm/numaq/mpparse.h> +#include <asm/numaq/wakecpu.h> #include <asm/numaq.h> static int mps_oem_check(struct mp_config_table *mpc, char *oem, @@ -38,4 +38,18 @@ static int acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 0; } +static cpumask_t vector_allocation_domain(int cpu) +{ + /* Careful. Some cpus do not strictly honor the set of cpus + * specified in the interrupt destination when using lowest + * priority interrupt delivery mode. + * + * In particular there was a hyperthreading cpu observed to + * deliver interrupts to the wrong hyperthread when only one + * hyperthread was specified in the interrupt desitination. + */ + cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; + return domain; +} + struct genapic apic_numaq = APIC_INIT("NUMAQ", probe_numaq); diff --git a/arch/x86/mach-generic/summit.c b/arch/x86/mach-generic/summit.c index a97ea0f35b1e..6272b5e69da6 100644 --- a/arch/x86/mach-generic/summit.c +++ b/arch/x86/mach-generic/summit.c @@ -4,19 +4,18 @@ #define APIC_DEFINITION 1 #include <linux/threads.h> #include <linux/cpumask.h> -#include <asm/smp.h> #include <asm/mpspec.h> #include <asm/genapic.h> #include <asm/fixmap.h> #include <asm/apicdef.h> #include <linux/kernel.h> #include <linux/string.h> -#include <linux/smp.h> #include <linux/init.h> -#include <asm/mach-summit/mach_apic.h> -#include <asm/mach-summit/mach_apicdef.h> -#include <asm/mach-summit/mach_ipi.h> -#include <asm/mach-summit/mach_mpparse.h> +#include <asm/summit/apicdef.h> +#include <linux/smp.h> +#include <asm/summit/apic.h> +#include <asm/summit/ipi.h> +#include <asm/summit/mpparse.h> static int probe_summit(void) { @@ -24,4 +23,18 @@ static int probe_summit(void) return 0; } +static cpumask_t vector_allocation_domain(int cpu) +{ + /* Careful. Some cpus do not strictly honor the set of cpus + * specified in the interrupt destination when using lowest + * priority interrupt delivery mode. + * + * In particular there was a hyperthreading cpu observed to + * deliver interrupts to the wrong hyperthread when only one + * hyperthread was specified in the interrupt desitination. + */ + cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; + return domain; +} + struct genapic apic_summit = APIC_INIT("summit", probe_summit); diff --git a/arch/x86/mach-rdc321x/platform.c b/arch/x86/mach-rdc321x/platform.c index a037041817c7..4f4e50c3ad3b 100644 --- a/arch/x86/mach-rdc321x/platform.c +++ b/arch/x86/mach-rdc321x/platform.c @@ -25,7 +25,6 @@ #include <linux/list.h> #include <linux/device.h> #include <linux/platform_device.h> -#include <linux/version.h> #include <linux/leds.h> #include <asm/gpio.h> diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c index ee0fba092157..0f6e8a6523ae 100644 --- a/arch/x86/mach-voyager/voyager_smp.c +++ b/arch/x86/mach-voyager/voyager_smp.c @@ -448,6 +448,8 @@ static void __init start_secondary(void *unused) VDEBUG(("VOYAGER SMP: CPU%d, stack at about %p\n", cpuid, &cpuid)); + notify_cpu_starting(cpuid); + /* enable interrupts */ local_irq_enable(); @@ -1481,7 +1483,7 @@ static void disable_local_vic_irq(unsigned int irq) * the interrupt off to another CPU */ static void before_handle_vic_irq(unsigned int irq) { - irq_desc_t *desc = irq_desc + irq; + irq_desc_t *desc = irq_to_desc(irq); __u8 cpu = smp_processor_id(); _raw_spin_lock(&vic_irq_lock); @@ -1516,7 +1518,7 @@ static void before_handle_vic_irq(unsigned int irq) /* Finish the VIC interrupt: basically mask */ static void after_handle_vic_irq(unsigned int irq) { - irq_desc_t *desc = irq_desc + irq; + irq_desc_t *desc = irq_to_desc(irq); _raw_spin_lock(&vic_irq_lock); { diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 9873716e9f76..59f89b434b45 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -1,5 +1,5 @@ obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ - pat.o pgtable.o + pat.o pgtable.o gup.o obj-$(CONFIG_X86_32) += pgtable_32.o @@ -13,11 +13,8 @@ obj-$(CONFIG_MMIOTRACE) += mmiotrace.o mmiotrace-y := pf_in.o mmio-mod.o obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o -ifeq ($(CONFIG_X86_32),y) -obj-$(CONFIG_NUMA) += discontig_32.o -else -obj-$(CONFIG_NUMA) += numa_64.o +obj-$(CONFIG_NUMA) += numa_$(BITS).o obj-$(CONFIG_K8_NUMA) += k8topology_64.o -endif obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o +obj-$(CONFIG_MEMTEST) += memtest.o diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 0bb0caed8971..e7277cbcfb40 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -148,8 +148,8 @@ static void note_page(struct seq_file *m, struct pg_state *st, * we have now. "break" is either changing perms, levels or * address space marker. */ - prot = pgprot_val(new_prot) & ~(PTE_MASK); - cur = pgprot_val(st->current_prot) & ~(PTE_MASK); + prot = pgprot_val(new_prot) & PTE_FLAGS_MASK; + cur = pgprot_val(st->current_prot) & PTE_FLAGS_MASK; if (!st->level) { /* First entry */ @@ -221,7 +221,7 @@ static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, for (i = 0; i < PTRS_PER_PMD; i++) { st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); if (!pmd_none(*start)) { - pgprotval_t prot = pmd_val(*start) & ~PTE_MASK; + pgprotval_t prot = pmd_val(*start) & PTE_FLAGS_MASK; if (pmd_large(*start) || !pmd_present(*start)) note_page(m, st, __pgprot(prot), 3); @@ -253,7 +253,7 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr, for (i = 0; i < PTRS_PER_PUD; i++) { st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); if (!pud_none(*start)) { - pgprotval_t prot = pud_val(*start) & ~PTE_MASK; + pgprotval_t prot = pud_val(*start) & PTE_FLAGS_MASK; if (pud_large(*start) || !pud_present(*start)) note_page(m, st, __pgprot(prot), 2); @@ -288,7 +288,7 @@ static void walk_pgd_level(struct seq_file *m) for (i = 0; i < PTRS_PER_PGD; i++) { st.current_address = normalize_addr(i * PGD_LEVEL_MULT); if (!pgd_none(*start)) { - pgprotval_t prot = pgd_val(*start) & ~PTE_MASK; + pgprotval_t prot = pgd_val(*start) & PTE_FLAGS_MASK; if (pgd_large(*start) || !pgd_present(*start)) note_page(m, &st, __pgprot(prot), 1); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 455f3fe67b42..31e8730fa246 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -35,6 +35,7 @@ #include <asm/tlbflush.h> #include <asm/proto.h> #include <asm-generic/sections.h> +#include <asm/traps.h> /* * Page fault error code bits @@ -357,8 +358,6 @@ static int is_errata100(struct pt_regs *regs, unsigned long address) return 0; } -void do_invalid_op(struct pt_regs *, unsigned long); - static int is_f00f_bug(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_F00F_BUG @@ -593,11 +592,6 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) unsigned long flags; #endif - /* - * We can fault from pretty much anywhere, with unknown IRQ state. - */ - trace_hardirqs_fixup(); - tsk = current; mm = tsk->mm; prefetchw(&mm->mmap_sem); @@ -646,24 +640,23 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) } -#ifdef CONFIG_X86_32 - /* It's safe to allow irq's after cr2 has been saved and the vmalloc - fault has been handled. */ - if (regs->flags & (X86_EFLAGS_IF | X86_VM_MASK)) - local_irq_enable(); - /* - * If we're in an interrupt, have no user context or are running in an - * atomic region then we must not take the fault. + * It's safe to allow irq's after cr2 has been saved and the + * vmalloc fault has been handled. + * + * User-mode registers count as a user access even for any + * potential system fault or CPU buglet. */ - if (in_atomic() || !mm) - goto bad_area_nosemaphore; -#else /* CONFIG_X86_64 */ - if (likely(regs->flags & X86_EFLAGS_IF)) + if (user_mode_vm(regs)) { + local_irq_enable(); + error_code |= PF_USER; + } else if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); +#ifdef CONFIG_X86_64 if (unlikely(error_code & PF_RSVD)) pgtable_bad(address, regs, error_code); +#endif /* * If we're in an interrupt, have no user context or are running in an @@ -672,15 +665,9 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) if (unlikely(in_atomic() || !mm)) goto bad_area_nosemaphore; - /* - * User-mode registers count as a user access even for any - * potential system fault or CPU buglet. - */ - if (user_mode_vm(regs)) - error_code |= PF_USER; again: -#endif - /* When running in the kernel we expect faults to occur only to + /* + * When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the * kernel and should generate an OOPS. Unfortunately, in the case of an * erroneous fault occurring in a code path which already holds mmap_sem @@ -743,9 +730,6 @@ good_area: goto bad_area; } -#ifdef CONFIG_X86_32 -survive: -#endif /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo @@ -880,12 +864,11 @@ out_of_memory: up_read(&mm->mmap_sem); if (is_global_init(tsk)) { yield(); -#ifdef CONFIG_X86_32 - down_read(&mm->mmap_sem); - goto survive; -#else + /* + * Re-lookup the vma - in theory the vma tree might + * have changed: + */ goto again; -#endif } printk("VM: killing process %s\n", tsk->comm); @@ -915,15 +898,15 @@ LIST_HEAD(pgd_list); void vmalloc_sync_all(void) { -#ifdef CONFIG_X86_32 - unsigned long start = VMALLOC_START & PGDIR_MASK; unsigned long address; +#ifdef CONFIG_X86_32 if (SHARED_KERNEL_PMD) return; - BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); - for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { + for (address = VMALLOC_START & PMD_MASK; + address >= TASK_SIZE && address < FIXADDR_TOP; + address += PMD_SIZE) { unsigned long flags; struct page *page; @@ -936,10 +919,8 @@ void vmalloc_sync_all(void) spin_unlock_irqrestore(&pgd_lock, flags); } #else /* CONFIG_X86_64 */ - unsigned long start = VMALLOC_START & PGDIR_MASK; - unsigned long address; - - for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { + for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; + address += PGDIR_SIZE) { const pgd_t *pgd_ref = pgd_offset_k(address); unsigned long flags; struct page *page; diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c new file mode 100644 index 000000000000..4ba373c5b8c8 --- /dev/null +++ b/arch/x86/mm/gup.c @@ -0,0 +1,298 @@ +/* + * Lockless get_user_pages_fast for x86 + * + * Copyright (C) 2008 Nick Piggin + * Copyright (C) 2008 Novell Inc. + */ +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/vmstat.h> +#include <linux/highmem.h> + +#include <asm/pgtable.h> + +static inline pte_t gup_get_pte(pte_t *ptep) +{ +#ifndef CONFIG_X86_PAE + return *ptep; +#else + /* + * With get_user_pages_fast, we walk down the pagetables without taking + * any locks. For this we would like to load the pointers atoimcally, + * but that is not possible (without expensive cmpxchg8b) on PAE. What + * we do have is the guarantee that a pte will only either go from not + * present to present, or present to not present or both -- it will not + * switch to a completely different present page without a TLB flush in + * between; something that we are blocking by holding interrupts off. + * + * Setting ptes from not present to present goes: + * ptep->pte_high = h; + * smp_wmb(); + * ptep->pte_low = l; + * + * And present to not present goes: + * ptep->pte_low = 0; + * smp_wmb(); + * ptep->pte_high = 0; + * + * We must ensure here that the load of pte_low sees l iff pte_high + * sees h. We load pte_high *after* loading pte_low, which ensures we + * don't see an older value of pte_high. *Then* we recheck pte_low, + * which ensures that we haven't picked up a changed pte high. We might + * have got rubbish values from pte_low and pte_high, but we are + * guaranteed that pte_low will not have the present bit set *unless* + * it is 'l'. And get_user_pages_fast only operates on present ptes, so + * we're safe. + * + * gup_get_pte should not be used or copied outside gup.c without being + * very careful -- it does not atomically load the pte or anything that + * is likely to be useful for you. + */ + pte_t pte; + +retry: + pte.pte_low = ptep->pte_low; + smp_rmb(); + pte.pte_high = ptep->pte_high; + smp_rmb(); + if (unlikely(pte.pte_low != ptep->pte_low)) + goto retry; + + return pte; +#endif +} + +/* + * The performance critical leaf functions are made noinline otherwise gcc + * inlines everything into a single function which results in too much + * register pressure. + */ +static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + unsigned long mask; + pte_t *ptep; + + mask = _PAGE_PRESENT|_PAGE_USER; + if (write) + mask |= _PAGE_RW; + + ptep = pte_offset_map(&pmd, addr); + do { + pte_t pte = gup_get_pte(ptep); + struct page *page; + + if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) { + pte_unmap(ptep); + return 0; + } + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + page = pte_page(pte); + get_page(page); + pages[*nr] = page; + (*nr)++; + + } while (ptep++, addr += PAGE_SIZE, addr != end); + pte_unmap(ptep - 1); + + return 1; +} + +static inline void get_head_page_multiple(struct page *page, int nr) +{ + VM_BUG_ON(page != compound_head(page)); + VM_BUG_ON(page_count(page) == 0); + atomic_add(nr, &page->_count); +} + +static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + unsigned long mask; + pte_t pte = *(pte_t *)&pmd; + struct page *head, *page; + int refs; + + mask = _PAGE_PRESENT|_PAGE_USER; + if (write) + mask |= _PAGE_RW; + if ((pte_flags(pte) & mask) != mask) + return 0; + /* hugepages are never "special" */ + VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL); + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + + refs = 0; + head = pte_page(pte); + page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + do { + VM_BUG_ON(compound_head(page) != head); + pages[*nr] = page; + (*nr)++; + page++; + refs++; + } while (addr += PAGE_SIZE, addr != end); + get_head_page_multiple(head, refs); + + return 1; +} + +static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + unsigned long next; + pmd_t *pmdp; + + pmdp = pmd_offset(&pud, addr); + do { + pmd_t pmd = *pmdp; + + next = pmd_addr_end(addr, end); + if (pmd_none(pmd)) + return 0; + if (unlikely(pmd_large(pmd))) { + if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) + return 0; + } else { + if (!gup_pte_range(pmd, addr, next, write, pages, nr)) + return 0; + } + } while (pmdp++, addr = next, addr != end); + + return 1; +} + +static noinline int gup_huge_pud(pud_t pud, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + unsigned long mask; + pte_t pte = *(pte_t *)&pud; + struct page *head, *page; + int refs; + + mask = _PAGE_PRESENT|_PAGE_USER; + if (write) + mask |= _PAGE_RW; + if ((pte_flags(pte) & mask) != mask) + return 0; + /* hugepages are never "special" */ + VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL); + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + + refs = 0; + head = pte_page(pte); + page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); + do { + VM_BUG_ON(compound_head(page) != head); + pages[*nr] = page; + (*nr)++; + page++; + refs++; + } while (addr += PAGE_SIZE, addr != end); + get_head_page_multiple(head, refs); + + return 1; +} + +static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + unsigned long next; + pud_t *pudp; + + pudp = pud_offset(&pgd, addr); + do { + pud_t pud = *pudp; + + next = pud_addr_end(addr, end); + if (pud_none(pud)) + return 0; + if (unlikely(pud_large(pud))) { + if (!gup_huge_pud(pud, addr, next, write, pages, nr)) + return 0; + } else { + if (!gup_pmd_range(pud, addr, next, write, pages, nr)) + return 0; + } + } while (pudp++, addr = next, addr != end); + + return 1; +} + +int get_user_pages_fast(unsigned long start, int nr_pages, int write, + struct page **pages) +{ + struct mm_struct *mm = current->mm; + unsigned long addr, len, end; + unsigned long next; + pgd_t *pgdp; + int nr = 0; + + start &= PAGE_MASK; + addr = start; + len = (unsigned long) nr_pages << PAGE_SHIFT; + end = start + len; + if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, + start, len))) + goto slow_irqon; + + /* + * XXX: batch / limit 'nr', to avoid large irq off latency + * needs some instrumenting to determine the common sizes used by + * important workloads (eg. DB2), and whether limiting the batch size + * will decrease performance. + * + * It seems like we're in the clear for the moment. Direct-IO is + * the main guy that batches up lots of get_user_pages, and even + * they are limited to 64-at-a-time which is not so many. + */ + /* + * This doesn't prevent pagetable teardown, but does prevent + * the pagetables and pages from being freed on x86. + * + * So long as we atomically load page table pointers versus teardown + * (which we do on x86, with the above PAE exception), we can follow the + * address down to the the page and take a ref on it. + */ + local_irq_disable(); + pgdp = pgd_offset(mm, addr); + do { + pgd_t pgd = *pgdp; + + next = pgd_addr_end(addr, end); + if (pgd_none(pgd)) + goto slow; + if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + goto slow; + } while (pgdp++, addr = next, addr != end); + local_irq_enable(); + + VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); + return nr; + + { + int ret; + +slow: + local_irq_enable(); +slow_irqon: + /* Try to get the remaining pages with get_user_pages */ + start += nr << PAGE_SHIFT; + pages += nr; + + down_read(&mm->mmap_sem); + ret = get_user_pages(current, mm, start, + (end - start) >> PAGE_SHIFT, write, 0, pages, NULL); + up_read(&mm->mmap_sem); + + /* Have to be a bit careful with return values */ + if (nr > 0) { + if (ret < 0) + ret = nr; + else + ret += nr; + } + + return ret; + } +} diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 165c871ba9af..bcc079c282dd 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -137,6 +137,7 @@ void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) return (void*) vaddr; } +EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */ struct page *kmap_atomic_to_page(void *ptr) { diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 0b3d567e686d..8f307d914c2e 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -124,7 +124,8 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) return 1; } -pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_alloc(struct mm_struct *mm, + unsigned long addr, unsigned long sz) { pgd_t *pgd; pud_t *pud; @@ -133,9 +134,14 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) pgd = pgd_offset(mm, addr); pud = pud_alloc(mm, pgd, addr); if (pud) { - if (pud_none(*pud)) - huge_pmd_share(mm, addr, pud); - pte = (pte_t *) pmd_alloc(mm, pud, addr); + if (sz == PUD_SIZE) { + pte = (pte_t *)pud; + } else { + BUG_ON(sz != PMD_SIZE); + if (pud_none(*pud)) + huge_pmd_share(mm, addr, pud); + pte = (pte_t *) pmd_alloc(mm, pud, addr); + } } BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); @@ -151,8 +157,11 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) pgd = pgd_offset(mm, addr); if (pgd_present(*pgd)) { pud = pud_offset(pgd, addr); - if (pud_present(*pud)) + if (pud_present(*pud)) { + if (pud_large(*pud)) + return (pte_t *)pud; pmd = pmd_offset(pud, addr); + } } return (pte_t *) pmd; } @@ -188,6 +197,11 @@ int pmd_huge(pmd_t pmd) return 0; } +int pud_huge(pud_t pud) +{ + return 0; +} + struct page * follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write) @@ -208,6 +222,11 @@ int pmd_huge(pmd_t pmd) return !!(pmd_val(pmd) & _PAGE_PSE); } +int pud_huge(pud_t pud) +{ + return !!(pud_val(pud) & _PAGE_PSE); +} + struct page * follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write) @@ -216,9 +235,22 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, page = pte_page(*(pte_t *)pmd); if (page) - page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT); + page += ((address & ~PMD_MASK) >> PAGE_SHIFT); + return page; +} + +struct page * +follow_huge_pud(struct mm_struct *mm, unsigned long address, + pud_t *pud, int write) +{ + struct page *page; + + page = pte_page(*(pte_t *)pud); + if (page) + page += ((address & ~PUD_MASK) >> PAGE_SHIFT); return page; } + #endif /* x86_64 also uses this file */ @@ -228,6 +260,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { + struct hstate *h = hstate_file(file); struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long start_addr; @@ -240,7 +273,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, } full_search: - addr = ALIGN(start_addr, HPAGE_SIZE); + addr = ALIGN(start_addr, huge_page_size(h)); for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { /* At this point: (!vma || addr < vma->vm_end). */ @@ -262,7 +295,7 @@ full_search: } if (addr + mm->cached_hole_size < vma->vm_start) mm->cached_hole_size = vma->vm_start - addr; - addr = ALIGN(vma->vm_end, HPAGE_SIZE); + addr = ALIGN(vma->vm_end, huge_page_size(h)); } } @@ -270,6 +303,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr0, unsigned long len, unsigned long pgoff, unsigned long flags) { + struct hstate *h = hstate_file(file); struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev_vma; unsigned long base = mm->mmap_base, addr = addr0; @@ -290,7 +324,7 @@ try_again: goto fail; /* either no address requested or cant fit in requested address hole */ - addr = (mm->free_area_cache - len) & HPAGE_MASK; + addr = (mm->free_area_cache - len) & huge_page_mask(h); do { /* * Lookup failure means no vma is above this address, @@ -321,7 +355,7 @@ try_again: largest_hole = vma->vm_start - addr; /* try just below the current vma->vm_start */ - addr = (vma->vm_start - len) & HPAGE_MASK; + addr = (vma->vm_start - len) & huge_page_mask(h); } while (len <= vma->vm_start); fail: @@ -359,22 +393,23 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { + struct hstate *h = hstate_file(file); struct mm_struct *mm = current->mm; struct vm_area_struct *vma; - if (len & ~HPAGE_MASK) + if (len & ~huge_page_mask(h)) return -EINVAL; if (len > TASK_SIZE) return -ENOMEM; if (flags & MAP_FIXED) { - if (prepare_hugepage_range(addr, len)) + if (prepare_hugepage_range(file, addr, len)) return -EINVAL; return addr; } if (addr) { - addr = ALIGN(addr, HPAGE_SIZE); + addr = ALIGN(addr, huge_page_size(h)); vma = find_vma(mm, addr); if (TASK_SIZE - len >= addr && (!vma || addr + len <= vma->vm_start)) @@ -390,3 +425,20 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, #endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/ +#ifdef CONFIG_X86_64 +static __init int setup_hugepagesz(char *opt) +{ + unsigned long ps = memparse(opt, &opt); + if (ps == PMD_SIZE) { + hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); + } else if (ps == PUD_SIZE && cpu_has_gbpages) { + hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); + } else { + printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n", + ps >> 20); + return 0; + } + return 1; +} +__setup("hugepagesz=", setup_hugepagesz); +#endif diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 9689a5138e64..8396868e82c5 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -31,6 +31,7 @@ #include <linux/cpumask.h> #include <asm/asm.h> +#include <asm/bios_ebda.h> #include <asm/processor.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -47,6 +48,7 @@ #include <asm/paravirt.h> #include <asm/setup.h> #include <asm/cacheflush.h> +#include <asm/smp.h> unsigned int __VMALLOC_RESERVE = 128 << 20; @@ -194,11 +196,30 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base, pgd_t *pgd; pmd_t *pmd; pte_t *pte; - unsigned pages_2m = 0, pages_4k = 0; + unsigned pages_2m, pages_4k; + int mapping_iter; + + /* + * First iteration will setup identity mapping using large/small pages + * based on use_pse, with other attributes same as set by + * the early code in head_32.S + * + * Second iteration will setup the appropriate attributes (NX, GLOBAL..) + * as desired for the kernel identity mapping. + * + * This two pass mechanism conforms to the TLB app note which says: + * + * "Software should not write to a paging-structure entry in a way + * that would change, for any linear address, both the page size + * and either the page frame or attributes." + */ + mapping_iter = 1; if (!cpu_has_pse) use_pse = 0; +repeat: + pages_2m = pages_4k = 0; pfn = start_pfn; pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET); pgd = pgd_base + pgd_idx; @@ -224,6 +245,13 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base, if (use_pse) { unsigned int addr2; pgprot_t prot = PAGE_KERNEL_LARGE; + /* + * first pass will use the same initial + * identity mapping attribute + _PAGE_PSE. + */ + pgprot_t init_prot = + __pgprot(PTE_IDENT_ATTR | + _PAGE_PSE); addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1; @@ -233,7 +261,10 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base, prot = PAGE_KERNEL_LARGE_EXEC; pages_2m++; - set_pmd(pmd, pfn_pmd(pfn, prot)); + if (mapping_iter == 1) + set_pmd(pmd, pfn_pmd(pfn, init_prot)); + else + set_pmd(pmd, pfn_pmd(pfn, prot)); pfn += PTRS_PER_PTE; continue; @@ -245,17 +276,43 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base, for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn; pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) { pgprot_t prot = PAGE_KERNEL; + /* + * first pass will use the same initial + * identity mapping attribute. + */ + pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR); if (is_kernel_text(addr)) prot = PAGE_KERNEL_EXEC; pages_4k++; - set_pte(pte, pfn_pte(pfn, prot)); + if (mapping_iter == 1) + set_pte(pte, pfn_pte(pfn, init_prot)); + else + set_pte(pte, pfn_pte(pfn, prot)); } } } - update_page_count(PG_LEVEL_2M, pages_2m); - update_page_count(PG_LEVEL_4K, pages_4k); + if (mapping_iter == 1) { + /* + * update direct mapping page count only in the first + * iteration. + */ + update_page_count(PG_LEVEL_2M, pages_2m); + update_page_count(PG_LEVEL_4K, pages_4k); + + /* + * local global flush tlb, which will flush the previous + * mappings present in both small and large page TLB's. + */ + __flush_tlb_all(); + + /* + * Second iteration will set the actual desired PTE attributes. + */ + mapping_iter = 2; + goto repeat; + } } /* @@ -458,11 +515,7 @@ static void __init pagetable_init(void) { pgd_t *pgd_base = swapper_pg_dir; - paravirt_pagetable_setup_start(pgd_base); - permanent_kmaps_init(pgd_base); - - paravirt_pagetable_setup_done(pgd_base); } #ifdef CONFIG_ACPI_SLEEP @@ -505,7 +558,7 @@ void zap_low_mappings(void) int nx_enabled; -pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL); +pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP); EXPORT_SYMBOL_GPL(__supported_pte_mask); #ifdef CONFIG_X86_PAE @@ -722,7 +775,7 @@ void __init setup_bootmem_allocator(void) after_init_bootmem = 1; } -static void __init find_early_table_space(unsigned long end) +static void __init find_early_table_space(unsigned long end, int use_pse) { unsigned long puds, pmds, ptes, tables, start; @@ -732,7 +785,7 @@ static void __init find_early_table_space(unsigned long end) pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; tables += PAGE_ALIGN(pmds * sizeof(pmd_t)); - if (cpu_has_pse) { + if (use_pse) { unsigned long extra; extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); @@ -772,12 +825,22 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, pgd_t *pgd_base = swapper_pg_dir; unsigned long start_pfn, end_pfn; unsigned long big_page_start; +#ifdef CONFIG_DEBUG_PAGEALLOC + /* + * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. + * This will simplify cpa(), which otherwise needs to support splitting + * large pages into small in interrupt context, etc. + */ + int use_pse = 0; +#else + int use_pse = cpu_has_pse; +#endif /* * Find space for the kernel direct mapping tables. */ if (!after_init_bootmem) - find_early_table_space(end); + find_early_table_space(end, use_pse); #ifdef CONFIG_X86_PAE set_nx(); @@ -823,7 +886,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); if (start_pfn < end_pfn) kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, - cpu_has_pse); + use_pse); /* tail is not big page alignment ? */ start_pfn = end_pfn; @@ -844,6 +907,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, reserve_early(table_start << PAGE_SHIFT, table_end << PAGE_SHIFT, "PGTABLE"); + if (!after_init_bootmem) + early_memtest(start, end); + return end >> PAGE_SHIFT; } @@ -868,8 +934,6 @@ void __init paging_init(void) */ sparse_init(); zone_sizes_init(); - - paravirt_post_allocator_init(); } /* @@ -906,6 +970,8 @@ void __init mem_init(void) int codesize, reservedpages, datasize, initsize; int tmp; + start_periodic_check_for_corruption(); + #ifdef CONFIG_FLATMEM BUG_ON(!mem_map); #endif @@ -985,7 +1051,6 @@ void __init mem_init(void) if (boot_cpu_data.wp_works_ok < 0) test_wp_bit(); - cpa_init(); save_pg_dir(); zap_low_mappings(); } diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 306049edd553..b8e461d49412 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -31,6 +31,7 @@ #include <linux/nmi.h> #include <asm/processor.h> +#include <asm/bios_ebda.h> #include <asm/system.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -60,7 +61,7 @@ static unsigned long dma_reserve __initdata; DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); -int direct_gbpages __meminitdata +int direct_gbpages #ifdef CONFIG_DIRECT_GBPAGES = 1 #endif @@ -86,46 +87,69 @@ early_param("gbpages", parse_direct_gbpages_on); * around without checking the pgd every time. */ -void show_mem(void) -{ - long i, total = 0, reserved = 0; - long shared = 0, cached = 0; - struct page *page; - pg_data_t *pgdat; - - printk(KERN_INFO "Mem-info:\n"); - show_free_areas(); - for_each_online_pgdat(pgdat) { - for (i = 0; i < pgdat->node_spanned_pages; ++i) { - /* - * This loop can take a while with 256 GB and - * 4k pages so defer the NMI watchdog: - */ - if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) - touch_nmi_watchdog(); +int after_bootmem; - if (!pfn_valid(pgdat->node_start_pfn + i)) - continue; +pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP; +EXPORT_SYMBOL_GPL(__supported_pte_mask); - page = pfn_to_page(pgdat->node_start_pfn + i); - total++; - if (PageReserved(page)) - reserved++; - else if (PageSwapCache(page)) - cached++; - else if (page_count(page)) - shared += page_count(page) - 1; - } +static int do_not_nx __cpuinitdata; + +/* + * noexec=on|off + * Control non-executable mappings for 64-bit processes. + * + * on Enable (default) + * off Disable + */ +static int __init nonx_setup(char *str) +{ + if (!str) + return -EINVAL; + if (!strncmp(str, "on", 2)) { + __supported_pte_mask |= _PAGE_NX; + do_not_nx = 0; + } else if (!strncmp(str, "off", 3)) { + do_not_nx = 1; + __supported_pte_mask &= ~_PAGE_NX; } - printk(KERN_INFO "%lu pages of RAM\n", total); - printk(KERN_INFO "%lu reserved pages\n", reserved); - printk(KERN_INFO "%lu pages shared\n", shared); - printk(KERN_INFO "%lu pages swap cached\n", cached); + return 0; } +early_param("noexec", nonx_setup); -int after_bootmem; +void __cpuinit check_efer(void) +{ + unsigned long efer; + + rdmsrl(MSR_EFER, efer); + if (!(efer & EFER_NX) || do_not_nx) + __supported_pte_mask &= ~_PAGE_NX; +} + +int force_personality32; + +/* + * noexec32=on|off + * Control non executable heap for 32bit processes. + * To control the stack too use noexec=off + * + * on PROT_READ does not imply PROT_EXEC for 32-bit processes (default) + * off PROT_READ implies PROT_EXEC + */ +static int __init nonx32_setup(char *str) +{ + if (!strcmp(str, "on")) + force_personality32 &= ~READ_IMPLIES_EXEC; + else if (!strcmp(str, "off")) + force_personality32 |= READ_IMPLIES_EXEC; + return 1; +} +__setup("noexec32=", nonx32_setup); -static __init void *spp_getpage(void) +/* + * NOTE: This function is marked __ref because it calls __init function + * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. + */ +static __ref void *spp_getpage(void) { void *ptr; @@ -172,9 +196,6 @@ set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) } pte = pte_offset_kernel(pmd, vaddr); - if (!pte_none(*pte) && pte_val(new_pte) && - pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask)) - pte_ERROR(*pte); set_pte(pte, new_pte); /* @@ -258,7 +279,7 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size) void __init cleanup_highmap(void) { unsigned long vaddr = __START_KERNEL_map; - unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1; + unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1; pmd_t *pmd = level2_kernel_pgt; pmd_t *last_pmd = pmd + PTRS_PER_PMD; @@ -274,7 +295,7 @@ static unsigned long __initdata table_start; static unsigned long __meminitdata table_end; static unsigned long __meminitdata table_top; -static __meminit void *alloc_low_page(unsigned long *phys) +static __ref void *alloc_low_page(unsigned long *phys) { unsigned long pfn = table_end++; void *adr; @@ -289,13 +310,13 @@ static __meminit void *alloc_low_page(unsigned long *phys) if (pfn >= table_top) panic("alloc_low_page: ran out of memory"); - adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE); + adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); memset(adr, 0, PAGE_SIZE); *phys = pfn * PAGE_SIZE; return adr; } -static __meminit void unmap_low_page(void *adr) +static __ref void unmap_low_page(void *adr) { if (after_bootmem) return; @@ -304,7 +325,8 @@ static __meminit void unmap_low_page(void *adr) } static unsigned long __meminit -phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end) +phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, + pgprot_t prot) { unsigned pages = 0; unsigned long last_map_addr = end; @@ -322,32 +344,40 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end) break; } + /* + * We will re-use the existing mapping. + * Xen for example has some special requirements, like mapping + * pagetable pages as RO. So assume someone who pre-setup + * these mappings are more intelligent. + */ if (pte_val(*pte)) continue; if (0) printk(" pte=%p addr=%lx pte=%016lx\n", pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte); - set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL)); - last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE; pages++; + set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot)); + last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE; } + update_page_count(PG_LEVEL_4K, pages); return last_map_addr; } static unsigned long __meminit -phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end) +phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end, + pgprot_t prot) { pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd); - return phys_pte_init(pte, address, end); + return phys_pte_init(pte, address, end, prot); } static unsigned long __meminit phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, - unsigned long page_size_mask) + unsigned long page_size_mask, pgprot_t prot) { unsigned long pages = 0; unsigned long last_map_addr = end; @@ -358,6 +388,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, unsigned long pte_phys; pmd_t *pmd = pmd_page + pmd_index(address); pte_t *pte; + pgprot_t new_prot = prot; if (address >= end) { if (!after_bootmem) { @@ -368,25 +399,48 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, } if (pmd_val(*pmd)) { - if (!pmd_large(*pmd)) + if (!pmd_large(*pmd)) { + spin_lock(&init_mm.page_table_lock); last_map_addr = phys_pte_update(pmd, address, - end); - continue; + end, prot); + spin_unlock(&init_mm.page_table_lock); + continue; + } + /* + * If we are ok with PG_LEVEL_2M mapping, then we will + * use the existing mapping, + * + * Otherwise, we will split the large page mapping but + * use the same existing protection bits except for + * large page, so that we don't violate Intel's TLB + * Application note (317080) which says, while changing + * the page sizes, new and old translations should + * not differ with respect to page frame and + * attributes. + */ + if (page_size_mask & (1 << PG_LEVEL_2M)) + continue; + new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd)); } if (page_size_mask & (1<<PG_LEVEL_2M)) { pages++; + spin_lock(&init_mm.page_table_lock); set_pte((pte_t *)pmd, - pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); + pfn_pte(address >> PAGE_SHIFT, + __pgprot(pgprot_val(prot) | _PAGE_PSE))); + spin_unlock(&init_mm.page_table_lock); last_map_addr = (address & PMD_MASK) + PMD_SIZE; continue; } pte = alloc_low_page(&pte_phys); - last_map_addr = phys_pte_init(pte, address, end); + last_map_addr = phys_pte_init(pte, address, end, new_prot); unmap_low_page(pte); + spin_lock(&init_mm.page_table_lock); pmd_populate_kernel(&init_mm, pmd, __va(pte_phys)); + spin_unlock(&init_mm.page_table_lock); } update_page_count(PG_LEVEL_2M, pages); return last_map_addr; @@ -394,14 +448,12 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, static unsigned long __meminit phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end, - unsigned long page_size_mask) + unsigned long page_size_mask, pgprot_t prot) { pmd_t *pmd = pmd_offset(pud, 0); unsigned long last_map_addr; - spin_lock(&init_mm.page_table_lock); - last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask); - spin_unlock(&init_mm.page_table_lock); + last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot); __flush_tlb_all(); return last_map_addr; } @@ -418,6 +470,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, unsigned long pmd_phys; pud_t *pud = pud_page + pud_index(addr); pmd_t *pmd; + pgprot_t prot = PAGE_KERNEL; if (addr >= end) break; @@ -429,30 +482,49 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, } if (pud_val(*pud)) { - if (!pud_large(*pud)) + if (!pud_large(*pud)) { last_map_addr = phys_pmd_update(pud, addr, end, - page_size_mask); - continue; + page_size_mask, prot); + continue; + } + /* + * If we are ok with PG_LEVEL_1G mapping, then we will + * use the existing mapping. + * + * Otherwise, we will split the gbpage mapping but use + * the same existing protection bits except for large + * page, so that we don't violate Intel's TLB + * Application note (317080) which says, while changing + * the page sizes, new and old translations should + * not differ with respect to page frame and + * attributes. + */ + if (page_size_mask & (1 << PG_LEVEL_1G)) + continue; + prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud)); } if (page_size_mask & (1<<PG_LEVEL_1G)) { pages++; + spin_lock(&init_mm.page_table_lock); set_pte((pte_t *)pud, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); + spin_unlock(&init_mm.page_table_lock); last_map_addr = (addr & PUD_MASK) + PUD_SIZE; continue; } pmd = alloc_low_page(&pmd_phys); + last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask, + prot); + unmap_low_page(pmd); spin_lock(&init_mm.page_table_lock); - last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask); - unmap_low_page(pmd); pud_populate(&init_mm, pud, __va(pmd_phys)); spin_unlock(&init_mm.page_table_lock); - } __flush_tlb_all(); + update_page_count(PG_LEVEL_1G, pages); return last_map_addr; @@ -469,27 +541,28 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end, return phys_pud_init(pud, addr, end, page_size_mask); } -static void __init find_early_table_space(unsigned long end) +static void __init find_early_table_space(unsigned long end, int use_pse, + int use_gbpages) { unsigned long puds, pmds, ptes, tables, start; puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; - tables = round_up(puds * sizeof(pud_t), PAGE_SIZE); - if (direct_gbpages) { + tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); + if (use_gbpages) { unsigned long extra; extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; } else pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; - tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE); + tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); - if (cpu_has_pse) { + if (use_pse) { unsigned long extra; extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; } else ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; - tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE); + tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); /* * RED-PEN putting page tables only on node 0 could @@ -517,118 +590,6 @@ static void __init init_gbpages(void) direct_gbpages = 0; } -#ifdef CONFIG_MEMTEST - -static void __init memtest(unsigned long start_phys, unsigned long size, - unsigned pattern) -{ - unsigned long i; - unsigned long *start; - unsigned long start_bad; - unsigned long last_bad; - unsigned long val; - unsigned long start_phys_aligned; - unsigned long count; - unsigned long incr; - - switch (pattern) { - case 0: - val = 0UL; - break; - case 1: - val = -1UL; - break; - case 2: - val = 0x5555555555555555UL; - break; - case 3: - val = 0xaaaaaaaaaaaaaaaaUL; - break; - default: - return; - } - - incr = sizeof(unsigned long); - start_phys_aligned = ALIGN(start_phys, incr); - count = (size - (start_phys_aligned - start_phys))/incr; - start = __va(start_phys_aligned); - start_bad = 0; - last_bad = 0; - - for (i = 0; i < count; i++) - start[i] = val; - for (i = 0; i < count; i++, start++, start_phys_aligned += incr) { - if (*start != val) { - if (start_phys_aligned == last_bad + incr) { - last_bad += incr; - } else { - if (start_bad) { - printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved", - val, start_bad, last_bad + incr); - reserve_early(start_bad, last_bad - start_bad, "BAD RAM"); - } - start_bad = last_bad = start_phys_aligned; - } - } - } - if (start_bad) { - printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved", - val, start_bad, last_bad + incr); - reserve_early(start_bad, last_bad - start_bad, "BAD RAM"); - } - -} - -/* default is disabled */ -static int memtest_pattern __initdata; - -static int __init parse_memtest(char *arg) -{ - if (arg) - memtest_pattern = simple_strtoul(arg, NULL, 0); - return 0; -} - -early_param("memtest", parse_memtest); - -static void __init early_memtest(unsigned long start, unsigned long end) -{ - u64 t_start, t_size; - unsigned pattern; - - if (!memtest_pattern) - return; - - printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern); - for (pattern = 0; pattern < memtest_pattern; pattern++) { - t_start = start; - t_size = 0; - while (t_start < end) { - t_start = find_e820_area_size(t_start, &t_size, 1); - - /* done ? */ - if (t_start >= end) - break; - if (t_start + t_size > end) - t_size = end - t_start; - - printk(KERN_CONT "\n %016llx - %016llx pattern %d", - (unsigned long long)t_start, - (unsigned long long)t_start + t_size, pattern); - - memtest(t_start, t_size, pattern); - - t_start += t_size; - } - } - printk(KERN_CONT "\n"); -} -#else -static void __init early_memtest(unsigned long start, unsigned long end) -{ -} -#endif - static unsigned long __init kernel_physical_mapping_init(unsigned long start, unsigned long end, unsigned long page_size_mask) @@ -654,17 +615,16 @@ static unsigned long __init kernel_physical_mapping_init(unsigned long start, continue; } - if (after_bootmem) - pud = pud_offset(pgd, start & PGDIR_MASK); - else - pud = alloc_low_page(&pud_phys); - + pud = alloc_low_page(&pud_phys); last_map_addr = phys_pud_init(pud, __pa(start), __pa(next), page_size_mask); unmap_low_page(pud); - pgd_populate(&init_mm, pgd_offset_k(start), - __va(pud_phys)); + + spin_lock(&init_mm.page_table_lock); + pgd_populate(&init_mm, pgd, __va(pud_phys)); + spin_unlock(&init_mm.page_table_lock); } + __flush_tlb_all(); return last_map_addr; } @@ -708,6 +668,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, struct map_range mr[NR_RANGE_MR]; int nr_range, i; + int use_pse, use_gbpages; printk(KERN_INFO "init_memory_mapping\n"); @@ -721,9 +682,21 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, if (!after_bootmem) init_gbpages(); - if (direct_gbpages) +#ifdef CONFIG_DEBUG_PAGEALLOC + /* + * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. + * This will simplify cpa(), which otherwise needs to support splitting + * large pages into small in interrupt context, etc. + */ + use_pse = use_gbpages = 0; +#else + use_pse = cpu_has_pse; + use_gbpages = direct_gbpages; +#endif + + if (use_gbpages) page_size_mask |= 1 << PG_LEVEL_1G; - if (cpu_has_pse) + if (use_pse) page_size_mask |= 1 << PG_LEVEL_2M; memset(mr, 0, sizeof(mr)); @@ -773,7 +746,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, old_start = mr[i].start; memmove(&mr[i], &mr[i+1], (nr_range - 1 - i) * sizeof (struct map_range)); - mr[i].start = old_start; + mr[i--].start = old_start; nr_range--; } @@ -784,7 +757,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k")); if (!after_bootmem) - find_early_table_space(end); + find_early_table_space(end, use_pse, use_gbpages); for (i = 0; i < nr_range; i++) last_map_addr = kernel_physical_mapping_init( @@ -906,6 +879,8 @@ void __init mem_init(void) { long codesize, reservedpages, datasize, initsize; + start_periodic_check_for_corruption(); + pci_iommu_alloc(); /* clear_bss() already clear the empty_zero_page */ @@ -943,8 +918,6 @@ void __init mem_init(void) reservedpages << (PAGE_SHIFT-10), datasize >> 10, initsize >> 10); - - cpa_init(); } void free_init_pages(char *what, unsigned long begin, unsigned long end) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 24c1d3c30186..ae71e11eb3e5 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -24,18 +24,47 @@ #ifdef CONFIG_X86_64 +static inline int phys_addr_valid(unsigned long addr) +{ + return addr < (1UL << boot_cpu_data.x86_phys_bits); +} + unsigned long __phys_addr(unsigned long x) { - if (x >= __START_KERNEL_map) - return x - __START_KERNEL_map + phys_base; - return x - PAGE_OFFSET; + if (x >= __START_KERNEL_map) { + x -= __START_KERNEL_map; + VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE); + x += phys_base; + } else { + VIRTUAL_BUG_ON(x < PAGE_OFFSET); + x -= PAGE_OFFSET; + VIRTUAL_BUG_ON(system_state == SYSTEM_BOOTING ? x > MAXMEM : + !phys_addr_valid(x)); + } + return x; } EXPORT_SYMBOL(__phys_addr); -static inline int phys_addr_valid(unsigned long addr) +bool __virt_addr_valid(unsigned long x) { - return addr < (1UL << boot_cpu_data.x86_phys_bits); + if (x >= __START_KERNEL_map) { + x -= __START_KERNEL_map; + if (x >= KERNEL_IMAGE_SIZE) + return false; + x += phys_base; + } else { + if (x < PAGE_OFFSET) + return false; + x -= PAGE_OFFSET; + if (system_state == SYSTEM_BOOTING ? + x > MAXMEM : !phys_addr_valid(x)) { + return false; + } + } + + return pfn_valid(x >> PAGE_SHIFT); } +EXPORT_SYMBOL(__virt_addr_valid); #else @@ -44,6 +73,28 @@ static inline int phys_addr_valid(unsigned long addr) return 1; } +#ifdef CONFIG_DEBUG_VIRTUAL +unsigned long __phys_addr(unsigned long x) +{ + /* VMALLOC_* aren't constants; not available at the boot time */ + VIRTUAL_BUG_ON(x < PAGE_OFFSET); + VIRTUAL_BUG_ON(system_state != SYSTEM_BOOTING && + is_vmalloc_addr((void *) x)); + return x - PAGE_OFFSET; +} +EXPORT_SYMBOL(__phys_addr); +#endif + +bool __virt_addr_valid(unsigned long x) +{ + if (x < PAGE_OFFSET) + return false; + if (system_state != SYSTEM_BOOTING && is_vmalloc_addr((void *) x)) + return false; + return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT); +} +EXPORT_SYMBOL(__virt_addr_valid); + #endif int page_is_ram(unsigned long pagenr) @@ -83,6 +134,25 @@ int page_is_ram(unsigned long pagenr) return 0; } +int pagerange_is_ram(unsigned long start, unsigned long end) +{ + int ram_page = 0, not_rampage = 0; + unsigned long page_nr; + + for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT); + ++page_nr) { + if (page_is_ram(page_nr)) + ram_page = 1; + else + not_rampage = 1; + + if (ram_page == not_rampage) + return -1; + } + + return ram_page; +} + /* * Fix up the linear direct mapping of the kernel to avoid cache attribute * conflicts. @@ -150,6 +220,12 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, return (__force void __iomem *)phys_to_virt(phys_addr); /* + * Check if the request spans more than any BAR in the iomem resource + * tree. + */ + WARN_ON(iomem_map_sanity_check(phys_addr, size)); + + /* * Don't allow anybody to remap normal RAM that we're using.. */ for (pfn = phys_addr >> PAGE_SHIFT; @@ -170,7 +246,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, phys_addr &= PAGE_MASK; size = PAGE_ALIGN(last_addr+1) - phys_addr; - retval = reserve_memtype(phys_addr, phys_addr + size, + retval = reserve_memtype(phys_addr, (u64)phys_addr + size, prot_val, &new_prot_val); if (retval) { pr_debug("Warning: reserve_memtype returned %d\n", retval); @@ -204,16 +280,16 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, switch (prot_val) { case _PAGE_CACHE_UC: default: - prot = PAGE_KERNEL_NOCACHE; + prot = PAGE_KERNEL_IO_NOCACHE; break; case _PAGE_CACHE_UC_MINUS: - prot = PAGE_KERNEL_UC_MINUS; + prot = PAGE_KERNEL_IO_UC_MINUS; break; case _PAGE_CACHE_WC: - prot = PAGE_KERNEL_WC; + prot = PAGE_KERNEL_IO_WC; break; case _PAGE_CACHE_WB: - prot = PAGE_KERNEL; + prot = PAGE_KERNEL_IO; break; } @@ -330,6 +406,14 @@ static void __iomem *ioremap_default(resource_size_t phys_addr, return (void __iomem *)ret; } +void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size, + unsigned long prot_val) +{ + return __ioremap_caller(phys_addr, size, (prot_val & _PAGE_CACHE_MASK), + __builtin_return_address(0)); +} +EXPORT_SYMBOL(ioremap_prot); + /** * iounmap - Free a IO remapping * @addr: virtual address from ioremap_* @@ -413,7 +497,7 @@ void unxlate_dev_mem_ptr(unsigned long phys, void *addr) return; } -int __initdata early_ioremap_debug; +static int __initdata early_ioremap_debug; static int __init early_ioremap_debug_setup(char *str) { @@ -522,12 +606,12 @@ static void __init __early_set_fixmap(enum fixed_addresses idx, } static inline void __init early_set_fixmap(enum fixed_addresses idx, - unsigned long phys) + unsigned long phys, pgprot_t prot) { if (after_paging_init) - set_fixmap(idx, phys); + __set_fixmap(idx, phys, prot); else - __early_set_fixmap(idx, phys, PAGE_KERNEL); + __early_set_fixmap(idx, phys, prot); } static inline void __init early_clear_fixmap(enum fixed_addresses idx) @@ -538,37 +622,56 @@ static inline void __init early_clear_fixmap(enum fixed_addresses idx) __early_set_fixmap(idx, 0, __pgprot(0)); } - -int __initdata early_ioremap_nested; - +static void *prev_map[FIX_BTMAPS_SLOTS] __initdata; +static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata; static int __init check_early_ioremap_leak(void) { - if (!early_ioremap_nested) - return 0; + int count = 0; + int i; - printk(KERN_WARNING + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) + if (prev_map[i]) + count++; + + if (!count) + return 0; + WARN(1, KERN_WARNING "Debug warning: early ioremap leak of %d areas detected.\n", - early_ioremap_nested); + count); printk(KERN_WARNING - "please boot with early_ioremap_debug and report the dmesg.\n"); - WARN_ON(1); + "please boot with early_ioremap_debug and report the dmesg.\n"); return 1; } late_initcall(check_early_ioremap_leak); -void __init *early_ioremap(unsigned long phys_addr, unsigned long size) +static void __init *__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot) { unsigned long offset, last_addr; - unsigned int nrpages, nesting; + unsigned int nrpages; enum fixed_addresses idx0, idx; + int i, slot; WARN_ON(system_state != SYSTEM_BOOTING); - nesting = early_ioremap_nested; + slot = -1; + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { + if (!prev_map[i]) { + slot = i; + break; + } + } + + if (slot < 0) { + printk(KERN_INFO "early_iomap(%08lx, %08lx) not found slot\n", + phys_addr, size); + WARN_ON(1); + return NULL; + } + if (early_ioremap_debug) { printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ", - phys_addr, size, nesting); + phys_addr, size, slot); dump_stack(); } @@ -579,17 +682,13 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size) return NULL; } - if (nesting >= FIX_BTMAPS_NESTING) { - WARN_ON(1); - return NULL; - } - early_ioremap_nested++; + prev_size[slot] = size; /* * Mappings have to be page-aligned */ offset = phys_addr & ~PAGE_MASK; phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr) - phys_addr; + size = PAGE_ALIGN(last_addr + 1) - phys_addr; /* * Mappings have to fit in the FIX_BTMAP area. @@ -603,10 +702,10 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size) /* * Ok, go for it.. */ - idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting; + idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; idx = idx0; while (nrpages > 0) { - early_set_fixmap(idx, phys_addr); + early_set_fixmap(idx, phys_addr, prot); phys_addr += PAGE_SIZE; --idx; --nrpages; @@ -614,7 +713,20 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size) if (early_ioremap_debug) printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0)); - return (void *) (offset + fix_to_virt(idx0)); + prev_map[slot] = (void *) (offset + fix_to_virt(idx0)); + return prev_map[slot]; +} + +/* Remap an IO device */ +void __init *early_ioremap(unsigned long phys_addr, unsigned long size) +{ + return __early_ioremap(phys_addr, size, PAGE_KERNEL_IO); +} + +/* Remap memory */ +void __init *early_memremap(unsigned long phys_addr, unsigned long size) +{ + return __early_ioremap(phys_addr, size, PAGE_KERNEL); } void __init early_iounmap(void *addr, unsigned long size) @@ -623,15 +735,33 @@ void __init early_iounmap(void *addr, unsigned long size) unsigned long offset; unsigned int nrpages; enum fixed_addresses idx; - int nesting; + int i, slot; + + slot = -1; + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { + if (prev_map[i] == addr) { + slot = i; + break; + } + } - nesting = --early_ioremap_nested; - if (WARN_ON(nesting < 0)) + if (slot < 0) { + printk(KERN_INFO "early_iounmap(%p, %08lx) not found slot\n", + addr, size); + WARN_ON(1); return; + } + + if (prev_size[slot] != size) { + printk(KERN_INFO "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n", + addr, size, slot, prev_size[slot]); + WARN_ON(1); + return; + } if (early_ioremap_debug) { printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr, - size, nesting); + size, slot); dump_stack(); } @@ -643,12 +773,13 @@ void __init early_iounmap(void *addr, unsigned long size) offset = virt_addr & ~PAGE_MASK; nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; - idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting; + idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; while (nrpages > 0) { early_clear_fixmap(idx); --idx; --nrpages; } + prev_map[slot] = 0; } void __this_fixmap_does_not_exist(void) diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c new file mode 100644 index 000000000000..9cab18b0b857 --- /dev/null +++ b/arch/x86/mm/memtest.c @@ -0,0 +1,122 @@ +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/init.h> +#include <linux/pfn.h> + +#include <asm/e820.h> + +static void __init memtest(unsigned long start_phys, unsigned long size, + unsigned pattern) +{ + unsigned long i; + unsigned long *start; + unsigned long start_bad; + unsigned long last_bad; + unsigned long val; + unsigned long start_phys_aligned; + unsigned long count; + unsigned long incr; + + switch (pattern) { + case 0: + val = 0UL; + break; + case 1: + val = -1UL; + break; + case 2: +#ifdef CONFIG_X86_64 + val = 0x5555555555555555UL; +#else + val = 0x55555555UL; +#endif + break; + case 3: +#ifdef CONFIG_X86_64 + val = 0xaaaaaaaaaaaaaaaaUL; +#else + val = 0xaaaaaaaaUL; +#endif + break; + default: + return; + } + + incr = sizeof(unsigned long); + start_phys_aligned = ALIGN(start_phys, incr); + count = (size - (start_phys_aligned - start_phys))/incr; + start = __va(start_phys_aligned); + start_bad = 0; + last_bad = 0; + + for (i = 0; i < count; i++) + start[i] = val; + for (i = 0; i < count; i++, start++, start_phys_aligned += incr) { + if (*start != val) { + if (start_phys_aligned == last_bad + incr) { + last_bad += incr; + } else { + if (start_bad) { + printk(KERN_CONT "\n %016lx bad mem addr %010lx - %010lx reserved", + val, start_bad, last_bad + incr); + reserve_early(start_bad, last_bad + incr, "BAD RAM"); + } + start_bad = last_bad = start_phys_aligned; + } + } + } + if (start_bad) { + printk(KERN_CONT "\n %016lx bad mem addr %010lx - %010lx reserved", + val, start_bad, last_bad + incr); + reserve_early(start_bad, last_bad + incr, "BAD RAM"); + } +} + +/* default is disabled */ +static int memtest_pattern __initdata; + +static int __init parse_memtest(char *arg) +{ + if (arg) + memtest_pattern = simple_strtoul(arg, NULL, 0); + return 0; +} + +early_param("memtest", parse_memtest); + +void __init early_memtest(unsigned long start, unsigned long end) +{ + u64 t_start, t_size; + unsigned pattern; + + if (!memtest_pattern) + return; + + printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern); + for (pattern = 0; pattern < memtest_pattern; pattern++) { + t_start = start; + t_size = 0; + while (t_start < end) { + t_start = find_e820_area_size(t_start, &t_size, 1); + + /* done ? */ + if (t_start >= end) + break; + if (t_start + t_size > end) + t_size = end - t_start; + + printk(KERN_CONT "\n %010llx - %010llx pattern %d", + (unsigned long long)t_start, + (unsigned long long)t_start + t_size, pattern); + + memtest(t_start, t_size, pattern); + + t_start += t_size; + } + } + printk(KERN_CONT "\n"); +} diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index e7397e108beb..2c4baa88f2cb 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -56,13 +56,6 @@ struct remap_trace { static DEFINE_PER_CPU(struct trap_reason, pf_reason); static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace); -#if 0 /* XXX: no way gather this info anymore */ -/* Access to this is not per-cpu. */ -static DEFINE_PER_CPU(atomic_t, dropped); -#endif - -static struct dentry *marker_file; - static DEFINE_MUTEX(mmiotrace_mutex); static DEFINE_SPINLOCK(trace_lock); static atomic_t mmiotrace_enabled; @@ -75,7 +68,7 @@ static LIST_HEAD(trace_list); /* struct remap_trace */ * and trace_lock. * - Routines depending on is_enabled() must take trace_lock. * - trace_list users must hold trace_lock. - * - is_enabled() guarantees that mmio_trace_record is allowed. + * - is_enabled() guarantees that mmio_trace_{rw,mapping} are allowed. * - pre/post callbacks assume the effect of is_enabled() being true. */ @@ -97,44 +90,6 @@ static bool is_enabled(void) return atomic_read(&mmiotrace_enabled); } -#if 0 /* XXX: needs rewrite */ -/* - * Write callback for the debugfs entry: - * Read a marker and write it to the mmio trace log - */ -static ssize_t write_marker(struct file *file, const char __user *buffer, - size_t count, loff_t *ppos) -{ - char *event = NULL; - struct mm_io_header *headp; - ssize_t len = (count > 65535) ? 65535 : count; - - event = kzalloc(sizeof(*headp) + len, GFP_KERNEL); - if (!event) - return -ENOMEM; - - headp = (struct mm_io_header *)event; - headp->type = MMIO_MAGIC | (MMIO_MARKER << MMIO_OPCODE_SHIFT); - headp->data_len = len; - - if (copy_from_user(event + sizeof(*headp), buffer, len)) { - kfree(event); - return -EFAULT; - } - - spin_lock_irq(&trace_lock); -#if 0 /* XXX: convert this to use tracing */ - if (is_enabled()) - relay_write(chan, event, sizeof(*headp) + len); - else -#endif - len = -EINVAL; - spin_unlock_irq(&trace_lock); - kfree(event); - return len; -} -#endif - static void print_pte(unsigned long address) { unsigned int level; @@ -307,8 +262,10 @@ static void ioremap_trace_core(resource_size_t offset, unsigned long size, map.map_id = trace->id; spin_lock_irq(&trace_lock); - if (!is_enabled()) + if (!is_enabled()) { + kfree(trace); goto not_enabled; + } mmio_trace_mapping(&map); list_add_tail(&trace->list, &trace_list); @@ -377,6 +334,23 @@ void mmiotrace_iounmap(volatile void __iomem *addr) iounmap_trace_core(addr); } +int mmiotrace_printk(const char *fmt, ...) +{ + int ret = 0; + va_list args; + unsigned long flags; + va_start(args, fmt); + + spin_lock_irqsave(&trace_lock, flags); + if (is_enabled()) + ret = mmio_trace_printk(fmt, args); + spin_unlock_irqrestore(&trace_lock, flags); + + va_end(args); + return ret; +} +EXPORT_SYMBOL(mmiotrace_printk); + static void clear_trace_list(void) { struct remap_trace *trace; @@ -430,7 +404,9 @@ static void enter_uniprocessor(void) "may miss events.\n"); } -static void leave_uniprocessor(void) +/* __ref because leave_uniprocessor calls cpu_up which is __cpuinit, + but this whole function is ifdefed CONFIG_HOTPLUG_CPU */ +static void __ref leave_uniprocessor(void) { int cpu; int err; @@ -460,26 +436,12 @@ static void leave_uniprocessor(void) } #endif -#if 0 /* XXX: out of order */ -static struct file_operations fops_marker = { - .owner = THIS_MODULE, - .write = write_marker -}; -#endif - void enable_mmiotrace(void) { mutex_lock(&mmiotrace_mutex); if (is_enabled()) goto out; -#if 0 /* XXX: tracing does not support text entries */ - marker_file = debugfs_create_file("marker", 0660, dir, NULL, - &fops_marker); - if (!marker_file) - pr_err(NAME "marker file creation failed.\n"); -#endif - if (nommiotrace) pr_info(NAME "MMIO tracing disabled.\n"); enter_uniprocessor(); @@ -504,11 +466,6 @@ void disable_mmiotrace(void) clear_trace_list(); /* guarantees: no more kmmio callbacks */ leave_uniprocessor(); - if (marker_file) { - debugfs_remove(marker_file); - marker_file = NULL; - } - pr_info(NAME "disabled.\n"); out: mutex_unlock(&mmiotrace_mutex); diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/numa_32.c index 5dfef9fa061a..847c164725f4 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/numa_32.c @@ -42,7 +42,6 @@ struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); -static bootmem_data_t node0_bdata; /* * numa interface - we expect the numa architecture specific code to have @@ -329,7 +328,7 @@ void __init initmem_init(unsigned long start_pfn, get_memcfg_numa(); - kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE); + kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE); kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); do { @@ -385,7 +384,7 @@ void __init initmem_init(unsigned long start_pfn, for_each_online_node(nid) memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); - NODE_DATA(0)->bdata = &node0_bdata; + NODE_DATA(0)->bdata = &bootmem_node_data[0]; setup_bootmem_allocator(); } diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index b432d5781773..cebcbf152d46 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -20,15 +20,9 @@ #include <asm/acpi.h> #include <asm/k8.h> -#ifndef Dprintk -#define Dprintk(x...) -#endif - struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); -static bootmem_data_t plat_node_bdata[MAX_NUMNODES]; - struct memnode memnode; s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { @@ -85,7 +79,7 @@ static int __init allocate_cachealigned_memnodemap(void) return 0; addr = 0x8000; - nodemap_size = round_up(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); + nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT, nodemap_size, L1_CACHE_BYTES); if (nodemap_addr == -1UL) { @@ -182,10 +176,10 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size; unsigned long bootmap_start, nodedata_phys; void *bootmap; - const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); + const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE); int nid; - start = round_up(start, ZONE_ALIGN); + start = roundup(start, ZONE_ALIGN); printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end); @@ -202,7 +196,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, nodedata_phys + pgdat_size - 1); memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); - NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; + NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid]; NODE_DATA(nodeid)->node_start_pfn = start_pfn; NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn; @@ -216,9 +210,9 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn); nid = phys_to_nid(nodedata_phys); if (nid == nodeid) - bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); + bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE); else - bootmap_start = round_up(start, PAGE_SIZE); + bootmap_start = roundup(start, PAGE_SIZE); /* * SMP_CACHE_BYTES could be enough, but init_bootmem_node like * to use that to align to PAGE_SIZE diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index 0dcd42eb94e6..e1d106909218 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -32,7 +32,7 @@ enum { GPS = (1<<30) }; -#define PAGE_TESTBIT __pgprot(_PAGE_UNUSED1) +#define PAGE_CPA_TEST __pgprot(_PAGE_CPA_TEST) static int pte_testbit(pte_t pte) { @@ -118,6 +118,7 @@ static int pageattr_test(void) unsigned int level; int i, k; int err; + unsigned long test_addr; if (print) printk(KERN_INFO "CPA self-test:\n"); @@ -172,7 +173,8 @@ static int pageattr_test(void) continue; } - err = change_page_attr_set(addr[i], len[i], PAGE_TESTBIT); + test_addr = addr[i]; + err = change_page_attr_set(&test_addr, len[i], PAGE_CPA_TEST, 0); if (err < 0) { printk(KERN_ERR "CPA %d failed %d\n", i, err); failed++; @@ -204,7 +206,8 @@ static int pageattr_test(void) failed++; continue; } - err = change_page_attr_clear(addr[i], len[i], PAGE_TESTBIT); + test_addr = addr[i]; + err = change_page_attr_clear(&test_addr, len[i], PAGE_CPA_TEST, 0); if (err < 0) { printk(KERN_ERR "CPA reverting failed: %d\n", err); failed++; @@ -221,8 +224,7 @@ static int pageattr_test(void) failed += print_split(&sc); if (failed) { - printk(KERN_ERR "NOT PASSED. Please report.\n"); - WARN_ON(1); + WARN(1, KERN_ERR "NOT PASSED. Please report.\n"); return -EINVAL; } else { if (print) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 65c6e46bf059..f1dc1b75d166 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -25,15 +25,27 @@ * The current flushing context - we pass it instead of 5 arguments: */ struct cpa_data { - unsigned long vaddr; + unsigned long *vaddr; pgprot_t mask_set; pgprot_t mask_clr; int numpages; - int flushtlb; + int flags; unsigned long pfn; unsigned force_split : 1; + int curpage; }; +/* + * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings) + * using cpa_lock. So that we don't allow any other cpu, with stale large tlb + * entries change the page attribute in parallel to some other cpu + * splitting a large page entry along with changing the attribute. + */ +static DEFINE_SPINLOCK(cpa_lock); + +#define CPA_FLUSHTLB 1 +#define CPA_ARRAY 2 + #ifdef CONFIG_PROC_FS static unsigned long direct_pages_count[PG_LEVEL_NUM]; @@ -53,17 +65,22 @@ static void split_page_count(int level) direct_pages_count[level - 1] += PTRS_PER_PTE; } -int arch_report_meminfo(char *page) +void arch_report_meminfo(struct seq_file *m) { - int n = sprintf(page, "DirectMap4k: %8lu\n" - "DirectMap2M: %8lu\n", - direct_pages_count[PG_LEVEL_4K], - direct_pages_count[PG_LEVEL_2M]); + seq_printf(m, "DirectMap4k: %8lu kB\n", + direct_pages_count[PG_LEVEL_4K] << 2); +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) + seq_printf(m, "DirectMap2M: %8lu kB\n", + direct_pages_count[PG_LEVEL_2M] << 11); +#else + seq_printf(m, "DirectMap4M: %8lu kB\n", + direct_pages_count[PG_LEVEL_2M] << 12); +#endif #ifdef CONFIG_X86_64 - n += sprintf(page + n, "DirectMap1G: %8lu\n", - direct_pages_count[PG_LEVEL_1G]); + if (direct_gbpages) + seq_printf(m, "DirectMap1G: %8lu kB\n", + direct_pages_count[PG_LEVEL_1G] << 20); #endif - return n; } #else static inline void split_page_count(int level) { } @@ -78,7 +95,7 @@ static inline unsigned long highmap_start_pfn(void) static inline unsigned long highmap_end_pfn(void) { - return __pa(round_up((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT; + return __pa(roundup((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT; } #endif @@ -184,6 +201,41 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache) } } +static void cpa_flush_array(unsigned long *start, int numpages, int cache) +{ + unsigned int i, level; + unsigned long *addr; + + BUG_ON(irqs_disabled()); + + on_each_cpu(__cpa_flush_range, NULL, 1); + + if (!cache) + return; + + /* 4M threshold */ + if (numpages >= 1024) { + if (boot_cpu_data.x86_model >= 4) + wbinvd(); + return; + } + /* + * We only need to flush on one CPU, + * clflush is a MESI-coherent instruction that + * will cause all other CPUs to flush the same + * cachelines: + */ + for (i = 0, addr = start; i < numpages; i++, addr++) { + pte_t *pte = lookup_address(*addr, &level); + + /* + * Only flush present addresses: + */ + if (pte && (pte_val(*pte) & _PAGE_PRESENT)) + clflush_cache_range((void *) *addr, PAGE_SIZE); + } +} + /* * Certain areas of memory on x86 require very specific protection flags, * for example the BIOS area or kernel text. Callers don't always get this @@ -392,7 +444,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, */ new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot)); __set_pmd_pte(kpte, address, new_pte); - cpa->flushtlb = 1; + cpa->flags |= CPA_FLUSHTLB; do_split = 0; } @@ -402,84 +454,6 @@ out_unlock: return do_split; } -static LIST_HEAD(page_pool); -static unsigned long pool_size, pool_pages, pool_low; -static unsigned long pool_used, pool_failed; - -static void cpa_fill_pool(struct page **ret) -{ - gfp_t gfp = GFP_KERNEL; - unsigned long flags; - struct page *p; - - /* - * Avoid recursion (on debug-pagealloc) and also signal - * our priority to get to these pagetables: - */ - if (current->flags & PF_MEMALLOC) - return; - current->flags |= PF_MEMALLOC; - - /* - * Allocate atomically from atomic contexts: - */ - if (in_atomic() || irqs_disabled() || debug_pagealloc) - gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN; - - while (pool_pages < pool_size || (ret && !*ret)) { - p = alloc_pages(gfp, 0); - if (!p) { - pool_failed++; - break; - } - /* - * If the call site needs a page right now, provide it: - */ - if (ret && !*ret) { - *ret = p; - continue; - } - spin_lock_irqsave(&pgd_lock, flags); - list_add(&p->lru, &page_pool); - pool_pages++; - spin_unlock_irqrestore(&pgd_lock, flags); - } - - current->flags &= ~PF_MEMALLOC; -} - -#define SHIFT_MB (20 - PAGE_SHIFT) -#define ROUND_MB_GB ((1 << 10) - 1) -#define SHIFT_MB_GB 10 -#define POOL_PAGES_PER_GB 16 - -void __init cpa_init(void) -{ - struct sysinfo si; - unsigned long gb; - - si_meminfo(&si); - /* - * Calculate the number of pool pages: - * - * Convert totalram (nr of pages) to MiB and round to the next - * GiB. Shift MiB to Gib and multiply the result by - * POOL_PAGES_PER_GB: - */ - if (debug_pagealloc) { - gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB; - pool_size = POOL_PAGES_PER_GB * gb; - } else { - pool_size = 1; - } - pool_low = pool_size; - - cpa_fill_pool(NULL); - printk(KERN_DEBUG - "CPA: page pool initialized %lu of %lu pages preallocated\n", - pool_pages, pool_size); -} - static int split_large_page(pte_t *kpte, unsigned long address) { unsigned long flags, pfn, pfninc = 1; @@ -488,28 +462,15 @@ static int split_large_page(pte_t *kpte, unsigned long address) pgprot_t ref_prot; struct page *base; - /* - * Get a page from the pool. The pool list is protected by the - * pgd_lock, which we have to take anyway for the split - * operation: - */ - spin_lock_irqsave(&pgd_lock, flags); - if (list_empty(&page_pool)) { - spin_unlock_irqrestore(&pgd_lock, flags); - base = NULL; - cpa_fill_pool(&base); - if (!base) - return -ENOMEM; - spin_lock_irqsave(&pgd_lock, flags); - } else { - base = list_first_entry(&page_pool, struct page, lru); - list_del(&base->lru); - pool_pages--; - - if (pool_pages < pool_low) - pool_low = pool_pages; - } + if (!debug_pagealloc) + spin_unlock(&cpa_lock); + base = alloc_pages(GFP_KERNEL, 0); + if (!debug_pagealloc) + spin_lock(&cpa_lock); + if (!base) + return -ENOMEM; + spin_lock_irqsave(&pgd_lock, flags); /* * Check for races, another CPU might have split this page * up for us already: @@ -566,11 +527,8 @@ out_unlock: * If we dropped out via the lookup_address check under * pgd_lock then stick the page back into the pool: */ - if (base) { - list_add(&base->lru, &page_pool); - pool_pages++; - } else - pool_used++; + if (base) + __free_page(base); spin_unlock_irqrestore(&pgd_lock, flags); return 0; @@ -578,11 +536,16 @@ out_unlock: static int __change_page_attr(struct cpa_data *cpa, int primary) { - unsigned long address = cpa->vaddr; + unsigned long address; int do_split, err; unsigned int level; pte_t *kpte, old_pte; + if (cpa->flags & CPA_ARRAY) + address = cpa->vaddr[cpa->curpage]; + else + address = *cpa->vaddr; + repeat: kpte = lookup_address(address, &level); if (!kpte) @@ -592,10 +555,9 @@ repeat: if (!pte_val(old_pte)) { if (!primary) return 0; - printk(KERN_WARNING "CPA: called for zero pte. " + WARN(1, KERN_WARNING "CPA: called for zero pte. " "vaddr = %lx cpa->vaddr = %lx\n", address, - cpa->vaddr); - WARN_ON(1); + *cpa->vaddr); return -EINVAL; } @@ -621,7 +583,7 @@ repeat: */ if (pte_val(old_pte) != pte_val(new_pte)) { set_pte_atomic(kpte, new_pte); - cpa->flushtlb = 1; + cpa->flags |= CPA_FLUSHTLB; } cpa->numpages = 1; return 0; @@ -645,7 +607,25 @@ repeat: */ err = split_large_page(kpte, address); if (!err) { - cpa->flushtlb = 1; + /* + * Do a global flush tlb after splitting the large page + * and before we do the actual change page attribute in the PTE. + * + * With out this, we violate the TLB application note, that says + * "The TLBs may contain both ordinary and large-page + * translations for a 4-KByte range of linear addresses. This + * may occur if software modifies the paging structures so that + * the page size used for the address range changes. If the two + * translations differ with respect to page frame or attributes + * (e.g., permissions), processor behavior is undefined and may + * be implementation-specific." + * + * We do this global tlb flush inside the cpa_lock, so that we + * don't allow any other cpu, with stale tlb entries change the + * page attribute in parallel, that also falls into the + * just split large page entry. + */ + flush_tlb_all(); goto repeat; } @@ -658,6 +638,7 @@ static int cpa_process_alias(struct cpa_data *cpa) { struct cpa_data alias_cpa; int ret = 0; + unsigned long temp_cpa_vaddr, vaddr; if (cpa->pfn >= max_pfn_mapped) return 0; @@ -670,16 +651,24 @@ static int cpa_process_alias(struct cpa_data *cpa) * No need to redo, when the primary call touched the direct * mapping already: */ - if (!(within(cpa->vaddr, PAGE_OFFSET, + if (cpa->flags & CPA_ARRAY) + vaddr = cpa->vaddr[cpa->curpage]; + else + vaddr = *cpa->vaddr; + + if (!(within(vaddr, PAGE_OFFSET, PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT)) #ifdef CONFIG_X86_64 - || within(cpa->vaddr, PAGE_OFFSET + (1UL<<32), + || within(vaddr, PAGE_OFFSET + (1UL<<32), PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)) #endif )) { alias_cpa = *cpa; - alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); + temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); + alias_cpa.vaddr = &temp_cpa_vaddr; + alias_cpa.flags &= ~CPA_ARRAY; + ret = __change_page_attr_set_clr(&alias_cpa, 0); } @@ -691,7 +680,7 @@ static int cpa_process_alias(struct cpa_data *cpa) * No need to redo, when the primary call touched the high * mapping already: */ - if (within(cpa->vaddr, (unsigned long) _text, (unsigned long) _end)) + if (within(vaddr, (unsigned long) _text, (unsigned long) _end)) return 0; /* @@ -702,8 +691,9 @@ static int cpa_process_alias(struct cpa_data *cpa) return 0; alias_cpa = *cpa; - alias_cpa.vaddr = - (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; + temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; + alias_cpa.vaddr = &temp_cpa_vaddr; + alias_cpa.flags &= ~CPA_ARRAY; /* * The high mapping range is imprecise, so ignore the return value. @@ -723,8 +713,15 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) * preservation check. */ cpa->numpages = numpages; + /* for array changes, we can't use large page */ + if (cpa->flags & CPA_ARRAY) + cpa->numpages = 1; + if (!debug_pagealloc) + spin_lock(&cpa_lock); ret = __change_page_attr(cpa, checkalias); + if (!debug_pagealloc) + spin_unlock(&cpa_lock); if (ret) return ret; @@ -741,7 +738,11 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) */ BUG_ON(cpa->numpages > numpages); numpages -= cpa->numpages; - cpa->vaddr += cpa->numpages * PAGE_SIZE; + if (cpa->flags & CPA_ARRAY) + cpa->curpage++; + else + *cpa->vaddr += cpa->numpages * PAGE_SIZE; + } return 0; } @@ -752,9 +753,9 @@ static inline int cache_attr(pgprot_t attr) (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD); } -static int change_page_attr_set_clr(unsigned long addr, int numpages, +static int change_page_attr_set_clr(unsigned long *addr, int numpages, pgprot_t mask_set, pgprot_t mask_clr, - int force_split) + int force_split, int array) { struct cpa_data cpa; int ret, cache, checkalias; @@ -769,21 +770,40 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, return 0; /* Ensure we are PAGE_SIZE aligned */ - if (addr & ~PAGE_MASK) { - addr &= PAGE_MASK; - /* - * People should not be passing in unaligned addresses: - */ - WARN_ON_ONCE(1); + if (!array) { + if (*addr & ~PAGE_MASK) { + *addr &= PAGE_MASK; + /* + * People should not be passing in unaligned addresses: + */ + WARN_ON_ONCE(1); + } + } else { + int i; + for (i = 0; i < numpages; i++) { + if (addr[i] & ~PAGE_MASK) { + addr[i] &= PAGE_MASK; + WARN_ON_ONCE(1); + } + } } + /* Must avoid aliasing mappings in the highmem code */ + kmap_flush_unused(); + + vm_unmap_aliases(); + cpa.vaddr = addr; cpa.numpages = numpages; cpa.mask_set = mask_set; cpa.mask_clr = mask_clr; - cpa.flushtlb = 0; + cpa.flags = 0; + cpa.curpage = 0; cpa.force_split = force_split; + if (array) + cpa.flags |= CPA_ARRAY; + /* No alias checking for _NX bit modifications */ checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; @@ -792,7 +812,7 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, /* * Check whether we really changed something: */ - if (!cpa.flushtlb) + if (!(cpa.flags & CPA_FLUSHTLB)) goto out; /* @@ -807,27 +827,30 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, * error case we fall back to cpa_flush_all (which uses * wbindv): */ - if (!ret && cpu_has_clflush) - cpa_flush_range(addr, numpages, cache); - else + if (!ret && cpu_has_clflush) { + if (cpa.flags & CPA_ARRAY) + cpa_flush_array(addr, numpages, cache); + else + cpa_flush_range(*addr, numpages, cache); + } else cpa_flush_all(cache); out: - cpa_fill_pool(NULL); - return ret; } -static inline int change_page_attr_set(unsigned long addr, int numpages, - pgprot_t mask) +static inline int change_page_attr_set(unsigned long *addr, int numpages, + pgprot_t mask, int array) { - return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0); + return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0, + array); } -static inline int change_page_attr_clear(unsigned long addr, int numpages, - pgprot_t mask) +static inline int change_page_attr_clear(unsigned long *addr, int numpages, + pgprot_t mask, int array) { - return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0); + return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0, + array); } int _set_memory_uc(unsigned long addr, int numpages) @@ -835,8 +858,8 @@ int _set_memory_uc(unsigned long addr, int numpages) /* * for now UC MINUS. see comments in ioremap_nocache() */ - return change_page_attr_set(addr, numpages, - __pgprot(_PAGE_CACHE_UC_MINUS)); + return change_page_attr_set(&addr, numpages, + __pgprot(_PAGE_CACHE_UC_MINUS), 0); } int set_memory_uc(unsigned long addr, int numpages) @@ -844,7 +867,7 @@ int set_memory_uc(unsigned long addr, int numpages) /* * for now UC MINUS. see comments in ioremap_nocache() */ - if (reserve_memtype(addr, addr + numpages * PAGE_SIZE, + if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, _PAGE_CACHE_UC_MINUS, NULL)) return -EINVAL; @@ -852,10 +875,48 @@ int set_memory_uc(unsigned long addr, int numpages) } EXPORT_SYMBOL(set_memory_uc); +int set_memory_array_uc(unsigned long *addr, int addrinarray) +{ + unsigned long start; + unsigned long end; + int i; + /* + * for now UC MINUS. see comments in ioremap_nocache() + */ + for (i = 0; i < addrinarray; i++) { + start = __pa(addr[i]); + for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) { + if (end != __pa(addr[i + 1])) + break; + i++; + } + if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL)) + goto out; + } + + return change_page_attr_set(addr, addrinarray, + __pgprot(_PAGE_CACHE_UC_MINUS), 1); +out: + for (i = 0; i < addrinarray; i++) { + unsigned long tmp = __pa(addr[i]); + + if (tmp == start) + break; + for (end = tmp + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) { + if (end != __pa(addr[i + 1])) + break; + i++; + } + free_memtype(tmp, end); + } + return -EINVAL; +} +EXPORT_SYMBOL(set_memory_array_uc); + int _set_memory_wc(unsigned long addr, int numpages) { - return change_page_attr_set(addr, numpages, - __pgprot(_PAGE_CACHE_WC)); + return change_page_attr_set(&addr, numpages, + __pgprot(_PAGE_CACHE_WC), 0); } int set_memory_wc(unsigned long addr, int numpages) @@ -863,7 +924,7 @@ int set_memory_wc(unsigned long addr, int numpages) if (!pat_enabled) return set_memory_uc(addr, numpages); - if (reserve_memtype(addr, addr + numpages * PAGE_SIZE, + if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, _PAGE_CACHE_WC, NULL)) return -EINVAL; @@ -873,49 +934,71 @@ EXPORT_SYMBOL(set_memory_wc); int _set_memory_wb(unsigned long addr, int numpages) { - return change_page_attr_clear(addr, numpages, - __pgprot(_PAGE_CACHE_MASK)); + return change_page_attr_clear(&addr, numpages, + __pgprot(_PAGE_CACHE_MASK), 0); } int set_memory_wb(unsigned long addr, int numpages) { - free_memtype(addr, addr + numpages * PAGE_SIZE); + free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); return _set_memory_wb(addr, numpages); } EXPORT_SYMBOL(set_memory_wb); +int set_memory_array_wb(unsigned long *addr, int addrinarray) +{ + int i; + + for (i = 0; i < addrinarray; i++) { + unsigned long start = __pa(addr[i]); + unsigned long end; + + for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) { + if (end != __pa(addr[i + 1])) + break; + i++; + } + free_memtype(start, end); + } + return change_page_attr_clear(addr, addrinarray, + __pgprot(_PAGE_CACHE_MASK), 1); +} +EXPORT_SYMBOL(set_memory_array_wb); + int set_memory_x(unsigned long addr, int numpages) { - return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX)); + return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0); } EXPORT_SYMBOL(set_memory_x); int set_memory_nx(unsigned long addr, int numpages) { - return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX)); + return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0); } EXPORT_SYMBOL(set_memory_nx); int set_memory_ro(unsigned long addr, int numpages) { - return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW)); + return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0); } +EXPORT_SYMBOL_GPL(set_memory_ro); int set_memory_rw(unsigned long addr, int numpages) { - return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW)); + return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0); } +EXPORT_SYMBOL_GPL(set_memory_rw); int set_memory_np(unsigned long addr, int numpages) { - return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT)); + return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0); } int set_memory_4k(unsigned long addr, int numpages) { - return change_page_attr_set_clr(addr, numpages, __pgprot(0), - __pgprot(0), 1); + return change_page_attr_set_clr(&addr, numpages, __pgprot(0), + __pgprot(0), 1, 0); } int set_pages_uc(struct page *page, int numpages) @@ -968,22 +1051,38 @@ int set_pages_rw(struct page *page, int numpages) static int __set_pages_p(struct page *page, int numpages) { - struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), + unsigned long tempaddr = (unsigned long) page_address(page); + struct cpa_data cpa = { .vaddr = &tempaddr, .numpages = numpages, .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), - .mask_clr = __pgprot(0)}; + .mask_clr = __pgprot(0), + .flags = 0}; - return __change_page_attr_set_clr(&cpa, 1); + /* + * No alias checking needed for setting present flag. otherwise, + * we may need to break large pages for 64-bit kernel text + * mappings (this adds to complexity if we want to do this from + * atomic context especially). Let's keep it simple! + */ + return __change_page_attr_set_clr(&cpa, 0); } static int __set_pages_np(struct page *page, int numpages) { - struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), + unsigned long tempaddr = (unsigned long) page_address(page); + struct cpa_data cpa = { .vaddr = &tempaddr, .numpages = numpages, .mask_set = __pgprot(0), - .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)}; + .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), + .flags = 0}; - return __change_page_attr_set_clr(&cpa, 1); + /* + * No alias checking needed for setting not present flag. otherwise, + * we may need to break large pages for 64-bit kernel text + * mappings (this adds to complexity if we want to do this from + * atomic context especially). Let's keep it simple! + */ + return __change_page_attr_set_clr(&cpa, 0); } void kernel_map_pages(struct page *page, int numpages, int enable) @@ -1003,11 +1102,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable) /* * The return value is ignored as the calls cannot fail. - * Large pages are kept enabled at boot time, and are - * split up quickly with DEBUG_PAGEALLOC. If a splitup - * fails here (due to temporary memory shortage) no damage - * is done because we just keep the largepage intact up - * to the next attempt when it will likely be split up: + * Large pages for identity mappings are not used at boot time + * and hence no memory allocations during large page split. */ if (enable) __set_pages_p(page, numpages); @@ -1019,53 +1115,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable) * but that can deadlock->flush only current cpu: */ __flush_tlb_all(); - - /* - * Try to refill the page pool here. We can do this only after - * the tlb flush. - */ - cpa_fill_pool(NULL); -} - -#ifdef CONFIG_DEBUG_FS -static int dpa_show(struct seq_file *m, void *v) -{ - seq_puts(m, "DEBUG_PAGEALLOC\n"); - seq_printf(m, "pool_size : %lu\n", pool_size); - seq_printf(m, "pool_pages : %lu\n", pool_pages); - seq_printf(m, "pool_low : %lu\n", pool_low); - seq_printf(m, "pool_used : %lu\n", pool_used); - seq_printf(m, "pool_failed : %lu\n", pool_failed); - - return 0; } -static int dpa_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, dpa_show, NULL); -} - -static const struct file_operations dpa_fops = { - .open = dpa_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int __init debug_pagealloc_proc_init(void) -{ - struct dentry *de; - - de = debugfs_create_file("debug_pagealloc", 0600, NULL, NULL, - &dpa_fops); - if (!de) - return -ENOMEM; - - return 0; -} -__initcall(debug_pagealloc_proc_init); -#endif - #ifdef CONFIG_HIBERNATION bool kernel_page_present(struct page *page) diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index d4585077977a..738fd0f24958 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -7,22 +7,24 @@ * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen. */ -#include <linux/mm.h> +#include <linux/seq_file.h> +#include <linux/bootmem.h> +#include <linux/debugfs.h> #include <linux/kernel.h> #include <linux/gfp.h> +#include <linux/mm.h> #include <linux/fs.h> -#include <linux/bootmem.h> -#include <asm/msr.h> -#include <asm/tlbflush.h> +#include <asm/cacheflush.h> #include <asm/processor.h> -#include <asm/page.h> +#include <asm/tlbflush.h> #include <asm/pgtable.h> -#include <asm/pat.h> -#include <asm/e820.h> -#include <asm/cacheflush.h> #include <asm/fcntl.h> +#include <asm/e820.h> #include <asm/mtrr.h> +#include <asm/page.h> +#include <asm/msr.h> +#include <asm/pat.h> #include <asm/io.h> #ifdef CONFIG_X86_PAT @@ -44,6 +46,7 @@ early_param("nopat", nopat); static int debug_enable; + static int __init pat_debug_setup(char *str) { debug_enable = 1; @@ -143,14 +146,14 @@ static char *cattr_name(unsigned long flags) */ struct memtype { - u64 start; - u64 end; - unsigned long type; - struct list_head nd; + u64 start; + u64 end; + unsigned long type; + struct list_head nd; }; static LIST_HEAD(memtype_list); -static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ +static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ /* * Does intersection of PAT memory type and MTRR memory type and returns @@ -178,8 +181,8 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type) return req_type; } -static int chk_conflict(struct memtype *new, struct memtype *entry, - unsigned long *type) +static int +chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type) { if (new->type != entry->type) { if (type) { @@ -205,6 +208,69 @@ static int chk_conflict(struct memtype *new, struct memtype *entry, return -EBUSY; } +static struct memtype *cached_entry; +static u64 cached_start; + +/* + * For RAM pages, mark the pages as non WB memory type using + * PageNonWB (PG_arch_1). We allow only one set_memory_uc() or + * set_memory_wc() on a RAM page at a time before marking it as WB again. + * This is ok, because only one driver will be owning the page and + * doing set_memory_*() calls. + * + * For now, we use PageNonWB to track that the RAM page is being mapped + * as non WB. In future, we will have to use one more flag + * (or some other mechanism in page_struct) to distinguish between + * UC and WC mapping. + */ +static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type, + unsigned long *new_type) +{ + struct page *page; + u64 pfn, end_pfn; + + for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { + page = pfn_to_page(pfn); + if (page_mapped(page) || PageNonWB(page)) + goto out; + + SetPageNonWB(page); + } + return 0; + +out: + end_pfn = pfn; + for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) { + page = pfn_to_page(pfn); + ClearPageNonWB(page); + } + + return -EINVAL; +} + +static int free_ram_pages_type(u64 start, u64 end) +{ + struct page *page; + u64 pfn, end_pfn; + + for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { + page = pfn_to_page(pfn); + if (page_mapped(page) || !PageNonWB(page)) + goto out; + + ClearPageNonWB(page); + } + return 0; + +out: + end_pfn = pfn; + for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) { + page = pfn_to_page(pfn); + SetPageNonWB(page); + } + return -EINVAL; +} + /* * req_type typically has one of the: * - _PAGE_CACHE_WB @@ -221,14 +287,15 @@ static int chk_conflict(struct memtype *new, struct memtype *entry, * it will return a negative return value. */ int reserve_memtype(u64 start, u64 end, unsigned long req_type, - unsigned long *new_type) + unsigned long *new_type) { struct memtype *new, *entry; unsigned long actual_type; struct list_head *where; + int is_range_ram; int err = 0; - BUG_ON(start >= end); /* end is exclusive */ + BUG_ON(start >= end); /* end is exclusive */ if (!pat_enabled) { /* This is identical to page table setting without PAT */ @@ -261,28 +328,41 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, actual_type = _PAGE_CACHE_WB; else actual_type = _PAGE_CACHE_UC_MINUS; - } else + } else { actual_type = pat_x_mtrr_type(start, end, req_type & _PAGE_CACHE_MASK); + } + + is_range_ram = pagerange_is_ram(start, end); + if (is_range_ram == 1) + return reserve_ram_pages_type(start, end, req_type, new_type); + else if (is_range_ram < 0) + return -EINVAL; new = kmalloc(sizeof(struct memtype), GFP_KERNEL); if (!new) return -ENOMEM; - new->start = start; - new->end = end; - new->type = actual_type; + new->start = start; + new->end = end; + new->type = actual_type; if (new_type) *new_type = actual_type; spin_lock(&memtype_lock); + if (cached_entry && start >= cached_start) + entry = cached_entry; + else + entry = list_entry(&memtype_list, struct memtype, nd); + /* Search for existing mapping that overlaps the current range */ where = NULL; - list_for_each_entry(entry, &memtype_list, nd) { + list_for_each_entry_continue(entry, &memtype_list, nd) { if (end <= entry->start) { where = entry->nd.prev; + cached_entry = list_entry(where, struct memtype, nd); break; } else if (start <= entry->start) { /* end > entry->start */ err = chk_conflict(new, entry, new_type); @@ -290,6 +370,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, dprintk("Overlap at 0x%Lx-0x%Lx\n", entry->start, entry->end); where = entry->nd.prev; + cached_entry = list_entry(where, + struct memtype, nd); } break; } else if (start < entry->end) { /* start > entry->start */ @@ -297,7 +379,20 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, if (!err) { dprintk("Overlap at 0x%Lx-0x%Lx\n", entry->start, entry->end); - where = &entry->nd; + cached_entry = list_entry(entry->nd.prev, + struct memtype, nd); + + /* + * Move to right position in the linked + * list to add this new entry + */ + list_for_each_entry_continue(entry, + &memtype_list, nd) { + if (start <= entry->start) { + where = entry->nd.prev; + break; + } + } } break; } @@ -309,9 +404,12 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, start, end, cattr_name(new->type), cattr_name(req_type)); kfree(new); spin_unlock(&memtype_lock); + return err; } + cached_start = start; + if (where) list_add(&new->nd, where); else @@ -330,6 +428,7 @@ int free_memtype(u64 start, u64 end) { struct memtype *entry; int err = -EINVAL; + int is_range_ram; if (!pat_enabled) return 0; @@ -338,9 +437,18 @@ int free_memtype(u64 start, u64 end) if (is_ISA_range(start, end - 1)) return 0; + is_range_ram = pagerange_is_ram(start, end); + if (is_range_ram == 1) + return free_ram_pages_type(start, end); + else if (is_range_ram < 0) + return -EINVAL; + spin_lock(&memtype_lock); list_for_each_entry(entry, &memtype_list, nd) { if (entry->start == start && entry->end == end) { + if (cached_entry == entry || cached_start == start) + cached_entry = NULL; + list_del(&entry->nd); kfree(entry); err = 0; @@ -355,26 +463,19 @@ int free_memtype(u64 start, u64 end) } dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end); + return err; } -/* - * /dev/mem mmap interface. The memtype used for mapping varies: - * - Use UC for mappings with O_SYNC flag - * - Without O_SYNC flag, if there is any conflict in reserve_memtype, - * inherit the memtype from existing mapping. - * - Else use UC_MINUS memtype (for backward compatibility with existing - * X drivers. - */ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot) { return vma_prot; } -#ifdef CONFIG_NONPROMISC_DEVMEM -/* This check is done in drivers/char/mem.c in case of NONPROMISC_DEVMEM*/ +#ifdef CONFIG_STRICT_DEVMEM +/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/ static inline int range_is_allowed(unsigned long pfn, unsigned long size) { return 1; @@ -398,20 +499,20 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size) } return 1; } -#endif /* CONFIG_NONPROMISC_DEVMEM */ +#endif /* CONFIG_STRICT_DEVMEM */ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, unsigned long size, pgprot_t *vma_prot) { u64 offset = ((u64) pfn) << PAGE_SHIFT; - unsigned long flags = _PAGE_CACHE_UC_MINUS; + unsigned long flags = -1; int retval; if (!range_is_allowed(pfn, size)) return 0; if (file->f_flags & O_SYNC) { - flags = _PAGE_CACHE_UC; + flags = _PAGE_CACHE_UC_MINUS; } #ifdef CONFIG_X86_32 @@ -434,13 +535,14 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, #endif /* - * With O_SYNC, we can only take UC mapping. Fail if we cannot. + * With O_SYNC, we can only take UC_MINUS mapping. Fail if we cannot. + * * Without O_SYNC, we want to get * - WB for WB-able memory and no other conflicting mappings * - UC_MINUS for non-WB-able memory with no other conflicting mappings * - Inherit from confliting mappings otherwise */ - if (flags != _PAGE_CACHE_UC_MINUS) { + if (flags != -1) { retval = reserve_memtype(offset, offset + size, flags, NULL); } else { retval = reserve_memtype(offset, offset + size, -1, &flags); @@ -468,9 +570,9 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) { + unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK); u64 addr = (u64)pfn << PAGE_SHIFT; unsigned long flags; - unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK); reserve_memtype(addr, addr + size, want_flags, &flags); if (flags != want_flags) { @@ -489,3 +591,91 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) free_memtype(addr, addr + size); } + +#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT) + +/* get Nth element of the linked list */ +static struct memtype *memtype_get_idx(loff_t pos) +{ + struct memtype *list_node, *print_entry; + int i = 1; + + print_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL); + if (!print_entry) + return NULL; + + spin_lock(&memtype_lock); + list_for_each_entry(list_node, &memtype_list, nd) { + if (pos == i) { + *print_entry = *list_node; + spin_unlock(&memtype_lock); + return print_entry; + } + ++i; + } + spin_unlock(&memtype_lock); + kfree(print_entry); + + return NULL; +} + +static void *memtype_seq_start(struct seq_file *seq, loff_t *pos) +{ + if (*pos == 0) { + ++*pos; + seq_printf(seq, "PAT memtype list:\n"); + } + + return memtype_get_idx(*pos); +} + +static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + return memtype_get_idx(*pos); +} + +static void memtype_seq_stop(struct seq_file *seq, void *v) +{ +} + +static int memtype_seq_show(struct seq_file *seq, void *v) +{ + struct memtype *print_entry = (struct memtype *)v; + + seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type), + print_entry->start, print_entry->end); + kfree(print_entry); + + return 0; +} + +static struct seq_operations memtype_seq_ops = { + .start = memtype_seq_start, + .next = memtype_seq_next, + .stop = memtype_seq_stop, + .show = memtype_seq_show, +}; + +static int memtype_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &memtype_seq_ops); +} + +static const struct file_operations memtype_fops = { + .open = memtype_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init pat_memtype_list_init(void) +{ + debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir, + NULL, &memtype_fops); + return 0; +} + +late_initcall(pat_memtype_list_init); + +#endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */ diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c index efa1911e20ca..df3d5c861cda 100644 --- a/arch/x86/mm/pf_in.c +++ b/arch/x86/mm/pf_in.c @@ -79,25 +79,34 @@ static unsigned int mw32[] = { 0xC7 }; static unsigned int mw64[] = { 0x89, 0x8B }; #endif /* not __i386__ */ -static int skip_prefix(unsigned char *addr, int *shorted, int *enlarged, - int *rexr) +struct prefix_bits { + unsigned shorted:1; + unsigned enlarged:1; + unsigned rexr:1; + unsigned rex:1; +}; + +static int skip_prefix(unsigned char *addr, struct prefix_bits *prf) { int i; unsigned char *p = addr; - *shorted = 0; - *enlarged = 0; - *rexr = 0; + prf->shorted = 0; + prf->enlarged = 0; + prf->rexr = 0; + prf->rex = 0; restart: for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) { if (*p == prefix_codes[i]) { if (*p == 0x66) - *shorted = 1; + prf->shorted = 1; #ifdef __amd64__ if ((*p & 0xf8) == 0x48) - *enlarged = 1; + prf->enlarged = 1; if ((*p & 0xf4) == 0x44) - *rexr = 1; + prf->rexr = 1; + if ((*p & 0xf0) == 0x40) + prf->rex = 1; #endif p++; goto restart; @@ -135,12 +144,12 @@ enum reason_type get_ins_type(unsigned long ins_addr) { unsigned int opcode; unsigned char *p; - int shorted, enlarged, rexr; + struct prefix_bits prf; int i; enum reason_type rv = OTHERS; p = (unsigned char *)ins_addr; - p += skip_prefix(p, &shorted, &enlarged, &rexr); + p += skip_prefix(p, &prf); p += get_opcode(p, &opcode); CHECK_OP_TYPE(opcode, reg_rop, REG_READ); @@ -156,10 +165,11 @@ static unsigned int get_ins_reg_width(unsigned long ins_addr) { unsigned int opcode; unsigned char *p; - int i, shorted, enlarged, rexr; + struct prefix_bits prf; + int i; p = (unsigned char *)ins_addr; - p += skip_prefix(p, &shorted, &enlarged, &rexr); + p += skip_prefix(p, &prf); p += get_opcode(p, &opcode); for (i = 0; i < ARRAY_SIZE(rw8); i++) @@ -168,7 +178,7 @@ static unsigned int get_ins_reg_width(unsigned long ins_addr) for (i = 0; i < ARRAY_SIZE(rw32); i++) if (rw32[i] == opcode) - return (shorted ? 2 : (enlarged ? 8 : 4)); + return prf.shorted ? 2 : (prf.enlarged ? 8 : 4); printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode); return 0; @@ -178,10 +188,11 @@ unsigned int get_ins_mem_width(unsigned long ins_addr) { unsigned int opcode; unsigned char *p; - int i, shorted, enlarged, rexr; + struct prefix_bits prf; + int i; p = (unsigned char *)ins_addr; - p += skip_prefix(p, &shorted, &enlarged, &rexr); + p += skip_prefix(p, &prf); p += get_opcode(p, &opcode); for (i = 0; i < ARRAY_SIZE(mw8); i++) @@ -194,11 +205,11 @@ unsigned int get_ins_mem_width(unsigned long ins_addr) for (i = 0; i < ARRAY_SIZE(mw32); i++) if (mw32[i] == opcode) - return shorted ? 2 : 4; + return prf.shorted ? 2 : 4; for (i = 0; i < ARRAY_SIZE(mw64); i++) if (mw64[i] == opcode) - return shorted ? 2 : (enlarged ? 8 : 4); + return prf.shorted ? 2 : (prf.enlarged ? 8 : 4); printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode); return 0; @@ -238,7 +249,7 @@ enum { #endif }; -static unsigned char *get_reg_w8(int no, struct pt_regs *regs) +static unsigned char *get_reg_w8(int no, int rex, struct pt_regs *regs) { unsigned char *rv = NULL; @@ -255,18 +266,6 @@ static unsigned char *get_reg_w8(int no, struct pt_regs *regs) case arg_DL: rv = (unsigned char *)®s->dx; break; - case arg_AH: - rv = 1 + (unsigned char *)®s->ax; - break; - case arg_BH: - rv = 1 + (unsigned char *)®s->bx; - break; - case arg_CH: - rv = 1 + (unsigned char *)®s->cx; - break; - case arg_DH: - rv = 1 + (unsigned char *)®s->dx; - break; #ifdef __amd64__ case arg_R8: rv = (unsigned char *)®s->r8; @@ -294,9 +293,55 @@ static unsigned char *get_reg_w8(int no, struct pt_regs *regs) break; #endif default: - printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no); break; } + + if (rv) + return rv; + + if (rex) { + /* + * If REX prefix exists, access low bytes of SI etc. + * instead of AH etc. + */ + switch (no) { + case arg_SI: + rv = (unsigned char *)®s->si; + break; + case arg_DI: + rv = (unsigned char *)®s->di; + break; + case arg_BP: + rv = (unsigned char *)®s->bp; + break; + case arg_SP: + rv = (unsigned char *)®s->sp; + break; + default: + break; + } + } else { + switch (no) { + case arg_AH: + rv = 1 + (unsigned char *)®s->ax; + break; + case arg_BH: + rv = 1 + (unsigned char *)®s->bx; + break; + case arg_CH: + rv = 1 + (unsigned char *)®s->cx; + break; + case arg_DH: + rv = 1 + (unsigned char *)®s->dx; + break; + default: + break; + } + } + + if (!rv) + printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no); + return rv; } @@ -368,11 +413,12 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs) unsigned char mod_rm; int reg; unsigned char *p; - int i, shorted, enlarged, rexr; + struct prefix_bits prf; + int i; unsigned long rv; p = (unsigned char *)ins_addr; - p += skip_prefix(p, &shorted, &enlarged, &rexr); + p += skip_prefix(p, &prf); p += get_opcode(p, &opcode); for (i = 0; i < ARRAY_SIZE(reg_rop); i++) if (reg_rop[i] == opcode) { @@ -392,10 +438,10 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs) do_work: mod_rm = *p; - reg = ((mod_rm >> 3) & 0x7) | (rexr << 3); + reg = ((mod_rm >> 3) & 0x7) | (prf.rexr << 3); switch (get_ins_reg_width(ins_addr)) { case 1: - return *get_reg_w8(reg, regs); + return *get_reg_w8(reg, prf.rex, regs); case 2: return *(unsigned short *)get_reg_w32(reg, regs); @@ -422,11 +468,12 @@ unsigned long get_ins_imm_val(unsigned long ins_addr) unsigned char mod_rm; unsigned char mod; unsigned char *p; - int i, shorted, enlarged, rexr; + struct prefix_bits prf; + int i; unsigned long rv; p = (unsigned char *)ins_addr; - p += skip_prefix(p, &shorted, &enlarged, &rexr); + p += skip_prefix(p, &prf); p += get_opcode(p, &opcode); for (i = 0; i < ARRAY_SIZE(imm_wop); i++) if (imm_wop[i] == opcode) { diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 557b2abceef8..86f2ffc43c3d 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -63,10 +63,8 @@ static inline void pgd_list_del(pgd_t *pgd) #define UNSHARED_PTRS_PER_PGD \ (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) -static void pgd_ctor(void *p) +static void pgd_ctor(pgd_t *pgd) { - pgd_t *pgd = p; - /* If the pgd points to a shared pagetable level (either the ptes in non-PAE, or shared PMD in PAE), then just copy the references from swapper_pg_dir. */ @@ -87,7 +85,7 @@ static void pgd_ctor(void *p) pgd_list_add(pgd); } -static void pgd_dtor(void *pgd) +static void pgd_dtor(pgd_t *pgd) { unsigned long flags; /* can be called from interrupt context */ @@ -207,6 +205,9 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) unsigned long addr; int i; + if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */ + return; + pud = pud_offset(pgd, 0); for (addr = i = 0; i < PREALLOCATED_PMDS; diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index b4becbf8c570..0951db9ee519 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -20,53 +20,6 @@ #include <asm/tlb.h> #include <asm/tlbflush.h> -void show_mem(void) -{ - int total = 0, reserved = 0; - int shared = 0, cached = 0; - int highmem = 0; - struct page *page; - pg_data_t *pgdat; - unsigned long i; - unsigned long flags; - - printk(KERN_INFO "Mem-info:\n"); - show_free_areas(); - for_each_online_pgdat(pgdat) { - pgdat_resize_lock(pgdat, &flags); - for (i = 0; i < pgdat->node_spanned_pages; ++i) { - if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) - touch_nmi_watchdog(); - page = pgdat_page_nr(pgdat, i); - total++; - if (PageHighMem(page)) - highmem++; - if (PageReserved(page)) - reserved++; - else if (PageSwapCache(page)) - cached++; - else if (page_count(page)) - shared += page_count(page) - 1; - } - pgdat_resize_unlock(pgdat, &flags); - } - printk(KERN_INFO "%d pages of RAM\n", total); - printk(KERN_INFO "%d pages of HIGHMEM\n", highmem); - printk(KERN_INFO "%d reserved pages\n", reserved); - printk(KERN_INFO "%d pages shared\n", shared); - printk(KERN_INFO "%d pages swap cached\n", cached); - - printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY)); - printk(KERN_INFO "%lu pages writeback\n", - global_page_state(NR_WRITEBACK)); - printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED)); - printk(KERN_INFO "%lu pages slab\n", - global_page_state(NR_SLAB_RECLAIMABLE) + - global_page_state(NR_SLAB_UNRECLAIMABLE)); - printk(KERN_INFO "%lu pages pagetables\n", - global_page_state(NR_PAGETABLE)); -} - /* * Associate a virtual page frame with a given physical page frame * and protection flags for that frame. @@ -170,7 +123,8 @@ static int __init parse_vmalloc(char *arg) if (!arg) return -EINVAL; - __VMALLOC_RESERVE = memparse(arg, &arg); + /* Add VMALLOC_OFFSET to the parsed value due to vm area guard hole*/ + __VMALLOC_RESERVE = memparse(arg, &arg) + VMALLOC_OFFSET; return 0; } early_param("vmalloc", parse_vmalloc); diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c index 1eb2973a301c..16ae70fc57e7 100644 --- a/arch/x86/mm/srat_32.c +++ b/arch/x86/mm/srat_32.c @@ -178,7 +178,7 @@ void acpi_numa_arch_fixup(void) * start of the node, and that the current "end" address is after * the previous one. */ -static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk) +static __init int node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk) { /* * Only add present memory as told by the e820. @@ -189,10 +189,10 @@ static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_c if (memory_chunk->start_pfn >= max_pfn) { printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n", memory_chunk->start_pfn, memory_chunk->end_pfn); - return; + return -1; } if (memory_chunk->nid != nid) - return; + return -1; if (!node_has_online_mem(nid)) node_start_pfn[nid] = memory_chunk->start_pfn; @@ -202,6 +202,8 @@ static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_c if (node_end_pfn[nid] < memory_chunk->end_pfn) node_end_pfn[nid] = memory_chunk->end_pfn; + + return 0; } int __init get_memcfg_from_srat(void) @@ -259,7 +261,9 @@ int __init get_memcfg_from_srat(void) printk(KERN_DEBUG "chunk %d nid %d start_pfn %08lx end_pfn %08lx\n", j, chunk->nid, chunk->start_pfn, chunk->end_pfn); - node_read_chunk(chunk->nid, chunk); + if (node_read_chunk(chunk->nid, chunk)) + continue; + e820_register_active_regions(chunk->nid, chunk->start_pfn, min(chunk->end_pfn, max_pfn)); } diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 1b4763e26ea9..51c0a2fc14fe 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -138,7 +138,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) return; } - if (is_uv_system()) + if (get_uv_system_type() >= UV_X2APIC) apic_id = (pa->apic_id << 8) | pa->local_sapic_eid; else apic_id = pa->apic_id; diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c index d877c5b423ef..ab50a8d7402c 100644 --- a/arch/x86/mm/testmmiotrace.c +++ b/arch/x86/mm/testmmiotrace.c @@ -3,6 +3,7 @@ */ #include <linux/module.h> #include <linux/io.h> +#include <linux/mmiotrace.h> #define MODULE_NAME "testmmiotrace" @@ -13,6 +14,7 @@ MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB."); static void do_write_test(void __iomem *p) { unsigned int i; + mmiotrace_printk("Write test.\n"); for (i = 0; i < 256; i++) iowrite8(i, p + i); for (i = 1024; i < (5 * 1024); i += 2) @@ -24,6 +26,7 @@ static void do_write_test(void __iomem *p) static void do_read_test(void __iomem *p) { unsigned int i; + mmiotrace_printk("Read test.\n"); for (i = 0; i < 256; i++) ioread8(p + i); for (i = 1024; i < (5 * 1024); i += 2) @@ -39,6 +42,7 @@ static void do_test(void) pr_err(MODULE_NAME ": could not ioremap, aborting.\n"); return; } + mmiotrace_printk("ioremap returned %p.\n", p); do_write_test(p); do_read_test(p); iounmap(p); diff --git a/arch/x86/oprofile/Makefile b/arch/x86/oprofile/Makefile index 30f3eb366667..446902b2a6b6 100644 --- a/arch/x86/oprofile/Makefile +++ b/arch/x86/oprofile/Makefile @@ -7,6 +7,6 @@ DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \ timer_int.o ) oprofile-y := $(DRIVER_OBJS) init.o backtrace.o -oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_athlon.o \ +oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_amd.o \ op_model_ppro.o op_model_p4.o oprofile-$(CONFIG_X86_IO_APIC) += nmi_timer_int.o diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index e2095cba409f..04df67f8a7ba 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -52,8 +52,7 @@ struct frame_head { unsigned long ret; } __attribute__((packed)); -static struct frame_head * -dump_user_backtrace(struct frame_head * head) +static struct frame_head *dump_user_backtrace(struct frame_head *head) { struct frame_head bufhead[2]; diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 7f3329b55d2e..022cd41ea9b4 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -1,10 +1,11 @@ /** * @file nmi_int.c * - * @remark Copyright 2002 OProfile authors + * @remark Copyright 2002-2008 OProfile authors * @remark Read the file COPYING * * @author John Levon <levon@movementarian.org> + * @author Robert Richter <robert.richter@amd.com> */ #include <linux/init.h> @@ -15,6 +16,7 @@ #include <linux/slab.h> #include <linux/moduleparam.h> #include <linux/kdebug.h> +#include <linux/cpu.h> #include <asm/nmi.h> #include <asm/msr.h> #include <asm/apic.h> @@ -26,60 +28,9 @@ static struct op_x86_model_spec const *model; static DEFINE_PER_CPU(struct op_msrs, cpu_msrs); static DEFINE_PER_CPU(unsigned long, saved_lvtpc); -static int nmi_start(void); -static void nmi_stop(void); - /* 0 == registered but off, 1 == registered and on */ static int nmi_enabled = 0; -#ifdef CONFIG_PM - -static int nmi_suspend(struct sys_device *dev, pm_message_t state) -{ - if (nmi_enabled == 1) - nmi_stop(); - return 0; -} - -static int nmi_resume(struct sys_device *dev) -{ - if (nmi_enabled == 1) - nmi_start(); - return 0; -} - -static struct sysdev_class oprofile_sysclass = { - .name = "oprofile", - .resume = nmi_resume, - .suspend = nmi_suspend, -}; - -static struct sys_device device_oprofile = { - .id = 0, - .cls = &oprofile_sysclass, -}; - -static int __init init_sysfs(void) -{ - int error; - - error = sysdev_class_register(&oprofile_sysclass); - if (!error) - error = sysdev_register(&device_oprofile); - return error; -} - -static void exit_sysfs(void) -{ - sysdev_unregister(&device_oprofile); - sysdev_class_unregister(&oprofile_sysclass); -} - -#else -#define init_sysfs() do { } while (0) -#define exit_sysfs() do { } while (0) -#endif /* CONFIG_PM */ - static int profile_exceptions_notify(struct notifier_block *self, unsigned long val, void *data) { @@ -269,10 +220,12 @@ static void nmi_cpu_shutdown(void *dummy) static void nmi_shutdown(void) { - struct op_msrs *msrs = &get_cpu_var(cpu_msrs); + struct op_msrs *msrs; + nmi_enabled = 0; on_each_cpu(nmi_cpu_shutdown, NULL, 1); unregister_die_notifier(&profile_exceptions_nb); + msrs = &get_cpu_var(cpu_msrs); model->shutdown(msrs); free_msrs(); put_cpu_var(cpu_msrs); @@ -332,6 +285,77 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root) return 0; } +#ifdef CONFIG_SMP +static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action, + void *data) +{ + int cpu = (unsigned long)data; + switch (action) { + case CPU_DOWN_FAILED: + case CPU_ONLINE: + smp_call_function_single(cpu, nmi_cpu_start, NULL, 0); + break; + case CPU_DOWN_PREPARE: + smp_call_function_single(cpu, nmi_cpu_stop, NULL, 1); + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block oprofile_cpu_nb = { + .notifier_call = oprofile_cpu_notifier +}; +#endif + +#ifdef CONFIG_PM + +static int nmi_suspend(struct sys_device *dev, pm_message_t state) +{ + /* Only one CPU left, just stop that one */ + if (nmi_enabled == 1) + nmi_cpu_stop(NULL); + return 0; +} + +static int nmi_resume(struct sys_device *dev) +{ + if (nmi_enabled == 1) + nmi_cpu_start(NULL); + return 0; +} + +static struct sysdev_class oprofile_sysclass = { + .name = "oprofile", + .resume = nmi_resume, + .suspend = nmi_suspend, +}; + +static struct sys_device device_oprofile = { + .id = 0, + .cls = &oprofile_sysclass, +}; + +static int __init init_sysfs(void) +{ + int error; + + error = sysdev_class_register(&oprofile_sysclass); + if (!error) + error = sysdev_register(&device_oprofile); + return error; +} + +static void exit_sysfs(void) +{ + sysdev_unregister(&device_oprofile); + sysdev_class_unregister(&oprofile_sysclass); +} + +#else +#define init_sysfs() do { } while (0) +#define exit_sysfs() do { } while (0) +#endif /* CONFIG_PM */ + static int p4force; module_param(p4force, int, 0); @@ -369,26 +393,47 @@ static int __init ppro_init(char **cpu_type) { __u8 cpu_model = boot_cpu_data.x86_model; - if (cpu_model == 14) + switch (cpu_model) { + case 0 ... 2: + *cpu_type = "i386/ppro"; + break; + case 3 ... 5: + *cpu_type = "i386/pii"; + break; + case 6 ... 8: + *cpu_type = "i386/piii"; + break; + case 9: + *cpu_type = "i386/p6_mobile"; + break; + case 10 ... 13: + *cpu_type = "i386/p6"; + break; + case 14: *cpu_type = "i386/core"; - else if (cpu_model == 15 || cpu_model == 23) + break; + case 15: case 23: *cpu_type = "i386/core_2"; - else if (cpu_model > 0xd) + break; + default: + /* Unknown */ return 0; - else if (cpu_model == 9) { - *cpu_type = "i386/p6_mobile"; - } else if (cpu_model > 5) { - *cpu_type = "i386/piii"; - } else if (cpu_model > 2) { - *cpu_type = "i386/pii"; - } else { - *cpu_type = "i386/ppro"; } model = &op_ppro_spec; return 1; } +static int __init arch_perfmon_init(char **cpu_type) +{ + if (!cpu_has_arch_perfmon) + return 0; + *cpu_type = "i386/arch_perfmon"; + model = &op_arch_perfmon_spec; + arch_perfmon_setup_counters(); + return 1; +} + /* in order to get sysfs right */ static int using_nmi; @@ -396,7 +441,8 @@ int __init op_nmi_init(struct oprofile_operations *ops) { __u8 vendor = boot_cpu_data.x86_vendor; __u8 family = boot_cpu_data.x86; - char *cpu_type; + char *cpu_type = NULL; + int ret = 0; if (!cpu_has_apic) return -ENODEV; @@ -409,19 +455,23 @@ int __init op_nmi_init(struct oprofile_operations *ops) default: return -ENODEV; case 6: - model = &op_athlon_spec; + model = &op_amd_spec; cpu_type = "i386/athlon"; break; case 0xf: - model = &op_athlon_spec; + model = &op_amd_spec; /* Actually it could be i386/hammer too, but give user space an consistent name. */ cpu_type = "x86-64/hammer"; break; case 0x10: - model = &op_athlon_spec; + model = &op_amd_spec; cpu_type = "x86-64/family10"; break; + case 0x11: + model = &op_amd_spec; + cpu_type = "x86-64/family11h"; + break; } break; @@ -429,39 +479,56 @@ int __init op_nmi_init(struct oprofile_operations *ops) switch (family) { /* Pentium IV */ case 0xf: - if (!p4_init(&cpu_type)) - return -ENODEV; + p4_init(&cpu_type); break; /* A P6-class processor */ case 6: - if (!ppro_init(&cpu_type)) - return -ENODEV; + ppro_init(&cpu_type); break; default: - return -ENODEV; + break; } + + if (!cpu_type && !arch_perfmon_init(&cpu_type)) + return -ENODEV; break; default: return -ENODEV; } - init_sysfs(); - using_nmi = 1; +#ifdef CONFIG_SMP + register_cpu_notifier(&oprofile_cpu_nb); +#endif + /* default values, can be overwritten by model */ ops->create_files = nmi_create_files; ops->setup = nmi_setup; ops->shutdown = nmi_shutdown; ops->start = nmi_start; ops->stop = nmi_stop; ops->cpu_type = cpu_type; + + if (model->init) + ret = model->init(ops); + if (ret) + return ret; + + init_sysfs(); + using_nmi = 1; printk(KERN_INFO "oprofile: using NMI interrupt.\n"); return 0; } void op_nmi_exit(void) { - if (using_nmi) + if (using_nmi) { exit_sysfs(); +#ifdef CONFIG_SMP + unregister_cpu_notifier(&oprofile_cpu_nb); +#endif + } + if (model->exit) + model->exit(); } diff --git a/arch/x86/oprofile/op_counter.h b/arch/x86/oprofile/op_counter.h index 2880b15c4675..91b6a116165e 100644 --- a/arch/x86/oprofile/op_counter.h +++ b/arch/x86/oprofile/op_counter.h @@ -6,22 +6,22 @@ * * @author John Levon */ - + #ifndef OP_COUNTER_H #define OP_COUNTER_H - + #define OP_MAX_COUNTER 8 - + /* Per-perfctr configuration as set via * oprofilefs. */ struct op_counter_config { - unsigned long count; - unsigned long enabled; - unsigned long event; - unsigned long kernel; - unsigned long user; - unsigned long unit_mask; + unsigned long count; + unsigned long enabled; + unsigned long event; + unsigned long kernel; + unsigned long user; + unsigned long unit_mask; }; extern struct op_counter_config counter_config[]; diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c new file mode 100644 index 000000000000..509513760a6e --- /dev/null +++ b/arch/x86/oprofile/op_model_amd.c @@ -0,0 +1,546 @@ +/* + * @file op_model_amd.c + * athlon / K7 / K8 / Family 10h model-specific MSR operations + * + * @remark Copyright 2002-2008 OProfile authors + * @remark Read the file COPYING + * + * @author John Levon + * @author Philippe Elie + * @author Graydon Hoare + * @author Robert Richter <robert.richter@amd.com> + * @author Barry Kasindorf +*/ + +#include <linux/oprofile.h> +#include <linux/device.h> +#include <linux/pci.h> + +#include <asm/ptrace.h> +#include <asm/msr.h> +#include <asm/nmi.h> + +#include "op_x86_model.h" +#include "op_counter.h" + +#define NUM_COUNTERS 4 +#define NUM_CONTROLS 4 + +#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) +#define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0) +#define CTR_WRITE(l, msrs, c) do {wrmsr(msrs->counters[(c)].addr, -(unsigned int)(l), -1); } while (0) +#define CTR_OVERFLOWED(n) (!((n) & (1U<<31))) + +#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0) +#define CTRL_READ(l, h, msrs, c) do {rdmsr(msrs->controls[(c)].addr, (l), (h)); } while (0) +#define CTRL_WRITE(l, h, msrs, c) do {wrmsr(msrs->controls[(c)].addr, (l), (h)); } while (0) +#define CTRL_SET_ACTIVE(n) (n |= (1<<22)) +#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22)) +#define CTRL_CLEAR_LO(x) (x &= (1<<21)) +#define CTRL_CLEAR_HI(x) (x &= 0xfffffcf0) +#define CTRL_SET_ENABLE(val) (val |= 1<<20) +#define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16)) +#define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17)) +#define CTRL_SET_UM(val, m) (val |= (m << 8)) +#define CTRL_SET_EVENT_LOW(val, e) (val |= (e & 0xff)) +#define CTRL_SET_EVENT_HIGH(val, e) (val |= ((e >> 8) & 0xf)) +#define CTRL_SET_HOST_ONLY(val, h) (val |= ((h & 1) << 9)) +#define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8)) + +static unsigned long reset_value[NUM_COUNTERS]; + +#ifdef CONFIG_OPROFILE_IBS + +/* IbsFetchCtl bits/masks */ +#define IBS_FETCH_HIGH_VALID_BIT (1UL << 17) /* bit 49 */ +#define IBS_FETCH_HIGH_ENABLE (1UL << 16) /* bit 48 */ +#define IBS_FETCH_LOW_MAX_CNT_MASK 0x0000FFFFUL /* MaxCnt mask */ + +/*IbsOpCtl bits */ +#define IBS_OP_LOW_VALID_BIT (1ULL<<18) /* bit 18 */ +#define IBS_OP_LOW_ENABLE (1ULL<<17) /* bit 17 */ + +/* Codes used in cpu_buffer.c */ +/* This produces duplicate code, need to be fixed */ +#define IBS_FETCH_BEGIN 3 +#define IBS_OP_BEGIN 4 + +/* The function interface needs to be fixed, something like add + data. Should then be added to linux/oprofile.h. */ +extern void +oprofile_add_ibs_sample(struct pt_regs *const regs, + unsigned int *const ibs_sample, int ibs_code); + +struct ibs_fetch_sample { + /* MSRC001_1031 IBS Fetch Linear Address Register */ + unsigned int ibs_fetch_lin_addr_low; + unsigned int ibs_fetch_lin_addr_high; + /* MSRC001_1030 IBS Fetch Control Register */ + unsigned int ibs_fetch_ctl_low; + unsigned int ibs_fetch_ctl_high; + /* MSRC001_1032 IBS Fetch Physical Address Register */ + unsigned int ibs_fetch_phys_addr_low; + unsigned int ibs_fetch_phys_addr_high; +}; + +struct ibs_op_sample { + /* MSRC001_1034 IBS Op Logical Address Register (IbsRIP) */ + unsigned int ibs_op_rip_low; + unsigned int ibs_op_rip_high; + /* MSRC001_1035 IBS Op Data Register */ + unsigned int ibs_op_data1_low; + unsigned int ibs_op_data1_high; + /* MSRC001_1036 IBS Op Data 2 Register */ + unsigned int ibs_op_data2_low; + unsigned int ibs_op_data2_high; + /* MSRC001_1037 IBS Op Data 3 Register */ + unsigned int ibs_op_data3_low; + unsigned int ibs_op_data3_high; + /* MSRC001_1038 IBS DC Linear Address Register (IbsDcLinAd) */ + unsigned int ibs_dc_linear_low; + unsigned int ibs_dc_linear_high; + /* MSRC001_1039 IBS DC Physical Address Register (IbsDcPhysAd) */ + unsigned int ibs_dc_phys_low; + unsigned int ibs_dc_phys_high; +}; + +/* + * unitialize the APIC for the IBS interrupts if needed on AMD Family10h+ +*/ +static void clear_ibs_nmi(void); + +static int ibs_allowed; /* AMD Family10h and later */ + +struct op_ibs_config { + unsigned long op_enabled; + unsigned long fetch_enabled; + unsigned long max_cnt_fetch; + unsigned long max_cnt_op; + unsigned long rand_en; + unsigned long dispatched_ops; +}; + +static struct op_ibs_config ibs_config; + +#endif + +/* functions for op_amd_spec */ + +static void op_amd_fill_in_addresses(struct op_msrs * const msrs) +{ + int i; + + for (i = 0; i < NUM_COUNTERS; i++) { + if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) + msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; + else + msrs->counters[i].addr = 0; + } + + for (i = 0; i < NUM_CONTROLS; i++) { + if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) + msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; + else + msrs->controls[i].addr = 0; + } +} + + +static void op_amd_setup_ctrs(struct op_msrs const * const msrs) +{ + unsigned int low, high; + int i; + + /* clear all counters */ + for (i = 0 ; i < NUM_CONTROLS; ++i) { + if (unlikely(!CTRL_IS_RESERVED(msrs, i))) + continue; + CTRL_READ(low, high, msrs, i); + CTRL_CLEAR_LO(low); + CTRL_CLEAR_HI(high); + CTRL_WRITE(low, high, msrs, i); + } + + /* avoid a false detection of ctr overflows in NMI handler */ + for (i = 0; i < NUM_COUNTERS; ++i) { + if (unlikely(!CTR_IS_RESERVED(msrs, i))) + continue; + CTR_WRITE(1, msrs, i); + } + + /* enable active counters */ + for (i = 0; i < NUM_COUNTERS; ++i) { + if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) { + reset_value[i] = counter_config[i].count; + + CTR_WRITE(counter_config[i].count, msrs, i); + + CTRL_READ(low, high, msrs, i); + CTRL_CLEAR_LO(low); + CTRL_CLEAR_HI(high); + CTRL_SET_ENABLE(low); + CTRL_SET_USR(low, counter_config[i].user); + CTRL_SET_KERN(low, counter_config[i].kernel); + CTRL_SET_UM(low, counter_config[i].unit_mask); + CTRL_SET_EVENT_LOW(low, counter_config[i].event); + CTRL_SET_EVENT_HIGH(high, counter_config[i].event); + CTRL_SET_HOST_ONLY(high, 0); + CTRL_SET_GUEST_ONLY(high, 0); + + CTRL_WRITE(low, high, msrs, i); + } else { + reset_value[i] = 0; + } + } +} + +#ifdef CONFIG_OPROFILE_IBS + +static inline int +op_amd_handle_ibs(struct pt_regs * const regs, + struct op_msrs const * const msrs) +{ + unsigned int low, high; + struct ibs_fetch_sample ibs_fetch; + struct ibs_op_sample ibs_op; + + if (!ibs_allowed) + return 1; + + if (ibs_config.fetch_enabled) { + rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); + if (high & IBS_FETCH_HIGH_VALID_BIT) { + ibs_fetch.ibs_fetch_ctl_high = high; + ibs_fetch.ibs_fetch_ctl_low = low; + rdmsr(MSR_AMD64_IBSFETCHLINAD, low, high); + ibs_fetch.ibs_fetch_lin_addr_high = high; + ibs_fetch.ibs_fetch_lin_addr_low = low; + rdmsr(MSR_AMD64_IBSFETCHPHYSAD, low, high); + ibs_fetch.ibs_fetch_phys_addr_high = high; + ibs_fetch.ibs_fetch_phys_addr_low = low; + + oprofile_add_ibs_sample(regs, + (unsigned int *)&ibs_fetch, + IBS_FETCH_BEGIN); + + /*reenable the IRQ */ + rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); + high &= ~IBS_FETCH_HIGH_VALID_BIT; + high |= IBS_FETCH_HIGH_ENABLE; + low &= IBS_FETCH_LOW_MAX_CNT_MASK; + wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); + } + } + + if (ibs_config.op_enabled) { + rdmsr(MSR_AMD64_IBSOPCTL, low, high); + if (low & IBS_OP_LOW_VALID_BIT) { + rdmsr(MSR_AMD64_IBSOPRIP, low, high); + ibs_op.ibs_op_rip_low = low; + ibs_op.ibs_op_rip_high = high; + rdmsr(MSR_AMD64_IBSOPDATA, low, high); + ibs_op.ibs_op_data1_low = low; + ibs_op.ibs_op_data1_high = high; + rdmsr(MSR_AMD64_IBSOPDATA2, low, high); + ibs_op.ibs_op_data2_low = low; + ibs_op.ibs_op_data2_high = high; + rdmsr(MSR_AMD64_IBSOPDATA3, low, high); + ibs_op.ibs_op_data3_low = low; + ibs_op.ibs_op_data3_high = high; + rdmsr(MSR_AMD64_IBSDCLINAD, low, high); + ibs_op.ibs_dc_linear_low = low; + ibs_op.ibs_dc_linear_high = high; + rdmsr(MSR_AMD64_IBSDCPHYSAD, low, high); + ibs_op.ibs_dc_phys_low = low; + ibs_op.ibs_dc_phys_high = high; + + /* reenable the IRQ */ + oprofile_add_ibs_sample(regs, + (unsigned int *)&ibs_op, + IBS_OP_BEGIN); + rdmsr(MSR_AMD64_IBSOPCTL, low, high); + high = 0; + low &= ~IBS_OP_LOW_VALID_BIT; + low |= IBS_OP_LOW_ENABLE; + wrmsr(MSR_AMD64_IBSOPCTL, low, high); + } + } + + return 1; +} + +#endif + +static int op_amd_check_ctrs(struct pt_regs * const regs, + struct op_msrs const * const msrs) +{ + unsigned int low, high; + int i; + + for (i = 0 ; i < NUM_COUNTERS; ++i) { + if (!reset_value[i]) + continue; + CTR_READ(low, high, msrs, i); + if (CTR_OVERFLOWED(low)) { + oprofile_add_sample(regs, i); + CTR_WRITE(reset_value[i], msrs, i); + } + } + +#ifdef CONFIG_OPROFILE_IBS + op_amd_handle_ibs(regs, msrs); +#endif + + /* See op_model_ppro.c */ + return 1; +} + +static void op_amd_start(struct op_msrs const * const msrs) +{ + unsigned int low, high; + int i; + for (i = 0 ; i < NUM_COUNTERS ; ++i) { + if (reset_value[i]) { + CTRL_READ(low, high, msrs, i); + CTRL_SET_ACTIVE(low); + CTRL_WRITE(low, high, msrs, i); + } + } + +#ifdef CONFIG_OPROFILE_IBS + if (ibs_allowed && ibs_config.fetch_enabled) { + low = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; + high = ((ibs_config.rand_en & 0x1) << 25) /* bit 57 */ + + IBS_FETCH_HIGH_ENABLE; + wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); + } + + if (ibs_allowed && ibs_config.op_enabled) { + low = ((ibs_config.max_cnt_op >> 4) & 0xFFFF) + + ((ibs_config.dispatched_ops & 0x1) << 19) /* bit 19 */ + + IBS_OP_LOW_ENABLE; + high = 0; + wrmsr(MSR_AMD64_IBSOPCTL, low, high); + } +#endif +} + + +static void op_amd_stop(struct op_msrs const * const msrs) +{ + unsigned int low, high; + int i; + + /* Subtle: stop on all counters to avoid race with + * setting our pm callback */ + for (i = 0 ; i < NUM_COUNTERS ; ++i) { + if (!reset_value[i]) + continue; + CTRL_READ(low, high, msrs, i); + CTRL_SET_INACTIVE(low); + CTRL_WRITE(low, high, msrs, i); + } + +#ifdef CONFIG_OPROFILE_IBS + if (ibs_allowed && ibs_config.fetch_enabled) { + low = 0; /* clear max count and enable */ + high = 0; + wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); + } + + if (ibs_allowed && ibs_config.op_enabled) { + low = 0; /* clear max count and enable */ + high = 0; + wrmsr(MSR_AMD64_IBSOPCTL, low, high); + } +#endif +} + +static void op_amd_shutdown(struct op_msrs const * const msrs) +{ + int i; + + for (i = 0 ; i < NUM_COUNTERS ; ++i) { + if (CTR_IS_RESERVED(msrs, i)) + release_perfctr_nmi(MSR_K7_PERFCTR0 + i); + } + for (i = 0 ; i < NUM_CONTROLS ; ++i) { + if (CTRL_IS_RESERVED(msrs, i)) + release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); + } +} + +#ifndef CONFIG_OPROFILE_IBS + +/* no IBS support */ + +static int op_amd_init(struct oprofile_operations *ops) +{ + return 0; +} + +static void op_amd_exit(void) {} + +#else + +static u8 ibs_eilvt_off; + +static inline void apic_init_ibs_nmi_per_cpu(void *arg) +{ + ibs_eilvt_off = setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0); +} + +static inline void apic_clear_ibs_nmi_per_cpu(void *arg) +{ + setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1); +} + +static int pfm_amd64_setup_eilvt(void) +{ +#define IBSCTL_LVTOFFSETVAL (1 << 8) +#define IBSCTL 0x1cc + struct pci_dev *cpu_cfg; + int nodes; + u32 value = 0; + + /* per CPU setup */ + on_each_cpu(apic_init_ibs_nmi_per_cpu, NULL, 1); + + nodes = 0; + cpu_cfg = NULL; + do { + cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD, + PCI_DEVICE_ID_AMD_10H_NB_MISC, + cpu_cfg); + if (!cpu_cfg) + break; + ++nodes; + pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off + | IBSCTL_LVTOFFSETVAL); + pci_read_config_dword(cpu_cfg, IBSCTL, &value); + if (value != (ibs_eilvt_off | IBSCTL_LVTOFFSETVAL)) { + printk(KERN_DEBUG "Failed to setup IBS LVT offset, " + "IBSCTL = 0x%08x", value); + return 1; + } + } while (1); + + if (!nodes) { + printk(KERN_DEBUG "No CPU node configured for IBS"); + return 1; + } + +#ifdef CONFIG_NUMA + /* Sanity check */ + /* Works only for 64bit with proper numa implementation. */ + if (nodes != num_possible_nodes()) { + printk(KERN_DEBUG "Failed to setup CPU node(s) for IBS, " + "found: %d, expected %d", + nodes, num_possible_nodes()); + return 1; + } +#endif + return 0; +} + +/* + * initialize the APIC for the IBS interrupts + * if available (AMD Family10h rev B0 and later) + */ +static void setup_ibs(void) +{ + ibs_allowed = boot_cpu_has(X86_FEATURE_IBS); + + if (!ibs_allowed) + return; + + if (pfm_amd64_setup_eilvt()) { + ibs_allowed = 0; + return; + } + + printk(KERN_INFO "oprofile: AMD IBS detected\n"); +} + + +/* + * unitialize the APIC for the IBS interrupts if needed on AMD Family10h + * rev B0 and later */ +static void clear_ibs_nmi(void) +{ + if (ibs_allowed) + on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1); +} + +static int (*create_arch_files)(struct super_block *sb, struct dentry *root); + +static int setup_ibs_files(struct super_block *sb, struct dentry *root) +{ + struct dentry *dir; + int ret = 0; + + /* architecture specific files */ + if (create_arch_files) + ret = create_arch_files(sb, root); + + if (ret) + return ret; + + if (!ibs_allowed) + return ret; + + /* model specific files */ + + /* setup some reasonable defaults */ + ibs_config.max_cnt_fetch = 250000; + ibs_config.fetch_enabled = 0; + ibs_config.max_cnt_op = 250000; + ibs_config.op_enabled = 0; + ibs_config.dispatched_ops = 1; + + dir = oprofilefs_mkdir(sb, root, "ibs_fetch"); + oprofilefs_create_ulong(sb, dir, "enable", + &ibs_config.fetch_enabled); + oprofilefs_create_ulong(sb, dir, "max_count", + &ibs_config.max_cnt_fetch); + oprofilefs_create_ulong(sb, dir, "rand_enable", + &ibs_config.rand_en); + + dir = oprofilefs_mkdir(sb, root, "ibs_op"); + oprofilefs_create_ulong(sb, dir, "enable", + &ibs_config.op_enabled); + oprofilefs_create_ulong(sb, dir, "max_count", + &ibs_config.max_cnt_op); + oprofilefs_create_ulong(sb, dir, "dispatched_ops", + &ibs_config.dispatched_ops); + + return 0; +} + +static int op_amd_init(struct oprofile_operations *ops) +{ + setup_ibs(); + create_arch_files = ops->create_files; + ops->create_files = setup_ibs_files; + return 0; +} + +static void op_amd_exit(void) +{ + clear_ibs_nmi(); +} + +#endif + +struct op_x86_model_spec const op_amd_spec = { + .init = op_amd_init, + .exit = op_amd_exit, + .num_counters = NUM_COUNTERS, + .num_controls = NUM_CONTROLS, + .fill_in_addresses = &op_amd_fill_in_addresses, + .setup_ctrs = &op_amd_setup_ctrs, + .check_ctrs = &op_amd_check_ctrs, + .start = &op_amd_start, + .stop = &op_amd_stop, + .shutdown = &op_amd_shutdown +}; diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c deleted file mode 100644 index 3d534879a9dc..000000000000 --- a/arch/x86/oprofile/op_model_athlon.c +++ /dev/null @@ -1,190 +0,0 @@ -/* - * @file op_model_athlon.h - * athlon / K7 / K8 / Family 10h model-specific MSR operations - * - * @remark Copyright 2002 OProfile authors - * @remark Read the file COPYING - * - * @author John Levon - * @author Philippe Elie - * @author Graydon Hoare - */ - -#include <linux/oprofile.h> -#include <asm/ptrace.h> -#include <asm/msr.h> -#include <asm/nmi.h> - -#include "op_x86_model.h" -#include "op_counter.h" - -#define NUM_COUNTERS 4 -#define NUM_CONTROLS 4 - -#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) -#define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0) -#define CTR_WRITE(l, msrs, c) do {wrmsr(msrs->counters[(c)].addr, -(unsigned int)(l), -1); } while (0) -#define CTR_OVERFLOWED(n) (!((n) & (1U<<31))) - -#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0) -#define CTRL_READ(l, h, msrs, c) do {rdmsr(msrs->controls[(c)].addr, (l), (h)); } while (0) -#define CTRL_WRITE(l, h, msrs, c) do {wrmsr(msrs->controls[(c)].addr, (l), (h)); } while (0) -#define CTRL_SET_ACTIVE(n) (n |= (1<<22)) -#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22)) -#define CTRL_CLEAR_LO(x) (x &= (1<<21)) -#define CTRL_CLEAR_HI(x) (x &= 0xfffffcf0) -#define CTRL_SET_ENABLE(val) (val |= 1<<20) -#define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16)) -#define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17)) -#define CTRL_SET_UM(val, m) (val |= (m << 8)) -#define CTRL_SET_EVENT_LOW(val, e) (val |= (e & 0xff)) -#define CTRL_SET_EVENT_HIGH(val, e) (val |= ((e >> 8) & 0xf)) -#define CTRL_SET_HOST_ONLY(val, h) (val |= ((h & 1) << 9)) -#define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8)) - -static unsigned long reset_value[NUM_COUNTERS]; - -static void athlon_fill_in_addresses(struct op_msrs * const msrs) -{ - int i; - - for (i = 0; i < NUM_COUNTERS; i++) { - if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) - msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; - else - msrs->counters[i].addr = 0; - } - - for (i = 0; i < NUM_CONTROLS; i++) { - if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) - msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; - else - msrs->controls[i].addr = 0; - } -} - - -static void athlon_setup_ctrs(struct op_msrs const * const msrs) -{ - unsigned int low, high; - int i; - - /* clear all counters */ - for (i = 0 ; i < NUM_CONTROLS; ++i) { - if (unlikely(!CTRL_IS_RESERVED(msrs, i))) - continue; - CTRL_READ(low, high, msrs, i); - CTRL_CLEAR_LO(low); - CTRL_CLEAR_HI(high); - CTRL_WRITE(low, high, msrs, i); - } - - /* avoid a false detection of ctr overflows in NMI handler */ - for (i = 0; i < NUM_COUNTERS; ++i) { - if (unlikely(!CTR_IS_RESERVED(msrs, i))) - continue; - CTR_WRITE(1, msrs, i); - } - - /* enable active counters */ - for (i = 0; i < NUM_COUNTERS; ++i) { - if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) { - reset_value[i] = counter_config[i].count; - - CTR_WRITE(counter_config[i].count, msrs, i); - - CTRL_READ(low, high, msrs, i); - CTRL_CLEAR_LO(low); - CTRL_CLEAR_HI(high); - CTRL_SET_ENABLE(low); - CTRL_SET_USR(low, counter_config[i].user); - CTRL_SET_KERN(low, counter_config[i].kernel); - CTRL_SET_UM(low, counter_config[i].unit_mask); - CTRL_SET_EVENT_LOW(low, counter_config[i].event); - CTRL_SET_EVENT_HIGH(high, counter_config[i].event); - CTRL_SET_HOST_ONLY(high, 0); - CTRL_SET_GUEST_ONLY(high, 0); - - CTRL_WRITE(low, high, msrs, i); - } else { - reset_value[i] = 0; - } - } -} - - -static int athlon_check_ctrs(struct pt_regs * const regs, - struct op_msrs const * const msrs) -{ - unsigned int low, high; - int i; - - for (i = 0 ; i < NUM_COUNTERS; ++i) { - if (!reset_value[i]) - continue; - CTR_READ(low, high, msrs, i); - if (CTR_OVERFLOWED(low)) { - oprofile_add_sample(regs, i); - CTR_WRITE(reset_value[i], msrs, i); - } - } - - /* See op_model_ppro.c */ - return 1; -} - - -static void athlon_start(struct op_msrs const * const msrs) -{ - unsigned int low, high; - int i; - for (i = 0 ; i < NUM_COUNTERS ; ++i) { - if (reset_value[i]) { - CTRL_READ(low, high, msrs, i); - CTRL_SET_ACTIVE(low); - CTRL_WRITE(low, high, msrs, i); - } - } -} - - -static void athlon_stop(struct op_msrs const * const msrs) -{ - unsigned int low, high; - int i; - - /* Subtle: stop on all counters to avoid race with - * setting our pm callback */ - for (i = 0 ; i < NUM_COUNTERS ; ++i) { - if (!reset_value[i]) - continue; - CTRL_READ(low, high, msrs, i); - CTRL_SET_INACTIVE(low); - CTRL_WRITE(low, high, msrs, i); - } -} - -static void athlon_shutdown(struct op_msrs const * const msrs) -{ - int i; - - for (i = 0 ; i < NUM_COUNTERS ; ++i) { - if (CTR_IS_RESERVED(msrs, i)) - release_perfctr_nmi(MSR_K7_PERFCTR0 + i); - } - for (i = 0 ; i < NUM_CONTROLS ; ++i) { - if (CTRL_IS_RESERVED(msrs, i)) - release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); - } -} - -struct op_x86_model_spec const op_athlon_spec = { - .num_counters = NUM_COUNTERS, - .num_controls = NUM_CONTROLS, - .fill_in_addresses = &athlon_fill_in_addresses, - .setup_ctrs = &athlon_setup_ctrs, - .check_ctrs = &athlon_check_ctrs, - .start = &athlon_start, - .stop = &athlon_stop, - .shutdown = &athlon_shutdown -}; diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 56b4757a1f47..4c4a51c90bc2 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -10,11 +10,12 @@ #include <linux/oprofile.h> #include <linux/smp.h> +#include <linux/ptrace.h> +#include <linux/nmi.h> #include <asm/msr.h> -#include <asm/ptrace.h> #include <asm/fixmap.h> #include <asm/apic.h> -#include <asm/nmi.h> + #include "op_x86_model.h" #include "op_counter.h" @@ -40,7 +41,7 @@ static unsigned int num_controls = NUM_CONTROLS_NON_HT; static inline void setup_num_counters(void) { #ifdef CONFIG_SMP - if (smp_num_siblings == 2){ + if (smp_num_siblings == 2) { num_counters = NUM_COUNTERS_HT2; num_controls = NUM_CONTROLS_HT2; } @@ -86,7 +87,7 @@ struct p4_event_binding { #define CTR_FLAME_2 (1 << 6) #define CTR_IQ_5 (1 << 7) -static struct p4_counter_binding p4_counters [NUM_COUNTERS_NON_HT] = { +static struct p4_counter_binding p4_counters[NUM_COUNTERS_NON_HT] = { { CTR_BPU_0, MSR_P4_BPU_PERFCTR0, MSR_P4_BPU_CCCR0 }, { CTR_MS_0, MSR_P4_MS_PERFCTR0, MSR_P4_MS_CCCR0 }, { CTR_FLAME_0, MSR_P4_FLAME_PERFCTR0, MSR_P4_FLAME_CCCR0 }, @@ -97,32 +98,32 @@ static struct p4_counter_binding p4_counters [NUM_COUNTERS_NON_HT] = { { CTR_IQ_5, MSR_P4_IQ_PERFCTR5, MSR_P4_IQ_CCCR5 } }; -#define NUM_UNUSED_CCCRS NUM_CCCRS_NON_HT - NUM_COUNTERS_NON_HT +#define NUM_UNUSED_CCCRS (NUM_CCCRS_NON_HT - NUM_COUNTERS_NON_HT) /* p4 event codes in libop/op_event.h are indices into this table. */ static struct p4_event_binding p4_events[NUM_EVENTS] = { - + { /* BRANCH_RETIRED */ - 0x05, 0x06, + 0x05, 0x06, { {CTR_IQ_4, MSR_P4_CRU_ESCR2}, {CTR_IQ_5, MSR_P4_CRU_ESCR3} } }, - + { /* MISPRED_BRANCH_RETIRED */ - 0x04, 0x03, + 0x04, 0x03, { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, { CTR_IQ_5, MSR_P4_CRU_ESCR1} } }, - + { /* TC_DELIVER_MODE */ 0x01, 0x01, - { { CTR_MS_0, MSR_P4_TC_ESCR0}, + { { CTR_MS_0, MSR_P4_TC_ESCR0}, { CTR_MS_2, MSR_P4_TC_ESCR1} } }, - + { /* BPU_FETCH_REQUEST */ - 0x00, 0x03, + 0x00, 0x03, { { CTR_BPU_0, MSR_P4_BPU_ESCR0}, { CTR_BPU_2, MSR_P4_BPU_ESCR1} } }, @@ -146,7 +147,7 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { }, { /* LOAD_PORT_REPLAY */ - 0x02, 0x04, + 0x02, 0x04, { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0}, { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} } }, @@ -170,43 +171,43 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { }, { /* BSQ_CACHE_REFERENCE */ - 0x07, 0x0c, + 0x07, 0x0c, { { CTR_BPU_0, MSR_P4_BSU_ESCR0}, { CTR_BPU_2, MSR_P4_BSU_ESCR1} } }, { /* IOQ_ALLOCATION */ - 0x06, 0x03, + 0x06, 0x03, { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, { 0, 0 } } }, { /* IOQ_ACTIVE_ENTRIES */ - 0x06, 0x1a, + 0x06, 0x1a, { { CTR_BPU_2, MSR_P4_FSB_ESCR1}, { 0, 0 } } }, { /* FSB_DATA_ACTIVITY */ - 0x06, 0x17, + 0x06, 0x17, { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, { CTR_BPU_2, MSR_P4_FSB_ESCR1} } }, { /* BSQ_ALLOCATION */ - 0x07, 0x05, + 0x07, 0x05, { { CTR_BPU_0, MSR_P4_BSU_ESCR0}, { 0, 0 } } }, { /* BSQ_ACTIVE_ENTRIES */ 0x07, 0x06, - { { CTR_BPU_2, MSR_P4_BSU_ESCR1 /* guess */}, + { { CTR_BPU_2, MSR_P4_BSU_ESCR1 /* guess */}, { 0, 0 } } }, { /* X87_ASSIST */ - 0x05, 0x03, + 0x05, 0x03, { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, { CTR_IQ_5, MSR_P4_CRU_ESCR3} } }, @@ -216,21 +217,21 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } }, - + { /* PACKED_SP_UOP */ - 0x01, 0x08, + 0x01, 0x08, { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } }, - + { /* PACKED_DP_UOP */ - 0x01, 0x0c, + 0x01, 0x0c, { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } }, { /* SCALAR_SP_UOP */ - 0x01, 0x0a, + 0x01, 0x0a, { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } }, @@ -242,31 +243,31 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { }, { /* 64BIT_MMX_UOP */ - 0x01, 0x02, + 0x01, 0x02, { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } }, - + { /* 128BIT_MMX_UOP */ - 0x01, 0x1a, + 0x01, 0x1a, { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } }, { /* X87_FP_UOP */ - 0x01, 0x04, + 0x01, 0x04, { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } }, - + { /* X87_SIMD_MOVES_UOP */ - 0x01, 0x2e, + 0x01, 0x2e, { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } }, - + { /* MACHINE_CLEAR */ - 0x05, 0x02, + 0x05, 0x02, { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, { CTR_IQ_5, MSR_P4_CRU_ESCR3} } }, @@ -276,9 +277,9 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, { CTR_BPU_2, MSR_P4_FSB_ESCR1} } }, - + { /* TC_MS_XFER */ - 0x00, 0x05, + 0x00, 0x05, { { CTR_MS_0, MSR_P4_MS_ESCR0}, { CTR_MS_2, MSR_P4_MS_ESCR1} } }, @@ -308,7 +309,7 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { }, { /* INSTR_RETIRED */ - 0x04, 0x02, + 0x04, 0x02, { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, { CTR_IQ_5, MSR_P4_CRU_ESCR1} } }, @@ -319,14 +320,14 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { { CTR_IQ_5, MSR_P4_CRU_ESCR1} } }, - { /* UOP_TYPE */ - 0x02, 0x02, + { /* UOP_TYPE */ + 0x02, 0x02, { { CTR_IQ_4, MSR_P4_RAT_ESCR0}, { CTR_IQ_5, MSR_P4_RAT_ESCR1} } }, { /* RETIRED_MISPRED_BRANCH_TYPE */ - 0x02, 0x05, + 0x02, 0x05, { { CTR_MS_0, MSR_P4_TBPU_ESCR0}, { CTR_MS_2, MSR_P4_TBPU_ESCR1} } }, @@ -349,8 +350,8 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { #define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1)) #define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25)) #define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9)) -#define ESCR_READ(escr,high,ev,i) do {rdmsr(ev->bindings[(i)].escr_address, (escr), (high));} while (0) -#define ESCR_WRITE(escr,high,ev,i) do {wrmsr(ev->bindings[(i)].escr_address, (escr), (high));} while (0) +#define ESCR_READ(escr, high, ev, i) do {rdmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0) +#define ESCR_WRITE(escr, high, ev, i) do {wrmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0) #define CCCR_RESERVED_BITS 0x38030FFF #define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS) @@ -360,15 +361,15 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { #define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27)) #define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12)) #define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12)) -#define CCCR_READ(low, high, i) do {rdmsr(p4_counters[(i)].cccr_address, (low), (high));} while (0) -#define CCCR_WRITE(low, high, i) do {wrmsr(p4_counters[(i)].cccr_address, (low), (high));} while (0) +#define CCCR_READ(low, high, i) do {rdmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0) +#define CCCR_WRITE(low, high, i) do {wrmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0) #define CCCR_OVF_P(cccr) ((cccr) & (1U<<31)) #define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31))) -#define CTRL_IS_RESERVED(msrs,c) (msrs->controls[(c)].addr ? 1 : 0) -#define CTR_IS_RESERVED(msrs,c) (msrs->counters[(c)].addr ? 1 : 0) -#define CTR_READ(l,h,i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h));} while (0) -#define CTR_WRITE(l,i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1);} while (0) +#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0) +#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) +#define CTR_READ(l, h, i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h)); } while (0) +#define CTR_WRITE(l, i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1); } while (0) #define CTR_OVERFLOW_P(ctr) (!((ctr) & 0x80000000)) @@ -380,7 +381,7 @@ static unsigned int get_stagger(void) #ifdef CONFIG_SMP int cpu = smp_processor_id(); return (cpu != first_cpu(per_cpu(cpu_sibling_map, cpu))); -#endif +#endif return 0; } @@ -395,25 +396,23 @@ static unsigned long reset_value[NUM_COUNTERS_NON_HT]; static void p4_fill_in_addresses(struct op_msrs * const msrs) { - unsigned int i; + unsigned int i; unsigned int addr, cccraddr, stag; setup_num_counters(); stag = get_stagger(); /* initialize some registers */ - for (i = 0; i < num_counters; ++i) { + for (i = 0; i < num_counters; ++i) msrs->counters[i].addr = 0; - } - for (i = 0; i < num_controls; ++i) { + for (i = 0; i < num_controls; ++i) msrs->controls[i].addr = 0; - } - + /* the counter & cccr registers we pay attention to */ for (i = 0; i < num_counters; ++i) { addr = p4_counters[VIRT_CTR(stag, i)].counter_address; cccraddr = p4_counters[VIRT_CTR(stag, i)].cccr_address; - if (reserve_perfctr_nmi(addr)){ + if (reserve_perfctr_nmi(addr)) { msrs->counters[i].addr = addr; msrs->controls[i].addr = cccraddr; } @@ -447,22 +446,22 @@ static void p4_fill_in_addresses(struct op_msrs * const msrs) if (reserve_evntsel_nmi(addr)) msrs->controls[i].addr = addr; } - + for (addr = MSR_P4_MS_ESCR0 + stag; - addr <= MSR_P4_TC_ESCR1; ++i, addr += addr_increment()) { + addr <= MSR_P4_TC_ESCR1; ++i, addr += addr_increment()) { if (reserve_evntsel_nmi(addr)) msrs->controls[i].addr = addr; } - + for (addr = MSR_P4_IX_ESCR0 + stag; - addr <= MSR_P4_CRU_ESCR3; ++i, addr += addr_increment()) { + addr <= MSR_P4_CRU_ESCR3; ++i, addr += addr_increment()) { if (reserve_evntsel_nmi(addr)) msrs->controls[i].addr = addr; } /* there are 2 remaining non-contiguously located ESCRs */ - if (num_counters == NUM_COUNTERS_NON_HT) { + if (num_counters == NUM_COUNTERS_NON_HT) { /* standard non-HT CPUs handle both remaining ESCRs*/ if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR5)) msrs->controls[i++].addr = MSR_P4_CRU_ESCR5; @@ -498,20 +497,20 @@ static void pmc_setup_one_p4_counter(unsigned int ctr) unsigned int stag; stag = get_stagger(); - + /* convert from counter *number* to counter *bit* */ counter_bit = 1 << VIRT_CTR(stag, ctr); - + /* find our event binding structure. */ if (counter_config[ctr].event <= 0 || counter_config[ctr].event > NUM_EVENTS) { - printk(KERN_ERR - "oprofile: P4 event code 0x%lx out of range\n", + printk(KERN_ERR + "oprofile: P4 event code 0x%lx out of range\n", counter_config[ctr].event); return; } - + ev = &(p4_events[counter_config[ctr].event - 1]); - + for (i = 0; i < maxbind; i++) { if (ev->bindings[i].virt_counter & counter_bit) { @@ -526,25 +525,24 @@ static void pmc_setup_one_p4_counter(unsigned int ctr) ESCR_SET_OS_1(escr, counter_config[ctr].kernel); } ESCR_SET_EVENT_SELECT(escr, ev->event_select); - ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask); + ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask); ESCR_WRITE(escr, high, ev, i); - + /* modify CCCR */ CCCR_READ(cccr, high, VIRT_CTR(stag, ctr)); CCCR_CLEAR(cccr); CCCR_SET_REQUIRED_BITS(cccr); CCCR_SET_ESCR_SELECT(cccr, ev->escr_select); - if (stag == 0) { + if (stag == 0) CCCR_SET_PMI_OVF_0(cccr); - } else { + else CCCR_SET_PMI_OVF_1(cccr); - } CCCR_WRITE(cccr, high, VIRT_CTR(stag, ctr)); return; } } - printk(KERN_ERR + printk(KERN_ERR "oprofile: P4 event code 0x%lx no binding, stag %d ctr %d\n", counter_config[ctr].event, stag, ctr); } @@ -559,14 +557,14 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs) stag = get_stagger(); rdmsr(MSR_IA32_MISC_ENABLE, low, high); - if (! MISC_PMC_ENABLED_P(low)) { + if (!MISC_PMC_ENABLED_P(low)) { printk(KERN_ERR "oprofile: P4 PMC not available\n"); return; } /* clear the cccrs we will use */ for (i = 0 ; i < num_counters ; i++) { - if (unlikely(!CTRL_IS_RESERVED(msrs,i))) + if (unlikely(!CTRL_IS_RESERVED(msrs, i))) continue; rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); CCCR_CLEAR(low); @@ -576,14 +574,14 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs) /* clear all escrs (including those outside our concern) */ for (i = num_counters; i < num_controls; i++) { - if (unlikely(!CTRL_IS_RESERVED(msrs,i))) + if (unlikely(!CTRL_IS_RESERVED(msrs, i))) continue; wrmsr(msrs->controls[i].addr, 0, 0); } /* setup all counters */ for (i = 0 ; i < num_counters ; ++i) { - if ((counter_config[i].enabled) && (CTRL_IS_RESERVED(msrs,i))) { + if ((counter_config[i].enabled) && (CTRL_IS_RESERVED(msrs, i))) { reset_value[i] = counter_config[i].count; pmc_setup_one_p4_counter(i); CTR_WRITE(counter_config[i].count, VIRT_CTR(stag, i)); @@ -603,11 +601,11 @@ static int p4_check_ctrs(struct pt_regs * const regs, stag = get_stagger(); for (i = 0; i < num_counters; ++i) { - - if (!reset_value[i]) + + if (!reset_value[i]) continue; - /* + /* * there is some eccentricity in the hardware which * requires that we perform 2 extra corrections: * @@ -616,24 +614,24 @@ static int p4_check_ctrs(struct pt_regs * const regs, * * - write the counter back twice to ensure it gets * updated properly. - * + * * the former seems to be related to extra NMIs happening * during the current NMI; the latter is reported as errata * N15 in intel doc 249199-029, pentium 4 specification * update, though their suggested work-around does not * appear to solve the problem. */ - + real = VIRT_CTR(stag, i); CCCR_READ(low, high, real); - CTR_READ(ctr, high, real); + CTR_READ(ctr, high, real); if (CCCR_OVF_P(low) || CTR_OVERFLOW_P(ctr)) { oprofile_add_sample(regs, i); - CTR_WRITE(reset_value[i], real); + CTR_WRITE(reset_value[i], real); CCCR_CLEAR_OVF(low); CCCR_WRITE(low, high, real); - CTR_WRITE(reset_value[i], real); + CTR_WRITE(reset_value[i], real); } } @@ -683,15 +681,16 @@ static void p4_shutdown(struct op_msrs const * const msrs) int i; for (i = 0 ; i < num_counters ; ++i) { - if (CTR_IS_RESERVED(msrs,i)) + if (CTR_IS_RESERVED(msrs, i)) release_perfctr_nmi(msrs->counters[i].addr); } - /* some of the control registers are specially reserved in + /* + * some of the control registers are specially reserved in * conjunction with the counter registers (hence the starting offset). * This saves a few bits. */ for (i = num_counters ; i < num_controls ; ++i) { - if (CTRL_IS_RESERVED(msrs,i)) + if (CTRL_IS_RESERVED(msrs, i)) release_evntsel_nmi(msrs->controls[i].addr); } } @@ -699,24 +698,24 @@ static void p4_shutdown(struct op_msrs const * const msrs) #ifdef CONFIG_SMP struct op_x86_model_spec const op_p4_ht2_spec = { - .num_counters = NUM_COUNTERS_HT2, - .num_controls = NUM_CONTROLS_HT2, - .fill_in_addresses = &p4_fill_in_addresses, - .setup_ctrs = &p4_setup_ctrs, - .check_ctrs = &p4_check_ctrs, - .start = &p4_start, - .stop = &p4_stop, - .shutdown = &p4_shutdown + .num_counters = NUM_COUNTERS_HT2, + .num_controls = NUM_CONTROLS_HT2, + .fill_in_addresses = &p4_fill_in_addresses, + .setup_ctrs = &p4_setup_ctrs, + .check_ctrs = &p4_check_ctrs, + .start = &p4_start, + .stop = &p4_stop, + .shutdown = &p4_shutdown }; #endif struct op_x86_model_spec const op_p4_spec = { - .num_counters = NUM_COUNTERS_NON_HT, - .num_controls = NUM_CONTROLS_NON_HT, - .fill_in_addresses = &p4_fill_in_addresses, - .setup_ctrs = &p4_setup_ctrs, - .check_ctrs = &p4_check_ctrs, - .start = &p4_start, - .stop = &p4_stop, - .shutdown = &p4_shutdown + .num_counters = NUM_COUNTERS_NON_HT, + .num_controls = NUM_CONTROLS_NON_HT, + .fill_in_addresses = &p4_fill_in_addresses, + .setup_ctrs = &p4_setup_ctrs, + .check_ctrs = &p4_check_ctrs, + .start = &p4_start, + .stop = &p4_stop, + .shutdown = &p4_shutdown }; diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index eff431f6c57b..0620d6d45f7d 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -1,32 +1,34 @@ /* * @file op_model_ppro.h - * pentium pro / P6 model-specific MSR operations + * Family 6 perfmon and architectural perfmon MSR operations * * @remark Copyright 2002 OProfile authors + * @remark Copyright 2008 Intel Corporation * @remark Read the file COPYING * * @author John Levon * @author Philippe Elie * @author Graydon Hoare + * @author Andi Kleen */ #include <linux/oprofile.h> +#include <linux/slab.h> #include <asm/ptrace.h> #include <asm/msr.h> #include <asm/apic.h> #include <asm/nmi.h> +#include <asm/intel_arch_perfmon.h> #include "op_x86_model.h" #include "op_counter.h" -#define NUM_COUNTERS 2 -#define NUM_CONTROLS 2 +static int num_counters = 2; +static int counter_width = 32; #define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) #define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0) -#define CTR_32BIT_WRITE(l, msrs, c) \ - do {wrmsr(msrs->counters[(c)].addr, -(u32)(l), 0); } while (0) -#define CTR_OVERFLOWED(n) (!((n) & (1U<<31))) +#define CTR_OVERFLOWED(n) (!((n) & (1U<<(counter_width-1)))) #define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0) #define CTRL_READ(l, h, msrs, c) do {rdmsr((msrs->controls[(c)].addr), (l), (h)); } while (0) @@ -40,20 +42,20 @@ #define CTRL_SET_UM(val, m) (val |= (m << 8)) #define CTRL_SET_EVENT(val, e) (val |= e) -static unsigned long reset_value[NUM_COUNTERS]; +static u64 *reset_value; static void ppro_fill_in_addresses(struct op_msrs * const msrs) { int i; - for (i = 0; i < NUM_COUNTERS; i++) { + for (i = 0; i < num_counters; i++) { if (reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i)) msrs->counters[i].addr = MSR_P6_PERFCTR0 + i; else msrs->counters[i].addr = 0; } - for (i = 0; i < NUM_CONTROLS; i++) { + for (i = 0; i < num_counters; i++) { if (reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i)) msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i; else @@ -67,8 +69,22 @@ static void ppro_setup_ctrs(struct op_msrs const * const msrs) unsigned int low, high; int i; + if (!reset_value) { + reset_value = kmalloc(sizeof(unsigned) * num_counters, + GFP_ATOMIC); + if (!reset_value) + return; + } + + if (cpu_has_arch_perfmon) { + union cpuid10_eax eax; + eax.full = cpuid_eax(0xa); + if (counter_width < eax.split.bit_width) + counter_width = eax.split.bit_width; + } + /* clear all counters */ - for (i = 0 ; i < NUM_CONTROLS; ++i) { + for (i = 0 ; i < num_counters; ++i) { if (unlikely(!CTRL_IS_RESERVED(msrs, i))) continue; CTRL_READ(low, high, msrs, i); @@ -77,18 +93,18 @@ static void ppro_setup_ctrs(struct op_msrs const * const msrs) } /* avoid a false detection of ctr overflows in NMI handler */ - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < num_counters; ++i) { if (unlikely(!CTR_IS_RESERVED(msrs, i))) continue; - CTR_32BIT_WRITE(1, msrs, i); + wrmsrl(msrs->counters[i].addr, -1LL); } /* enable active counters */ - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < num_counters; ++i) { if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) { reset_value[i] = counter_config[i].count; - CTR_32BIT_WRITE(counter_config[i].count, msrs, i); + wrmsrl(msrs->counters[i].addr, -reset_value[i]); CTRL_READ(low, high, msrs, i); CTRL_CLEAR(low); @@ -111,13 +127,13 @@ static int ppro_check_ctrs(struct pt_regs * const regs, unsigned int low, high; int i; - for (i = 0 ; i < NUM_COUNTERS; ++i) { + for (i = 0 ; i < num_counters; ++i) { if (!reset_value[i]) continue; CTR_READ(low, high, msrs, i); if (CTR_OVERFLOWED(low)) { oprofile_add_sample(regs, i); - CTR_32BIT_WRITE(reset_value[i], msrs, i); + wrmsrl(msrs->counters[i].addr, -reset_value[i]); } } @@ -141,7 +157,7 @@ static void ppro_start(struct op_msrs const * const msrs) unsigned int low, high; int i; - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < num_counters; ++i) { if (reset_value[i]) { CTRL_READ(low, high, msrs, i); CTRL_SET_ACTIVE(low); @@ -156,7 +172,7 @@ static void ppro_stop(struct op_msrs const * const msrs) unsigned int low, high; int i; - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < num_counters; ++i) { if (!reset_value[i]) continue; CTRL_READ(low, high, msrs, i); @@ -169,24 +185,70 @@ static void ppro_shutdown(struct op_msrs const * const msrs) { int i; - for (i = 0 ; i < NUM_COUNTERS ; ++i) { + for (i = 0 ; i < num_counters ; ++i) { if (CTR_IS_RESERVED(msrs, i)) release_perfctr_nmi(MSR_P6_PERFCTR0 + i); } - for (i = 0 ; i < NUM_CONTROLS ; ++i) { + for (i = 0 ; i < num_counters ; ++i) { if (CTRL_IS_RESERVED(msrs, i)) release_evntsel_nmi(MSR_P6_EVNTSEL0 + i); } + if (reset_value) { + kfree(reset_value); + reset_value = NULL; + } } -struct op_x86_model_spec const op_ppro_spec = { - .num_counters = NUM_COUNTERS, - .num_controls = NUM_CONTROLS, - .fill_in_addresses = &ppro_fill_in_addresses, - .setup_ctrs = &ppro_setup_ctrs, - .check_ctrs = &ppro_check_ctrs, - .start = &ppro_start, - .stop = &ppro_stop, - .shutdown = &ppro_shutdown +struct op_x86_model_spec op_ppro_spec = { + .num_counters = 2, /* can be overriden */ + .num_controls = 2, /* dito */ + .fill_in_addresses = &ppro_fill_in_addresses, + .setup_ctrs = &ppro_setup_ctrs, + .check_ctrs = &ppro_check_ctrs, + .start = &ppro_start, + .stop = &ppro_stop, + .shutdown = &ppro_shutdown +}; + +/* + * Architectural performance monitoring. + * + * Newer Intel CPUs (Core1+) have support for architectural + * events described in CPUID 0xA. See the IA32 SDM Vol3b.18 for details. + * The advantage of this is that it can be done without knowing about + * the specific CPU. + */ + +void arch_perfmon_setup_counters(void) +{ + union cpuid10_eax eax; + + eax.full = cpuid_eax(0xa); + + /* Workaround for BIOS bugs in 6/15. Taken from perfmon2 */ + if (eax.split.version_id == 0 && current_cpu_data.x86 == 6 && + current_cpu_data.x86_model == 15) { + eax.split.version_id = 2; + eax.split.num_counters = 2; + eax.split.bit_width = 40; + } + + num_counters = eax.split.num_counters; + + op_arch_perfmon_spec.num_counters = num_counters; + op_arch_perfmon_spec.num_controls = num_counters; + op_ppro_spec.num_counters = num_counters; + op_ppro_spec.num_controls = num_counters; +} + +struct op_x86_model_spec op_arch_perfmon_spec = { + /* num_counters/num_controls filled in at runtime */ + .fill_in_addresses = &ppro_fill_in_addresses, + /* user space does the cpuid check for available events */ + .setup_ctrs = &ppro_setup_ctrs, + .check_ctrs = &ppro_check_ctrs, + .start = &ppro_start, + .stop = &ppro_stop, + .shutdown = &ppro_shutdown }; diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 45b605fa71d0..825e79064d64 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -22,8 +22,8 @@ struct op_msr { }; struct op_msrs { - struct op_msr * counters; - struct op_msr * controls; + struct op_msr *counters; + struct op_msr *controls; }; struct pt_regs; @@ -32,8 +32,10 @@ struct pt_regs; * various x86 CPU models' perfctr support. */ struct op_x86_model_spec { - unsigned int const num_counters; - unsigned int const num_controls; + int (*init)(struct oprofile_operations *ops); + void (*exit)(void); + unsigned int num_counters; + unsigned int num_controls; void (*fill_in_addresses)(struct op_msrs * const msrs); void (*setup_ctrs)(struct op_msrs const * const msrs); int (*check_ctrs)(struct pt_regs * const regs, @@ -43,9 +45,12 @@ struct op_x86_model_spec { void (*shutdown)(struct op_msrs const * const msrs); }; -extern struct op_x86_model_spec const op_ppro_spec; +extern struct op_x86_model_spec op_ppro_spec; extern struct op_x86_model_spec const op_p4_spec; extern struct op_x86_model_spec const op_p4_ht2_spec; -extern struct op_x86_model_spec const op_athlon_spec; +extern struct op_x86_model_spec const op_amd_spec; +extern struct op_x86_model_spec op_arch_perfmon_spec; + +extern void arch_perfmon_setup_counters(void); #endif /* OP_X86_MODEL_H */ diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile index e515e8db842a..d49202e740ea 100644 --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -5,13 +5,13 @@ obj-$(CONFIG_PCI_MMCONFIG) += mmconfig_$(BITS).o direct.o mmconfig-shared.o obj-$(CONFIG_PCI_DIRECT) += direct.o obj-$(CONFIG_PCI_OLPC) += olpc.o -pci-y := fixup.o -pci-$(CONFIG_ACPI) += acpi.o -pci-y += legacy.o irq.o +obj-y += fixup.o +obj-$(CONFIG_ACPI) += acpi.o +obj-y += legacy.o irq.o -pci-$(CONFIG_X86_VISWS) += visws.o +obj-$(CONFIG_X86_VISWS) += visws.o -pci-$(CONFIG_X86_NUMAQ) += numa.o +obj-$(CONFIG_X86_NUMAQ) += numaq_32.o -obj-y += $(pci-y) common.o early.o +obj-y += common.o early.o obj-y += amd_bus.o diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 19af06927fbc..1d88d2b39771 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -250,10 +250,5 @@ int __init pci_acpi_init(void) acpi_pci_irq_enable(dev); } -#ifdef CONFIG_X86_IO_APIC - if (acpi_ioapic) - print_IO_APIC(); -#endif - return 0; } diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index dbf532369711..22e057665e55 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -1,6 +1,7 @@ #include <linux/init.h> #include <linux/pci.h> #include <linux/topology.h> +#include <linux/cpu.h> #include "pci.h" #ifdef CONFIG_X86_64 @@ -555,15 +556,17 @@ static int __init early_fill_mp_bus_info(void) return 0; } -postcore_initcall(early_fill_mp_bus_info); +#else /* !CONFIG_X86_64 */ -#endif +static int __init early_fill_mp_bus_info(void) { return 0; } + +#endif /* !CONFIG_X86_64 */ /* common 32/64 bit code */ #define ENABLE_CF8_EXT_CFG (1ULL << 46) -static void enable_pci_io_ecs_per_cpu(void *unused) +static void enable_pci_io_ecs(void *unused) { u64 reg; rdmsrl(MSR_AMD64_NB_CFG, reg); @@ -573,14 +576,51 @@ static void enable_pci_io_ecs_per_cpu(void *unused) } } -static int __init enable_pci_io_ecs(void) +static int __cpuinit amd_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) { + int cpu = (long)hcpu; + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + smp_call_function_single(cpu, enable_pci_io_ecs, NULL, 0); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata amd_cpu_notifier = { + .notifier_call = amd_cpu_notify, +}; + +static int __init pci_io_ecs_init(void) +{ + int cpu; + /* assume all cpus from fam10h have IO ECS */ if (boot_cpu_data.x86 < 0x10) return 0; - on_each_cpu(enable_pci_io_ecs_per_cpu, NULL, 1); + + register_cpu_notifier(&amd_cpu_notifier); + for_each_online_cpu(cpu) + amd_cpu_notify(&amd_cpu_notifier, (unsigned long)CPU_ONLINE, + (void *)(long)cpu); pci_probe |= PCI_HAS_IO_ECS; + + return 0; +} + +static int __init amd_postcore_init(void) +{ + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + return 0; + + early_fill_mp_bus_info(); + pci_io_ecs_init(); + return 0; } -postcore_initcall(enable_pci_io_ecs); +postcore_initcall(amd_postcore_init); diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c index 858dbe3399f9..86631ccbc25a 100644 --- a/arch/x86/pci/early.c +++ b/arch/x86/pci/early.c @@ -7,15 +7,13 @@ /* Direct PCI access. This is used for PCI accesses in early boot before the PCI subsystem works. */ -#define PDprintk(x...) - u32 read_pci_config(u8 bus, u8 slot, u8 func, u8 offset) { u32 v; outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); v = inl(0xcfc); if (v != 0xffffffff) - PDprintk("%x reading 4 from %x: %x\n", slot, offset, v); + pr_debug("%x reading 4 from %x: %x\n", slot, offset, v); return v; } @@ -24,7 +22,7 @@ u8 read_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset) u8 v; outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); v = inb(0xcfc + (offset&3)); - PDprintk("%x reading 1 from %x: %x\n", slot, offset, v); + pr_debug("%x reading 1 from %x: %x\n", slot, offset, v); return v; } @@ -33,28 +31,28 @@ u16 read_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset) u16 v; outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); v = inw(0xcfc + (offset&2)); - PDprintk("%x reading 2 from %x: %x\n", slot, offset, v); + pr_debug("%x reading 2 from %x: %x\n", slot, offset, v); return v; } void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset, u32 val) { - PDprintk("%x writing to %x: %x\n", slot, offset, val); + pr_debug("%x writing to %x: %x\n", slot, offset, val); outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); outl(val, 0xcfc); } void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val) { - PDprintk("%x writing to %x: %x\n", slot, offset, val); + pr_debug("%x writing to %x: %x\n", slot, offset, val); outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); outb(val, 0xcfc + (offset&3)); } void write_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset, u16 val) { - PDprintk("%x writing to %x: %x\n", slot, offset, val); + pr_debug("%x writing to %x: %x\n", slot, offset, val); outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); outw(val, 0xcfc + (offset&2)); } @@ -71,7 +69,7 @@ void early_dump_pci_device(u8 bus, u8 slot, u8 func) int j; u32 val; - printk("PCI: %02x:%02x:%02x", bus, slot, func); + printk(KERN_INFO "PCI: %02x:%02x:%02x", bus, slot, func); for (i = 0; i < 256; i += 4) { if (!(i & 0x0f)) diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index ff3a6a336342..3c27a809393b 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -23,7 +23,8 @@ static void __devinit pci_fixup_i450nx(struct pci_dev *d) pci_read_config_byte(d, reg++, &busno); pci_read_config_byte(d, reg++, &suba); pci_read_config_byte(d, reg++, &subb); - DBG("i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno, suba, subb); + dev_dbg(&d->dev, "i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno, + suba, subb); if (busno) pci_scan_bus_with_sysdata(busno); /* Bus A */ if (suba < subb) @@ -510,3 +511,31 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1201, fam10h_pci_cfg_space_size); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1202, fam10h_pci_cfg_space_size); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1203, fam10h_pci_cfg_space_size); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1204, fam10h_pci_cfg_space_size); + +/* + * SB600: Disable BAR1 on device 14.0 to avoid HPET resources from + * confusing the PCI engine: + */ +static void sb600_disable_hpet_bar(struct pci_dev *dev) +{ + u8 val; + + /* + * The SB600 and SB700 both share the same device + * ID, but the PM register 0x55 does something different + * for the SB700, so make sure we are dealing with the + * SB600 before touching the bit: + */ + + pci_read_config_byte(dev, 0x08, &val); + + if (val < 0x2F) { + outb(0x55, 0xCD6); + val = inb(0xCD7); + + /* Set bit 7 in PM register 0x55 */ + outb(0x55, 0xCD6); + outb(val | 0x80, 0xCD7); + } +} +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_ATI, 0x4385, sb600_disable_hpet_bar); diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 2aafb67dc5f1..844df0cbbd3e 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -33,6 +33,7 @@ #include <linux/bootmem.h> #include <asm/pat.h> +#include <asm/e820.h> #include "pci.h" @@ -128,10 +129,7 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list) pr = pci_find_parent_resource(dev, r); if (!r->start || !pr || request_resource(pr, r) < 0) { - printk(KERN_ERR "PCI: Cannot allocate " - "resource region %d " - "of bridge %s\n", - idx, pci_name(dev)); + dev_err(&dev->dev, "BAR %d: can't allocate resource\n", idx); /* * Something is wrong with the region. * Invalidate the resource to prevent @@ -166,15 +164,13 @@ static void __init pcibios_allocate_resources(int pass) else disabled = !(command & PCI_COMMAND_MEMORY); if (pass == disabled) { - DBG("PCI: Resource %08lx-%08lx " - "(f=%lx, d=%d, p=%d)\n", - r->start, r->end, r->flags, disabled, pass); + dev_dbg(&dev->dev, "resource %#08llx-%#08llx (f=%lx, d=%d, p=%d)\n", + (unsigned long long) r->start, + (unsigned long long) r->end, + r->flags, disabled, pass); pr = pci_find_parent_resource(dev, r); if (!pr || request_resource(pr, r) < 0) { - printk(KERN_ERR "PCI: Cannot allocate " - "resource region %d " - "of device %s\n", - idx, pci_name(dev)); + dev_err(&dev->dev, "BAR %d: can't allocate resource\n", idx); /* We'll assign a new address later */ r->end -= r->start; r->start = 0; @@ -187,8 +183,7 @@ static void __init pcibios_allocate_resources(int pass) /* Turn the ROM off, leave the resource region, * but keep it unregistered. */ u32 reg; - DBG("PCI: Switching off ROM of %s\n", - pci_name(dev)); + dev_dbg(&dev->dev, "disabling ROM\n"); r->flags &= ~IORESOURCE_ROM_ENABLE; pci_read_config_dword(dev, dev->rom_base_reg, ®); @@ -233,6 +228,8 @@ void __init pcibios_resource_survey(void) pcibios_allocate_bus_resources(&pci_root_buses); pcibios_allocate_resources(0); pcibios_allocate_resources(1); + + e820_reserve_resources_late(); } /** @@ -257,8 +254,7 @@ void pcibios_set_master(struct pci_dev *dev) lat = pcibios_max_latency; else return; - printk(KERN_DEBUG "PCI: Setting latency timer of device %s to %d\n", - pci_name(dev), lat); + dev_printk(KERN_DEBUG, &dev->dev, "setting latency timer to %d\n", lat); pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat); } @@ -280,6 +276,7 @@ static void pci_track_mmap_page_range(struct vm_area_struct *vma) static struct vm_operations_struct pci_mmap_ops = { .open = pci_track_mmap_page_range, .close = pci_unmap_page_range, + .access = generic_access_phys, }; int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 6a06a2eb0597..bf69dbe08bff 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -436,7 +436,7 @@ static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq) { WARN_ON_ONCE(pirq >= 9); if (pirq > 8) { - printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq); + dev_info(&dev->dev, "VLSI router PIRQ escape (%d)\n", pirq); return 0; } return read_config_nybble(router, 0x74, pirq-1); @@ -446,7 +446,7 @@ static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, { WARN_ON_ONCE(pirq >= 9); if (pirq > 8) { - printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq); + dev_info(&dev->dev, "VLSI router PIRQ escape (%d)\n", pirq); return 0; } write_config_nybble(router, 0x74, pirq-1, irq); @@ -492,15 +492,17 @@ static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq irq = 0; if (pirq <= 4) irq = read_config_nybble(router, 0x56, pirq - 1); - printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d get irq : %2d\n", - dev->vendor, dev->device, pirq, irq); + dev_info(&dev->dev, + "AMD756: dev [%04x:%04x], router PIRQ %d get IRQ %d\n", + dev->vendor, dev->device, pirq, irq); return irq; } static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) { - printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n", - dev->vendor, dev->device, pirq, irq); + dev_info(&dev->dev, + "AMD756: dev [%04x:%04x], router PIRQ %d set IRQ %d\n", + dev->vendor, dev->device, pirq, irq); if (pirq <= 4) write_config_nybble(router, 0x56, pirq - 1, irq); return 1; @@ -593,6 +595,15 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route r->set = pirq_piix_set; return 1; } + + if ((device >= PCI_DEVICE_ID_INTEL_PCH_LPC_MIN) && + (device <= PCI_DEVICE_ID_INTEL_PCH_LPC_MAX)) { + r->name = "PIIX/ICH"; + r->get = pirq_piix_get; + r->set = pirq_piix_set; + return 1; + } + return 0; } @@ -730,7 +741,6 @@ static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, switch (device) { case PCI_DEVICE_ID_AL_M1533: case PCI_DEVICE_ID_AL_M1563: - printk(KERN_DEBUG "PCI: Using ALI IRQ Router\n"); r->name = "ALI"; r->get = pirq_ali_get; r->set = pirq_ali_set; @@ -820,7 +830,7 @@ static void __init pirq_find_router(struct irq_router *r) r->get = NULL; r->set = NULL; - DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n", + DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for [%04x:%04x]\n", rt->rtr_vendor, rt->rtr_device); pirq_router_dev = pci_get_bus_and_slot(rt->rtr_bus, rt->rtr_devfn); @@ -840,11 +850,9 @@ static void __init pirq_find_router(struct irq_router *r) h->probe(r, pirq_router_dev, pirq_router_dev->device)) break; } - printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n", - pirq_router.name, - pirq_router_dev->vendor, - pirq_router_dev->device, - pci_name(pirq_router_dev)); + dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x:%04x]\n", + pirq_router.name, + pirq_router_dev->vendor, pirq_router_dev->device); /* The device remains referenced for the kernel lifetime */ } @@ -877,7 +885,7 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) /* Find IRQ pin */ pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); if (!pin) { - DBG(KERN_DEBUG " -> no interrupt pin\n"); + dev_dbg(&dev->dev, "no interrupt pin\n"); return 0; } pin = pin - 1; @@ -887,20 +895,20 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) if (!pirq_table) return 0; - DBG(KERN_DEBUG "IRQ for %s[%c]", pci_name(dev), 'A' + pin); info = pirq_get_info(dev); if (!info) { - DBG(" -> not found in routing table\n" KERN_DEBUG); + dev_dbg(&dev->dev, "PCI INT %c not found in routing table\n", + 'A' + pin); return 0; } pirq = info->irq[pin].link; mask = info->irq[pin].bitmap; if (!pirq) { - DBG(" -> not routed\n" KERN_DEBUG); + dev_dbg(&dev->dev, "PCI INT %c not routed\n", 'A' + pin); return 0; } - DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask, - pirq_table->exclusive_irqs); + dev_dbg(&dev->dev, "PCI INT %c -> PIRQ %02x, mask %04x, excl %04x", + 'A' + pin, pirq, mask, pirq_table->exclusive_irqs); mask &= pcibios_irq_mask; /* Work around broken HP Pavilion Notebooks which assign USB to @@ -930,10 +938,8 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) if (pci_probe & PCI_USE_PIRQ_MASK) newirq = 0; else - printk("\n" KERN_WARNING - "PCI: IRQ %i for device %s doesn't match PIRQ mask - try pci=usepirqmask\n" - KERN_DEBUG, newirq, - pci_name(dev)); + dev_warn(&dev->dev, "IRQ %d doesn't match PIRQ mask " + "%#x; try pci=usepirqmask\n", newirq, mask); } if (!newirq && assign) { for (i = 0; i < 16; i++) { @@ -944,39 +950,35 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) newirq = i; } } - DBG(" -> newirq=%d", newirq); + dev_dbg(&dev->dev, "PCI INT %c -> newirq %d", 'A' + pin, newirq); /* Check if it is hardcoded */ if ((pirq & 0xf0) == 0xf0) { irq = pirq & 0xf; - DBG(" -> hardcoded IRQ %d\n", irq); - msg = "Hardcoded"; + msg = "hardcoded"; } else if (r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \ ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask))) { - DBG(" -> got IRQ %d\n", irq); - msg = "Found"; + msg = "found"; eisa_set_level_irq(irq); } else if (newirq && r->set && (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) { - DBG(" -> assigning IRQ %d", newirq); if (r->set(pirq_router_dev, dev, pirq, newirq)) { eisa_set_level_irq(newirq); - DBG(" ... OK\n"); - msg = "Assigned"; + msg = "assigned"; irq = newirq; } } if (!irq) { - DBG(" ... failed\n"); if (newirq && mask == (1 << newirq)) { - msg = "Guessed"; + msg = "guessed"; irq = newirq; - } else + } else { + dev_dbg(&dev->dev, "can't route interrupt\n"); return 0; + } } - printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq, - pci_name(dev)); + dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin, irq); /* Update IRQ for all devices with the same pirq value */ while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) { @@ -996,17 +998,17 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) (!(pci_probe & PCI_USE_PIRQ_MASK) || \ ((1 << dev2->irq) & mask))) { #ifndef CONFIG_PCI_MSI - printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n", - pci_name(dev2), dev2->irq, irq); + dev_info(&dev2->dev, "IRQ routing conflict: " + "have IRQ %d, want IRQ %d\n", + dev2->irq, irq); #endif continue; } dev2->irq = irq; pirq_penalty[irq]++; if (dev != dev2) - printk(KERN_INFO - "PCI: Sharing IRQ %d with %s\n", - irq, pci_name(dev2)); + dev_info(&dev->dev, "sharing IRQ %d with %s\n", + irq, pci_name(dev2)); } } return 1; @@ -1025,8 +1027,7 @@ static void __init pcibios_fixup_irqs(void) * already in use. */ if (dev->irq >= 16) { - DBG(KERN_DEBUG "%s: ignoring bogus IRQ %d\n", - pci_name(dev), dev->irq); + dev_dbg(&dev->dev, "ignoring bogus IRQ %d\n", dev->irq); dev->irq = 0; } /* @@ -1049,35 +1050,44 @@ static void __init pcibios_fixup_irqs(void) if (io_apic_assign_pci_irqs) { int irq; - if (pin) { - /* - * interrupt pins are numbered starting - * from 1 - */ - pin--; - irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, - PCI_SLOT(dev->devfn), pin); - /* - * Busses behind bridges are typically not listed in the MP-table. - * In this case we have to look up the IRQ based on the parent bus, - * parent slot, and pin number. The SMP code detects such bridged - * busses itself so we should get into this branch reliably. - */ - if (irq < 0 && dev->bus->parent) { /* go back to the bridge */ - struct pci_dev *bridge = dev->bus->self; - - pin = (pin + PCI_SLOT(dev->devfn)) % 4; - irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, - PCI_SLOT(bridge->devfn), pin); - if (irq >= 0) - printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n", - pci_name(bridge), 'A' + pin, irq); - } - if (irq >= 0) { - printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n", - pci_name(dev), 'A' + pin, irq); - dev->irq = irq; - } + if (!pin) + continue; + + /* + * interrupt pins are numbered starting from 1 + */ + pin--; + irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, + PCI_SLOT(dev->devfn), pin); + /* + * Busses behind bridges are typically not listed in the + * MP-table. In this case we have to look up the IRQ + * based on the parent bus, parent slot, and pin number. + * The SMP code detects such bridged busses itself so we + * should get into this branch reliably. + */ + if (irq < 0 && dev->bus->parent) { + /* go back to the bridge */ + struct pci_dev *bridge = dev->bus->self; + int bus; + + pin = (pin + PCI_SLOT(dev->devfn)) % 4; + bus = bridge->bus->number; + irq = IO_APIC_get_PCI_irq_vector(bus, + PCI_SLOT(bridge->devfn), pin); + if (irq >= 0) + dev_warn(&dev->dev, + "using bridge %s INT %c to " + "get IRQ %d\n", + pci_name(bridge), + 'A' + pin, irq); + } + if (irq >= 0) { + dev_info(&dev->dev, + "PCI->APIC IRQ transform: INT %c " + "-> IRQ %d\n", + 'A' + pin, irq); + dev->irq = irq; } } #endif @@ -1231,25 +1241,24 @@ static int pirq_enable_irq(struct pci_dev *dev) irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, PCI_SLOT(bridge->devfn), pin); if (irq >= 0) - printk(KERN_WARNING - "PCI: using PPB %s[%c] to get irq %d\n", - pci_name(bridge), - 'A' + pin, irq); + dev_warn(&dev->dev, "using bridge %s " + "INT %c to get IRQ %d\n", + pci_name(bridge), 'A' + pin, + irq); dev = bridge; } dev = temp_dev; if (irq >= 0) { - printk(KERN_INFO - "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n", - pci_name(dev), 'A' + pin, irq); + dev_info(&dev->dev, "PCI->APIC IRQ transform: " + "INT %c -> IRQ %d\n", 'A' + pin, irq); dev->irq = irq; return 0; } else - msg = " Probably buggy MP table."; + msg = "; probably buggy MP table"; } else if (pci_probe & PCI_BIOS_IRQ_SCAN) msg = ""; else - msg = " Please try using pci=biosirq."; + msg = "; please try using pci=biosirq"; /* * With IDE legacy devices the IRQ lookup failure is not @@ -1259,9 +1268,8 @@ static int pirq_enable_irq(struct pci_dev *dev) !(dev->class & 0x5)) return 0; - printk(KERN_WARNING - "PCI: No IRQ known for interrupt pin %c of device %s.%s\n", - 'A' + pin, pci_name(dev), msg); + dev_warn(&dev->dev, "can't find IRQ for PCI INT %c%s\n", + 'A' + pin, msg); } return 0; } diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c index 132876cc6fca..b722dd481b39 100644 --- a/arch/x86/pci/legacy.c +++ b/arch/x86/pci/legacy.c @@ -14,7 +14,7 @@ static void __devinit pcibios_fixup_peer_bridges(void) int n, devfn; long node; - if (pcibios_last_bus <= 0 || pcibios_last_bus >= 0xff) + if (pcibios_last_bus <= 0 || pcibios_last_bus > 0xff) return; DBG("PCI: Peer bridge fixup\n"); @@ -57,14 +57,17 @@ static int __init pci_legacy_init(void) int __init pci_subsys_init(void) { +#ifdef CONFIG_X86_NUMAQ + pci_numaq_init(); +#endif #ifdef CONFIG_ACPI pci_acpi_init(); #endif +#ifdef CONFIG_X86_VISWS + pci_visws_init(); +#endif pci_legacy_init(); pcibios_irq_init(); -#ifdef CONFIG_X86_NUMAQ - pci_numa_init(); -#endif pcibios_init(); return 0; diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 23faaa890ffc..654a2234f8f3 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -209,7 +209,7 @@ static int __init pci_mmcfg_check_hostbridge(void) return name != NULL; } -static void __init pci_mmcfg_insert_resources(unsigned long resource_flags) +static void __init pci_mmcfg_insert_resources(void) { #define PCI_MMCFG_RESOURCE_NAME_LEN 19 int i; @@ -233,7 +233,7 @@ static void __init pci_mmcfg_insert_resources(unsigned long resource_flags) cfg->pci_segment); res->start = cfg->address; res->end = res->start + (num_buses << 20) - 1; - res->flags = IORESOURCE_MEM | resource_flags; + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; insert_resource(&iomem_resource, res); names += PCI_MMCFG_RESOURCE_NAME_LEN; } @@ -293,7 +293,7 @@ static acpi_status __init find_mboard_resource(acpi_handle handle, u32 lvl, return AE_OK; } -static int __init is_acpi_reserved(unsigned long start, unsigned long end) +static int __init is_acpi_reserved(u64 start, u64 end, unsigned not_used) { struct resource mcfg_res; @@ -310,6 +310,41 @@ static int __init is_acpi_reserved(unsigned long start, unsigned long end) return mcfg_res.flags; } +typedef int (*check_reserved_t)(u64 start, u64 end, unsigned type); + +static int __init is_mmconf_reserved(check_reserved_t is_reserved, + u64 addr, u64 size, int i, + typeof(pci_mmcfg_config[0]) *cfg, int with_e820) +{ + u64 old_size = size; + int valid = 0; + + while (!is_reserved(addr, addr + size - 1, E820_RESERVED)) { + size >>= 1; + if (size < (16UL<<20)) + break; + } + + if (size >= (16UL<<20) || size == old_size) { + printk(KERN_NOTICE + "PCI: MCFG area at %Lx reserved in %s\n", + addr, with_e820?"E820":"ACPI motherboard resources"); + valid = 1; + + if (old_size != size) { + /* update end_bus_number */ + cfg->end_bus_number = cfg->start_bus_number + ((size>>20) - 1); + printk(KERN_NOTICE "PCI: updated MCFG configuration %d: base %lx " + "segment %hu buses %u - %u\n", + i, (unsigned long)cfg->address, cfg->pci_segment, + (unsigned int)cfg->start_bus_number, + (unsigned int)cfg->end_bus_number); + } + } + + return valid; +} + static void __init pci_mmcfg_reject_broken(int early) { typeof(pci_mmcfg_config[0]) *cfg; @@ -324,21 +359,22 @@ static void __init pci_mmcfg_reject_broken(int early) for (i = 0; i < pci_mmcfg_config_num; i++) { int valid = 0; - u32 size = (cfg->end_bus_number + 1) << 20; + u64 addr, size; + cfg = &pci_mmcfg_config[i]; + addr = cfg->start_bus_number; + addr <<= 20; + addr += cfg->address; + size = cfg->end_bus_number + 1 - cfg->start_bus_number; + size <<= 20; printk(KERN_NOTICE "PCI: MCFG configuration %d: base %lx " "segment %hu buses %u - %u\n", i, (unsigned long)cfg->address, cfg->pci_segment, (unsigned int)cfg->start_bus_number, (unsigned int)cfg->end_bus_number); - if (!early && - is_acpi_reserved(cfg->address, cfg->address + size - 1)) { - printk(KERN_NOTICE "PCI: MCFG area at %Lx reserved " - "in ACPI motherboard resources\n", - cfg->address); - valid = 1; - } + if (!early) + valid = is_mmconf_reserved(is_acpi_reserved, addr, size, i, cfg, 0); if (valid) continue; @@ -347,16 +383,11 @@ static void __init pci_mmcfg_reject_broken(int early) printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %Lx is not" " reserved in ACPI motherboard resources\n", cfg->address); + /* Don't try to do this check unless configuration type 1 is available. how about type 2 ?*/ - if (raw_pci_ops && e820_all_mapped(cfg->address, - cfg->address + size - 1, - E820_RESERVED)) { - printk(KERN_NOTICE - "PCI: MCFG area at %Lx reserved in E820\n", - cfg->address); - valid = 1; - } + if (raw_pci_ops) + valid = is_mmconf_reserved(e820_all_mapped, addr, size, i, cfg, 1); if (!valid) goto reject; @@ -365,7 +396,7 @@ static void __init pci_mmcfg_reject_broken(int early) return; reject: - printk(KERN_ERR "PCI: Not using MMCONFIG.\n"); + printk(KERN_INFO "PCI: Not using MMCONFIG.\n"); pci_mmcfg_arch_free(); kfree(pci_mmcfg_config); pci_mmcfg_config = NULL; @@ -403,11 +434,9 @@ static void __init __pci_mmcfg_init(int early) (pci_mmcfg_config[0].address == 0)) return; - if (pci_mmcfg_arch_init()) { - if (known_bridge) - pci_mmcfg_insert_resources(IORESOURCE_BUSY); + if (pci_mmcfg_arch_init()) pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; - } else { + else { /* * Signal not to attempt to insert mmcfg resources because * the architecture mmcfg setup could not initialize. @@ -444,7 +473,7 @@ static int __init pci_mmcfg_late_insert_resources(void) * marked so it won't cause request errors when __request_region is * called. */ - pci_mmcfg_insert_resources(0); + pci_mmcfg_insert_resources(); return 0; } diff --git a/arch/x86/pci/numa.c b/arch/x86/pci/numaq_32.c index 8b5ca1966731..1177845d3186 100644 --- a/arch/x86/pci/numa.c +++ b/arch/x86/pci/numaq_32.c @@ -1,5 +1,5 @@ /* - * numa.c - Low-level PCI access for NUMA-Q machines + * numaq_32.c - Low-level PCI access for NUMA-Q machines */ #include <linux/pci.h> @@ -131,13 +131,14 @@ static void __devinit pci_fixup_i450nx(struct pci_dev *d) u8 busno, suba, subb; int quad = BUS2QUAD(d->bus->number); - printk("PCI: Searching for i450NX host bridges on %s\n", pci_name(d)); + dev_info(&d->dev, "searching for i450NX host bridges\n"); reg = 0xd0; for(pxb=0; pxb<2; pxb++) { pci_read_config_byte(d, reg++, &busno); pci_read_config_byte(d, reg++, &suba); pci_read_config_byte(d, reg++, &subb); - DBG("i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno, suba, subb); + dev_dbg(&d->dev, "i450NX PXB %d: %02x/%02x/%02x\n", + pxb, busno, suba, subb); if (busno) { /* Bus A */ pci_scan_bus_with_sysdata(QUADLOCAL2BUS(quad, busno)); @@ -151,7 +152,7 @@ static void __devinit pci_fixup_i450nx(struct pci_dev *d) } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx); -int __init pci_numa_init(void) +int __init pci_numaq_init(void) { int quad; diff --git a/arch/x86/pci/pci.h b/arch/x86/pci/pci.h index 3e25deb821ac..15b9cf6be729 100644 --- a/arch/x86/pci/pci.h +++ b/arch/x86/pci/pci.h @@ -108,7 +108,8 @@ extern void __init dmi_check_skip_isa_align(void); /* some common used subsys_initcalls */ extern int __init pci_acpi_init(void); extern int __init pcibios_irq_init(void); -extern int __init pci_numa_init(void); +extern int __init pci_visws_init(void); +extern int __init pci_numaq_init(void); extern int __init pcibios_init(void); /* pci-mmconfig.c */ diff --git a/arch/x86/pci/visws.c b/arch/x86/pci/visws.c index 1a7bed492bb1..42f4cb19faca 100644 --- a/arch/x86/pci/visws.c +++ b/arch/x86/pci/visws.c @@ -86,8 +86,14 @@ void __init pcibios_update_irq(struct pci_dev *dev, int irq) pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq); } -static int __init pci_visws_init(void) +int __init pci_visws_init(void) { + if (!is_visws_box()) + return -1; + + pcibios_enable_irq = &pci_visws_enable_irq; + pcibios_disable_irq = &pci_visws_disable_irq; + /* The VISWS supports configuration access type 1 only */ pci_probe = (pci_probe | PCI_PROBE_CONF1) & ~(PCI_PROBE_BIOS | PCI_PROBE_CONF2); @@ -105,18 +111,3 @@ static int __init pci_visws_init(void) pcibios_resource_survey(); return 0; } - -static __init int pci_subsys_init(void) -{ - if (!is_visws_box()) - return -1; - - pcibios_enable_irq = &pci_visws_enable_irq; - pcibios_disable_irq = &pci_visws_disable_irq; - - pci_visws_init(); - pcibios_init(); - - return 0; -} -subsys_initcall(pci_subsys_init); diff --git a/arch/x86/power/cpu_32.c b/arch/x86/power/cpu_32.c index 7dc5d5cf50a2..274d06082f48 100644 --- a/arch/x86/power/cpu_32.c +++ b/arch/x86/power/cpu_32.c @@ -11,6 +11,7 @@ #include <linux/suspend.h> #include <asm/mtrr.h> #include <asm/mce.h> +#include <asm/xcr.h> static struct saved_context saved_context; @@ -45,7 +46,7 @@ static void __save_processor_state(struct saved_context *ctxt) ctxt->cr0 = read_cr0(); ctxt->cr2 = read_cr2(); ctxt->cr3 = read_cr3(); - ctxt->cr4 = read_cr4(); + ctxt->cr4 = read_cr4_safe(); } /* Needed by apm.c */ @@ -98,7 +99,9 @@ static void __restore_processor_state(struct saved_context *ctxt) /* * control registers */ - write_cr4(ctxt->cr4); + /* cr4 was introduced in the Pentium CPU */ + if (ctxt->cr4) + write_cr4(ctxt->cr4); write_cr3(ctxt->cr3); write_cr2(ctxt->cr2); write_cr0(ctxt->cr0); @@ -124,6 +127,12 @@ static void __restore_processor_state(struct saved_context *ctxt) if (boot_cpu_has(X86_FEATURE_SEP)) enable_sep_cpu(); + /* + * restore XCR0 for xsave capable cpu's. + */ + if (cpu_has_xsave) + xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); + fix_processor_context(); do_fpu_end(); mtrr_ap_init(); diff --git a/arch/x86/power/cpu_64.c b/arch/x86/power/cpu_64.c index 66bdfb591fd8..e3b6cf70d62c 100644 --- a/arch/x86/power/cpu_64.c +++ b/arch/x86/power/cpu_64.c @@ -14,6 +14,7 @@ #include <asm/page.h> #include <asm/pgtable.h> #include <asm/mtrr.h> +#include <asm/xcr.h> static void fix_processor_context(void); @@ -122,6 +123,12 @@ static void __restore_processor_state(struct saved_context *ctxt) wrmsrl(MSR_GS_BASE, ctxt->gs_base); wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); + /* + * restore XCR0 for xsave capable cpu's. + */ + if (cpu_has_xsave) + xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); + fix_processor_context(); do_fpu_end(); diff --git a/arch/x86/power/hibernate_asm_32.S b/arch/x86/power/hibernate_asm_32.S index b95aa6cfe3cb..d1e9b53f9d33 100644 --- a/arch/x86/power/hibernate_asm_32.S +++ b/arch/x86/power/hibernate_asm_32.S @@ -1,5 +1,3 @@ -.text - /* * This may not use any stack, nor any variable that is not "NoSave": * @@ -12,25 +10,26 @@ #include <asm/segment.h> #include <asm/page.h> #include <asm/asm-offsets.h> +#include <asm/processor-flags.h> - .text +.text ENTRY(swsusp_arch_suspend) - movl %esp, saved_context_esp movl %ebx, saved_context_ebx movl %ebp, saved_context_ebp movl %esi, saved_context_esi movl %edi, saved_context_edi - pushfl ; popl saved_context_eflags + pushfl + popl saved_context_eflags call swsusp_save ret ENTRY(restore_image) - movl resume_pg_dir, %ecx - subl $__PAGE_OFFSET, %ecx - movl %ecx, %cr3 + movl resume_pg_dir, %eax + subl $__PAGE_OFFSET, %eax + movl %eax, %cr3 movl restore_pblist, %edx .p2align 4,,7 @@ -52,17 +51,21 @@ copy_loop: done: /* go back to the original page tables */ - movl $swapper_pg_dir, %ecx - subl $__PAGE_OFFSET, %ecx - movl %ecx, %cr3 + movl $swapper_pg_dir, %eax + subl $__PAGE_OFFSET, %eax + movl %eax, %cr3 /* Flush TLB, including "global" things (vmalloc) */ - movl mmu_cr4_features, %eax - movl %eax, %edx - andl $~(1<<7), %edx; # PGE + movl mmu_cr4_features, %ecx + jecxz 1f # cr4 Pentium and higher, skip if zero + movl %ecx, %edx + andl $~(X86_CR4_PGE), %edx movl %edx, %cr4; # turn off PGE - movl %cr3, %ecx; # flush TLB - movl %ecx, %cr3 - movl %eax, %cr4; # turn PGE back on +1: + movl %cr3, %eax; # flush TLB + movl %eax, %cr3 + jecxz 1f # cr4 Pentium and higher, skip if zero + movl %ecx, %cr4; # turn PGE back on +1: movl saved_context_esp, %esp movl saved_context_ebp, %ebp @@ -70,7 +73,8 @@ done: movl saved_context_esi, %esi movl saved_context_edi, %edi - pushl saved_context_eflags ; popfl + pushl saved_context_eflags + popfl xorl %eax, %eax diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index b7ad9f89d21f..4d6ef0a336d6 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -62,7 +62,7 @@ $(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE # Build multiple 32-bit vDSO images to choose from at boot time. # obj-$(VDSO32-y) += vdso32-syms.lds -vdso32.so-$(CONFIG_X86_32) += int80 +vdso32.so-$(VDSO32-y) += int80 vdso32.so-$(CONFIG_COMPAT) += syscall vdso32.so-$(VDSO32-y) += sysenter diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 0bce5429a515..513f330c5832 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -193,17 +193,12 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr) } } -/* - * These symbols are defined by vdso32.S to mark the bounds - * of the ELF DSO images included therein. - */ -extern const char vdso32_default_start, vdso32_default_end; -extern const char vdso32_sysenter_start, vdso32_sysenter_end; static struct page *vdso32_pages[1]; #ifdef CONFIG_X86_64 #define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32)) +#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32)) /* May not be __init: called during resume */ void syscall32_cpu_init(void) @@ -226,6 +221,7 @@ static inline void map_compat_vdso(int map) #else /* CONFIG_X86_32 */ #define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP)) +#define vdso32_syscall() (0) void enable_sep_cpu(void) { @@ -296,12 +292,15 @@ int __init sysenter_setup(void) gate_vma_init(); #endif - if (!vdso32_sysenter()) { - vsyscall = &vdso32_default_start; - vsyscall_len = &vdso32_default_end - &vdso32_default_start; - } else { + if (vdso32_syscall()) { + vsyscall = &vdso32_syscall_start; + vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start; + } else if (vdso32_sysenter()){ vsyscall = &vdso32_sysenter_start; vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start; + } else { + vsyscall = &vdso32_int80_start; + vsyscall_len = &vdso32_int80_end - &vdso32_int80_start; } memcpy(syscall_page, vsyscall, vsyscall_len); diff --git a/arch/x86/vdso/vdso32.S b/arch/x86/vdso/vdso32.S index 1e36f72cab86..2ce5f82c333b 100644 --- a/arch/x86/vdso/vdso32.S +++ b/arch/x86/vdso/vdso32.S @@ -2,14 +2,17 @@ __INITDATA - .globl vdso32_default_start, vdso32_default_end -vdso32_default_start: -#ifdef CONFIG_X86_32 + .globl vdso32_int80_start, vdso32_int80_end +vdso32_int80_start: .incbin "arch/x86/vdso/vdso32-int80.so" -#else +vdso32_int80_end: + + .globl vdso32_syscall_start, vdso32_syscall_end +vdso32_syscall_start: +#ifdef CONFIG_COMPAT .incbin "arch/x86/vdso/vdso32-syscall.so" #endif -vdso32_default_end: +vdso32_syscall_end: .globl vdso32_sysenter_start, vdso32_sysenter_end vdso32_sysenter_start: diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 19a6cfaf5db9..257ba4a10abf 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -21,7 +21,8 @@ unsigned int __read_mostly vdso_enabled = 1; extern char vdso_start[], vdso_end[]; extern unsigned short vdso_sync_cpuid; -struct page **vdso_pages; +static struct page **vdso_pages; +static unsigned vdso_size; static inline void *var_ref(void *p, char *name) { @@ -38,6 +39,7 @@ static int __init init_vdso_vars(void) int i; char *vbase; + vdso_size = npages << PAGE_SHIFT; vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL); if (!vdso_pages) goto oom; @@ -101,20 +103,19 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) struct mm_struct *mm = current->mm; unsigned long addr; int ret; - unsigned len = round_up(vdso_end - vdso_start, PAGE_SIZE); if (!vdso_enabled) return 0; down_write(&mm->mmap_sem); - addr = vdso_addr(mm->start_stack, len); - addr = get_unmapped_area(NULL, addr, len, 0, 0); + addr = vdso_addr(mm->start_stack, vdso_size); + addr = get_unmapped_area(NULL, addr, vdso_size, 0, 0); if (IS_ERR_VALUE(addr)) { ret = addr; goto up_fail; } - ret = install_special_mapping(mm, addr, len, + ret = install_special_mapping(mm, addr, vdso_size, VM_READ|VM_EXEC| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| VM_ALWAYSDUMP, diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index c2cc99580871..87b9ab166423 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -6,8 +6,8 @@ config XEN bool "Xen guest support" select PARAVIRT select PARAVIRT_CLOCK - depends on X86_32 - depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER) + depends on X86_64 || (X86_32 && X86_PAE && !(X86_VISWS || X86_VOYAGER)) + depends on X86_CMPXCHG && X86_TSC help This is the Linux Xen port. Enabling this will allow the kernel to boot in a paravirtualized environment under the @@ -15,10 +15,24 @@ config XEN config XEN_MAX_DOMAIN_MEMORY int "Maximum allowed size of a domain in gigabytes" - default 8 + default 8 if X86_32 + default 32 if X86_64 depends on XEN help The pseudo-physical to machine address array is sized according to the maximum possible memory size of a Xen domain. This array uses 1 page per gigabyte, so there's no - need to be too stingy here.
\ No newline at end of file + need to be too stingy here. + +config XEN_SAVE_RESTORE + bool + depends on XEN && PM + default y + +config XEN_DEBUG_FS + bool "Enable Xen debug and tuning parameters in debugfs" + depends on XEN && DEBUG_FS + default n + help + Enable statistics output and various tuning options in debugfs. + Enabling this option may incur a significant performance overhead. diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 2ba2d1649131..313947940a1a 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -1,4 +1,12 @@ -obj-y := enlighten.o setup.o multicalls.o mmu.o \ - time.o xen-asm.o grant-table.o suspend.o +ifdef CONFIG_FTRACE +# Do not profile debug and lowlevel utilities +CFLAGS_REMOVE_spinlock.o = -pg +CFLAGS_REMOVE_time.o = -pg +CFLAGS_REMOVE_irq.o = -pg +endif -obj-$(CONFIG_SMP) += smp.o +obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ + time.o xen-asm_$(BITS).o grant-table.o suspend.o + +obj-$(CONFIG_SMP) += smp.o spinlock.o +obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
\ No newline at end of file diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c new file mode 100644 index 000000000000..b53225d2cac3 --- /dev/null +++ b/arch/x86/xen/debugfs.c @@ -0,0 +1,123 @@ +#include <linux/init.h> +#include <linux/debugfs.h> +#include <linux/module.h> + +#include "debugfs.h" + +static struct dentry *d_xen_debug; + +struct dentry * __init xen_init_debugfs(void) +{ + if (!d_xen_debug) { + d_xen_debug = debugfs_create_dir("xen", NULL); + + if (!d_xen_debug) + pr_warning("Could not create 'xen' debugfs directory\n"); + } + + return d_xen_debug; +} + +struct array_data +{ + void *array; + unsigned elements; +}; + +static int u32_array_open(struct inode *inode, struct file *file) +{ + file->private_data = NULL; + return nonseekable_open(inode, file); +} + +static size_t format_array(char *buf, size_t bufsize, const char *fmt, + u32 *array, unsigned array_size) +{ + size_t ret = 0; + unsigned i; + + for(i = 0; i < array_size; i++) { + size_t len; + + len = snprintf(buf, bufsize, fmt, array[i]); + len++; /* ' ' or '\n' */ + ret += len; + + if (buf) { + buf += len; + bufsize -= len; + buf[-1] = (i == array_size-1) ? '\n' : ' '; + } + } + + ret++; /* \0 */ + if (buf) + *buf = '\0'; + + return ret; +} + +static char *format_array_alloc(const char *fmt, u32 *array, unsigned array_size) +{ + size_t len = format_array(NULL, 0, fmt, array, array_size); + char *ret; + + ret = kmalloc(len, GFP_KERNEL); + if (ret == NULL) + return NULL; + + format_array(ret, len, fmt, array, array_size); + return ret; +} + +static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len, + loff_t *ppos) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct array_data *data = inode->i_private; + size_t size; + + if (*ppos == 0) { + if (file->private_data) { + kfree(file->private_data); + file->private_data = NULL; + } + + file->private_data = format_array_alloc("%u", data->array, data->elements); + } + + size = 0; + if (file->private_data) + size = strlen(file->private_data); + + return simple_read_from_buffer(buf, len, ppos, file->private_data, size); +} + +static int xen_array_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + + return 0; +} + +static struct file_operations u32_array_fops = { + .owner = THIS_MODULE, + .open = u32_array_open, + .release= xen_array_release, + .read = u32_array_read, +}; + +struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode, + struct dentry *parent, + u32 *array, unsigned elements) +{ + struct array_data *data = kmalloc(sizeof(*data), GFP_KERNEL); + + if (data == NULL) + return NULL; + + data->array = array; + data->elements = elements; + + return debugfs_create_file(name, mode, parent, data, &u32_array_fops); +} diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h new file mode 100644 index 000000000000..e28132084832 --- /dev/null +++ b/arch/x86/xen/debugfs.h @@ -0,0 +1,10 @@ +#ifndef _XEN_DEBUGFS_H +#define _XEN_DEBUGFS_H + +struct dentry * __init xen_init_debugfs(void); + +struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode, + struct dentry *parent, + u32 *array, unsigned elements); + +#endif /* _XEN_DEBUGFS_H */ diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index bb508456ef52..b61534c7a4c4 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -30,22 +30,23 @@ #include <xen/interface/xen.h> #include <xen/interface/physdev.h> #include <xen/interface/vcpu.h> -#include <xen/interface/sched.h> #include <xen/features.h> #include <xen/page.h> +#include <xen/hvc-console.h> #include <asm/paravirt.h> +#include <asm/apic.h> #include <asm/page.h> #include <asm/xen/hypercall.h> #include <asm/xen/hypervisor.h> #include <asm/fixmap.h> #include <asm/processor.h> +#include <asm/msr-index.h> #include <asm/setup.h> #include <asm/desc.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/reboot.h> -#include <asm/pgalloc.h> #include "xen-ops.h" #include "mmu.h" @@ -56,6 +57,21 @@ EXPORT_SYMBOL_GPL(hypercall_page); DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); +enum xen_domain_type xen_domain_type = XEN_NATIVE; +EXPORT_SYMBOL_GPL(xen_domain_type); + +/* + * Identity map, in addition to plain kernel map. This needs to be + * large enough to allocate page table pages to allocate the rest. + * Each page can map 2MB. + */ +static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss; + +#ifdef CONFIG_X86_64 +/* l3 pud for userspace vsyscall mapping */ +static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; +#endif /* CONFIG_X86_64 */ + /* * Note about cr3 (pagetable base) values: * @@ -97,7 +113,14 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info; * * 0: not available, 1: available */ -static int have_vcpu_info_placement = 1; +static int have_vcpu_info_placement = +#ifdef CONFIG_X86_32 + 1 +#else + 0 +#endif + ; + static void xen_vcpu_setup(int cpu) { @@ -167,10 +190,14 @@ void xen_vcpu_restore(void) static void __init xen_banner(void) { + unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL); + struct xen_extraversion extra; + HYPERVISOR_xen_version(XENVER_extraversion, &extra); + printk(KERN_INFO "Booting paravirtualized kernel on %s\n", pv_info.name); - printk(KERN_INFO "Hypervisor signature: %s%s\n", - xen_start_info->magic, + printk(KERN_INFO "Xen version: %d.%d%s%s\n", + version >> 16, version & 0xffff, extra.extraversion, xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); } @@ -209,103 +236,68 @@ static unsigned long xen_get_debugreg(int reg) return HYPERVISOR_get_debugreg(reg); } -static unsigned long xen_save_fl(void) +static void xen_leave_lazy(void) { - struct vcpu_info *vcpu; - unsigned long flags; - - vcpu = x86_read_percpu(xen_vcpu); - - /* flag has opposite sense of mask */ - flags = !vcpu->evtchn_upcall_mask; - - /* convert to IF type flag - -0 -> 0x00000000 - -1 -> 0xffffffff - */ - return (-flags) & X86_EFLAGS_IF; + paravirt_leave_lazy(paravirt_get_lazy_mode()); + xen_mc_flush(); } -static void xen_restore_fl(unsigned long flags) +static unsigned long xen_store_tr(void) { - struct vcpu_info *vcpu; - - /* convert from IF type flag */ - flags = !(flags & X86_EFLAGS_IF); - - /* There's a one instruction preempt window here. We need to - make sure we're don't switch CPUs between getting the vcpu - pointer and updating the mask. */ - preempt_disable(); - vcpu = x86_read_percpu(xen_vcpu); - vcpu->evtchn_upcall_mask = flags; - preempt_enable_no_resched(); - - /* Doesn't matter if we get preempted here, because any - pending event will get dealt with anyway. */ - - if (flags == 0) { - preempt_check_resched(); - barrier(); /* unmask then check (avoid races) */ - if (unlikely(vcpu->evtchn_upcall_pending)) - force_evtchn_callback(); - } + return 0; } -static void xen_irq_disable(void) +/* + * Set the page permissions for a particular virtual address. If the + * address is a vmalloc mapping (or other non-linear mapping), then + * find the linear mapping of the page and also set its protections to + * match. + */ +static void set_aliased_prot(void *v, pgprot_t prot) { - /* There's a one instruction preempt window here. We need to - make sure we're don't switch CPUs between getting the vcpu - pointer and updating the mask. */ - preempt_disable(); - x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1; - preempt_enable_no_resched(); -} + int level; + pte_t *ptep; + pte_t pte; + unsigned long pfn; + struct page *page; -static void xen_irq_enable(void) -{ - struct vcpu_info *vcpu; + ptep = lookup_address((unsigned long)v, &level); + BUG_ON(ptep == NULL); - /* We don't need to worry about being preempted here, since - either a) interrupts are disabled, so no preemption, or b) - the caller is confused and is trying to re-enable interrupts - on an indeterminate processor. */ + pfn = pte_pfn(*ptep); + page = pfn_to_page(pfn); - vcpu = x86_read_percpu(xen_vcpu); - vcpu->evtchn_upcall_mask = 0; + pte = pfn_pte(pfn, prot); - /* Doesn't matter if we get preempted here, because any - pending event will get dealt with anyway. */ + if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0)) + BUG(); - barrier(); /* unmask then check (avoid races) */ - if (unlikely(vcpu->evtchn_upcall_pending)) - force_evtchn_callback(); -} + if (!PageHighMem(page)) { + void *av = __va(PFN_PHYS(pfn)); -static void xen_safe_halt(void) -{ - /* Blocking includes an implicit local_irq_enable(). */ - if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0) - BUG(); + if (av != v) + if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0)) + BUG(); + } else + kmap_flush_unused(); } -static void xen_halt(void) +static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries) { - if (irqs_disabled()) - HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); - else - xen_safe_halt(); -} + const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE; + int i; -static void xen_leave_lazy(void) -{ - paravirt_leave_lazy(paravirt_get_lazy_mode()); - xen_mc_flush(); + for(i = 0; i < entries; i += entries_per_page) + set_aliased_prot(ldt + i, PAGE_KERNEL_RO); } -static unsigned long xen_store_tr(void) +static void xen_free_ldt(struct desc_struct *ldt, unsigned entries) { - return 0; + const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE; + int i; + + for(i = 0; i < entries; i += entries_per_page) + set_aliased_prot(ldt + i, PAGE_KERNEL); } static void xen_set_ldt(const void *addr, unsigned entries) @@ -363,14 +355,6 @@ static void load_TLS_descriptor(struct thread_struct *t, static void xen_load_tls(struct thread_struct *t, unsigned int cpu) { - xen_mc_batch(); - - load_TLS_descriptor(t, cpu, 0); - load_TLS_descriptor(t, cpu, 1); - load_TLS_descriptor(t, cpu, 2); - - xen_mc_issue(PARAVIRT_LAZY_CPU); - /* * XXX sleazy hack: If we're being called in a lazy-cpu zone, * it means we're in a context switch, and %gs has just been @@ -379,16 +363,44 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu) * Either way, it has been saved, and the new value will get * loaded properly. This will go away as soon as Xen has been * modified to not save/restore %gs for normal hypercalls. + * + * On x86_64, this hack is not used for %gs, because gs points + * to KERNEL_GS_BASE (and uses it for PDA references), so we + * must not zero %gs on x86_64 + * + * For x86_64, we need to zero %fs, otherwise we may get an + * exception between the new %fs descriptor being loaded and + * %fs being effectively cleared at __switch_to(). */ - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) + if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) { +#ifdef CONFIG_X86_32 loadsegment(gs, 0); +#else + loadsegment(fs, 0); +#endif + } + + xen_mc_batch(); + + load_TLS_descriptor(t, cpu, 0); + load_TLS_descriptor(t, cpu, 1); + load_TLS_descriptor(t, cpu, 2); + + xen_mc_issue(PARAVIRT_LAZY_CPU); } +#ifdef CONFIG_X86_64 +static void xen_load_gs_index(unsigned int idx) +{ + if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx)) + BUG(); +} +#endif + static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, const void *ptr) { - unsigned long lp = (unsigned long)&dt[entrynum]; - xmaddr_t mach_lp = virt_to_machine(lp); + xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]); u64 entry = *(u64 *)ptr; preempt_disable(); @@ -400,23 +412,18 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, preempt_enable(); } -static int cvt_gate_to_trap(int vector, u32 low, u32 high, +static int cvt_gate_to_trap(int vector, const gate_desc *val, struct trap_info *info) { - u8 type, dpl; - - type = (high >> 8) & 0x1f; - dpl = (high >> 13) & 3; - - if (type != 0xf && type != 0xe) + if (val->type != 0xf && val->type != 0xe) return 0; info->vector = vector; - info->address = (high & 0xffff0000) | (low & 0x0000ffff); - info->cs = low >> 16; - info->flags = dpl; + info->address = gate_offset(*val); + info->cs = gate_segment(*val); + info->flags = val->dpl; /* interrupt gates clear IF */ - if (type == 0xe) + if (val->type == 0xe) info->flags |= 4; return 1; @@ -443,11 +450,10 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g) if (p >= start && (p + 8) <= end) { struct trap_info info[2]; - u32 *desc = (u32 *)g; info[1].address = 0; - if (cvt_gate_to_trap(entrynum, desc[0], desc[1], &info[0])) + if (cvt_gate_to_trap(entrynum, g, &info[0])) if (HYPERVISOR_set_trap_table(info)) BUG(); } @@ -460,13 +466,13 @@ static void xen_convert_trap_info(const struct desc_ptr *desc, { unsigned in, out, count; - count = (desc->size+1) / 8; + count = (desc->size+1) / sizeof(gate_desc); BUG_ON(count > 256); for (in = out = 0; in < count; in++) { - const u32 *entry = (u32 *)(desc->address + in * 8); + gate_desc *entry = (gate_desc*)(desc->address) + in; - if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out])) + if (cvt_gate_to_trap(in, entry, &traps[out])) out++; } traps[out].address = 0; @@ -527,7 +533,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry, } static void xen_load_sp0(struct tss_struct *tss, - struct thread_struct *thread) + struct thread_struct *thread) { struct multicall_space mcs = xen_mc_entry(0); MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); @@ -548,16 +554,47 @@ static void xen_io_delay(void) } #ifdef CONFIG_X86_LOCAL_APIC -static u32 xen_apic_read(unsigned long reg) +static u32 xen_apic_read(u32 reg) { return 0; } -static void xen_apic_write(unsigned long reg, u32 val) +static void xen_apic_write(u32 reg, u32 val) { /* Warn to see if there's any stray references */ WARN_ON(1); } + +static u64 xen_apic_icr_read(void) +{ + return 0; +} + +static void xen_apic_icr_write(u32 low, u32 id) +{ + /* Warn to see if there's any stray references */ + WARN_ON(1); +} + +static void xen_apic_wait_icr_idle(void) +{ + return; +} + +static u32 xen_safe_apic_wait_icr_idle(void) +{ + return 0; +} + +static struct apic_ops xen_basic_apic_ops = { + .read = xen_apic_read, + .write = xen_apic_write, + .icr_read = xen_apic_icr_read, + .icr_write = xen_apic_icr_write, + .wait_icr_idle = xen_apic_wait_icr_idle, + .safe_wait_icr_idle = xen_safe_apic_wait_icr_idle, +}; + #endif static void xen_flush_tlb(void) @@ -695,36 +732,105 @@ static void set_current_cr3(void *v) x86_write_percpu(xen_current_cr3, (unsigned long)v); } -static void xen_write_cr3(unsigned long cr3) +static void __xen_write_cr3(bool kernel, unsigned long cr3) { struct mmuext_op *op; struct multicall_space mcs; - unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); + unsigned long mfn; - BUG_ON(preemptible()); + if (cr3) + mfn = pfn_to_mfn(PFN_DOWN(cr3)); + else + mfn = 0; - mcs = xen_mc_entry(sizeof(*op)); /* disables interrupts */ + WARN_ON(mfn == 0 && kernel); - /* Update while interrupts are disabled, so its atomic with - respect to ipis */ - x86_write_percpu(xen_cr3, cr3); + mcs = __xen_mc_entry(sizeof(*op)); op = mcs.args; - op->cmd = MMUEXT_NEW_BASEPTR; + op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR; op->arg1.mfn = mfn; MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - /* Update xen_update_cr3 once the batch has actually - been submitted. */ - xen_mc_callback(set_current_cr3, (void *)cr3); + if (kernel) { + x86_write_percpu(xen_cr3, cr3); + + /* Update xen_current_cr3 once the batch has actually + been submitted. */ + xen_mc_callback(set_current_cr3, (void *)cr3); + } +} + +static void xen_write_cr3(unsigned long cr3) +{ + BUG_ON(preemptible()); + + xen_mc_batch(); /* disables interrupts */ + + /* Update while interrupts are disabled, so its atomic with + respect to ipis */ + x86_write_percpu(xen_cr3, cr3); + + __xen_write_cr3(true, cr3); + +#ifdef CONFIG_X86_64 + { + pgd_t *user_pgd = xen_get_user_pgd(__va(cr3)); + if (user_pgd) + __xen_write_cr3(false, __pa(user_pgd)); + else + __xen_write_cr3(false, 0); + } +#endif xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ } +static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) +{ + int ret; + + ret = 0; + + switch(msr) { +#ifdef CONFIG_X86_64 + unsigned which; + u64 base; + + case MSR_FS_BASE: which = SEGBASE_FS; goto set; + case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set; + case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set; + + set: + base = ((u64)high << 32) | low; + if (HYPERVISOR_set_segment_base(which, base) != 0) + ret = -EFAULT; + break; +#endif + + case MSR_STAR: + case MSR_CSTAR: + case MSR_LSTAR: + case MSR_SYSCALL_MASK: + case MSR_IA32_SYSENTER_CS: + case MSR_IA32_SYSENTER_ESP: + case MSR_IA32_SYSENTER_EIP: + /* Fast syscall setup is all done in hypercalls, so + these are all ignored. Stub them out here to stop + Xen console noise. */ + break; + + default: + ret = native_write_msr_safe(msr, low, high); + } + + return ret; +} + /* Early in boot, while setting up the initial pagetable, assume everything is pinned. */ -static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn) +static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) { #ifdef CONFIG_FLATMEM BUG_ON(mem_map); /* should only be used early */ @@ -734,7 +840,7 @@ static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn) /* Early release_pte assumes that all pts are pinned, since there's only init_mm and anything attached to that is pinned. */ -static void xen_release_pte_init(u32 pfn) +static void xen_release_pte_init(unsigned long pfn) { make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); } @@ -750,7 +856,7 @@ static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) /* This needs to make sure the new pte page is pinned iff its being attached to a pinned pagetable. */ -static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level) +static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level) { struct page *page = pfn_to_page(pfn); @@ -758,34 +864,77 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level) SetPagePinned(page); if (!PageHighMem(page)) { - make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); - if (level == PT_PTE) + make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn))); + if (level == PT_PTE && USE_SPLIT_PTLOCKS) pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); } else /* make sure there are no stray mappings of this page */ kmap_flush_unused(); + vm_unmap_aliases(); } } -static void xen_alloc_pte(struct mm_struct *mm, u32 pfn) +static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn) { xen_alloc_ptpage(mm, pfn, PT_PTE); } -static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn) +static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn) { xen_alloc_ptpage(mm, pfn, PT_PMD); } +static int xen_pgd_alloc(struct mm_struct *mm) +{ + pgd_t *pgd = mm->pgd; + int ret = 0; + + BUG_ON(PagePinned(virt_to_page(pgd))); + +#ifdef CONFIG_X86_64 + { + struct page *page = virt_to_page(pgd); + pgd_t *user_pgd; + + BUG_ON(page->private != 0); + + ret = -ENOMEM; + + user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + page->private = (unsigned long)user_pgd; + + if (user_pgd != NULL) { + user_pgd[pgd_index(VSYSCALL_START)] = + __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE); + ret = 0; + } + + BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd)))); + } +#endif + + return ret; +} + +static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ +#ifdef CONFIG_X86_64 + pgd_t *user_pgd = xen_get_user_pgd(pgd); + + if (user_pgd) + free_page((unsigned long)user_pgd); +#endif +} + /* This should never happen until we're OK to use struct page */ -static void xen_release_ptpage(u32 pfn, unsigned level) +static void xen_release_ptpage(unsigned long pfn, unsigned level) { struct page *page = pfn_to_page(pfn); if (PagePinned(page)) { if (!PageHighMem(page)) { - if (level == PT_PTE) + if (level == PT_PTE && USE_SPLIT_PTLOCKS) pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); } @@ -793,16 +942,28 @@ static void xen_release_ptpage(u32 pfn, unsigned level) } } -static void xen_release_pte(u32 pfn) +static void xen_release_pte(unsigned long pfn) { xen_release_ptpage(pfn, PT_PTE); } -static void xen_release_pmd(u32 pfn) +static void xen_release_pmd(unsigned long pfn) { xen_release_ptpage(pfn, PT_PMD); } +#if PAGETABLE_LEVELS == 4 +static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) +{ + xen_alloc_ptpage(mm, pfn, PT_PUD); +} + +static void xen_release_pud(unsigned long pfn) +{ + xen_release_ptpage(pfn, PT_PUD); +} +#endif + #ifdef CONFIG_HIGHPTE static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) { @@ -820,6 +981,7 @@ static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) } #endif +#ifdef CONFIG_X86_32 static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) { /* If there's an existing pte, then don't allow _PAGE_RW to be set */ @@ -838,71 +1000,20 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) xen_set_pte(ptep, pte); } +#endif static __init void xen_pagetable_setup_start(pgd_t *base) { - pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; - int i; - - /* special set_pte for pagetable initialization */ - pv_mmu_ops.set_pte = xen_set_pte_init; - - init_mm.pgd = base; - /* - * copy top-level of Xen-supplied pagetable into place. This - * is a stand-in while we copy the pmd pages. - */ - memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t)); - - /* - * For PAE, need to allocate new pmds, rather than - * share Xen's, since Xen doesn't like pmd's being - * shared between address spaces. - */ - for (i = 0; i < PTRS_PER_PGD; i++) { - if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) { - pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); - - memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]), - PAGE_SIZE); - - make_lowmem_page_readonly(pmd); - - set_pgd(&base[i], __pgd(1 + __pa(pmd))); - } else - pgd_clear(&base[i]); - } - - /* make sure zero_page is mapped RO so we can use it in pagetables */ - make_lowmem_page_readonly(empty_zero_page); - make_lowmem_page_readonly(base); - /* - * Switch to new pagetable. This is done before - * pagetable_init has done anything so that the new pages - * added to the table can be prepared properly for Xen. - */ - xen_write_cr3(__pa(base)); - - /* Unpin initial Xen pagetable */ - pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, - PFN_DOWN(__pa(xen_start_info->pt_base))); } void xen_setup_shared_info(void) { if (!xen_feature(XENFEAT_auto_translated_physmap)) { - unsigned long addr = fix_to_virt(FIX_PARAVIRT_BOOTMAP); - - /* - * Create a mapping for the shared info page. - * Should be set_fixmap(), but shared_info is a machine - * address with no corresponding pseudo-phys address. - */ - set_pte_mfn(addr, - PFN_DOWN(xen_start_info->shared_info), - PAGE_KERNEL); - - HYPERVISOR_shared_info = (struct shared_info *)addr; + set_fixmap(FIX_PARAVIRT_BOOTMAP, + xen_start_info->shared_info); + + HYPERVISOR_shared_info = + (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); } else HYPERVISOR_shared_info = (struct shared_info *)__va(xen_start_info->shared_info); @@ -917,26 +1028,32 @@ void xen_setup_shared_info(void) static __init void xen_pagetable_setup_done(pgd_t *base) { - /* This will work as long as patching hasn't happened yet - (which it hasn't) */ - pv_mmu_ops.alloc_pte = xen_alloc_pte; - pv_mmu_ops.alloc_pmd = xen_alloc_pmd; - pv_mmu_ops.release_pte = xen_release_pte; - pv_mmu_ops.release_pmd = xen_release_pmd; - pv_mmu_ops.set_pte = xen_set_pte; - xen_setup_shared_info(); - - /* Actually pin the pagetable down, but we can't set PG_pinned - yet because the page structures don't exist yet. */ - pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base))); } static __init void xen_post_allocator_init(void) { + pv_mmu_ops.set_pte = xen_set_pte; pv_mmu_ops.set_pmd = xen_set_pmd; pv_mmu_ops.set_pud = xen_set_pud; +#if PAGETABLE_LEVELS == 4 + pv_mmu_ops.set_pgd = xen_set_pgd; +#endif + /* This will work as long as patching hasn't happened yet + (which it hasn't) */ + pv_mmu_ops.alloc_pte = xen_alloc_pte; + pv_mmu_ops.alloc_pmd = xen_alloc_pmd; + pv_mmu_ops.release_pte = xen_release_pte; + pv_mmu_ops.release_pmd = xen_release_pmd; +#if PAGETABLE_LEVELS == 4 + pv_mmu_ops.alloc_pud = xen_alloc_pud; + pv_mmu_ops.release_pud = xen_release_pud; +#endif + +#ifdef CONFIG_X86_64 + SetPagePinned(virt_to_page(level3_user_vsyscall)); +#endif xen_mark_init_mm_pinned(); } @@ -1025,8 +1142,15 @@ static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot) #ifdef CONFIG_X86_F00F_BUG case FIX_F00F_IDT: #endif +#ifdef CONFIG_X86_32 case FIX_WP_TEST: case FIX_VDSO: +# ifdef CONFIG_HIGHMEM + case FIX_KMAP_BEGIN ... FIX_KMAP_END: +# endif +#else + case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE: +#endif #ifdef CONFIG_X86_LOCAL_APIC case FIX_APIC_BASE: /* maps dummy local APIC */ #endif @@ -1039,6 +1163,15 @@ static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot) } __native_set_fixmap(idx, pte); + +#ifdef CONFIG_X86_64 + /* Replicate changes to map the vsyscall page into the user + pagetable vsyscall mapping. */ + if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) { + unsigned long vaddr = __fix_to_virt(idx); + set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte); + } +#endif } static const struct pv_info xen_info __initdata = { @@ -1084,18 +1217,28 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { .wbinvd = native_wbinvd, .read_msr = native_read_msr_safe, - .write_msr = native_write_msr_safe, + .write_msr = xen_write_msr_safe, .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, .iret = xen_iret, .irq_enable_sysexit = xen_sysexit, +#ifdef CONFIG_X86_64 + .usergs_sysret32 = xen_sysret32, + .usergs_sysret64 = xen_sysret64, +#endif .load_tr_desc = paravirt_nop, .set_ldt = xen_set_ldt, .load_gdt = xen_load_gdt, .load_idt = xen_load_idt, .load_tls = xen_load_tls, +#ifdef CONFIG_X86_64 + .load_gs_index = xen_load_gs_index, +#endif + + .alloc_ldt = xen_alloc_ldt, + .free_ldt = xen_free_ldt, .store_gdt = native_store_gdt, .store_idt = native_store_idt, @@ -1109,30 +1252,17 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { .set_iopl_mask = xen_set_iopl_mask, .io_delay = xen_io_delay, + /* Xen takes care of %gs when switching to usermode for us */ + .swapgs = paravirt_nop, + .lazy_mode = { .enter = paravirt_enter_lazy_cpu, .leave = xen_leave_lazy, }, }; -static const struct pv_irq_ops xen_irq_ops __initdata = { - .init_IRQ = xen_init_IRQ, - .save_fl = xen_save_fl, - .restore_fl = xen_restore_fl, - .irq_disable = xen_irq_disable, - .irq_enable = xen_irq_enable, - .safe_halt = xen_safe_halt, - .halt = xen_halt, -#ifdef CONFIG_X86_64 - .adjust_exception_frame = paravirt_nop, -#endif -}; - static const struct pv_apic_ops xen_apic_ops __initdata = { #ifdef CONFIG_X86_LOCAL_APIC - .apic_write = xen_apic_write, - .apic_write_atomic = xen_apic_write, - .apic_read = xen_apic_read, .setup_boot_clock = paravirt_nop, .setup_secondary_clock = paravirt_nop, .startup_ipi_hook = paravirt_nop, @@ -1157,8 +1287,8 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .pte_update = paravirt_nop, .pte_update_defer = paravirt_nop, - .pgd_alloc = __paravirt_pgd_alloc, - .pgd_free = paravirt_nop, + .pgd_alloc = xen_pgd_alloc, + .pgd_free = xen_pgd_free, .alloc_pte = xen_alloc_pte_init, .release_pte = xen_release_pte_init, @@ -1170,7 +1300,11 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .kmap_atomic_pte = xen_kmap_atomic_pte, #endif - .set_pte = NULL, /* see xen_pagetable_setup_* */ +#ifdef CONFIG_X86_64 + .set_pte = xen_set_pte, +#else + .set_pte = xen_set_pte_init, +#endif .set_pte_at = xen_set_pte_at, .set_pmd = xen_set_pmd_hyper, @@ -1178,21 +1312,32 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .ptep_modify_prot_commit = __ptep_modify_prot_commit, .pte_val = xen_pte_val, - .pte_flags = native_pte_val, + .pte_flags = native_pte_flags, .pgd_val = xen_pgd_val, .make_pte = xen_make_pte, .make_pgd = xen_make_pgd, +#ifdef CONFIG_X86_PAE .set_pte_atomic = xen_set_pte_atomic, .set_pte_present = xen_set_pte_at, - .set_pud = xen_set_pud_hyper, .pte_clear = xen_pte_clear, .pmd_clear = xen_pmd_clear, +#endif /* CONFIG_X86_PAE */ + .set_pud = xen_set_pud_hyper, .make_pmd = xen_make_pmd, .pmd_val = xen_pmd_val, +#if PAGETABLE_LEVELS == 4 + .pud_val = xen_pud_val, + .make_pud = xen_make_pud, + .set_pgd = xen_set_pgd_hyper, + + .alloc_pud = xen_alloc_pte_init, + .release_pud = xen_release_pte_init, +#endif /* PAGETABLE_LEVELS == 4 */ + .activate_mm = xen_activate_mm, .dup_mmap = xen_dup_mmap, .exit_mmap = xen_exit_mmap, @@ -1205,21 +1350,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .set_fixmap = xen_set_fixmap, }; -#ifdef CONFIG_SMP -static const struct smp_ops xen_smp_ops __initdata = { - .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, - .smp_prepare_cpus = xen_smp_prepare_cpus, - .cpu_up = xen_cpu_up, - .smp_cpus_done = xen_smp_cpus_done, - - .smp_send_stop = xen_smp_send_stop, - .smp_send_reschedule = xen_smp_send_reschedule, - - .send_call_func_ipi = xen_smp_send_call_function_ipi, - .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi, -}; -#endif /* CONFIG_SMP */ - static void xen_reboot(int reason) { struct sched_shutdown r = { .reason = reason }; @@ -1264,15 +1394,219 @@ static const struct machine_ops __initdata xen_machine_ops = { static void __init xen_reserve_top(void) { +#ifdef CONFIG_X86_32 unsigned long top = HYPERVISOR_VIRT_START; struct xen_platform_parameters pp; if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) top = pp.virt_start; - reserve_top_address(-top + 2 * PAGE_SIZE); + reserve_top_address(-top); +#endif /* CONFIG_X86_32 */ +} + +/* + * Like __va(), but returns address in the kernel mapping (which is + * all we have until the physical memory mapping has been set up. + */ +static void *__ka(phys_addr_t paddr) +{ +#ifdef CONFIG_X86_64 + return (void *)(paddr + __START_KERNEL_map); +#else + return __va(paddr); +#endif +} + +/* Convert a machine address to physical address */ +static unsigned long m2p(phys_addr_t maddr) +{ + phys_addr_t paddr; + + maddr &= PTE_PFN_MASK; + paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT; + + return paddr; } +/* Convert a machine address to kernel virtual */ +static void *m2v(phys_addr_t maddr) +{ + return __ka(m2p(maddr)); +} + +static void set_page_prot(void *addr, pgprot_t prot) +{ + unsigned long pfn = __pa(addr) >> PAGE_SHIFT; + pte_t pte = pfn_pte(pfn, prot); + + if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0)) + BUG(); +} + +static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) +{ + unsigned pmdidx, pteidx; + unsigned ident_pte; + unsigned long pfn; + + ident_pte = 0; + pfn = 0; + for(pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { + pte_t *pte_page; + + /* Reuse or allocate a page of ptes */ + if (pmd_present(pmd[pmdidx])) + pte_page = m2v(pmd[pmdidx].pmd); + else { + /* Check for free pte pages */ + if (ident_pte == ARRAY_SIZE(level1_ident_pgt)) + break; + + pte_page = &level1_ident_pgt[ident_pte]; + ident_pte += PTRS_PER_PTE; + + pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE); + } + + /* Install mappings */ + for(pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { + pte_t pte; + + if (pfn > max_pfn_mapped) + max_pfn_mapped = pfn; + + if (!pte_none(pte_page[pteidx])) + continue; + + pte = pfn_pte(pfn, PAGE_KERNEL_EXEC); + pte_page[pteidx] = pte; + } + } + + for(pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE) + set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO); + + set_page_prot(pmd, PAGE_KERNEL_RO); +} + +#ifdef CONFIG_X86_64 +static void convert_pfn_mfn(void *v) +{ + pte_t *pte = v; + int i; + + /* All levels are converted the same way, so just treat them + as ptes. */ + for(i = 0; i < PTRS_PER_PTE; i++) + pte[i] = xen_make_pte(pte[i].pte); +} + +/* + * Set up the inital kernel pagetable. + * + * We can construct this by grafting the Xen provided pagetable into + * head_64.S's preconstructed pagetables. We copy the Xen L2's into + * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This + * means that only the kernel has a physical mapping to start with - + * but that's enough to get __va working. We need to fill in the rest + * of the physical mapping once some sort of allocator has been set + * up. + */ +static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) +{ + pud_t *l3; + pmd_t *l2; + + /* Zap identity mapping */ + init_level4_pgt[0] = __pgd(0); + + /* Pre-constructed entries are in pfn, so convert to mfn */ + convert_pfn_mfn(init_level4_pgt); + convert_pfn_mfn(level3_ident_pgt); + convert_pfn_mfn(level3_kernel_pgt); + + l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); + l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); + + memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); + memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); + + l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd); + l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud); + memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); + + /* Set up identity map */ + xen_map_identity_early(level2_ident_pgt, max_pfn); + + /* Make pagetable pieces RO */ + set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); + set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); + set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); + + /* Pin down new L4 */ + pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, + PFN_DOWN(__pa_symbol(init_level4_pgt))); + + /* Unpin Xen-provided one */ + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); + + /* Switch over */ + pgd = init_level4_pgt; + + /* + * At this stage there can be no user pgd, and no page + * structure to attach it to, so make sure we just set kernel + * pgd. + */ + xen_mc_batch(); + __xen_write_cr3(true, __pa(pgd)); + xen_mc_issue(PARAVIRT_LAZY_CPU); + + reserve_early(__pa(xen_start_info->pt_base), + __pa(xen_start_info->pt_base + + xen_start_info->nr_pt_frames * PAGE_SIZE), + "XEN PAGETABLES"); + + return pgd; +} +#else /* !CONFIG_X86_64 */ +static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss; + +static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) +{ + pmd_t *kernel_pmd; + + init_pg_tables_start = __pa(pgd); + init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; + max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024); + + kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); + memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); + + xen_map_identity_early(level2_kernel_pgt, max_pfn); + + memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD); + set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY], + __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT)); + + set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); + set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO); + set_page_prot(empty_zero_page, PAGE_KERNEL_RO); + + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); + + xen_write_cr3(__pa(swapper_pg_dir)); + + pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir))); + + return swapper_pg_dir; +} +#endif /* CONFIG_X86_64 */ + /* First C function to be called on Xen boot */ asmlinkage void __init xen_start_kernel(void) { @@ -1281,6 +1615,8 @@ asmlinkage void __init xen_start_kernel(void) if (!xen_start_info) return; + xen_domain_type = XEN_PV_DOMAIN; + BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0); xen_setup_features(); @@ -1290,10 +1626,18 @@ asmlinkage void __init xen_start_kernel(void) pv_init_ops = xen_init_ops; pv_time_ops = xen_time_ops; pv_cpu_ops = xen_cpu_ops; - pv_irq_ops = xen_irq_ops; pv_apic_ops = xen_apic_ops; pv_mmu_ops = xen_mmu_ops; + xen_init_irq_ops(); + +#ifdef CONFIG_X86_LOCAL_APIC + /* + * set up the basic apic ops. + */ + apic_ops = &xen_basic_apic_ops; +#endif + if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start; pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit; @@ -1301,60 +1645,69 @@ asmlinkage void __init xen_start_kernel(void) machine_ops = xen_machine_ops; -#ifdef CONFIG_SMP - smp_ops = xen_smp_ops; +#ifdef CONFIG_X86_64 + /* Disable until direct per-cpu data access. */ + have_vcpu_info_placement = 0; + x86_64_init_pda(); #endif + xen_smp_init(); + /* Get mfn list */ if (!xen_feature(XENFEAT_auto_translated_physmap)) xen_build_dynamic_phys_to_machine(); pgd = (pgd_t *)xen_start_info->pt_base; - init_pg_tables_start = __pa(pgd); - init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; - max_pfn_mapped = (init_pg_tables_end + 512*1024) >> PAGE_SHIFT; - - init_mm.pgd = pgd; /* use the Xen pagetables to start */ - - /* keep using Xen gdt for now; no urgent need to change it */ - - x86_write_percpu(xen_cr3, __pa(pgd)); - x86_write_percpu(xen_current_cr3, __pa(pgd)); + /* Prevent unwanted bits from being set in PTEs. */ + __supported_pte_mask &= ~_PAGE_GLOBAL; + if (!xen_initial_domain()) + __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); /* Don't do the full vcpu_info placement stuff until we have a possible map and a non-dummy shared_info. */ per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; + xen_raw_console_write("mapping kernel into physical memory\n"); + pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages); + + init_mm.pgd = pgd; + + /* keep using Xen gdt for now; no urgent need to change it */ + pv_info.kernel_rpl = 1; if (xen_feature(XENFEAT_supervisor_mode_kernel)) pv_info.kernel_rpl = 0; - /* Prevent unwanted bits from being set in PTEs. */ - __supported_pte_mask &= ~_PAGE_GLOBAL; - if (!is_initial_xendomain()) - __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); - /* set the limit of our address space */ xen_reserve_top(); +#ifdef CONFIG_X86_32 /* set up basic CPUID stuff */ cpu_detect(&new_cpu_data); new_cpu_data.hard_math = 1; new_cpu_data.x86_capability[0] = cpuid_edx(1); +#endif /* Poke various useful things into boot_params */ boot_params.hdr.type_of_loader = (9 << 4) | 0; boot_params.hdr.ramdisk_image = xen_start_info->mod_start ? __pa(xen_start_info->mod_start) : 0; boot_params.hdr.ramdisk_size = xen_start_info->mod_len; + boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line); - if (!is_initial_xendomain()) { + if (!xen_initial_domain()) { add_preferred_console("xenboot", 0, NULL); add_preferred_console("tty", 0, NULL); add_preferred_console("hvc", 0, NULL); } + xen_raw_console_write("about to get started...\n"); + /* Start the world */ +#ifdef CONFIG_X86_32 i386_start_kernel(); +#else + x86_64_start_reservations((char *)__pa_symbol(&boot_params)); +#endif } diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c new file mode 100644 index 000000000000..bb042608c602 --- /dev/null +++ b/arch/x86/xen/irq.c @@ -0,0 +1,141 @@ +#include <linux/hardirq.h> + +#include <xen/interface/xen.h> +#include <xen/interface/sched.h> +#include <xen/interface/vcpu.h> + +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> + +#include "xen-ops.h" + +/* + * Force a proper event-channel callback from Xen after clearing the + * callback mask. We do this in a very simple manner, by making a call + * down into Xen. The pending flag will be checked by Xen on return. + */ +void xen_force_evtchn_callback(void) +{ + (void)HYPERVISOR_xen_version(0, NULL); +} + +static void __init __xen_init_IRQ(void) +{ + int i; + + /* Create identity vector->irq map */ + for(i = 0; i < NR_VECTORS; i++) { + int cpu; + + for_each_possible_cpu(cpu) + per_cpu(vector_irq, cpu)[i] = i; + } + + xen_init_IRQ(); +} + +static unsigned long xen_save_fl(void) +{ + struct vcpu_info *vcpu; + unsigned long flags; + + vcpu = x86_read_percpu(xen_vcpu); + + /* flag has opposite sense of mask */ + flags = !vcpu->evtchn_upcall_mask; + + /* convert to IF type flag + -0 -> 0x00000000 + -1 -> 0xffffffff + */ + return (-flags) & X86_EFLAGS_IF; +} + +static void xen_restore_fl(unsigned long flags) +{ + struct vcpu_info *vcpu; + + /* convert from IF type flag */ + flags = !(flags & X86_EFLAGS_IF); + + /* There's a one instruction preempt window here. We need to + make sure we're don't switch CPUs between getting the vcpu + pointer and updating the mask. */ + preempt_disable(); + vcpu = x86_read_percpu(xen_vcpu); + vcpu->evtchn_upcall_mask = flags; + preempt_enable_no_resched(); + + /* Doesn't matter if we get preempted here, because any + pending event will get dealt with anyway. */ + + if (flags == 0) { + preempt_check_resched(); + barrier(); /* unmask then check (avoid races) */ + if (unlikely(vcpu->evtchn_upcall_pending)) + xen_force_evtchn_callback(); + } +} + +static void xen_irq_disable(void) +{ + /* There's a one instruction preempt window here. We need to + make sure we're don't switch CPUs between getting the vcpu + pointer and updating the mask. */ + preempt_disable(); + x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1; + preempt_enable_no_resched(); +} + +static void xen_irq_enable(void) +{ + struct vcpu_info *vcpu; + + /* We don't need to worry about being preempted here, since + either a) interrupts are disabled, so no preemption, or b) + the caller is confused and is trying to re-enable interrupts + on an indeterminate processor. */ + + vcpu = x86_read_percpu(xen_vcpu); + vcpu->evtchn_upcall_mask = 0; + + /* Doesn't matter if we get preempted here, because any + pending event will get dealt with anyway. */ + + barrier(); /* unmask then check (avoid races) */ + if (unlikely(vcpu->evtchn_upcall_pending)) + xen_force_evtchn_callback(); +} + +static void xen_safe_halt(void) +{ + /* Blocking includes an implicit local_irq_enable(). */ + if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0) + BUG(); +} + +static void xen_halt(void) +{ + if (irqs_disabled()) + HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); + else + xen_safe_halt(); +} + +static const struct pv_irq_ops xen_irq_ops __initdata = { + .init_IRQ = __xen_init_IRQ, + .save_fl = xen_save_fl, + .restore_fl = xen_restore_fl, + .irq_disable = xen_irq_disable, + .irq_enable = xen_irq_enable, + .safe_halt = xen_safe_halt, + .halt = xen_halt, +#ifdef CONFIG_X86_64 + .adjust_exception_frame = xen_adjust_exception_frame, +#endif +}; + +void __init xen_init_irq_ops() +{ + pv_irq_ops = xen_irq_ops; +} diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index ff0aa74afaa1..d4d52f5a1cf7 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -40,12 +40,15 @@ */ #include <linux/sched.h> #include <linux/highmem.h> +#include <linux/debugfs.h> #include <linux/bug.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> +#include <asm/fixmap.h> #include <asm/mmu_context.h> #include <asm/paravirt.h> +#include <asm/linkage.h> #include <asm/xen/hypercall.h> #include <asm/xen/hypervisor.h> @@ -55,27 +58,85 @@ #include "multicalls.h" #include "mmu.h" +#include "debugfs.h" + +#define MMU_UPDATE_HISTO 30 + +#ifdef CONFIG_XEN_DEBUG_FS + +static struct { + u32 pgd_update; + u32 pgd_update_pinned; + u32 pgd_update_batched; + + u32 pud_update; + u32 pud_update_pinned; + u32 pud_update_batched; + + u32 pmd_update; + u32 pmd_update_pinned; + u32 pmd_update_batched; + + u32 pte_update; + u32 pte_update_pinned; + u32 pte_update_batched; + + u32 mmu_update; + u32 mmu_update_extended; + u32 mmu_update_histo[MMU_UPDATE_HISTO]; + + u32 prot_commit; + u32 prot_commit_batched; + + u32 set_pte_at; + u32 set_pte_at_batched; + u32 set_pte_at_pinned; + u32 set_pte_at_current; + u32 set_pte_at_kernel; +} mmu_stats; + +static u8 zero_stats; + +static inline void check_zero(void) +{ + if (unlikely(zero_stats)) { + memset(&mmu_stats, 0, sizeof(mmu_stats)); + zero_stats = 0; + } +} + +#define ADD_STATS(elem, val) \ + do { check_zero(); mmu_stats.elem += (val); } while(0) + +#else /* !CONFIG_XEN_DEBUG_FS */ + +#define ADD_STATS(elem, val) do { (void)(val); } while(0) + +#endif /* CONFIG_XEN_DEBUG_FS */ + +/* + * Just beyond the highest usermode address. STACK_TOP_MAX has a + * redzone above it, so round it up to a PGD boundary. + */ +#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) + #define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) #define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE) /* Placeholder for holes in the address space */ -static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] - __attribute__((section(".data.page_aligned"))) = +static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data = { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL }; /* Array of pointers to pages containing p2m entries */ -static unsigned long *p2m_top[TOP_ENTRIES] - __attribute__((section(".data.page_aligned"))) = +static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data = { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] }; /* Arrays of p2m arrays expressed in mfns used for save/restore */ -static unsigned long p2m_top_mfn[TOP_ENTRIES] - __attribute__((section(".bss.page_aligned"))); +static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss; -static unsigned long p2m_top_mfn_list[ - PAGE_ALIGN(TOP_ENTRIES / P2M_ENTRIES_PER_PAGE)] - __attribute__((section(".bss.page_aligned"))); +static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE] + __page_aligned_bss; static inline unsigned p2m_top_index(unsigned long pfn) { @@ -181,15 +242,16 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn) p2m_top[topidx][idx] = mfn; } -xmaddr_t arbitrary_virt_to_machine(unsigned long address) +xmaddr_t arbitrary_virt_to_machine(void *vaddr) { + unsigned long address = (unsigned long)vaddr; unsigned int level; pte_t *pte = lookup_address(address, &level); unsigned offset = address & ~PAGE_MASK; BUG_ON(pte == NULL); - return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset); + return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset); } void make_lowmem_page_readonly(void *vaddr) @@ -223,25 +285,35 @@ void make_lowmem_page_readwrite(void *vaddr) } -static bool page_pinned(void *ptr) +static bool xen_page_pinned(void *ptr) { struct page *page = virt_to_page(ptr); return PagePinned(page); } -static void extend_mmu_update(const struct mmu_update *update) +static void xen_extend_mmu_update(const struct mmu_update *update) { struct multicall_space mcs; struct mmu_update *u; mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u)); - if (mcs.mc != NULL) + if (mcs.mc != NULL) { + ADD_STATS(mmu_update_extended, 1); + ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1); + mcs.mc->args[1]++; - else { + + if (mcs.mc->args[1] < MMU_UPDATE_HISTO) + ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1); + else + ADD_STATS(mmu_update_histo[0], 1); + } else { + ADD_STATS(mmu_update, 1); mcs = __xen_mc_entry(sizeof(*u)); MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); + ADD_STATS(mmu_update_histo[1], 1); } u = mcs.args; @@ -256,9 +328,12 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) xen_mc_batch(); - u.ptr = virt_to_machine(ptr).maddr; + /* ptr may be ioremapped for 64-bit pagetable setup */ + u.ptr = arbitrary_virt_to_machine(ptr).maddr; u.val = pmd_val_ma(val); - extend_mmu_update(&u); + xen_extend_mmu_update(&u); + + ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); xen_mc_issue(PARAVIRT_LAZY_MMU); @@ -267,13 +342,17 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) void xen_set_pmd(pmd_t *ptr, pmd_t val) { + ADD_STATS(pmd_update, 1); + /* If page is not pinned, we can just update the entry directly */ - if (!page_pinned(ptr)) { + if (!xen_page_pinned(ptr)) { *ptr = val; return; } + ADD_STATS(pmd_update_pinned, 1); + xen_set_pmd_hyper(ptr, val); } @@ -283,35 +362,7 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val) */ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) { - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - pgd = swapper_pg_dir + pgd_index(vaddr); - if (pgd_none(*pgd)) { - BUG(); - return; - } - pud = pud_offset(pgd, vaddr); - if (pud_none(*pud)) { - BUG(); - return; - } - pmd = pmd_offset(pud, vaddr); - if (pmd_none(*pmd)) { - BUG(); - return; - } - pte = pte_offset_kernel(pmd, vaddr); - /* <mfn,flags> stored as-is, to permit clearing entries */ - xen_set_pte(pte, mfn_pte(mfn, flags)); - - /* - * It's enough to flush this one mapping. - * (PGE mappings get flushed as well) - */ - __flush_tlb_one(vaddr); + set_pte_vaddr(vaddr, mfn_pte(mfn, flags)); } void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, @@ -321,12 +372,18 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, if (mm == &init_mm) preempt_disable(); + ADD_STATS(set_pte_at, 1); +// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep)); + ADD_STATS(set_pte_at_current, mm == current->mm); + ADD_STATS(set_pte_at_kernel, mm == &init_mm); + if (mm == current->mm || mm == &init_mm) { if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { struct multicall_space mcs; mcs = xen_mc_entry(0); MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); + ADD_STATS(set_pte_at_batched, 1); xen_mc_issue(PARAVIRT_LAZY_MMU); goto out; } else @@ -355,7 +412,10 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; u.val = pte_val_ma(pte); - extend_mmu_update(&u); + xen_extend_mmu_update(&u); + + ADD_STATS(prot_commit, 1); + ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); xen_mc_issue(PARAVIRT_LAZY_MMU); } @@ -364,8 +424,8 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, static pteval_t pte_mfn_to_pfn(pteval_t val) { if (val & _PAGE_PRESENT) { - unsigned long mfn = (val & PTE_MASK) >> PAGE_SHIFT; - pteval_t flags = val & ~PTE_MASK; + unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; + pteval_t flags = val & PTE_FLAGS_MASK; val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags; } @@ -375,8 +435,8 @@ static pteval_t pte_mfn_to_pfn(pteval_t val) static pteval_t pte_pfn_to_mfn(pteval_t val) { if (val & _PAGE_PRESENT) { - unsigned long pfn = (val & PTE_MASK) >> PAGE_SHIFT; - pteval_t flags = val & ~PTE_MASK; + unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; + pteval_t flags = val & PTE_FLAGS_MASK; val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags; } @@ -418,9 +478,12 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val) xen_mc_batch(); - u.ptr = virt_to_machine(ptr).maddr; + /* ptr may be ioremapped for 64-bit pagetable setup */ + u.ptr = arbitrary_virt_to_machine(ptr).maddr; u.val = pud_val_ma(val); - extend_mmu_update(&u); + xen_extend_mmu_update(&u); + + ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); xen_mc_issue(PARAVIRT_LAZY_MMU); @@ -429,26 +492,39 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val) void xen_set_pud(pud_t *ptr, pud_t val) { + ADD_STATS(pud_update, 1); + /* If page is not pinned, we can just update the entry directly */ - if (!page_pinned(ptr)) { + if (!xen_page_pinned(ptr)) { *ptr = val; return; } + ADD_STATS(pud_update_pinned, 1); + xen_set_pud_hyper(ptr, val); } void xen_set_pte(pte_t *ptep, pte_t pte) { + ADD_STATS(pte_update, 1); +// ADD_STATS(pte_update_pinned, xen_page_pinned(ptep)); + ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); + +#ifdef CONFIG_X86_PAE ptep->pte_high = pte.pte_high; smp_wmb(); ptep->pte_low = pte.pte_low; +#else + *ptep = pte; +#endif } +#ifdef CONFIG_X86_PAE void xen_set_pte_atomic(pte_t *ptep, pte_t pte) { - set_64bit((u64 *)ptep, pte_val_ma(pte)); + set_64bit((u64 *)ptep, native_pte_val(pte)); } void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) @@ -462,6 +538,7 @@ void xen_pmd_clear(pmd_t *pmdp) { set_pmd(pmdp, __pmd(0)); } +#endif /* CONFIG_X86_PAE */ pmd_t xen_make_pmd(pmdval_t pmd) { @@ -469,95 +546,218 @@ pmd_t xen_make_pmd(pmdval_t pmd) return native_make_pmd(pmd); } +#if PAGETABLE_LEVELS == 4 +pudval_t xen_pud_val(pud_t pud) +{ + return pte_mfn_to_pfn(pud.pud); +} + +pud_t xen_make_pud(pudval_t pud) +{ + pud = pte_pfn_to_mfn(pud); + + return native_make_pud(pud); +} + +pgd_t *xen_get_user_pgd(pgd_t *pgd) +{ + pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK); + unsigned offset = pgd - pgd_page; + pgd_t *user_ptr = NULL; + + if (offset < pgd_index(USER_LIMIT)) { + struct page *page = virt_to_page(pgd_page); + user_ptr = (pgd_t *)page->private; + if (user_ptr) + user_ptr += offset; + } + + return user_ptr; +} + +static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) +{ + struct mmu_update u; + + u.ptr = virt_to_machine(ptr).maddr; + u.val = pgd_val_ma(val); + xen_extend_mmu_update(&u); +} + /* - (Yet another) pagetable walker. This one is intended for pinning a - pagetable. This means that it walks a pagetable and calls the - callback function on each page it finds making up the page table, - at every level. It walks the entire pagetable, but it only bothers - pinning pte pages which are below pte_limit. In the normal case - this will be TASK_SIZE, but at boot we need to pin up to - FIXADDR_TOP. But the important bit is that we don't pin beyond - there, because then we start getting into Xen's ptes. -*/ -static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level), - unsigned long limit) -{ - pgd_t *pgd = pgd_base; + * Raw hypercall-based set_pgd, intended for in early boot before + * there's a page structure. This implies: + * 1. The only existing pagetable is the kernel's + * 2. It is always pinned + * 3. It has no user pagetable attached to it + */ +void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) +{ + preempt_disable(); + + xen_mc_batch(); + + __xen_set_pgd_hyper(ptr, val); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + + preempt_enable(); +} + +void xen_set_pgd(pgd_t *ptr, pgd_t val) +{ + pgd_t *user_ptr = xen_get_user_pgd(ptr); + + ADD_STATS(pgd_update, 1); + + /* If page is not pinned, we can just update the entry + directly */ + if (!xen_page_pinned(ptr)) { + *ptr = val; + if (user_ptr) { + WARN_ON(xen_page_pinned(user_ptr)); + *user_ptr = val; + } + return; + } + + ADD_STATS(pgd_update_pinned, 1); + ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); + + /* If it's pinned, then we can at least batch the kernel and + user updates together. */ + xen_mc_batch(); + + __xen_set_pgd_hyper(ptr, val); + if (user_ptr) + __xen_set_pgd_hyper(user_ptr, val); + + xen_mc_issue(PARAVIRT_LAZY_MMU); +} +#endif /* PAGETABLE_LEVELS == 4 */ + +/* + * (Yet another) pagetable walker. This one is intended for pinning a + * pagetable. This means that it walks a pagetable and calls the + * callback function on each page it finds making up the page table, + * at every level. It walks the entire pagetable, but it only bothers + * pinning pte pages which are below limit. In the normal case this + * will be STACK_TOP_MAX, but at boot we need to pin up to + * FIXADDR_TOP. + * + * For 32-bit the important bit is that we don't pin beyond there, + * because then we start getting into Xen's ptes. + * + * For 64-bit, we must skip the Xen hole in the middle of the address + * space, just after the big x86-64 virtual hole. + */ +static int xen_pgd_walk(struct mm_struct *mm, + int (*func)(struct mm_struct *mm, struct page *, + enum pt_level), + unsigned long limit) +{ + pgd_t *pgd = mm->pgd; int flush = 0; - unsigned long addr = 0; - unsigned long pgd_next; + unsigned hole_low, hole_high; + unsigned pgdidx_limit, pudidx_limit, pmdidx_limit; + unsigned pgdidx, pudidx, pmdidx; - BUG_ON(limit > FIXADDR_TOP); + /* The limit is the last byte to be touched */ + limit--; + BUG_ON(limit >= FIXADDR_TOP); if (xen_feature(XENFEAT_auto_translated_physmap)) return 0; - for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) { + /* + * 64-bit has a great big hole in the middle of the address + * space, which contains the Xen mappings. On 32-bit these + * will end up making a zero-sized hole and so is a no-op. + */ + hole_low = pgd_index(USER_LIMIT); + hole_high = pgd_index(PAGE_OFFSET); + + pgdidx_limit = pgd_index(limit); +#if PTRS_PER_PUD > 1 + pudidx_limit = pud_index(limit); +#else + pudidx_limit = 0; +#endif +#if PTRS_PER_PMD > 1 + pmdidx_limit = pmd_index(limit); +#else + pmdidx_limit = 0; +#endif + + for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) { pud_t *pud; - unsigned long pud_limit, pud_next; - pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP); + if (pgdidx >= hole_low && pgdidx < hole_high) + continue; - if (!pgd_val(*pgd)) + if (!pgd_val(pgd[pgdidx])) continue; - pud = pud_offset(pgd, 0); + pud = pud_offset(&pgd[pgdidx], 0); if (PTRS_PER_PUD > 1) /* not folded */ - flush |= (*func)(virt_to_page(pud), PT_PUD); + flush |= (*func)(mm, virt_to_page(pud), PT_PUD); - for (; addr != pud_limit; pud++, addr = pud_next) { + for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) { pmd_t *pmd; - unsigned long pmd_limit; - - pud_next = pud_addr_end(addr, pud_limit); - if (pud_next < limit) - pmd_limit = pud_next; - else - pmd_limit = limit; + if (pgdidx == pgdidx_limit && + pudidx > pudidx_limit) + goto out; - if (pud_none(*pud)) + if (pud_none(pud[pudidx])) continue; - pmd = pmd_offset(pud, 0); + pmd = pmd_offset(&pud[pudidx], 0); if (PTRS_PER_PMD > 1) /* not folded */ - flush |= (*func)(virt_to_page(pmd), PT_PMD); + flush |= (*func)(mm, virt_to_page(pmd), PT_PMD); + + for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) { + struct page *pte; - for (; addr != pmd_limit; pmd++) { - addr += (PAGE_SIZE * PTRS_PER_PTE); - if ((pmd_limit-1) < (addr-1)) { - addr = pmd_limit; - break; - } + if (pgdidx == pgdidx_limit && + pudidx == pudidx_limit && + pmdidx > pmdidx_limit) + goto out; - if (pmd_none(*pmd)) + if (pmd_none(pmd[pmdidx])) continue; - flush |= (*func)(pmd_page(*pmd), PT_PTE); + pte = pmd_page(pmd[pmdidx]); + flush |= (*func)(mm, pte, PT_PTE); } } } - flush |= (*func)(virt_to_page(pgd_base), PT_PGD); +out: + /* Do the top level last, so that the callbacks can use it as + a cue to do final things like tlb flushes. */ + flush |= (*func)(mm, virt_to_page(pgd), PT_PGD); return flush; } -static spinlock_t *lock_pte(struct page *page) +/* If we're using split pte locks, then take the page's lock and + return a pointer to it. Otherwise return NULL. */ +static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm) { spinlock_t *ptl = NULL; -#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS +#if USE_SPLIT_PTLOCKS ptl = __pte_lockptr(page); - spin_lock(ptl); + spin_lock_nest_lock(ptl, &mm->page_table_lock); #endif return ptl; } -static void do_unlock(void *v) +static void xen_pte_unlock(void *v) { spinlock_t *ptl = v; spin_unlock(ptl); @@ -575,7 +775,8 @@ static void xen_do_pin(unsigned level, unsigned long pfn) MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); } -static int pin_page(struct page *page, enum pt_level level) +static int xen_pin_page(struct mm_struct *mm, struct page *page, + enum pt_level level) { unsigned pgfl = TestSetPagePinned(page); int flush; @@ -594,21 +795,40 @@ static int pin_page(struct page *page, enum pt_level level) flush = 0; + /* + * We need to hold the pagetable lock between the time + * we make the pagetable RO and when we actually pin + * it. If we don't, then other users may come in and + * attempt to update the pagetable by writing it, + * which will fail because the memory is RO but not + * pinned, so Xen won't do the trap'n'emulate. + * + * If we're using split pte locks, we can't hold the + * entire pagetable's worth of locks during the + * traverse, because we may wrap the preempt count (8 + * bits). The solution is to mark RO and pin each PTE + * page while holding the lock. This means the number + * of locks we end up holding is never more than a + * batch size (~32 entries, at present). + * + * If we're not using split pte locks, we needn't pin + * the PTE pages independently, because we're + * protected by the overall pagetable lock. + */ ptl = NULL; if (level == PT_PTE) - ptl = lock_pte(page); + ptl = xen_pte_lock(page, mm); MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, pfn_pte(pfn, PAGE_KERNEL_RO), level == PT_PGD ? UVMF_TLB_FLUSH : 0); - if (level == PT_PTE) + if (ptl) { xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); - if (ptl) { /* Queue a deferred unlock for when this batch is completed. */ - xen_mc_callback(do_unlock, ptl); + xen_mc_callback(xen_pte_unlock, ptl); } } @@ -618,26 +838,54 @@ static int pin_page(struct page *page, enum pt_level level) /* This is called just after a mm has been created, but it has not been used yet. We need to make sure that its pagetable is all read-only, and can be pinned. */ -void xen_pgd_pin(pgd_t *pgd) +static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd) { xen_mc_batch(); - if (pgd_walk(pgd, pin_page, TASK_SIZE)) { + if (xen_pgd_walk(mm, xen_pin_page, USER_LIMIT)) { /* re-enable interrupts for kmap_flush_unused */ xen_mc_issue(0); kmap_flush_unused(); + vm_unmap_aliases(); xen_mc_batch(); } +#ifdef CONFIG_X86_64 + { + pgd_t *user_pgd = xen_get_user_pgd(pgd); + + xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd))); + + if (user_pgd) { + xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD); + xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd))); + } + } +#else /* CONFIG_X86_32 */ +#ifdef CONFIG_X86_PAE + /* Need to make sure unshared kernel PMD is pinnable */ + xen_pin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), + PT_PMD); +#endif xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); +#endif /* CONFIG_X86_64 */ xen_mc_issue(0); } +static void xen_pgd_pin(struct mm_struct *mm) +{ + __xen_pgd_pin(mm, mm->pgd); +} + /* * On save, we need to pin all pagetables to make sure they get their * mfns turned into pfns. Search the list for any unpinned pgds and pin * them (unpinned pgds are not currently in use, probably because the * process is under construction or destruction). + * + * Expected to be called in stop_machine() ("equivalent to taking + * every spinlock in the system"), so the locking doesn't really + * matter all that much. */ void xen_mm_pin_all(void) { @@ -648,7 +896,7 @@ void xen_mm_pin_all(void) list_for_each_entry(page, &pgd_list, lru) { if (!PagePinned(page)) { - xen_pgd_pin((pgd_t *)page_address(page)); + __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page)); SetPageSavePinned(page); } } @@ -656,10 +904,13 @@ void xen_mm_pin_all(void) spin_unlock_irqrestore(&pgd_lock, flags); } -/* The init_mm pagetable is really pinned as soon as its created, but - that's before we have page structures to store the bits. So do all - the book-keeping now. */ -static __init int mark_pinned(struct page *page, enum pt_level level) +/* + * The init_mm pagetable is really pinned as soon as its created, but + * that's before we have page structures to store the bits. So do all + * the book-keeping now. + */ +static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page, + enum pt_level level) { SetPagePinned(page); return 0; @@ -667,10 +918,11 @@ static __init int mark_pinned(struct page *page, enum pt_level level) void __init xen_mark_init_mm_pinned(void) { - pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); + xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP); } -static int unpin_page(struct page *page, enum pt_level level) +static int xen_unpin_page(struct mm_struct *mm, struct page *page, + enum pt_level level) { unsigned pgfl = TestClearPagePinned(page); @@ -680,10 +932,18 @@ static int unpin_page(struct page *page, enum pt_level level) spinlock_t *ptl = NULL; struct multicall_space mcs; + /* + * Do the converse to pin_page. If we're using split + * pte locks, we must be holding the lock for while + * the pte page is unpinned but still RO to prevent + * concurrent updates from seeing it in this + * partially-pinned state. + */ if (level == PT_PTE) { - ptl = lock_pte(page); + ptl = xen_pte_lock(page, mm); - xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); + if (ptl) + xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); } mcs = __xen_mc_entry(0); @@ -694,7 +954,7 @@ static int unpin_page(struct page *page, enum pt_level level) if (ptl) { /* unlock when batch completed */ - xen_mc_callback(do_unlock, ptl); + xen_mc_callback(xen_pte_unlock, ptl); } } @@ -702,17 +962,39 @@ static int unpin_page(struct page *page, enum pt_level level) } /* Release a pagetables pages back as normal RW */ -static void xen_pgd_unpin(pgd_t *pgd) +static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd) { xen_mc_batch(); xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); - pgd_walk(pgd, unpin_page, TASK_SIZE); +#ifdef CONFIG_X86_64 + { + pgd_t *user_pgd = xen_get_user_pgd(pgd); + + if (user_pgd) { + xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd))); + xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD); + } + } +#endif + +#ifdef CONFIG_X86_PAE + /* Need to make sure unshared kernel PMD is unpinned */ + xen_unpin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), + PT_PMD); +#endif + + xen_pgd_walk(mm, xen_unpin_page, USER_LIMIT); xen_mc_issue(0); } +static void xen_pgd_unpin(struct mm_struct *mm) +{ + __xen_pgd_unpin(mm, mm->pgd); +} + /* * On resume, undo any pinning done at save, so that the rest of the * kernel doesn't see any unexpected pinned pagetables. @@ -727,8 +1009,7 @@ void xen_mm_unpin_all(void) list_for_each_entry(page, &pgd_list, lru) { if (PageSavePinned(page)) { BUG_ON(!PagePinned(page)); - printk("unpinning pinned %p\n", page_address(page)); - xen_pgd_unpin((pgd_t *)page_address(page)); + __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page)); ClearPageSavePinned(page); } } @@ -739,14 +1020,14 @@ void xen_mm_unpin_all(void) void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) { spin_lock(&next->page_table_lock); - xen_pgd_pin(next->pgd); + xen_pgd_pin(next); spin_unlock(&next->page_table_lock); } void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) { spin_lock(&mm->page_table_lock); - xen_pgd_pin(mm->pgd); + xen_pgd_pin(mm); spin_unlock(&mm->page_table_lock); } @@ -757,8 +1038,15 @@ void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) static void drop_other_mm_ref(void *info) { struct mm_struct *mm = info; + struct mm_struct *active_mm; - if (__get_cpu_var(cpu_tlbstate).active_mm == mm) +#ifdef CONFIG_X86_64 + active_mm = read_pda(active_mm); +#else + active_mm = __get_cpu_var(cpu_tlbstate).active_mm; +#endif + + if (active_mm == mm) leave_mm(smp_processor_id()); /* If this cpu still has a stale cr3 reference, then make sure @@ -769,7 +1057,7 @@ static void drop_other_mm_ref(void *info) } } -static void drop_mm_ref(struct mm_struct *mm) +static void xen_drop_mm_ref(struct mm_struct *mm) { cpumask_t mask; unsigned cpu; @@ -799,7 +1087,7 @@ static void drop_mm_ref(struct mm_struct *mm) smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); } #else -static void drop_mm_ref(struct mm_struct *mm) +static void xen_drop_mm_ref(struct mm_struct *mm) { if (current->active_mm == mm) load_cr3(swapper_pg_dir); @@ -823,14 +1111,77 @@ static void drop_mm_ref(struct mm_struct *mm) void xen_exit_mmap(struct mm_struct *mm) { get_cpu(); /* make sure we don't move around */ - drop_mm_ref(mm); + xen_drop_mm_ref(mm); put_cpu(); spin_lock(&mm->page_table_lock); /* pgd may not be pinned in the error exit path of execve */ - if (page_pinned(mm->pgd)) - xen_pgd_unpin(mm->pgd); + if (xen_page_pinned(mm->pgd)) + xen_pgd_unpin(mm); spin_unlock(&mm->page_table_lock); } + +#ifdef CONFIG_XEN_DEBUG_FS + +static struct dentry *d_mmu_debug; + +static int __init xen_mmu_debugfs(void) +{ + struct dentry *d_xen = xen_init_debugfs(); + + if (d_xen == NULL) + return -ENOMEM; + + d_mmu_debug = debugfs_create_dir("mmu", d_xen); + + debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats); + + debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update); + debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug, + &mmu_stats.pgd_update_pinned); + debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug, + &mmu_stats.pgd_update_pinned); + + debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update); + debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug, + &mmu_stats.pud_update_pinned); + debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug, + &mmu_stats.pud_update_pinned); + + debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update); + debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug, + &mmu_stats.pmd_update_pinned); + debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug, + &mmu_stats.pmd_update_pinned); + + debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update); +// debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug, +// &mmu_stats.pte_update_pinned); + debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug, + &mmu_stats.pte_update_pinned); + + debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update); + debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug, + &mmu_stats.mmu_update_extended); + xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug, + mmu_stats.mmu_update_histo, 20); + + debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at); + debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug, + &mmu_stats.set_pte_at_batched); + debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug, + &mmu_stats.set_pte_at_current); + debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug, + &mmu_stats.set_pte_at_kernel); + + debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit); + debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug, + &mmu_stats.prot_commit_batched); + + return 0; +} +fs_initcall(xen_mmu_debugfs); + +#endif /* CONFIG_XEN_DEBUG_FS */ diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index 297bf9f5b8bc..98d71659da5a 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -10,18 +10,6 @@ enum pt_level { PT_PTE }; -/* - * Page-directory addresses above 4GB do not fit into architectural %cr3. - * When accessing %cr3, or equivalent field in vcpu_guest_context, guests - * must use the following accessor macros to pack/unpack valid MFNs. - * - * Note that Xen is using the fact that the pagetable base is always - * page-aligned, and putting the 12 MSB of the address into the 12 LSB - * of cr3. - */ -#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20)) -#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20)) - void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); @@ -30,9 +18,6 @@ void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next); void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); void xen_exit_mmap(struct mm_struct *mm); -void xen_pgd_pin(pgd_t *pgd); -//void xen_pgd_unpin(pgd_t *pgd); - pteval_t xen_pte_val(pte_t); pmdval_t xen_pmd_val(pmd_t); pgdval_t xen_pgd_val(pgd_t); @@ -44,13 +29,26 @@ pgd_t xen_make_pgd(pgdval_t); void xen_set_pte(pte_t *ptep, pte_t pteval); void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval); + +#ifdef CONFIG_X86_PAE void xen_set_pte_atomic(pte_t *ptep, pte_t pte); +void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +void xen_pmd_clear(pmd_t *pmdp); +#endif /* CONFIG_X86_PAE */ + void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval); void xen_set_pud(pud_t *ptr, pud_t val); void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval); void xen_set_pud_hyper(pud_t *ptr, pud_t val); -void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -void xen_pmd_clear(pmd_t *pmdp); + +#if PAGETABLE_LEVELS == 4 +pudval_t xen_pud_val(pud_t pud); +pud_t xen_make_pud(pudval_t pudval); +void xen_set_pgd(pgd_t *pgdp, pgd_t pgd); +void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd); +#endif + +pgd_t *xen_get_user_pgd(pgd_t *pgd); pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep); void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index 3c63c4da7ed1..8ea8a0d0b0de 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c @@ -21,16 +21,20 @@ */ #include <linux/percpu.h> #include <linux/hardirq.h> +#include <linux/debugfs.h> #include <asm/xen/hypercall.h> #include "multicalls.h" +#include "debugfs.h" + +#define MC_BATCH 32 #define MC_DEBUG 1 -#define MC_BATCH 32 #define MC_ARGS (MC_BATCH * 16) + struct mc_buffer { struct multicall_entry entries[MC_BATCH]; #if MC_DEBUG @@ -47,6 +51,76 @@ struct mc_buffer { static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags); +/* flush reasons 0- slots, 1- args, 2- callbacks */ +enum flush_reasons +{ + FL_SLOTS, + FL_ARGS, + FL_CALLBACKS, + + FL_N_REASONS +}; + +#ifdef CONFIG_XEN_DEBUG_FS +#define NHYPERCALLS 40 /* not really */ + +static struct { + unsigned histo[MC_BATCH+1]; + + unsigned issued; + unsigned arg_total; + unsigned hypercalls; + unsigned histo_hypercalls[NHYPERCALLS]; + + unsigned flush[FL_N_REASONS]; +} mc_stats; + +static u8 zero_stats; + +static inline void check_zero(void) +{ + if (unlikely(zero_stats)) { + memset(&mc_stats, 0, sizeof(mc_stats)); + zero_stats = 0; + } +} + +static void mc_add_stats(const struct mc_buffer *mc) +{ + int i; + + check_zero(); + + mc_stats.issued++; + mc_stats.hypercalls += mc->mcidx; + mc_stats.arg_total += mc->argidx; + + mc_stats.histo[mc->mcidx]++; + for(i = 0; i < mc->mcidx; i++) { + unsigned op = mc->entries[i].op; + if (op < NHYPERCALLS) + mc_stats.histo_hypercalls[op]++; + } +} + +static void mc_stats_flush(enum flush_reasons idx) +{ + check_zero(); + + mc_stats.flush[idx]++; +} + +#else /* !CONFIG_XEN_DEBUG_FS */ + +static inline void mc_add_stats(const struct mc_buffer *mc) +{ +} + +static inline void mc_stats_flush(enum flush_reasons idx) +{ +} +#endif /* CONFIG_XEN_DEBUG_FS */ + void xen_mc_flush(void) { struct mc_buffer *b = &__get_cpu_var(mc_buffer); @@ -60,6 +134,8 @@ void xen_mc_flush(void) something in the middle */ local_irq_save(flags); + mc_add_stats(b); + if (b->mcidx) { #if MC_DEBUG memcpy(b->debug, b->entries, @@ -76,6 +152,7 @@ void xen_mc_flush(void) if (ret) { printk(KERN_ERR "%d multicall(s) failed: cpu %d\n", ret, smp_processor_id()); + dump_stack(); for (i = 0; i < b->mcidx; i++) { printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n", i+1, b->mcidx, @@ -114,6 +191,7 @@ struct multicall_space __xen_mc_entry(size_t args) if (b->mcidx == MC_BATCH || (argidx + args) > MC_ARGS) { + mc_stats_flush(b->mcidx == MC_BATCH ? FL_SLOTS : FL_ARGS); xen_mc_flush(); argidx = roundup(b->argidx, sizeof(u64)); } @@ -157,10 +235,44 @@ void xen_mc_callback(void (*fn)(void *), void *data) struct mc_buffer *b = &__get_cpu_var(mc_buffer); struct callback *cb; - if (b->cbidx == MC_BATCH) + if (b->cbidx == MC_BATCH) { + mc_stats_flush(FL_CALLBACKS); xen_mc_flush(); + } cb = &b->callbacks[b->cbidx++]; cb->fn = fn; cb->data = data; } + +#ifdef CONFIG_XEN_DEBUG_FS + +static struct dentry *d_mc_debug; + +static int __init xen_mc_debugfs(void) +{ + struct dentry *d_xen = xen_init_debugfs(); + + if (d_xen == NULL) + return -ENOMEM; + + d_mc_debug = debugfs_create_dir("multicalls", d_xen); + + debugfs_create_u8("zero_stats", 0644, d_mc_debug, &zero_stats); + + debugfs_create_u32("batches", 0444, d_mc_debug, &mc_stats.issued); + debugfs_create_u32("hypercalls", 0444, d_mc_debug, &mc_stats.hypercalls); + debugfs_create_u32("arg_total", 0444, d_mc_debug, &mc_stats.arg_total); + + xen_debugfs_create_u32_array("batch_histo", 0444, d_mc_debug, + mc_stats.histo, MC_BATCH); + xen_debugfs_create_u32_array("hypercall_histo", 0444, d_mc_debug, + mc_stats.histo_hypercalls, NHYPERCALLS); + xen_debugfs_create_u32_array("flush_reasons", 0444, d_mc_debug, + mc_stats.flush, FL_N_REASONS); + + return 0; +} +fs_initcall(xen_mc_debugfs); + +#endif /* CONFIG_XEN_DEBUG_FS */ diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index e0a39595bde3..d67901083888 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -42,7 +42,7 @@ char * __init xen_memory_setup(void) e820.nr_map = 0; - e820_add_region(0, PFN_PHYS(max_pfn), E820_RAM); + e820_add_region(0, PFN_PHYS((u64)max_pfn), E820_RAM); /* * Even though this is normal, usable memory under Xen, reserve @@ -83,30 +83,72 @@ static void xen_idle(void) /* * Set the bit indicating "nosegneg" library variants should be used. + * We only need to bother in pure 32-bit mode; compat 32-bit processes + * can have un-truncated segments, so wrapping around is allowed. */ static void __init fiddle_vdso(void) { - extern const char vdso32_default_start; - u32 *mask = VDSO32_SYMBOL(&vdso32_default_start, NOTE_MASK); +#ifdef CONFIG_X86_32 + u32 *mask; + mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK); *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; + mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK); + *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; +#endif } -void xen_enable_sysenter(void) +static __cpuinit int register_callback(unsigned type, const void *func) { - int cpu = smp_processor_id(); - extern void xen_sysenter_target(void); - /* Mask events on entry, even though they get enabled immediately */ - static struct callback_register sysenter = { - .type = CALLBACKTYPE_sysenter, - .address = { __KERNEL_CS, (unsigned long)xen_sysenter_target }, + struct callback_register callback = { + .type = type, + .address = XEN_CALLBACK(__KERNEL_CS, func), .flags = CALLBACKF_mask_events, }; - if (!boot_cpu_has(X86_FEATURE_SEP) || - HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) != 0) { - clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SEP); - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP); + return HYPERVISOR_callback_op(CALLBACKOP_register, &callback); +} + +void __cpuinit xen_enable_sysenter(void) +{ + extern void xen_sysenter_target(void); + int ret; + unsigned sysenter_feature; + +#ifdef CONFIG_X86_32 + sysenter_feature = X86_FEATURE_SEP; +#else + sysenter_feature = X86_FEATURE_SYSENTER32; +#endif + + if (!boot_cpu_has(sysenter_feature)) + return; + + ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target); + if(ret != 0) + setup_clear_cpu_cap(sysenter_feature); +} + +void __cpuinit xen_enable_syscall(void) +{ +#ifdef CONFIG_X86_64 + int ret; + extern void xen_syscall_target(void); + extern void xen_syscall32_target(void); + + ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target); + if (ret != 0) { + printk(KERN_ERR "Failed to set syscall callback: %d\n", ret); + /* Pretty fatal; 64-bit userspace has no other + mechanism for syscalls. */ } + + if (boot_cpu_has(X86_FEATURE_SYSCALL32)) { + ret = register_callback(CALLBACKTYPE_syscall32, + xen_syscall32_target); + if (ret != 0) + setup_clear_cpu_cap(X86_FEATURE_SYSCALL32); + } +#endif /* CONFIG_X86_64 */ } void __init xen_arch_setup(void) @@ -120,10 +162,12 @@ void __init xen_arch_setup(void) if (!xen_feature(XENFEAT_auto_translated_physmap)) HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3); - HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback, - __KERNEL_CS, (unsigned long)xen_failsafe_callback); + if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) || + register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback)) + BUG(); xen_enable_sysenter(); + xen_enable_syscall(); set_iopl.iopl = 1; rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); @@ -143,11 +187,6 @@ void __init xen_arch_setup(void) pm_idle = xen_idle; -#ifdef CONFIG_SMP - /* fill cpus_possible with all available cpus */ - xen_fill_possible_map(); -#endif - paravirt_disable_iospace(); fiddle_vdso(); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 233156f39b7f..d77da613b1d2 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -11,8 +11,6 @@ * useful topology information for the kernel to make use of. As a * result, all CPUs are treated as if they're single-core and * single-threaded. - * - * This does not handle HOTPLUG_CPU yet. */ #include <linux/sched.h> #include <linux/err.h> @@ -61,22 +59,37 @@ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -static __cpuinit void cpu_bringup_and_idle(void) +static __cpuinit void cpu_bringup(void) { int cpu = smp_processor_id(); cpu_init(); + touch_softlockup_watchdog(); + preempt_disable(); + xen_enable_sysenter(); + xen_enable_syscall(); - preempt_disable(); - per_cpu(cpu_state, cpu) = CPU_ONLINE; + cpu = smp_processor_id(); + smp_store_cpu_info(cpu); + cpu_data(cpu).x86_max_cores = 1; + set_cpu_sibling_map(cpu); xen_setup_cpu_clockevents(); + cpu_set(cpu, cpu_online_map); + x86_write_percpu(cpu_state, CPU_ONLINE); + wmb(); + /* We can take interrupts now: we're officially "up". */ local_irq_enable(); wmb(); /* make sure everything is out */ +} + +static __cpuinit void cpu_bringup_and_idle(void) +{ + cpu_bringup(); cpu_idle(); } @@ -141,56 +154,39 @@ static int xen_smp_intr_init(unsigned int cpu) return rc; } -void __init xen_fill_possible_map(void) +static void __init xen_fill_possible_map(void) { int i, rc; for (i = 0; i < NR_CPUS; i++) { rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); - if (rc >= 0) + if (rc >= 0) { + num_processors++; cpu_set(i, cpu_possible_map); + } } } -void __init xen_smp_prepare_boot_cpu(void) +static void __init xen_smp_prepare_boot_cpu(void) { - int cpu; - BUG_ON(smp_processor_id() != 0); native_smp_prepare_boot_cpu(); /* We've switched to the "real" per-cpu gdt, so make sure the old memory can be recycled */ - make_lowmem_page_readwrite(&per_cpu__gdt_page); - - for_each_possible_cpu(cpu) { - cpus_clear(per_cpu(cpu_sibling_map, cpu)); - /* - * cpu_core_map lives in a per cpu area that is cleared - * when the per cpu array is allocated. - * - * cpus_clear(per_cpu(cpu_core_map, cpu)); - */ - } + make_lowmem_page_readwrite(&per_cpu_var(gdt_page)); xen_setup_vcpu_info_placement(); } -void __init xen_smp_prepare_cpus(unsigned int max_cpus) +static void __init xen_smp_prepare_cpus(unsigned int max_cpus) { unsigned cpu; - for_each_possible_cpu(cpu) { - cpus_clear(per_cpu(cpu_sibling_map, cpu)); - /* - * cpu_core_ map will be zeroed when the per - * cpu area is allocated. - * - * cpus_clear(per_cpu(cpu_core_map, cpu)); - */ - } + xen_init_lock_cpu(0); smp_store_cpu_info(0); + cpu_data(0).x86_max_cores = 1; set_cpu_sibling_map(0); if (xen_smp_intr_init(0)) @@ -217,15 +213,13 @@ void __init xen_smp_prepare_cpus(unsigned int max_cpus) cpu_set(cpu, cpu_present_map); } - - //init_xenbus_allowed_cpumask(); } static __cpuinit int cpu_initialize_context(unsigned int cpu, struct task_struct *idle) { struct vcpu_guest_context *ctxt; - struct gdt_page *gdt = &per_cpu(gdt_page, cpu); + struct desc_struct *gdt; if (cpu_test_and_set(cpu, xen_cpu_initialized_map)) return 0; @@ -234,12 +228,15 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) if (ctxt == NULL) return -ENOMEM; + gdt = get_cpu_gdt_table(cpu); + ctxt->flags = VGCF_IN_KERNEL; ctxt->user_regs.ds = __USER_DS; ctxt->user_regs.es = __USER_DS; - ctxt->user_regs.fs = __KERNEL_PERCPU; - ctxt->user_regs.gs = 0; ctxt->user_regs.ss = __KERNEL_DS; +#ifdef CONFIG_X86_32 + ctxt->user_regs.fs = __KERNEL_PERCPU; +#endif ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ @@ -249,11 +246,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) ctxt->ldt_ents = 0; - BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK); - make_lowmem_page_readonly(gdt->gdt); + BUG_ON((unsigned long)gdt & ~PAGE_MASK); + make_lowmem_page_readonly(gdt); - ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt); - ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt); + ctxt->gdt_frames[0] = virt_to_mfn(gdt); + ctxt->gdt_ents = GDT_ENTRIES; ctxt->user_regs.cs = __KERNEL_CS; ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); @@ -261,9 +258,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) ctxt->kernel_ss = __KERNEL_DS; ctxt->kernel_sp = idle->thread.sp0; +#ifdef CONFIG_X86_32 ctxt->event_callback_cs = __KERNEL_CS; - ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback; ctxt->failsafe_callback_cs = __KERNEL_CS; +#endif + ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback; ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback; per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); @@ -276,21 +275,33 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) return 0; } -int __cpuinit xen_cpu_up(unsigned int cpu) +static int __cpuinit xen_cpu_up(unsigned int cpu) { struct task_struct *idle = idle_task(cpu); int rc; -#if 0 - rc = cpu_up_check(cpu); - if (rc) - return rc; +#ifdef CONFIG_X86_64 + /* Allocate node local memory for AP pdas */ + WARN_ON(cpu == 0); + if (cpu > 0) { + rc = get_local_pda(cpu); + if (rc) + return rc; + } #endif +#ifdef CONFIG_X86_32 init_gdt(cpu); per_cpu(current_task, cpu) = idle; irq_ctx_init(cpu); +#else + cpu_pda(cpu)->pcurrent = idle; + clear_tsk_thread_flag(idle, TIF_FORK); +#endif xen_setup_timer(cpu); + xen_init_lock_cpu(cpu); + + per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; /* make sure interrupts start blocked */ per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; @@ -306,23 +317,75 @@ int __cpuinit xen_cpu_up(unsigned int cpu) if (rc) return rc; - smp_store_cpu_info(cpu); - set_cpu_sibling_map(cpu); - /* This must be done before setting cpu_online_map */ - wmb(); - - cpu_set(cpu, cpu_online_map); - rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL); BUG_ON(rc); + while(per_cpu(cpu_state, cpu) != CPU_ONLINE) { + HYPERVISOR_sched_op(SCHEDOP_yield, 0); + barrier(); + } + + return 0; +} + +static void xen_smp_cpus_done(unsigned int max_cpus) +{ +} + +#ifdef CONFIG_HOTPLUG_CPU +static int xen_cpu_disable(void) +{ + unsigned int cpu = smp_processor_id(); + if (cpu == 0) + return -EBUSY; + + cpu_disable_common(); + + load_cr3(swapper_pg_dir); return 0; } -void xen_smp_cpus_done(unsigned int max_cpus) +static void xen_cpu_die(unsigned int cpu) +{ + while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) { + current->state = TASK_UNINTERRUPTIBLE; + schedule_timeout(HZ/10); + } + unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL); + xen_uninit_lock_cpu(cpu); + xen_teardown_timer(cpu); + + if (num_online_cpus() == 1) + alternatives_smp_switch(0); +} + +static void xen_play_dead(void) +{ + play_dead_common(); + HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); + cpu_bringup(); +} + +#else /* !CONFIG_HOTPLUG_CPU */ +static int xen_cpu_disable(void) +{ + return -ENOSYS; +} + +static void xen_cpu_die(unsigned int cpu) { + BUG(); } +static void xen_play_dead(void) +{ + BUG(); +} + +#endif static void stop_self(void *v) { int cpu = smp_processor_id(); @@ -335,12 +398,12 @@ static void stop_self(void *v) BUG(); } -void xen_smp_send_stop(void) +static void xen_smp_send_stop(void) { smp_call_function(stop_self, NULL, 0); } -void xen_smp_send_reschedule(int cpu) +static void xen_smp_send_reschedule(int cpu) { xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); } @@ -351,18 +414,18 @@ static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector) cpus_and(mask, mask, cpu_online_map); - for_each_cpu_mask(cpu, mask) + for_each_cpu_mask_nr(cpu, mask) xen_send_IPI_one(cpu, vector); } -void xen_smp_send_call_function_ipi(cpumask_t mask) +static void xen_smp_send_call_function_ipi(cpumask_t mask) { int cpu; xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); /* Make sure other vcpus get a chance to run if they need to. */ - for_each_cpu_mask(cpu, mask) { + for_each_cpu_mask_nr(cpu, mask) { if (xen_vcpu_stolen(cpu)) { HYPERVISOR_sched_op(SCHEDOP_yield, 0); break; @@ -370,7 +433,7 @@ void xen_smp_send_call_function_ipi(cpumask_t mask) } } -void xen_smp_send_call_function_single_ipi(int cpu) +static void xen_smp_send_call_function_single_ipi(int cpu) { xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR); } @@ -379,7 +442,11 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id) { irq_enter(); generic_smp_call_function_interrupt(); +#ifdef CONFIG_X86_32 __get_cpu_var(irq_stat).irq_call_count++; +#else + add_pda(irq_call_count, 1); +#endif irq_exit(); return IRQ_HANDLED; @@ -389,8 +456,36 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id) { irq_enter(); generic_smp_call_function_single_interrupt(); +#ifdef CONFIG_X86_32 __get_cpu_var(irq_stat).irq_call_count++; +#else + add_pda(irq_call_count, 1); +#endif irq_exit(); return IRQ_HANDLED; } + +static const struct smp_ops xen_smp_ops __initdata = { + .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, + .smp_prepare_cpus = xen_smp_prepare_cpus, + .smp_cpus_done = xen_smp_cpus_done, + + .cpu_up = xen_cpu_up, + .cpu_die = xen_cpu_die, + .cpu_disable = xen_cpu_disable, + .play_dead = xen_play_dead, + + .smp_send_stop = xen_smp_send_stop, + .smp_send_reschedule = xen_smp_send_reschedule, + + .send_call_func_ipi = xen_smp_send_call_function_ipi, + .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi, +}; + +void __init xen_smp_init(void) +{ + smp_ops = xen_smp_ops; + xen_fill_possible_map(); + xen_init_spinlocks(); +} diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c new file mode 100644 index 000000000000..5601506f2dd9 --- /dev/null +++ b/arch/x86/xen/spinlock.c @@ -0,0 +1,428 @@ +/* + * Split spinlock implementation out into its own file, so it can be + * compiled in a FTRACE-compatible way. + */ +#include <linux/kernel_stat.h> +#include <linux/spinlock.h> +#include <linux/debugfs.h> +#include <linux/log2.h> + +#include <asm/paravirt.h> + +#include <xen/interface/xen.h> +#include <xen/events.h> + +#include "xen-ops.h" +#include "debugfs.h" + +#ifdef CONFIG_XEN_DEBUG_FS +static struct xen_spinlock_stats +{ + u64 taken; + u32 taken_slow; + u32 taken_slow_nested; + u32 taken_slow_pickup; + u32 taken_slow_spurious; + u32 taken_slow_irqenable; + + u64 released; + u32 released_slow; + u32 released_slow_kicked; + +#define HISTO_BUCKETS 30 + u32 histo_spin_total[HISTO_BUCKETS+1]; + u32 histo_spin_spinning[HISTO_BUCKETS+1]; + u32 histo_spin_blocked[HISTO_BUCKETS+1]; + + u64 time_total; + u64 time_spinning; + u64 time_blocked; +} spinlock_stats; + +static u8 zero_stats; + +static unsigned lock_timeout = 1 << 10; +#define TIMEOUT lock_timeout + +static inline void check_zero(void) +{ + if (unlikely(zero_stats)) { + memset(&spinlock_stats, 0, sizeof(spinlock_stats)); + zero_stats = 0; + } +} + +#define ADD_STATS(elem, val) \ + do { check_zero(); spinlock_stats.elem += (val); } while(0) + +static inline u64 spin_time_start(void) +{ + return xen_clocksource_read(); +} + +static void __spin_time_accum(u64 delta, u32 *array) +{ + unsigned index = ilog2(delta); + + check_zero(); + + if (index < HISTO_BUCKETS) + array[index]++; + else + array[HISTO_BUCKETS]++; +} + +static inline void spin_time_accum_spinning(u64 start) +{ + u32 delta = xen_clocksource_read() - start; + + __spin_time_accum(delta, spinlock_stats.histo_spin_spinning); + spinlock_stats.time_spinning += delta; +} + +static inline void spin_time_accum_total(u64 start) +{ + u32 delta = xen_clocksource_read() - start; + + __spin_time_accum(delta, spinlock_stats.histo_spin_total); + spinlock_stats.time_total += delta; +} + +static inline void spin_time_accum_blocked(u64 start) +{ + u32 delta = xen_clocksource_read() - start; + + __spin_time_accum(delta, spinlock_stats.histo_spin_blocked); + spinlock_stats.time_blocked += delta; +} +#else /* !CONFIG_XEN_DEBUG_FS */ +#define TIMEOUT (1 << 10) +#define ADD_STATS(elem, val) do { (void)(val); } while(0) + +static inline u64 spin_time_start(void) +{ + return 0; +} + +static inline void spin_time_accum_total(u64 start) +{ +} +static inline void spin_time_accum_spinning(u64 start) +{ +} +static inline void spin_time_accum_blocked(u64 start) +{ +} +#endif /* CONFIG_XEN_DEBUG_FS */ + +struct xen_spinlock { + unsigned char lock; /* 0 -> free; 1 -> locked */ + unsigned short spinners; /* count of waiting cpus */ +}; + +static int xen_spin_is_locked(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + + return xl->lock != 0; +} + +static int xen_spin_is_contended(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + + /* Not strictly true; this is only the count of contended + lock-takers entering the slow path. */ + return xl->spinners != 0; +} + +static int xen_spin_trylock(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + u8 old = 1; + + asm("xchgb %b0,%1" + : "+q" (old), "+m" (xl->lock) : : "memory"); + + return old == 0; +} + +static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; +static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners); + +/* + * Mark a cpu as interested in a lock. Returns the CPU's previous + * lock of interest, in case we got preempted by an interrupt. + */ +static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl) +{ + struct xen_spinlock *prev; + + prev = __get_cpu_var(lock_spinners); + __get_cpu_var(lock_spinners) = xl; + + wmb(); /* set lock of interest before count */ + + asm(LOCK_PREFIX " incw %0" + : "+m" (xl->spinners) : : "memory"); + + return prev; +} + +/* + * Mark a cpu as no longer interested in a lock. Restores previous + * lock of interest (NULL for none). + */ +static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev) +{ + asm(LOCK_PREFIX " decw %0" + : "+m" (xl->spinners) : : "memory"); + wmb(); /* decrement count before restoring lock */ + __get_cpu_var(lock_spinners) = prev; +} + +static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enable) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + struct xen_spinlock *prev; + int irq = __get_cpu_var(lock_kicker_irq); + int ret; + unsigned long flags; + u64 start; + + /* If kicker interrupts not initialized yet, just spin */ + if (irq == -1) + return 0; + + start = spin_time_start(); + + /* announce we're spinning */ + prev = spinning_lock(xl); + + flags = __raw_local_save_flags(); + if (irq_enable) { + ADD_STATS(taken_slow_irqenable, 1); + raw_local_irq_enable(); + } + + ADD_STATS(taken_slow, 1); + ADD_STATS(taken_slow_nested, prev != NULL); + + do { + /* clear pending */ + xen_clear_irq_pending(irq); + + /* check again make sure it didn't become free while + we weren't looking */ + ret = xen_spin_trylock(lock); + if (ret) { + ADD_STATS(taken_slow_pickup, 1); + + /* + * If we interrupted another spinlock while it + * was blocking, make sure it doesn't block + * without rechecking the lock. + */ + if (prev != NULL) + xen_set_irq_pending(irq); + goto out; + } + + /* + * Block until irq becomes pending. If we're + * interrupted at this point (after the trylock but + * before entering the block), then the nested lock + * handler guarantees that the irq will be left + * pending if there's any chance the lock became free; + * xen_poll_irq() returns immediately if the irq is + * pending. + */ + xen_poll_irq(irq); + ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq)); + } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */ + + kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); + +out: + raw_local_irq_restore(flags); + unspinning_lock(xl, prev); + spin_time_accum_blocked(start); + + return ret; +} + +static inline void __xen_spin_lock(struct raw_spinlock *lock, bool irq_enable) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + unsigned timeout; + u8 oldval; + u64 start_spin; + + ADD_STATS(taken, 1); + + start_spin = spin_time_start(); + + do { + u64 start_spin_fast = spin_time_start(); + + timeout = TIMEOUT; + + asm("1: xchgb %1,%0\n" + " testb %1,%1\n" + " jz 3f\n" + "2: rep;nop\n" + " cmpb $0,%0\n" + " je 1b\n" + " dec %2\n" + " jnz 2b\n" + "3:\n" + : "+m" (xl->lock), "=q" (oldval), "+r" (timeout) + : "1" (1) + : "memory"); + + spin_time_accum_spinning(start_spin_fast); + + } while (unlikely(oldval != 0 && + (TIMEOUT == ~0 || !xen_spin_lock_slow(lock, irq_enable)))); + + spin_time_accum_total(start_spin); +} + +static void xen_spin_lock(struct raw_spinlock *lock) +{ + __xen_spin_lock(lock, false); +} + +static void xen_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags) +{ + __xen_spin_lock(lock, !raw_irqs_disabled_flags(flags)); +} + +static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl) +{ + int cpu; + + ADD_STATS(released_slow, 1); + + for_each_online_cpu(cpu) { + /* XXX should mix up next cpu selection */ + if (per_cpu(lock_spinners, cpu) == xl) { + ADD_STATS(released_slow_kicked, 1); + xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR); + break; + } + } +} + +static void xen_spin_unlock(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + + ADD_STATS(released, 1); + + smp_wmb(); /* make sure no writes get moved after unlock */ + xl->lock = 0; /* release lock */ + + /* make sure unlock happens before kick */ + barrier(); + + if (unlikely(xl->spinners)) + xen_spin_unlock_slow(xl); +} + +static irqreturn_t dummy_handler(int irq, void *dev_id) +{ + BUG(); + return IRQ_HANDLED; +} + +void __cpuinit xen_init_lock_cpu(int cpu) +{ + int irq; + const char *name; + + name = kasprintf(GFP_KERNEL, "spinlock%d", cpu); + irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR, + cpu, + dummy_handler, + IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, + name, + NULL); + + if (irq >= 0) { + disable_irq(irq); /* make sure it's never delivered */ + per_cpu(lock_kicker_irq, cpu) = irq; + } + + printk("cpu %d spinlock event irq %d\n", cpu, irq); +} + +void xen_uninit_lock_cpu(int cpu) +{ + unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL); +} + +void __init xen_init_spinlocks(void) +{ + pv_lock_ops.spin_is_locked = xen_spin_is_locked; + pv_lock_ops.spin_is_contended = xen_spin_is_contended; + pv_lock_ops.spin_lock = xen_spin_lock; + pv_lock_ops.spin_lock_flags = xen_spin_lock_flags; + pv_lock_ops.spin_trylock = xen_spin_trylock; + pv_lock_ops.spin_unlock = xen_spin_unlock; +} + +#ifdef CONFIG_XEN_DEBUG_FS + +static struct dentry *d_spin_debug; + +static int __init xen_spinlock_debugfs(void) +{ + struct dentry *d_xen = xen_init_debugfs(); + + if (d_xen == NULL) + return -ENOMEM; + + d_spin_debug = debugfs_create_dir("spinlocks", d_xen); + + debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats); + + debugfs_create_u32("timeout", 0644, d_spin_debug, &lock_timeout); + + debugfs_create_u64("taken", 0444, d_spin_debug, &spinlock_stats.taken); + debugfs_create_u32("taken_slow", 0444, d_spin_debug, + &spinlock_stats.taken_slow); + debugfs_create_u32("taken_slow_nested", 0444, d_spin_debug, + &spinlock_stats.taken_slow_nested); + debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug, + &spinlock_stats.taken_slow_pickup); + debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug, + &spinlock_stats.taken_slow_spurious); + debugfs_create_u32("taken_slow_irqenable", 0444, d_spin_debug, + &spinlock_stats.taken_slow_irqenable); + + debugfs_create_u64("released", 0444, d_spin_debug, &spinlock_stats.released); + debugfs_create_u32("released_slow", 0444, d_spin_debug, + &spinlock_stats.released_slow); + debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug, + &spinlock_stats.released_slow_kicked); + + debugfs_create_u64("time_spinning", 0444, d_spin_debug, + &spinlock_stats.time_spinning); + debugfs_create_u64("time_blocked", 0444, d_spin_debug, + &spinlock_stats.time_blocked); + debugfs_create_u64("time_total", 0444, d_spin_debug, + &spinlock_stats.time_total); + + xen_debugfs_create_u32_array("histo_total", 0444, d_spin_debug, + spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1); + xen_debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug, + spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1); + xen_debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug, + spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1); + + return 0; +} +fs_initcall(xen_spinlock_debugfs); + +#endif /* CONFIG_XEN_DEBUG_FS */ diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 251669a932d4..2a234db5949b 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -38,8 +38,11 @@ void xen_post_suspend(int suspend_cancelled) xen_cpu_initialized_map = cpu_online_map; #endif xen_vcpu_restore(); - xen_timer_resume(); } } +void xen_arch_resume(void) +{ + /* nothing */ +} diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 685b77470fc3..c9f7cda48ed7 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -30,8 +30,6 @@ #define TIMER_SLOP 100000 #define NS_PER_TICK (1000000000LL / HZ) -static cycle_t xen_clocksource_read(void); - /* runstate info updated by Xen */ static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); @@ -200,20 +198,13 @@ unsigned long long xen_sched_clock(void) /* Get the TSC speed from Xen */ unsigned long xen_tsc_khz(void) { - u64 xen_khz = 1000000ULL << 32; - const struct pvclock_vcpu_time_info *info = + struct pvclock_vcpu_time_info *info = &HYPERVISOR_shared_info->vcpu_info[0].time; - do_div(xen_khz, info->tsc_to_system_mul); - if (info->tsc_shift < 0) - xen_khz <<= -info->tsc_shift; - else - xen_khz >>= info->tsc_shift; - - return xen_khz; + return pvclock_tsc_khz(info); } -static cycle_t xen_clocksource_read(void) +cycle_t xen_clocksource_read(void) { struct pvclock_vcpu_time_info *src; cycle_t ret; @@ -452,6 +443,14 @@ void xen_setup_timer(int cpu) setup_runstate_info(cpu); } +void xen_teardown_timer(int cpu) +{ + struct clock_event_device *evt; + BUG_ON(cpu == 0); + evt = &per_cpu(xen_clock_events, cpu); + unbind_from_irqhandler(evt->irq, NULL); +} + void xen_setup_cpu_clockevents(void) { BUG_ON(preemptible()); diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm_32.S index 2497a30f41de..42786f59d9c0 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm_32.S @@ -298,7 +298,7 @@ check_events: push %eax push %ecx push %edx - call force_evtchn_callback + call xen_force_evtchn_callback pop %edx pop %ecx pop %eax diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S new file mode 100644 index 000000000000..05794c566e87 --- /dev/null +++ b/arch/x86/xen/xen-asm_64.S @@ -0,0 +1,285 @@ +/* + Asm versions of Xen pv-ops, suitable for either direct use or inlining. + The inline versions are the same as the direct-use versions, with the + pre- and post-amble chopped off. + + This code is encoded for size rather than absolute efficiency, + with a view to being able to inline as much as possible. + + We only bother with direct forms (ie, vcpu in pda) of the operations + here; the indirect forms are better handled in C, since they're + generally too large to inline anyway. + */ + +#include <linux/linkage.h> + +#include <asm/asm-offsets.h> +#include <asm/processor-flags.h> +#include <asm/errno.h> +#include <asm/segment.h> + +#include <xen/interface/xen.h> + +#define RELOC(x, v) .globl x##_reloc; x##_reloc=v +#define ENDPATCH(x) .globl x##_end; x##_end=. + +/* Pseudo-flag used for virtual NMI, which we don't implement yet */ +#define XEN_EFLAGS_NMI 0x80000000 + +#if 1 +/* + x86-64 does not yet support direct access to percpu variables + via a segment override, so we just need to make sure this code + never gets used + */ +#define BUG ud2a +#define PER_CPU_VAR(var, off) 0xdeadbeef +#endif + +/* + Enable events. This clears the event mask and tests the pending + event status with one and operation. If there are pending + events, then enter the hypervisor to get them handled. + */ +ENTRY(xen_irq_enable_direct) + BUG + + /* Unmask events */ + movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) + + /* Preempt here doesn't matter because that will deal with + any pending interrupts. The pending check may end up being + run on the wrong CPU, but that doesn't hurt. */ + + /* Test for pending */ + testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending) + jz 1f + +2: call check_events +1: +ENDPATCH(xen_irq_enable_direct) + ret + ENDPROC(xen_irq_enable_direct) + RELOC(xen_irq_enable_direct, 2b+1) + +/* + Disabling events is simply a matter of making the event mask + non-zero. + */ +ENTRY(xen_irq_disable_direct) + BUG + + movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) +ENDPATCH(xen_irq_disable_direct) + ret + ENDPROC(xen_irq_disable_direct) + RELOC(xen_irq_disable_direct, 0) + +/* + (xen_)save_fl is used to get the current interrupt enable status. + Callers expect the status to be in X86_EFLAGS_IF, and other bits + may be set in the return value. We take advantage of this by + making sure that X86_EFLAGS_IF has the right value (and other bits + in that byte are 0), but other bits in the return value are + undefined. We need to toggle the state of the bit, because + Xen and x86 use opposite senses (mask vs enable). + */ +ENTRY(xen_save_fl_direct) + BUG + + testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) + setz %ah + addb %ah,%ah +ENDPATCH(xen_save_fl_direct) + ret + ENDPROC(xen_save_fl_direct) + RELOC(xen_save_fl_direct, 0) + +/* + In principle the caller should be passing us a value return + from xen_save_fl_direct, but for robustness sake we test only + the X86_EFLAGS_IF flag rather than the whole byte. After + setting the interrupt mask state, it checks for unmasked + pending events and enters the hypervisor to get them delivered + if so. + */ +ENTRY(xen_restore_fl_direct) + BUG + + testb $X86_EFLAGS_IF>>8, %ah + setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) + /* Preempt here doesn't matter because that will deal with + any pending interrupts. The pending check may end up being + run on the wrong CPU, but that doesn't hurt. */ + + /* check for unmasked and pending */ + cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending) + jz 1f +2: call check_events +1: +ENDPATCH(xen_restore_fl_direct) + ret + ENDPROC(xen_restore_fl_direct) + RELOC(xen_restore_fl_direct, 2b+1) + + +/* + Force an event check by making a hypercall, + but preserve regs before making the call. + */ +check_events: + push %rax + push %rcx + push %rdx + push %rsi + push %rdi + push %r8 + push %r9 + push %r10 + push %r11 + call xen_force_evtchn_callback + pop %r11 + pop %r10 + pop %r9 + pop %r8 + pop %rdi + pop %rsi + pop %rdx + pop %rcx + pop %rax + ret + +ENTRY(xen_adjust_exception_frame) + mov 8+0(%rsp),%rcx + mov 8+8(%rsp),%r11 + ret $16 + +hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 +/* + Xen64 iret frame: + + ss + rsp + rflags + cs + rip <-- standard iret frame + + flags + + rcx } + r11 }<-- pushed by hypercall page +rsp -> rax } + */ +ENTRY(xen_iret) + pushq $0 +1: jmp hypercall_iret +ENDPATCH(xen_iret) +RELOC(xen_iret, 1b+1) + +/* + sysexit is not used for 64-bit processes, so it's + only ever used to return to 32-bit compat userspace. + */ +ENTRY(xen_sysexit) + pushq $__USER32_DS + pushq %rcx + pushq $X86_EFLAGS_IF + pushq $__USER32_CS + pushq %rdx + + pushq $0 +1: jmp hypercall_iret +ENDPATCH(xen_sysexit) +RELOC(xen_sysexit, 1b+1) + +ENTRY(xen_sysret64) + /* We're already on the usermode stack at this point, but still + with the kernel gs, so we can easily switch back */ + movq %rsp, %gs:pda_oldrsp + movq %gs:pda_kernelstack,%rsp + + pushq $__USER_DS + pushq %gs:pda_oldrsp + pushq %r11 + pushq $__USER_CS + pushq %rcx + + pushq $VGCF_in_syscall +1: jmp hypercall_iret +ENDPATCH(xen_sysret64) +RELOC(xen_sysret64, 1b+1) + +ENTRY(xen_sysret32) + /* We're already on the usermode stack at this point, but still + with the kernel gs, so we can easily switch back */ + movq %rsp, %gs:pda_oldrsp + movq %gs:pda_kernelstack, %rsp + + pushq $__USER32_DS + pushq %gs:pda_oldrsp + pushq %r11 + pushq $__USER32_CS + pushq %rcx + + pushq $VGCF_in_syscall +1: jmp hypercall_iret +ENDPATCH(xen_sysret32) +RELOC(xen_sysret32, 1b+1) + +/* + Xen handles syscall callbacks much like ordinary exceptions, + which means we have: + - kernel gs + - kernel rsp + - an iret-like stack frame on the stack (including rcx and r11): + ss + rsp + rflags + cs + rip + r11 + rsp-> rcx + + In all the entrypoints, we undo all that to make it look + like a CPU-generated syscall/sysenter and jump to the normal + entrypoint. + */ + +.macro undo_xen_syscall + mov 0*8(%rsp),%rcx + mov 1*8(%rsp),%r11 + mov 5*8(%rsp),%rsp +.endm + +/* Normal 64-bit system call target */ +ENTRY(xen_syscall_target) + undo_xen_syscall + jmp system_call_after_swapgs +ENDPROC(xen_syscall_target) + +#ifdef CONFIG_IA32_EMULATION + +/* 32-bit compat syscall target */ +ENTRY(xen_syscall32_target) + undo_xen_syscall + jmp ia32_cstar_target +ENDPROC(xen_syscall32_target) + +/* 32-bit compat sysenter target */ +ENTRY(xen_sysenter_target) + undo_xen_syscall + jmp ia32_sysenter_target +ENDPROC(xen_sysenter_target) + +#else /* !CONFIG_IA32_EMULATION */ + +ENTRY(xen_syscall32_target) +ENTRY(xen_sysenter_target) + lea 16(%rsp), %rsp /* strip %rcx,%r11 */ + mov $-ENOSYS, %rax + pushq $VGCF_in_syscall + jmp hypercall_iret +ENDPROC(xen_syscall32_target) +ENDPROC(xen_sysenter_target) + +#endif /* CONFIG_IA32_EMULATION */ diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 7c0cf6320a0a..63d49a523ed3 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -5,15 +5,24 @@ #include <linux/elfnote.h> #include <linux/init.h> + #include <asm/boot.h> +#include <asm/asm.h> +#include <asm/page.h> + #include <xen/interface/elfnote.h> #include <asm/xen/interface.h> __INIT ENTRY(startup_xen) - movl %esi,xen_start_info cld - movl $(init_thread_union+THREAD_SIZE),%esp +#ifdef CONFIG_X86_32 + mov %esi,xen_start_info + mov $init_thread_union+THREAD_SIZE,%esp +#else + mov %rsi,xen_start_info + mov $init_thread_union+THREAD_SIZE,%rsp +#endif jmp xen_start_kernel __FINIT @@ -21,21 +30,26 @@ ENTRY(startup_xen) .pushsection .text .align PAGE_SIZE_asm ENTRY(hypercall_page) - .skip 0x1000 + .skip PAGE_SIZE_asm .popsection ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") - ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET) - ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen) - ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page) +#ifdef CONFIG_X86_32 + ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET) +#else + ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map) +#endif + ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen) + ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page) ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad _PAGE_PRESENT; .quad _PAGE_PRESENT) ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1) - ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long __HYPERVISOR_VIRT_START) + ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START) + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, _ASM_PTR 0) #endif /*CONFIG_XEN */ diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 6f4b1045c1c2..d7422dc2a55c 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -2,6 +2,7 @@ #define XEN_OPS_H #include <linux/init.h> +#include <linux/clocksource.h> #include <linux/irqreturn.h> #include <xen/xen-ops.h> @@ -26,18 +27,21 @@ char * __init xen_memory_setup(void); void __init xen_arch_setup(void); void __init xen_init_IRQ(void); void xen_enable_sysenter(void); +void xen_enable_syscall(void); void xen_vcpu_restore(void); void __init xen_build_dynamic_phys_to_machine(void); +void xen_init_irq_ops(void); void xen_setup_timer(int cpu); +void xen_teardown_timer(int cpu); +cycle_t xen_clocksource_read(void); void xen_setup_cpu_clockevents(void); unsigned long xen_tsc_khz(void); void __init xen_time_init(void); unsigned long xen_get_wallclock(void); int xen_set_wallclock(unsigned long time); unsigned long long xen_sched_clock(void); -void xen_timer_resume(void); irqreturn_t xen_debug_interrupt(int irq, void *dev_id); @@ -45,20 +49,19 @@ bool xen_vcpu_stolen(int vcpu); void xen_mark_init_mm_pinned(void); -void __init xen_fill_possible_map(void); - void __init xen_setup_vcpu_info_placement(void); -void xen_smp_prepare_boot_cpu(void); -void xen_smp_prepare_cpus(unsigned int max_cpus); -int xen_cpu_up(unsigned int cpu); -void xen_smp_cpus_done(unsigned int max_cpus); -void xen_smp_send_stop(void); -void xen_smp_send_reschedule(int cpu); -void xen_smp_send_call_function_ipi(cpumask_t mask); -void xen_smp_send_call_function_single_ipi(int cpu); +#ifdef CONFIG_SMP +void xen_smp_init(void); + +void __init xen_init_spinlocks(void); +__cpuinit void xen_init_lock_cpu(int cpu); +void xen_uninit_lock_cpu(int cpu); extern cpumask_t xen_cpu_initialized_map; +#else +static inline void xen_smp_init(void) {} +#endif /* Declare an asm function, along with symbols needed to make it @@ -73,7 +76,11 @@ DECL_ASM(void, xen_irq_disable_direct, void); DECL_ASM(unsigned long, xen_save_fl_direct, void); DECL_ASM(void, xen_restore_fl_direct, unsigned long); +/* These are not functions, and cannot be called normally */ void xen_iret(void); void xen_sysexit(void); +void xen_sysret32(void); +void xen_sysret64(void); +void xen_adjust_exception_frame(void); #endif /* XEN_OPS_H */ |