From d7e5a5462f68270ed66efff22b1981be57a28c19 Mon Sep 17 00:00:00 2001
From: Segher Boessenkool <segher@kernel.crashing.org>
Date: Wed, 2 May 2007 12:18:41 +0200
Subject: [RSLIB] Support non-canonical GF representations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For the CAFÉ NAND controller, we need to support non-canonical
representations of the Galois field. Allow the caller to provide its own
function for generating the field, and CAFÉ can use rslib instead of its
own implementation.

Signed-off-by: Segher Boessenkool <segher@kernel.crashing.org>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/rslib.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rslib.h b/include/linux/rslib.h
index ace25acfdc97..746580c1939c 100644
--- a/include/linux/rslib.h
+++ b/include/linux/rslib.h
@@ -34,6 +34,7 @@
  * @prim:	Primitive element, index form
  * @iprim:	prim-th root of 1, index form
  * @gfpoly:	The primitive generator polynominal
+ * @gffunc:	Function to generate the field, if non-canonical representation
  * @users:	Users of this structure
  * @list:	List entry for the rs control list
 */
@@ -48,6 +49,7 @@ struct rs_control {
 	int 		prim;
 	int 		iprim;
 	int		gfpoly;
+	int		(*gffunc)(int);
 	int		users;
 	struct list_head list;
 };
@@ -77,6 +79,8 @@ int decode_rs16(struct rs_control *rs, uint16_t *data, uint16_t *par, int len,
 /* Create or get a matching rs control structure */
 struct rs_control *init_rs(int symsize, int gfpoly, int fcr, int prim,
 			   int nroots);
+struct rs_control *init_rs_non_canonical(int symsize, int (*func)(int),
+                                         int fcr, int prim, int nroots);
 
 /* Release a rs control structure */
 void free_rs(struct rs_control *rs);
-- 
cgit v1.2.3


From 972edcb79ec8c8512ed5b29ca6718065328d6992 Mon Sep 17 00:00:00 2001
From: Vitaly Wool <vitalywool@gmail.com>
Date: Sun, 6 May 2007 18:46:57 +0400
Subject: [MTD] [NAND] platform NAND driver: update header

This patch extends nand.h in order to enable platform NAND driver.

Signed-off-by: Vitaly Wool <vitalywool@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/mtd/nand.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index cf197ad62da6..d2365c8dcacc 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -560,6 +560,7 @@ extern int nand_do_read(struct mtd_info *mtd, loff_t from, size_t len,
  * @chip_delay:		R/B delay value in us
  * @options:		Option flags, e.g. 16bit buswidth
  * @ecclayout:		ecc layout info structure
+ * @part_probe_types:	NULL-terminated array of probe types
  * @priv:		hardware controller specific settings
  */
 struct platform_nand_chip {
@@ -570,6 +571,7 @@ struct platform_nand_chip {
 	struct nand_ecclayout	*ecclayout;
 	int			chip_delay;
 	unsigned int		options;
+	const char		**part_probe_types;
 	void			*priv;
 };
 
@@ -578,6 +580,8 @@ struct platform_nand_chip {
  * @hwcontrol:		platform specific hardware control structure
  * @dev_ready:		platform specific function to read ready/busy pin
  * @select_chip:	platform specific chip select function
+ * @cmd_ctrl:		platform specific function for controlling
+ *			ALE/CLE/nCE. Also used to write command and address
  * @priv:		private data to transport driver specific settings
  *
  * All fields are optional and depend on the hardware driver requirements
@@ -586,9 +590,21 @@ struct platform_nand_ctrl {
 	void		(*hwcontrol)(struct mtd_info *mtd, int cmd);
 	int		(*dev_ready)(struct mtd_info *mtd);
 	void		(*select_chip)(struct mtd_info *mtd, int chip);
+	void		(*cmd_ctrl)(struct mtd_info *mtd, int dat,
+				    unsigned int ctrl);
 	void		*priv;
 };
 
+/**
+ * struct platform_nand_data - container structure for platform-specific data
+ * @chip:		chip level chip structure
+ * @ctrl:		controller level device structure
+ */
+struct platform_nand_data {
+	struct platform_nand_chip	chip;
+	struct platform_nand_ctrl	ctrl;
+};
+
 /* Some helpers to access the data structures */
 static inline
 struct platform_nand_chip *get_platform_nandchip(struct mtd_info *mtd)
-- 
cgit v1.2.3


From 225c7b1feef1b41170f7037a5b10a65cd8a42c54 Mon Sep 17 00:00:00 2001
From: Roland Dreier <rolandd@cisco.com>
Date: Tue, 8 May 2007 18:00:38 -0700
Subject: IB/mlx4: Add a driver Mellanox ConnectX InfiniBand adapters

Add an InfiniBand driver for Mellanox ConnectX adapters.  Because
these adapters can also be used as ethernet NICs and Fibre Channel
HBAs, the driver is split into two modules:

  mlx4_core: Handles low-level things like device initialization and
    processing firmware commands.  Also controls resource allocation
    so that the InfiniBand, ethernet and FC functions can share a
    device without stepping on each other.

  mlx4_ib: Handles InfiniBand-specific things; plugs into the
    InfiniBand midlayer.

Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 include/linux/mlx4/cmd.h      | 178 +++++++++++++++++++++++
 include/linux/mlx4/cq.h       | 123 ++++++++++++++++
 include/linux/mlx4/device.h   | 331 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/mlx4/doorbell.h |  97 +++++++++++++
 include/linux/mlx4/driver.h   |  59 ++++++++
 include/linux/mlx4/qp.h       | 288 ++++++++++++++++++++++++++++++++++++
 include/linux/mlx4/srq.h      |  42 ++++++
 7 files changed, 1118 insertions(+)
 create mode 100644 include/linux/mlx4/cmd.h
 create mode 100644 include/linux/mlx4/cq.h
 create mode 100644 include/linux/mlx4/device.h
 create mode 100644 include/linux/mlx4/doorbell.h
 create mode 100644 include/linux/mlx4/driver.h
 create mode 100644 include/linux/mlx4/qp.h
 create mode 100644 include/linux/mlx4/srq.h

(limited to 'include/linux')

diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h
new file mode 100644
index 000000000000..4fb552d12f7a
--- /dev/null
+++ b/include/linux/mlx4/cmd.h
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2006 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_CMD_H
+#define MLX4_CMD_H
+
+#include <linux/dma-mapping.h>
+
+enum {
+	/* initialization and general commands */
+	MLX4_CMD_SYS_EN		 = 0x1,
+	MLX4_CMD_SYS_DIS	 = 0x2,
+	MLX4_CMD_MAP_FA		 = 0xfff,
+	MLX4_CMD_UNMAP_FA	 = 0xffe,
+	MLX4_CMD_RUN_FW		 = 0xff6,
+	MLX4_CMD_MOD_STAT_CFG	 = 0x34,
+	MLX4_CMD_QUERY_DEV_CAP	 = 0x3,
+	MLX4_CMD_QUERY_FW	 = 0x4,
+	MLX4_CMD_ENABLE_LAM	 = 0xff8,
+	MLX4_CMD_DISABLE_LAM	 = 0xff7,
+	MLX4_CMD_QUERY_DDR	 = 0x5,
+	MLX4_CMD_QUERY_ADAPTER	 = 0x6,
+	MLX4_CMD_INIT_HCA	 = 0x7,
+	MLX4_CMD_CLOSE_HCA	 = 0x8,
+	MLX4_CMD_INIT_PORT	 = 0x9,
+	MLX4_CMD_CLOSE_PORT	 = 0xa,
+	MLX4_CMD_QUERY_HCA	 = 0xb,
+	MLX4_CMD_SET_PORT	 = 0xc,
+	MLX4_CMD_ACCESS_DDR	 = 0x2e,
+	MLX4_CMD_MAP_ICM	 = 0xffa,
+	MLX4_CMD_UNMAP_ICM	 = 0xff9,
+	MLX4_CMD_MAP_ICM_AUX	 = 0xffc,
+	MLX4_CMD_UNMAP_ICM_AUX	 = 0xffb,
+	MLX4_CMD_SET_ICM_SIZE	 = 0xffd,
+
+	/* TPT commands */
+	MLX4_CMD_SW2HW_MPT	 = 0xd,
+	MLX4_CMD_QUERY_MPT	 = 0xe,
+	MLX4_CMD_HW2SW_MPT	 = 0xf,
+	MLX4_CMD_READ_MTT	 = 0x10,
+	MLX4_CMD_WRITE_MTT	 = 0x11,
+	MLX4_CMD_SYNC_TPT	 = 0x2f,
+
+	/* EQ commands */
+	MLX4_CMD_MAP_EQ		 = 0x12,
+	MLX4_CMD_SW2HW_EQ	 = 0x13,
+	MLX4_CMD_HW2SW_EQ	 = 0x14,
+	MLX4_CMD_QUERY_EQ	 = 0x15,
+
+	/* CQ commands */
+	MLX4_CMD_SW2HW_CQ	 = 0x16,
+	MLX4_CMD_HW2SW_CQ	 = 0x17,
+	MLX4_CMD_QUERY_CQ	 = 0x18,
+	MLX4_CMD_RESIZE_CQ	 = 0x2c,
+
+	/* SRQ commands */
+	MLX4_CMD_SW2HW_SRQ	 = 0x35,
+	MLX4_CMD_HW2SW_SRQ	 = 0x36,
+	MLX4_CMD_QUERY_SRQ	 = 0x37,
+	MLX4_CMD_ARM_SRQ	 = 0x40,
+
+	/* QP/EE commands */
+	MLX4_CMD_RST2INIT_QP	 = 0x19,
+	MLX4_CMD_INIT2RTR_QP	 = 0x1a,
+	MLX4_CMD_RTR2RTS_QP	 = 0x1b,
+	MLX4_CMD_RTS2RTS_QP	 = 0x1c,
+	MLX4_CMD_SQERR2RTS_QP	 = 0x1d,
+	MLX4_CMD_2ERR_QP	 = 0x1e,
+	MLX4_CMD_RTS2SQD_QP	 = 0x1f,
+	MLX4_CMD_SQD2SQD_QP	 = 0x38,
+	MLX4_CMD_SQD2RTS_QP	 = 0x20,
+	MLX4_CMD_2RST_QP	 = 0x21,
+	MLX4_CMD_QUERY_QP	 = 0x22,
+	MLX4_CMD_INIT2INIT_QP	 = 0x2d,
+	MLX4_CMD_SUSPEND_QP	 = 0x32,
+	MLX4_CMD_UNSUSPEND_QP	 = 0x33,
+	/* special QP and management commands */
+	MLX4_CMD_CONF_SPECIAL_QP = 0x23,
+	MLX4_CMD_MAD_IFC	 = 0x24,
+
+	/* multicast commands */
+	MLX4_CMD_READ_MCG	 = 0x25,
+	MLX4_CMD_WRITE_MCG	 = 0x26,
+	MLX4_CMD_MGID_HASH	 = 0x27,
+
+	/* miscellaneous commands */
+	MLX4_CMD_DIAG_RPRT	 = 0x30,
+	MLX4_CMD_NOP		 = 0x31,
+
+	/* debug commands */
+	MLX4_CMD_QUERY_DEBUG_MSG = 0x2a,
+	MLX4_CMD_SET_DEBUG_MSG	 = 0x2b,
+};
+
+enum {
+	MLX4_CMD_TIME_CLASS_A	= 10000,
+	MLX4_CMD_TIME_CLASS_B	= 10000,
+	MLX4_CMD_TIME_CLASS_C	= 10000,
+};
+
+enum {
+	MLX4_MAILBOX_SIZE	=  4096
+};
+
+struct mlx4_dev;
+
+struct mlx4_cmd_mailbox {
+	void		       *buf;
+	dma_addr_t		dma;
+};
+
+int __mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
+	       int out_is_imm, u32 in_modifier, u8 op_modifier,
+	       u16 op, unsigned long timeout);
+
+/* Invoke a command with no output parameter */
+static inline int mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u32 in_modifier,
+			   u8 op_modifier, u16 op, unsigned long timeout)
+{
+	return __mlx4_cmd(dev, in_param, NULL, 0, in_modifier,
+			  op_modifier, op, timeout);
+}
+
+/* Invoke a command with an output mailbox */
+static inline int mlx4_cmd_box(struct mlx4_dev *dev, u64 in_param, u64 out_param,
+			       u32 in_modifier, u8 op_modifier, u16 op,
+			       unsigned long timeout)
+{
+	return __mlx4_cmd(dev, in_param, &out_param, 0, in_modifier,
+			  op_modifier, op, timeout);
+}
+
+/*
+ * Invoke a command with an immediate output parameter (and copy the
+ * output into the caller's out_param pointer after the command
+ * executes).
+ */
+static inline int mlx4_cmd_imm(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
+			       u32 in_modifier, u8 op_modifier, u16 op,
+			       unsigned long timeout)
+{
+	return __mlx4_cmd(dev, in_param, out_param, 1, in_modifier,
+			  op_modifier, op, timeout);
+}
+
+struct mlx4_cmd_mailbox *mlx4_alloc_cmd_mailbox(struct mlx4_dev *dev);
+void mlx4_free_cmd_mailbox(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox);
+
+#endif /* MLX4_CMD_H */
diff --git a/include/linux/mlx4/cq.h b/include/linux/mlx4/cq.h
new file mode 100644
index 000000000000..0181e0a57cbf
--- /dev/null
+++ b/include/linux/mlx4/cq.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_CQ_H
+#define MLX4_CQ_H
+
+#include <linux/types.h>
+
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/doorbell.h>
+
+struct mlx4_cqe {
+	__be32			my_qpn;
+	__be32			immed_rss_invalid;
+	__be32			g_mlpath_rqpn;
+	u8			sl;
+	u8			reserved1;
+	__be16			rlid;
+	u32			reserved2;
+	__be32			byte_cnt;
+	__be16			wqe_index;
+	__be16			checksum;
+	u8			reserved3[3];
+	u8			owner_sr_opcode;
+};
+
+struct mlx4_err_cqe {
+	__be32			my_qpn;
+	u32			reserved1[5];
+	__be16			wqe_index;
+	u8			vendor_err_syndrome;
+	u8			syndrome;
+	u8			reserved2[3];
+	u8			owner_sr_opcode;
+};
+
+enum {
+	MLX4_CQE_OWNER_MASK	= 0x80,
+	MLX4_CQE_IS_SEND_MASK	= 0x40,
+	MLX4_CQE_OPCODE_MASK	= 0x1f
+};
+
+enum {
+	MLX4_CQE_SYNDROME_LOCAL_LENGTH_ERR		= 0x01,
+	MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR		= 0x02,
+	MLX4_CQE_SYNDROME_LOCAL_PROT_ERR		= 0x04,
+	MLX4_CQE_SYNDROME_WR_FLUSH_ERR			= 0x05,
+	MLX4_CQE_SYNDROME_MW_BIND_ERR			= 0x06,
+	MLX4_CQE_SYNDROME_BAD_RESP_ERR			= 0x10,
+	MLX4_CQE_SYNDROME_LOCAL_ACCESS_ERR		= 0x11,
+	MLX4_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR		= 0x12,
+	MLX4_CQE_SYNDROME_REMOTE_ACCESS_ERR		= 0x13,
+	MLX4_CQE_SYNDROME_REMOTE_OP_ERR			= 0x14,
+	MLX4_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR	= 0x15,
+	MLX4_CQE_SYNDROME_RNR_RETRY_EXC_ERR		= 0x16,
+	MLX4_CQE_SYNDROME_REMOTE_ABORTED_ERR		= 0x22,
+};
+
+static inline void mlx4_cq_arm(struct mlx4_cq *cq, u32 cmd,
+			       void __iomem *uar_page,
+			       spinlock_t *doorbell_lock)
+{
+	__be32 doorbell[2];
+	u32 sn;
+	u32 ci;
+
+	sn = cq->arm_sn & 3;
+	ci = cq->cons_index & 0xffffff;
+
+	*cq->arm_db = cpu_to_be32(sn << 28 | cmd | ci);
+
+	/*
+	 * Make sure that the doorbell record in host memory is
+	 * written before ringing the doorbell via PCI MMIO.
+	 */
+	wmb();
+
+	doorbell[0] = cpu_to_be32(sn << 28 | cmd | cq->cqn);
+	doorbell[1] = cpu_to_be32(ci);
+
+	mlx4_write64(doorbell, uar_page + MLX4_CQ_DOORBELL, doorbell_lock);
+}
+
+static inline void mlx4_cq_set_ci(struct mlx4_cq *cq)
+{
+	*cq->set_ci_db = cpu_to_be32(cq->cons_index & 0xffffff);
+}
+
+enum {
+	MLX4_CQ_DB_REQ_NOT_SOL		= 1 << 24,
+	MLX4_CQ_DB_REQ_NOT		= 2 << 24
+};
+
+#endif /* MLX4_CQ_H */
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
new file mode 100644
index 000000000000..8c5f8fd86841
--- /dev/null
+++ b/include/linux/mlx4/device.h
@@ -0,0 +1,331 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_DEVICE_H
+#define MLX4_DEVICE_H
+
+#include <linux/pci.h>
+#include <linux/completion.h>
+#include <linux/radix-tree.h>
+
+#include <asm/atomic.h>
+
+enum {
+	MLX4_FLAG_MSI_X		= 1 << 0,
+};
+
+enum {
+	MLX4_MAX_PORTS		= 2
+};
+
+enum {
+	MLX4_DEV_CAP_FLAG_RC		= 1 <<  0,
+	MLX4_DEV_CAP_FLAG_UC		= 1 <<  1,
+	MLX4_DEV_CAP_FLAG_UD		= 1 <<  2,
+	MLX4_DEV_CAP_FLAG_SRQ		= 1 <<  6,
+	MLX4_DEV_CAP_FLAG_IPOIB_CSUM	= 1 <<  7,
+	MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR	= 1 <<  8,
+	MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR	= 1 <<  9,
+	MLX4_DEV_CAP_FLAG_MEM_WINDOW	= 1 << 16,
+	MLX4_DEV_CAP_FLAG_APM		= 1 << 17,
+	MLX4_DEV_CAP_FLAG_ATOMIC	= 1 << 18,
+	MLX4_DEV_CAP_FLAG_RAW_MCAST	= 1 << 19,
+	MLX4_DEV_CAP_FLAG_UD_AV_PORT	= 1 << 20,
+	MLX4_DEV_CAP_FLAG_UD_MCAST	= 1 << 21
+};
+
+enum mlx4_event {
+	MLX4_EVENT_TYPE_COMP		   = 0x00,
+	MLX4_EVENT_TYPE_PATH_MIG	   = 0x01,
+	MLX4_EVENT_TYPE_COMM_EST	   = 0x02,
+	MLX4_EVENT_TYPE_SQ_DRAINED	   = 0x03,
+	MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE	   = 0x13,
+	MLX4_EVENT_TYPE_SRQ_LIMIT	   = 0x14,
+	MLX4_EVENT_TYPE_CQ_ERROR	   = 0x04,
+	MLX4_EVENT_TYPE_WQ_CATAS_ERROR	   = 0x05,
+	MLX4_EVENT_TYPE_EEC_CATAS_ERROR	   = 0x06,
+	MLX4_EVENT_TYPE_PATH_MIG_FAILED	   = 0x07,
+	MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR = 0x10,
+	MLX4_EVENT_TYPE_WQ_ACCESS_ERROR	   = 0x11,
+	MLX4_EVENT_TYPE_SRQ_CATAS_ERROR	   = 0x12,
+	MLX4_EVENT_TYPE_LOCAL_CATAS_ERROR  = 0x08,
+	MLX4_EVENT_TYPE_PORT_CHANGE	   = 0x09,
+	MLX4_EVENT_TYPE_EQ_OVERFLOW	   = 0x0f,
+	MLX4_EVENT_TYPE_ECC_DETECT	   = 0x0e,
+	MLX4_EVENT_TYPE_CMD		   = 0x0a
+};
+
+enum {
+	MLX4_PORT_CHANGE_SUBTYPE_DOWN	= 1,
+	MLX4_PORT_CHANGE_SUBTYPE_ACTIVE	= 4
+};
+
+enum {
+	MLX4_PERM_LOCAL_READ	= 1 << 10,
+	MLX4_PERM_LOCAL_WRITE	= 1 << 11,
+	MLX4_PERM_REMOTE_READ	= 1 << 12,
+	MLX4_PERM_REMOTE_WRITE	= 1 << 13,
+	MLX4_PERM_ATOMIC	= 1 << 14
+};
+
+enum {
+	MLX4_OPCODE_NOP			= 0x00,
+	MLX4_OPCODE_SEND_INVAL		= 0x01,
+	MLX4_OPCODE_RDMA_WRITE		= 0x08,
+	MLX4_OPCODE_RDMA_WRITE_IMM	= 0x09,
+	MLX4_OPCODE_SEND		= 0x0a,
+	MLX4_OPCODE_SEND_IMM		= 0x0b,
+	MLX4_OPCODE_LSO			= 0x0e,
+	MLX4_OPCODE_RDMA_READ		= 0x10,
+	MLX4_OPCODE_ATOMIC_CS		= 0x11,
+	MLX4_OPCODE_ATOMIC_FA		= 0x12,
+	MLX4_OPCODE_ATOMIC_MASK_CS	= 0x14,
+	MLX4_OPCODE_ATOMIC_MASK_FA	= 0x15,
+	MLX4_OPCODE_BIND_MW		= 0x18,
+	MLX4_OPCODE_FMR			= 0x19,
+	MLX4_OPCODE_LOCAL_INVAL		= 0x1b,
+	MLX4_OPCODE_CONFIG_CMD		= 0x1f,
+
+	MLX4_RECV_OPCODE_RDMA_WRITE_IMM	= 0x00,
+	MLX4_RECV_OPCODE_SEND		= 0x01,
+	MLX4_RECV_OPCODE_SEND_IMM	= 0x02,
+	MLX4_RECV_OPCODE_SEND_INVAL	= 0x03,
+
+	MLX4_CQE_OPCODE_ERROR		= 0x1e,
+	MLX4_CQE_OPCODE_RESIZE		= 0x16,
+};
+
+enum {
+	MLX4_STAT_RATE_OFFSET	= 5
+};
+
+struct mlx4_caps {
+	u64			fw_ver;
+	int			num_ports;
+	int			vl_cap;
+	int			mtu_cap;
+	int			gid_table_len;
+	int			pkey_table_len;
+	int			local_ca_ack_delay;
+	int			num_uars;
+	int			bf_reg_size;
+	int			bf_regs_per_page;
+	int			max_sq_sg;
+	int			max_rq_sg;
+	int			num_qps;
+	int			max_wqes;
+	int			max_sq_desc_sz;
+	int			max_rq_desc_sz;
+	int			max_qp_init_rdma;
+	int			max_qp_dest_rdma;
+	int			reserved_qps;
+	int			sqp_start;
+	int			num_srqs;
+	int			max_srq_wqes;
+	int			max_srq_sge;
+	int			reserved_srqs;
+	int			num_cqs;
+	int			max_cqes;
+	int			reserved_cqs;
+	int			num_eqs;
+	int			reserved_eqs;
+	int			num_mpts;
+	int			num_mtt_segs;
+	int			fmr_reserved_mtts;
+	int			reserved_mtts;
+	int			reserved_mrws;
+	int			reserved_uars;
+	int			num_mgms;
+	int			num_amgms;
+	int			reserved_mcgs;
+	int			num_qp_per_mgm;
+	int			num_pds;
+	int			reserved_pds;
+	int			mtt_entry_sz;
+	u32			page_size_cap;
+	u32			flags;
+	u16			stat_rate_support;
+	u8			port_width_cap;
+};
+
+struct mlx4_buf_list {
+	void		       *buf;
+	dma_addr_t		map;
+};
+
+struct mlx4_buf {
+	union {
+		struct mlx4_buf_list	direct;
+		struct mlx4_buf_list   *page_list;
+	} u;
+	int			nbufs;
+	int			npages;
+	int			page_shift;
+};
+
+struct mlx4_mtt {
+	u32			first_seg;
+	int			order;
+	int			page_shift;
+};
+
+struct mlx4_mr {
+	struct mlx4_mtt		mtt;
+	u64			iova;
+	u64			size;
+	u32			key;
+	u32			pd;
+	u32			access;
+	int			enabled;
+};
+
+struct mlx4_uar {
+	unsigned long		pfn;
+	int			index;
+};
+
+struct mlx4_cq {
+	void (*comp)		(struct mlx4_cq *);
+	void (*event)		(struct mlx4_cq *, enum mlx4_event);
+
+	struct mlx4_uar	       *uar;
+
+	u32			cons_index;
+
+	__be32		       *set_ci_db;
+	__be32		       *arm_db;
+	int			arm_sn;
+
+	int			cqn;
+
+	atomic_t		refcount;
+	struct completion	free;
+};
+
+struct mlx4_qp {
+	void (*event)		(struct mlx4_qp *, enum mlx4_event);
+
+	int			qpn;
+
+	atomic_t		refcount;
+	struct completion	free;
+};
+
+struct mlx4_srq {
+	void (*event)		(struct mlx4_srq *, enum mlx4_event);
+
+	int			srqn;
+	int			max;
+	int			max_gs;
+	int			wqe_shift;
+
+	atomic_t		refcount;
+	struct completion	free;
+};
+
+struct mlx4_av {
+	__be32			port_pd;
+	u8			reserved1;
+	u8			g_slid;
+	__be16			dlid;
+	u8			reserved2;
+	u8			gid_index;
+	u8			stat_rate;
+	u8			hop_limit;
+	__be32			sl_tclass_flowlabel;
+	u8			dgid[16];
+};
+
+struct mlx4_dev {
+	struct pci_dev	       *pdev;
+	unsigned long		flags;
+	struct mlx4_caps	caps;
+	struct radix_tree_root	qp_table_tree;
+};
+
+struct mlx4_init_port_param {
+	int			set_guid0;
+	int			set_node_guid;
+	int			set_si_guid;
+	u16			mtu;
+	int			port_width_cap;
+	u16			vl_cap;
+	u16			max_gid;
+	u16			max_pkey;
+	u64			guid0;
+	u64			node_guid;
+	u64			si_guid;
+};
+
+int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
+		   struct mlx4_buf *buf);
+void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf);
+
+int mlx4_pd_alloc(struct mlx4_dev *dev, u32 *pdn);
+void mlx4_pd_free(struct mlx4_dev *dev, u32 pdn);
+
+int mlx4_uar_alloc(struct mlx4_dev *dev, struct mlx4_uar *uar);
+void mlx4_uar_free(struct mlx4_dev *dev, struct mlx4_uar *uar);
+
+int mlx4_mtt_init(struct mlx4_dev *dev, int npages, int page_shift,
+		  struct mlx4_mtt *mtt);
+void mlx4_mtt_cleanup(struct mlx4_dev *dev, struct mlx4_mtt *mtt);
+u64 mlx4_mtt_addr(struct mlx4_dev *dev, struct mlx4_mtt *mtt);
+
+int mlx4_mr_alloc(struct mlx4_dev *dev, u32 pd, u64 iova, u64 size, u32 access,
+		  int npages, int page_shift, struct mlx4_mr *mr);
+void mlx4_mr_free(struct mlx4_dev *dev, struct mlx4_mr *mr);
+int mlx4_mr_enable(struct mlx4_dev *dev, struct mlx4_mr *mr);
+int mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		   int start_index, int npages, u64 *page_list);
+int mlx4_buf_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		       struct mlx4_buf *buf);
+
+int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt,
+		  struct mlx4_uar *uar, u64 db_rec, struct mlx4_cq *cq);
+void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq);
+
+int mlx4_qp_alloc(struct mlx4_dev *dev, int sqpn, struct mlx4_qp *qp);
+void mlx4_qp_free(struct mlx4_dev *dev, struct mlx4_qp *qp);
+
+int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, struct mlx4_mtt *mtt,
+		   u64 db_rec, struct mlx4_srq *srq);
+void mlx4_srq_free(struct mlx4_dev *dev, struct mlx4_srq *srq);
+int mlx4_srq_arm(struct mlx4_dev *dev, struct mlx4_srq *srq, int limit_watermark);
+
+int mlx4_INIT_PORT(struct mlx4_dev *dev, struct mlx4_init_port_param *param, int port);
+int mlx4_CLOSE_PORT(struct mlx4_dev *dev, int port);
+
+int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16]);
+int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16]);
+
+#endif /* MLX4_DEVICE_H */
diff --git a/include/linux/mlx4/doorbell.h b/include/linux/mlx4/doorbell.h
new file mode 100644
index 000000000000..3f2da442d7cb
--- /dev/null
+++ b/include/linux/mlx4/doorbell.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_DOORBELL_H
+#define MLX4_DOORBELL_H
+
+#include <linux/types.h>
+#include <linux/io.h>
+
+#define MLX4_SEND_DOORBELL    0x14
+#define MLX4_CQ_DOORBELL      0x20
+
+#if BITS_PER_LONG == 64
+/*
+ * Assume that we can just write a 64-bit doorbell atomically.  s390
+ * actually doesn't have writeq() but S/390 systems don't even have
+ * PCI so we won't worry about it.
+ */
+
+#define MLX4_DECLARE_DOORBELL_LOCK(name)
+#define MLX4_INIT_DOORBELL_LOCK(ptr)    do { } while (0)
+#define MLX4_GET_DOORBELL_LOCK(ptr)      (NULL)
+
+static inline void mlx4_write64_raw(__be64 val, void __iomem *dest)
+{
+	__raw_writeq((__force u64) val, dest);
+}
+
+static inline void mlx4_write64(__be32 val[2], void __iomem *dest,
+				spinlock_t *doorbell_lock)
+{
+	__raw_writeq(*(u64 *) val, dest);
+}
+
+#else
+
+/*
+ * Just fall back to a spinlock to protect the doorbell if
+ * BITS_PER_LONG is 32 -- there's no portable way to do atomic 64-bit
+ * MMIO writes.
+ */
+
+#define MLX4_DECLARE_DOORBELL_LOCK(name) spinlock_t name;
+#define MLX4_INIT_DOORBELL_LOCK(ptr)     spin_lock_init(ptr)
+#define MLX4_GET_DOORBELL_LOCK(ptr)      (ptr)
+
+static inline void mlx4_write64_raw(__be64 val, void __iomem *dest)
+{
+	__raw_writel(((__force u32 *) &val)[0], dest);
+	__raw_writel(((__force u32 *) &val)[1], dest + 4);
+}
+
+static inline void mlx4_write64(__be32 val[2], void __iomem *dest,
+				spinlock_t *doorbell_lock)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(doorbell_lock, flags);
+	__raw_writel((__force u32) val[0], dest);
+	__raw_writel((__force u32) val[1], dest + 4);
+	spin_unlock_irqrestore(doorbell_lock, flags);
+}
+
+#endif
+
+#endif /* MLX4_DOORBELL_H */
diff --git a/include/linux/mlx4/driver.h b/include/linux/mlx4/driver.h
new file mode 100644
index 000000000000..1b835ca49df1
--- /dev/null
+++ b/include/linux/mlx4/driver.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2006 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_DRIVER_H
+#define MLX4_DRIVER_H
+
+#include <linux/device.h>
+
+struct mlx4_dev;
+
+enum mlx4_dev_event {
+	MLX4_DEV_EVENT_CATASTROPHIC_ERROR,
+	MLX4_DEV_EVENT_PORT_UP,
+	MLX4_DEV_EVENT_PORT_DOWN,
+	MLX4_DEV_EVENT_PORT_REINIT,
+};
+
+struct mlx4_interface {
+	void *			(*add)	 (struct mlx4_dev *dev);
+	void			(*remove)(struct mlx4_dev *dev, void *context);
+	void			(*event) (struct mlx4_dev *dev, void *context,
+					  enum mlx4_dev_event event, int subtype,
+					  int port);
+	struct list_head	list;
+};
+
+int mlx4_register_interface(struct mlx4_interface *intf);
+void mlx4_unregister_interface(struct mlx4_interface *intf);
+
+#endif /* MLX4_DRIVER_H */
diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h
new file mode 100644
index 000000000000..9eeb61adf6a3
--- /dev/null
+++ b/include/linux/mlx4/qp.h
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_QP_H
+#define MLX4_QP_H
+
+#include <linux/types.h>
+
+#include <linux/mlx4/device.h>
+
+#define MLX4_INVALID_LKEY	0x100
+
+enum mlx4_qp_optpar {
+	MLX4_QP_OPTPAR_ALT_ADDR_PATH		= 1 << 0,
+	MLX4_QP_OPTPAR_RRE			= 1 << 1,
+	MLX4_QP_OPTPAR_RAE			= 1 << 2,
+	MLX4_QP_OPTPAR_RWE			= 1 << 3,
+	MLX4_QP_OPTPAR_PKEY_INDEX		= 1 << 4,
+	MLX4_QP_OPTPAR_Q_KEY			= 1 << 5,
+	MLX4_QP_OPTPAR_RNR_TIMEOUT		= 1 << 6,
+	MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH	= 1 << 7,
+	MLX4_QP_OPTPAR_SRA_MAX			= 1 << 8,
+	MLX4_QP_OPTPAR_RRA_MAX			= 1 << 9,
+	MLX4_QP_OPTPAR_PM_STATE			= 1 << 10,
+	MLX4_QP_OPTPAR_RETRY_COUNT		= 1 << 12,
+	MLX4_QP_OPTPAR_RNR_RETRY		= 1 << 13,
+	MLX4_QP_OPTPAR_ACK_TIMEOUT		= 1 << 14,
+	MLX4_QP_OPTPAR_SCHED_QUEUE		= 1 << 16
+};
+
+enum mlx4_qp_state {
+	MLX4_QP_STATE_RST			= 0,
+	MLX4_QP_STATE_INIT			= 1,
+	MLX4_QP_STATE_RTR			= 2,
+	MLX4_QP_STATE_RTS			= 3,
+	MLX4_QP_STATE_SQER			= 4,
+	MLX4_QP_STATE_SQD			= 5,
+	MLX4_QP_STATE_ERR			= 6,
+	MLX4_QP_STATE_SQ_DRAINING		= 7,
+	MLX4_QP_NUM_STATE
+};
+
+enum {
+	MLX4_QP_ST_RC				= 0x0,
+	MLX4_QP_ST_UC				= 0x1,
+	MLX4_QP_ST_RD				= 0x2,
+	MLX4_QP_ST_UD				= 0x3,
+	MLX4_QP_ST_MLX				= 0x7
+};
+
+enum {
+	MLX4_QP_PM_MIGRATED			= 0x3,
+	MLX4_QP_PM_ARMED			= 0x0,
+	MLX4_QP_PM_REARM			= 0x1
+};
+
+enum {
+	/* params1 */
+	MLX4_QP_BIT_SRE				= 1 << 15,
+	MLX4_QP_BIT_SWE				= 1 << 14,
+	MLX4_QP_BIT_SAE				= 1 << 13,
+	/* params2 */
+	MLX4_QP_BIT_RRE				= 1 << 15,
+	MLX4_QP_BIT_RWE				= 1 << 14,
+	MLX4_QP_BIT_RAE				= 1 << 13,
+	MLX4_QP_BIT_RIC				= 1 <<	4,
+};
+
+struct mlx4_qp_path {
+	u8			fl;
+	u8			reserved1[2];
+	u8			pkey_index;
+	u8			reserved2;
+	u8			grh_mylmc;
+	__be16			rlid;
+	u8			ackto;
+	u8			mgid_index;
+	u8			static_rate;
+	u8			hop_limit;
+	__be32			tclass_flowlabel;
+	u8			rgid[16];
+	u8			sched_queue;
+	u8			snooper_flags;
+	u8			reserved3[2];
+	u8			counter_index;
+	u8			reserved4[7];
+};
+
+struct mlx4_qp_context {
+	__be32			flags;
+	__be32			pd;
+	u8			mtu_msgmax;
+	u8			rq_size_stride;
+	u8			sq_size_stride;
+	u8			rlkey;
+	__be32			usr_page;
+	__be32			local_qpn;
+	__be32			remote_qpn;
+	struct			mlx4_qp_path pri_path;
+	struct			mlx4_qp_path alt_path;
+	__be32			params1;
+	u32			reserved1;
+	__be32			next_send_psn;
+	__be32			cqn_send;
+	u32			reserved2[2];
+	__be32			last_acked_psn;
+	__be32			ssn;
+	__be32			params2;
+	__be32			rnr_nextrecvpsn;
+	__be32			srcd;
+	__be32			cqn_recv;
+	__be64			db_rec_addr;
+	__be32			qkey;
+	__be32			srqn;
+	__be32			msn;
+	__be16			rq_wqe_counter;
+	__be16			sq_wqe_counter;
+	u32			reserved3[2];
+	__be32			param3;
+	__be32			nummmcpeers_basemkey;
+	u8			log_page_size;
+	u8			reserved4[2];
+	u8			mtt_base_addr_h;
+	__be32			mtt_base_addr_l;
+	u32			reserved5[10];
+};
+
+enum {
+	MLX4_WQE_CTRL_FENCE	= 1 << 6,
+	MLX4_WQE_CTRL_CQ_UPDATE	= 3 << 2,
+	MLX4_WQE_CTRL_SOLICITED	= 1 << 1,
+};
+
+struct mlx4_wqe_ctrl_seg {
+	__be32			owner_opcode;
+	u8			reserved2[3];
+	u8			fence_size;
+	/*
+	 * High 24 bits are SRC remote buffer; low 8 bits are flags:
+	 * [7]   SO (strong ordering)
+	 * [5]   TCP/UDP checksum
+	 * [4]   IP checksum
+	 * [3:2] C (generate completion queue entry)
+	 * [1]   SE (solicited event)
+	 */
+	__be32			srcrb_flags;
+	/*
+	 * imm is immediate data for send/RDMA write w/ immediate;
+	 * also invalidation key for send with invalidate; input
+	 * modifier for WQEs on CCQs.
+	 */
+	__be32			imm;
+};
+
+enum {
+	MLX4_WQE_MLX_VL15	= 1 << 17,
+	MLX4_WQE_MLX_SLR	= 1 << 16
+};
+
+struct mlx4_wqe_mlx_seg {
+	u8			owner;
+	u8			reserved1[2];
+	u8			opcode;
+	u8			reserved2[3];
+	u8			size;
+	/*
+	 * [17]    VL15
+	 * [16]    SLR
+	 * [15:12] static rate
+	 * [11:8]  SL
+	 * [4]     ICRC
+	 * [3:2]   C
+	 * [0]     FL (force loopback)
+	 */
+	__be32			flags;
+	__be16			rlid;
+	u16			reserved3;
+};
+
+struct mlx4_wqe_datagram_seg {
+	__be32			av[8];
+	__be32			dqpn;
+	__be32			qkey;
+	__be32			reservd[2];
+};
+
+struct mlx4_wqe_bind_seg {
+	__be32			flags1;
+	__be32			flags2;
+	__be32			new_rkey;
+	__be32			lkey;
+	__be64			addr;
+	__be64			length;
+};
+
+struct mlx4_wqe_fmr_seg {
+	__be32			flags;
+	__be32			mem_key;
+	__be64			buf_list;
+	__be64			start_addr;
+	__be64			reg_len;
+	__be32			offset;
+	__be32			page_size;
+	u32			reserved[2];
+};
+
+struct mlx4_wqe_fmr_ext_seg {
+	u8			flags;
+	u8			reserved;
+	__be16			app_mask;
+	__be16			wire_app_tag;
+	__be16			mem_app_tag;
+	__be32			wire_ref_tag_base;
+	__be32			mem_ref_tag_base;
+};
+
+struct mlx4_wqe_local_inval_seg {
+	u8			flags;
+	u8			reserved1[3];
+	__be32			mem_key;
+	u8			reserved2[3];
+	u8			guest_id;
+	__be64			pa;
+};
+
+struct mlx4_wqe_raddr_seg {
+	__be64			raddr;
+	__be32			rkey;
+	u32			reserved;
+};
+
+struct mlx4_wqe_atomic_seg {
+	__be64			swap_add;
+	__be64			compare;
+};
+
+struct mlx4_wqe_data_seg {
+	__be32			byte_count;
+	__be32			lkey;
+	__be64			addr;
+};
+
+struct mlx4_wqe_inline_seg {
+	__be32			byte_count;
+};
+
+int mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		   enum mlx4_qp_state cur_state, enum mlx4_qp_state new_state,
+		   struct mlx4_qp_context *context, enum mlx4_qp_optpar optpar,
+		   int sqd_event, struct mlx4_qp *qp);
+
+static inline struct mlx4_qp *__mlx4_qp_lookup(struct mlx4_dev *dev, u32 qpn)
+{
+	return radix_tree_lookup(&dev->qp_table_tree, qpn & (dev->caps.num_qps - 1));
+}
+
+void mlx4_qp_remove(struct mlx4_dev *dev, struct mlx4_qp *qp);
+
+#endif /* MLX4_QP_H */
diff --git a/include/linux/mlx4/srq.h b/include/linux/mlx4/srq.h
new file mode 100644
index 000000000000..799a0697a383
--- /dev/null
+++ b/include/linux/mlx4/srq.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_SRQ_H
+#define MLX4_SRQ_H
+
+struct mlx4_wqe_srq_next_seg {
+	u16			reserved1;
+	__be16			next_wqe_index;
+	u32			reserved2[3];
+};
+
+#endif /* MLX4_SRQ_H */
-- 
cgit v1.2.3


From beb7dd86a101263bf63a78c7c6d4da3849b35bd6 Mon Sep 17 00:00:00 2001
From: "Robert P. J. Day" <rpjday@mindspring.com>
Date: Wed, 9 May 2007 07:14:03 +0200
Subject: Fix misspellings collected by members of KJ list.

Fix the misspellings of "propogate", "writting" and (oh, the shame
:-) "kenrel" in the source tree.

Signed-off-by: Robert P. J. Day <rpjday@mindspring.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 include/linux/mount.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mount.h b/include/linux/mount.h
index dab69afee2fa..6d3047d8c91c 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -33,7 +33,7 @@ struct mnt_namespace;
 
 #define MNT_SHARED	0x1000	/* if the vfsmount is a shared mount */
 #define MNT_UNBINDABLE	0x2000	/* if the vfsmount is a unbindable mount */
-#define MNT_PNODE_MASK	0x3000	/* propogation flag mask */
+#define MNT_PNODE_MASK	0x3000	/* propagation flag mask */
 
 struct vfsmount {
 	struct list_head mnt_hash;
-- 
cgit v1.2.3


From 59c51591a0ac7568824f541f57de967e88adaa07 Mon Sep 17 00:00:00 2001
From: Michael Opdenacker <michael@free-electrons.com>
Date: Wed, 9 May 2007 08:57:56 +0200
Subject: Fix occurrences of "the the "

Signed-off-by: Michael Opdenacker <michael@free-electrons.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 include/linux/ext3_fs_i.h  | 2 +-
 include/linux/ext4_fs_i.h  | 2 +-
 include/linux/radix-tree.h | 4 ++--
 include/linux/security.h   | 2 +-
 include/linux/usb.h        | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ext3_fs_i.h b/include/linux/ext3_fs_i.h
index 4395e5206746..7894dd0f3b77 100644
--- a/include/linux/ext3_fs_i.h
+++ b/include/linux/ext3_fs_i.h
@@ -54,7 +54,7 @@ struct ext3_block_alloc_info {
 	/*
 	 * Was i_next_alloc_goal in ext3_inode_info
 	 * is the *physical* companion to i_next_alloc_block.
-	 * it the the physical block number of the block which was most-recentl
+	 * it the physical block number of the block which was most-recentl
 	 * allocated to this file.  This give us the goal (target) for the next
 	 * allocation when we detect linearly ascending requests.
 	 */
diff --git a/include/linux/ext4_fs_i.h b/include/linux/ext4_fs_i.h
index bb42379cb7fd..d5b177e5b395 100644
--- a/include/linux/ext4_fs_i.h
+++ b/include/linux/ext4_fs_i.h
@@ -52,7 +52,7 @@ struct ext4_block_alloc_info {
 	/*
 	 * Was i_next_alloc_goal in ext4_inode_info
 	 * is the *physical* companion to i_next_alloc_block.
-	 * it the the physical block number of the block which was most-recentl
+	 * it the physical block number of the block which was most-recentl
 	 * allocated to this file.  This give us the goal (target) for the next
 	 * allocation when we detect linearly ascending requests.
 	 */
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 0deb842541ac..f9e77d2ee320 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -87,10 +87,10 @@ do {									\
  * management of their lifetimes must be completely managed by API users.
  *
  * For API usage, in general,
- * - any function _modifying_ the the tree or tags (inserting or deleting
+ * - any function _modifying_ the tree or tags (inserting or deleting
  *   items, setting or clearing tags must exclude other modifications, and
  *   exclude any functions reading the tree.
- * - any function _reading_ the the tree or tags (looking up items or tags,
+ * - any function _reading_ the tree or tags (looking up items or tags,
  *   gang lookups) must exclude modifications to the tree, but may occur
  *   concurrently with other readers.
  *
diff --git a/include/linux/security.h b/include/linux/security.h
index 47e82c120f9a..9eb9e0fe0331 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -322,7 +322,7 @@ struct request_sock;
  *	@dir contains the inode structure of parent of the new file.
  *	@dentry contains the dentry structure of the new file.
  *	@mode contains the mode of the new file.
- *	@dev contains the the device number.
+ *	@dev contains the device number.
  *	Return 0 if permission is granted.
  * @inode_rename:
  *	Check for permission to rename a file or directory.
diff --git a/include/linux/usb.h b/include/linux/usb.h
index cfbd2bb8fa2c..94bd38a6d947 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -126,7 +126,7 @@ enum usb_interface_condition {
  * Each interface may have alternate settings.  The initial configuration
  * of a device sets altsetting 0, but the device driver can change
  * that setting using usb_set_interface().  Alternate settings are often
- * used to control the the use of periodic endpoints, such as by having
+ * used to control the use of periodic endpoints, such as by having
  * different endpoints use different amounts of reserved USB bandwidth.
  * All standards-conformant USB devices that use isochronous endpoints
  * will use them in non-default settings.
-- 
cgit v1.2.3


From 5886269962f94fa9185c32db3ec936c612503235 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?=
 <ukleinek@informatik.uni-freiburg.de>
Date: Wed, 9 May 2007 07:51:49 +0200
Subject: fix file specification in comments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Many files include the filename at the beginning, serveral used a wrong one.

Signed-off-by: Uwe Kleine-König <ukleinek@informatik.uni-freiburg.de>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 include/linux/generic_acl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/generic_acl.h b/include/linux/generic_acl.h
index 80764f40be75..886f5faa08cb 100644
--- a/include/linux/generic_acl.h
+++ b/include/linux/generic_acl.h
@@ -1,5 +1,5 @@
 /*
- * fs/generic_acl.c
+ * include/linux/generic_acl.h
  *
  * (C) 2005 Andreas Gruenbacher <agruen@suse.de>
  *
-- 
cgit v1.2.3


From 121e70b69aef898a3c02fa90d0a2108381bcf975 Mon Sep 17 00:00:00 2001
From: John Anthony Kazos Jr <jakj@j-a-k-j.com>
Date: Wed, 9 May 2007 08:30:57 +0200
Subject: include files: convert "include" subdirectory to UTF-8

Convert the "include" subdirectory to UTF-8.

Signed-off-by: John Anthony Kazos Jr. <jakj@j-a-k-j.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 include/linux/i2c-algo-bit.h | 2 +-
 include/linux/i2c-algo-pcf.h | 2 +-
 include/linux/irda.h         | 2 +-
 include/linux/meye.h         | 2 +-
 include/linux/sonypi.h       | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/i2c-algo-bit.h b/include/linux/i2c-algo-bit.h
index 9ee0f800592f..111334f5b922 100644
--- a/include/linux/i2c-algo-bit.h
+++ b/include/linux/i2c-algo-bit.h
@@ -18,7 +18,7 @@
     Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.                */
 /* ------------------------------------------------------------------------- */
 
-/* With some changes from Ky�sti M�lkki <kmalkki@cc.hut.fi> and even
+/* With some changes from Kyösti Mälkki <kmalkki@cc.hut.fi> and even
    Frodo Looijaard <frodol@dds.nl> */
 
 #ifndef _LINUX_I2C_ALGO_BIT_H
diff --git a/include/linux/i2c-algo-pcf.h b/include/linux/i2c-algo-pcf.h
index 994eb86f882c..77afbb60fd11 100644
--- a/include/linux/i2c-algo-pcf.h
+++ b/include/linux/i2c-algo-pcf.h
@@ -19,7 +19,7 @@
     Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.                */
 /* ------------------------------------------------------------------------- */
 
-/* With some changes from Ky�sti M�lkki <kmalkki@cc.hut.fi> and even
+/* With some changes from Kyösti Mälkki <kmalkki@cc.hut.fi> and even
    Frodo Looijaard <frodol@dds.nl> */
 
 #ifndef _LINUX_I2C_ALGO_PCF_H
diff --git a/include/linux/irda.h b/include/linux/irda.h
index 09d8f105a5a8..945ba3110874 100644
--- a/include/linux/irda.h
+++ b/include/linux/irda.h
@@ -16,7 +16,7 @@
  *     published by the Free Software Foundation; either version 2 of 
  *     the License, or (at your option) any later version.
  *  
- *     Neither Dag Brattli nor University of Troms� admit liability nor
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
  *     provide warranty for any of this software. This material is 
  *     provided "AS-IS" and at no charge.
  *
diff --git a/include/linux/meye.h b/include/linux/meye.h
index 11ec45e9a132..39fd9c8ddd4b 100644
--- a/include/linux/meye.h
+++ b/include/linux/meye.h
@@ -3,7 +3,7 @@
  *
  * Copyright (C) 2001-2003 Stelian Pop <stelian@popies.net>
  *
- * Copyright (C) 2001-2002 Alc�ve <www.alcove.com>
+ * Copyright (C) 2001-2002 Alcôve <www.alcove.com>
  *
  * Copyright (C) 2000 Andrew Tridgell <tridge@valinux.com>
  *
diff --git a/include/linux/sonypi.h b/include/linux/sonypi.h
index f56d24734950..34d4b075f7b8 100644
--- a/include/linux/sonypi.h
+++ b/include/linux/sonypi.h
@@ -5,7 +5,7 @@
  *
  * Copyright (C) 2005 Narayanan R S <nars@kadamba.org>
 
- * Copyright (C) 2001-2002 Alc�ve <www.alcove.com>
+ * Copyright (C) 2001-2002 Alcôve <www.alcove.com>
  *
  * Copyright (C) 2001 Michael Ashley <m.ashley@unsw.edu.au>
  *
-- 
cgit v1.2.3


From 36200b76008d52d16b170d4f7dae9cfe00f5eb2b Mon Sep 17 00:00:00 2001
From: "Robert P. J. Day" <rpjday@mindspring.com>
Date: Thu, 3 May 2007 15:58:49 -0400
Subject: [MTD] Remove unnecessary user space check from mtd.h.

Since the header file include/linux/mtd/mtd.h is not exported to user
space, remove the user space check and error.

Signed-off-by: Robert P. J. Day <rpjday@mindspring.com>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/mtd/mtd.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 45d482ce8397..12a9a18f6e16 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -9,10 +9,6 @@
 #ifndef __MTD_MTD_H__
 #define __MTD_MTD_H__
 
-#ifndef __KERNEL__
-#error This is a kernel header. Perhaps include mtd-user.h instead?
-#endif
-
 #include <linux/types.h>
 #include <linux/module.h>
 #include <linux/uio.h>
-- 
cgit v1.2.3


From 42f209d3c94516affeb5e578fae62925f531a2d9 Mon Sep 17 00:00:00 2001
From: "Robert P. J. Day" <rpjday@mindspring.com>
Date: Fri, 4 May 2007 15:49:38 -0400
Subject: [MTD] Delete allegedly obsolete "bank_size" field of mtd_info.

Delete the allegedly obsolete "bank_size" member of struct mtd_info.

Signed-off-by: Robert P. J. Day <rpjday@mindspring.com>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/mtd/mtd.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 12a9a18f6e16..fd64ccfbce02 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -133,9 +133,6 @@ struct mtd_info {
 	int numeraseregions;
 	struct mtd_erase_region_info *eraseregions;
 
-	/* This really shouldn't be here. It can go away in 2.5 */
-	u_int32_t bank_size;
-
 	int (*erase) (struct mtd_info *mtd, struct erase_info *instr);
 
 	/* This stuff for eXecute-In-Place */
-- 
cgit v1.2.3


From 97416ce82e20a9511ec369822098a8d20998398a Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Wed, 9 May 2007 02:32:35 -0700
Subject: Declare {compat_}sys_utimensat

This is needed before Powerpc can wire up the syscall.

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compat.h   | 3 +++
 include/linux/syscalls.h | 2 ++
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index ccd863dd77fa..70a157a130bb 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -253,5 +253,8 @@ asmlinkage long compat_sys_epoll_pwait(int epfd,
 			const compat_sigset_t __user *sigmask,
 			compat_size_t sigsetsize);
 
+asmlinkage long compat_sys_utimensat(unsigned int dfd, char __user *filename,
+				struct compat_timespec __user *t, int flags);
+
 #endif /* CONFIG_COMPAT */
 #endif /* _LINUX_COMPAT_H */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 1912c6cbef55..3139f4412297 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -576,6 +576,8 @@ asmlinkage long sys_fstatat64(int dfd, char __user *filename,
 			       struct stat64 __user *statbuf, int flag);
 asmlinkage long sys_readlinkat(int dfd, const char __user *path, char __user *buf,
 			       int bufsiz);
+asmlinkage long sys_utimensat(int dfd, char __user *filename,
+				struct timespec __user *utimes, int flags);
 asmlinkage long compat_sys_futimesat(unsigned int dfd, char __user *filename,
 				     struct compat_timeval __user *t);
 asmlinkage long compat_sys_newfstatat(unsigned int dfd, char __user * filename,
-- 
cgit v1.2.3


From a3d25c275d383975504dc53c25b691df59bd3c48 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 9 May 2007 02:33:18 -0700
Subject: PM: Separate hibernation code from suspend code

[ With Johannes Berg <johannes@sipsolutions.net> ]

Separate the hibernation (aka suspend to disk code) from the other suspend
code.  In particular:

 * Remove the definitions related to hibernation from include/linux/pm.h
 * Introduce struct hibernation_ops and a new hibernate() function to hibernate
   the system, defined in include/linux/suspend.h
 * Separate suspend code in kernel/power/main.c from hibernation-related code
   in kernel/power/disk.c and kernel/power/user.c (with the help of
   hibernation_ops)
 * Switch ACPI (the only user of pm_ops.pm_disk_mode) to hibernation_ops

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Greg KH <greg@kroah.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Nigel Cunningham <nigel@nigel.suspend2.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pm.h      | 31 +------------------------------
 include/linux/suspend.h | 24 ++++++++++++++++++++++++
 2 files changed, 25 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pm.h b/include/linux/pm.h
index 6e8fa3049e5d..87545e0f0b58 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -107,26 +107,11 @@ typedef int __bitwise suspend_state_t;
 #define PM_SUSPEND_ON		((__force suspend_state_t) 0)
 #define PM_SUSPEND_STANDBY	((__force suspend_state_t) 1)
 #define PM_SUSPEND_MEM		((__force suspend_state_t) 3)
-#define PM_SUSPEND_DISK		((__force suspend_state_t) 4)
-#define PM_SUSPEND_MAX		((__force suspend_state_t) 5)
-
-typedef int __bitwise suspend_disk_method_t;
-
-/* invalid must be 0 so struct pm_ops initialisers can leave it out */
-#define PM_DISK_INVALID		((__force suspend_disk_method_t) 0)
-#define	PM_DISK_PLATFORM	((__force suspend_disk_method_t) 1)
-#define	PM_DISK_SHUTDOWN	((__force suspend_disk_method_t) 2)
-#define	PM_DISK_REBOOT		((__force suspend_disk_method_t) 3)
-#define	PM_DISK_TEST		((__force suspend_disk_method_t) 4)
-#define	PM_DISK_TESTPROC	((__force suspend_disk_method_t) 5)
-#define	PM_DISK_MAX		((__force suspend_disk_method_t) 6)
+#define PM_SUSPEND_MAX		((__force suspend_state_t) 4)
 
 /**
  * struct pm_ops - Callbacks for managing platform dependent suspend states.
  * @valid: Callback to determine whether the given state can be entered.
- * 	If %CONFIG_SOFTWARE_SUSPEND is set then %PM_SUSPEND_DISK is
- *	always valid and never passed to this call. If not assigned,
- *	no suspend states are valid.
  *	Valid states are advertised in /sys/power/state but can still
  *	be rejected by prepare or enter if the conditions aren't right.
  *	There is a %pm_valid_only_mem function available that can be assigned
@@ -140,24 +125,12 @@ typedef int __bitwise suspend_disk_method_t;
  *
  * @finish: Called when the system has left the given state and all devices
  *	are resumed. The return value is ignored.
- *
- * @pm_disk_mode: The generic code always allows one of the shutdown methods
- *	%PM_DISK_SHUTDOWN, %PM_DISK_REBOOT, %PM_DISK_TEST and
- *	%PM_DISK_TESTPROC. If this variable is set, the mode it is set
- *	to is allowed in addition to those modes and is also made default.
- *	When this mode is sent selected, the @prepare call will be called
- *	before suspending to disk (if present), the @enter call should be
- *	present and will be called after all state has been saved and the
- *	machine is ready to be powered off; the @finish callback is called
- *	after state has been restored. All these calls are called with
- *	%PM_SUSPEND_DISK as the state.
  */
 struct pm_ops {
 	int (*valid)(suspend_state_t state);
 	int (*prepare)(suspend_state_t state);
 	int (*enter)(suspend_state_t state);
 	int (*finish)(suspend_state_t state);
-	suspend_disk_method_t pm_disk_mode;
 };
 
 /**
@@ -276,8 +249,6 @@ extern void device_power_up(void);
 extern void device_resume(void);
 
 #ifdef CONFIG_PM
-extern suspend_disk_method_t pm_disk_mode;
-
 extern int device_suspend(pm_message_t state);
 extern int device_prepare_suspend(pm_message_t state);
 
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 9d2aa1a12aa0..d74da9122b60 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -32,6 +32,24 @@ static inline int pm_prepare_console(void) { return 0; }
 static inline void pm_restore_console(void) {}
 #endif
 
+/**
+ * struct hibernation_ops - hibernation platform support
+ *
+ * The methods in this structure allow a platform to override the default
+ * mechanism of shutting down the machine during a hibernation transition.
+ *
+ * All three methods must be assigned.
+ *
+ * @prepare: prepare system for hibernation
+ * @enter: shut down system after state has been saved to disk
+ * @finish: finish/clean up after state has been reloaded
+ */
+struct hibernation_ops {
+	int (*prepare)(void);
+	int (*enter)(void);
+	void (*finish)(void);
+};
+
 #if defined(CONFIG_PM) && defined(CONFIG_SOFTWARE_SUSPEND)
 /* kernel/power/snapshot.c */
 extern void __init register_nosave_region(unsigned long, unsigned long);
@@ -39,11 +57,17 @@ extern int swsusp_page_is_forbidden(struct page *);
 extern void swsusp_set_page_free(struct page *);
 extern void swsusp_unset_page_free(struct page *);
 extern unsigned long get_safe_page(gfp_t gfp_mask);
+
+extern void hibernation_set_ops(struct hibernation_ops *ops);
+extern int hibernate(void);
 #else
 static inline void register_nosave_region(unsigned long b, unsigned long e) {}
 static inline int swsusp_page_is_forbidden(struct page *p) { return 0; }
 static inline void swsusp_set_page_free(struct page *p) {}
 static inline void swsusp_unset_page_free(struct page *p) {}
+
+static inline void hibernation_set_ops(struct hibernation_ops *ops) {}
+static inline int hibernate(void) { return -ENOSYS; }
 #endif /* defined(CONFIG_PM) && defined(CONFIG_SOFTWARE_SUSPEND) */
 
 void save_processor_state(void);
-- 
cgit v1.2.3


From dd2a345f8f002845636dbf5d2d768bb5cd8a5f59 Mon Sep 17 00:00:00 2001
From: Dave Gilbert <linux@treblig.org>
Date: Wed, 9 May 2007 02:33:24 -0700
Subject: Display all possible partitions when the root filesystem failed to
 mount

Display all possible partitions when the root filesystem is not mounted.
This helps to track spell'o's and missing drivers.

Updated to work with newer kernels.

Example output:

VFS: Cannot open root device "foobar" or unknown-block(0,0)
Please append a correct "root=" boot option; here are the available partitions:
0800    8388608 sda driver: sd
  0801     192748 sda1
  0802    8193150 sda2
0810    4194304 sdb driver: sd
Kernel panic - not syncing: VFS: Unable to mount root fs on unknown-block(0,0)

[akpm@linux-foundation.org: cleanups, fix printk warnings]
Signed-off-by: Jan Engelhardt <jengelh@gmx.de>
Cc: Dave Gilbert <linux@treblig.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/genhd.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 2c65da7cabb2..f589559cf070 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -413,6 +413,7 @@ char *disk_name (struct gendisk *hd, int part, char *buf);
 extern int rescan_partitions(struct gendisk *disk, struct block_device *bdev);
 extern void add_partition(struct gendisk *, int, sector_t, sector_t, int);
 extern void delete_partition(struct gendisk *, int);
+extern void printk_all_partitions(void);
 
 extern struct gendisk *alloc_disk_node(int minors, int node_id);
 extern struct gendisk *alloc_disk(int minors);
-- 
cgit v1.2.3


From 2f4dfe206a2fc07099dfad77a8ea2f4b4ae2140f Mon Sep 17 00:00:00 2001
From: Fernando Luis Vazquez Cao <fernando@oss.ntt.co.jp>
Date: Wed, 9 May 2007 02:33:25 -0700
Subject: Remove hardcoding of hard_smp_processor_id on UP systems

With the advent of kdump, the assumption that the boot CPU when booting an UP
kernel is always the CPU with a particular hardware ID (often 0) (usually
referred to as BSP on some architectures) is not valid anymore.  The reason
being that the dump capture kernel boots on the crashed CPU (the CPU that
invoked crash_kexec), which may be or may not be that particular CPU.

Move definition of hard_smp_processor_id for the UP case to
architecture-specific code ("asm/smp.h") where it belongs, so that each
architecture can provide its own implementation.

Signed-off-by: Fernando Luis Vazquez Cao <fernando@oss.ntt.co.jp>
Cc: "Luck, Tony" <tony.luck@intel.com>
Acked-by: Andi Kleen <ak@suse.de>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/smp.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/smp.h b/include/linux/smp.h
index 7ba23ec8211b..3f70149eabbb 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -83,7 +83,6 @@ void smp_prepare_boot_cpu(void);
  *	These macros fold the SMP functionality into a single CPU system
  */
 #define raw_smp_processor_id()			0
-#define hard_smp_processor_id()			0
 static inline int up_smp_call_function(void)
 {
 	return 0;
-- 
cgit v1.2.3


From 8813d1c00ca923c1683da625ff85959be1db9a49 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@SteelEye.com>
Date: Wed, 9 May 2007 02:33:30 -0700
Subject: mca: add integrated device bus matching

The MCA bus has a few "integrated" functions, which are effectively virtual
slots on the bus.  The problem is that these special functions don't have
dedicated pos IDs, so we have to manufacture ids for them outside the pos
space ...  and these ids can't be matched by the standard matching function,
so add a special registration that requests a list of pos ids or a particular
integrated function.

Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mca.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mca.h b/include/linux/mca.h
index 5cff2923092b..37972704617f 100644
--- a/include/linux/mca.h
+++ b/include/linux/mca.h
@@ -94,6 +94,7 @@ struct mca_bus {
 struct mca_driver {
 	const short		*id_table;
 	void			*driver_data;
+	int			integrated_id;
 	struct device_driver	driver;
 };
 #define to_mca_driver(mdriver) container_of(mdriver, struct mca_driver, driver)
@@ -125,6 +126,7 @@ extern enum MCA_AdapterStatus mca_device_status(struct mca_device *mca_dev);
 extern struct bus_type mca_bus_type;
 
 extern int mca_register_driver(struct mca_driver *drv);
+extern int mca_register_driver_integrated(struct mca_driver *, int);
 extern void mca_unregister_driver(struct mca_driver *drv);
 
 /* WARNING: only called by the boot time device setup */
-- 
cgit v1.2.3


From 55c0d1f83e481dd6c77f52f7dcfeb043b8b740fa Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 9 May 2007 02:33:37 -0700
Subject: Move sig_kernel_* et al macros to linux/signal.h

This patch moves the sig_kernel_* and related macros from kernel/signal.c
to linux/signal.h, and cleans them up slightly.  I need the sig_kernel_*
macros for default signal behavior in the utrace code, and want to avoid
duplication or overhead to share the knowledge.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/signal.h | 125 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 125 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/signal.h b/include/linux/signal.h
index 14749056dd63..3fa0fab4a04b 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -243,6 +243,131 @@ extern int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
 
 extern struct kmem_cache *sighand_cachep;
 
+/*
+ * In POSIX a signal is sent either to a specific thread (Linux task)
+ * or to the process as a whole (Linux thread group).  How the signal
+ * is sent determines whether it's to one thread or the whole group,
+ * which determines which signal mask(s) are involved in blocking it
+ * from being delivered until later.  When the signal is delivered,
+ * either it's caught or ignored by a user handler or it has a default
+ * effect that applies to the whole thread group (POSIX process).
+ *
+ * The possible effects an unblocked signal set to SIG_DFL can have are:
+ *   ignore	- Nothing Happens
+ *   terminate	- kill the process, i.e. all threads in the group,
+ * 		  similar to exit_group.  The group leader (only) reports
+ *		  WIFSIGNALED status to its parent.
+ *   coredump	- write a core dump file describing all threads using
+ *		  the same mm and then kill all those threads
+ *   stop 	- stop all the threads in the group, i.e. TASK_STOPPED state
+ *
+ * SIGKILL and SIGSTOP cannot be caught, blocked, or ignored.
+ * Other signals when not blocked and set to SIG_DFL behaves as follows.
+ * The job control signals also have other special effects.
+ *
+ *	+--------------------+------------------+
+ *	|  POSIX signal      |  default action  |
+ *	+--------------------+------------------+
+ *	|  SIGHUP            |  terminate	|
+ *	|  SIGINT            |	terminate	|
+ *	|  SIGQUIT           |	coredump 	|
+ *	|  SIGILL            |	coredump 	|
+ *	|  SIGTRAP           |	coredump 	|
+ *	|  SIGABRT/SIGIOT    |	coredump 	|
+ *	|  SIGBUS            |	coredump 	|
+ *	|  SIGFPE            |	coredump 	|
+ *	|  SIGKILL           |	terminate(+)	|
+ *	|  SIGUSR1           |	terminate	|
+ *	|  SIGSEGV           |	coredump 	|
+ *	|  SIGUSR2           |	terminate	|
+ *	|  SIGPIPE           |	terminate	|
+ *	|  SIGALRM           |	terminate	|
+ *	|  SIGTERM           |	terminate	|
+ *	|  SIGCHLD           |	ignore   	|
+ *	|  SIGCONT           |	ignore(*)	|
+ *	|  SIGSTOP           |	stop(*)(+)  	|
+ *	|  SIGTSTP           |	stop(*)  	|
+ *	|  SIGTTIN           |	stop(*)  	|
+ *	|  SIGTTOU           |	stop(*)  	|
+ *	|  SIGURG            |	ignore   	|
+ *	|  SIGXCPU           |	coredump 	|
+ *	|  SIGXFSZ           |	coredump 	|
+ *	|  SIGVTALRM         |	terminate	|
+ *	|  SIGPROF           |	terminate	|
+ *	|  SIGPOLL/SIGIO     |	terminate	|
+ *	|  SIGSYS/SIGUNUSED  |	coredump 	|
+ *	|  SIGSTKFLT         |	terminate	|
+ *	|  SIGWINCH          |	ignore   	|
+ *	|  SIGPWR            |	terminate	|
+ *	|  SIGRTMIN-SIGRTMAX |	terminate       |
+ *	+--------------------+------------------+
+ *	|  non-POSIX signal  |  default action  |
+ *	+--------------------+------------------+
+ *	|  SIGEMT            |  coredump	|
+ *	+--------------------+------------------+
+ *
+ * (+) For SIGKILL and SIGSTOP the action is "always", not just "default".
+ * (*) Special job control effects:
+ * When SIGCONT is sent, it resumes the process (all threads in the group)
+ * from TASK_STOPPED state and also clears any pending/queued stop signals
+ * (any of those marked with "stop(*)").  This happens regardless of blocking,
+ * catching, or ignoring SIGCONT.  When any stop signal is sent, it clears
+ * any pending/queued SIGCONT signals; this happens regardless of blocking,
+ * catching, or ignored the stop signal, though (except for SIGSTOP) the
+ * default action of stopping the process may happen later or never.
+ */
+
+#ifdef SIGEMT
+#define SIGEMT_MASK	rt_sigmask(SIGEMT)
+#else
+#define SIGEMT_MASK	0
+#endif
+
+#if SIGRTMIN > BITS_PER_LONG
+#define rt_sigmask(sig)	(1ULL << ((sig)-1))
+#else
+#define rt_sigmask(sig)	sigmask(sig)
+#endif
+#define siginmask(sig, mask) (rt_sigmask(sig) & (mask))
+
+#define SIG_KERNEL_ONLY_MASK (\
+	rt_sigmask(SIGKILL)   |  rt_sigmask(SIGSTOP))
+
+#define SIG_KERNEL_STOP_MASK (\
+	rt_sigmask(SIGSTOP)   |  rt_sigmask(SIGTSTP)   | \
+	rt_sigmask(SIGTTIN)   |  rt_sigmask(SIGTTOU)   )
+
+#define SIG_KERNEL_COREDUMP_MASK (\
+        rt_sigmask(SIGQUIT)   |  rt_sigmask(SIGILL)    | \
+	rt_sigmask(SIGTRAP)   |  rt_sigmask(SIGABRT)   | \
+        rt_sigmask(SIGFPE)    |  rt_sigmask(SIGSEGV)   | \
+	rt_sigmask(SIGBUS)    |  rt_sigmask(SIGSYS)    | \
+        rt_sigmask(SIGXCPU)   |  rt_sigmask(SIGXFSZ)   | \
+	SIGEMT_MASK				       )
+
+#define SIG_KERNEL_IGNORE_MASK (\
+        rt_sigmask(SIGCONT)   |  rt_sigmask(SIGCHLD)   | \
+	rt_sigmask(SIGWINCH)  |  rt_sigmask(SIGURG)    )
+
+#define sig_kernel_only(sig) \
+	(((sig) < SIGRTMIN) && siginmask(sig, SIG_KERNEL_ONLY_MASK))
+#define sig_kernel_coredump(sig) \
+	(((sig) < SIGRTMIN) && siginmask(sig, SIG_KERNEL_COREDUMP_MASK))
+#define sig_kernel_ignore(sig) \
+	(((sig) < SIGRTMIN) && siginmask(sig, SIG_KERNEL_IGNORE_MASK))
+#define sig_kernel_stop(sig) \
+	(((sig) < SIGRTMIN) && siginmask(sig, SIG_KERNEL_STOP_MASK))
+
+#define sig_needs_tasklist(sig)	((sig) == SIGCONT)
+
+#define sig_user_defined(t, signr) \
+	(((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) &&	\
+	 ((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_IGN))
+
+#define sig_fatal(t, signr) \
+	(!siginmask(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \
+	 (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL)
+
 #endif /* __KERNEL__ */
 
 #endif /* _LINUX_SIGNAL_H */
-- 
cgit v1.2.3


From 18d8362d517cb2bd97761294924fe6c2a6ee5e3c Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 9 May 2007 02:33:39 -0700
Subject: mutex_lock_interruptible(): add __must_check

It's not sane to use mutex_lock_interruptible() and to then ignore the result.

Ditto down_interruptible(), but I'm lazy.

Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mutex.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index b81bc2adaeff..0d50ea3df689 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -121,11 +121,12 @@ static inline int fastcall mutex_is_locked(struct mutex *lock)
  * Also see Documentation/mutex-design.txt.
  */
 extern void fastcall mutex_lock(struct mutex *lock);
-extern int fastcall mutex_lock_interruptible(struct mutex *lock);
+extern int __must_check fastcall mutex_lock_interruptible(struct mutex *lock);
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 extern void mutex_lock_nested(struct mutex *lock, unsigned int subclass);
-extern int mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass);
+extern int __must_check mutex_lock_interruptible_nested(struct mutex *lock,
+					unsigned int subclass);
 #else
 # define mutex_lock_nested(lock, subclass) mutex_lock(lock)
 # define mutex_lock_interruptible_nested(lock, subclass) mutex_lock_interruptible(lock)
-- 
cgit v1.2.3


From b89deed32ccc96098bd6bc953c64bba6b847774f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 9 May 2007 02:33:52 -0700
Subject: implement flush_work()

A basic problem with flush_scheduled_work() is that it blocks behind _all_
presently-queued works, rather than just the work whcih the caller wants to
flush.  If the caller holds some lock, and if one of the queued work happens
to want that lock as well then accidental deadlocks can occur.

One example of this is the phy layer: it wants to flush work while holding
rtnl_lock().  But if a linkwatch event happens to be queued, the phy code will
deadlock because the linkwatch callback function takes rtnl_lock.

So we implement a new function which will flush a *single* work - just the one
which the caller wants to free up.  Thus we avoid the accidental deadlocks
which can arise from unrelated subsystems' callbacks taking shared locks.

flush_work() non-blockingly dequeues the work_struct which we want to kill,
then it waits for its handler to complete on all CPUs.

Add ->current_work to the "struct cpu_workqueue_struct", it points to
currently running "struct work_struct". When flush_work(work) detects
->current_work == work, it inserts a barrier at the _head_ of ->worklist
(and thus right _after_ that work) and waits for completition. This means
that the next work fired on that CPU will be this barrier, or another
barrier queued by concurrent flush_work(), so the caller of flush_work()
will be woken before any "regular" work has a chance to run.

When wait_on_work() unlocks workqueue_mutex (or whatever we choose to protect
against CPU hotplug), CPU may go away. But in that case take_over_work() will
move a barrier we queued to another CPU, it will be fired sometime, and
wait_on_work() will be woken.

Actually, we are doing cleanup_workqueue_thread()->kthread_stop() before
take_over_work(), so cwq->thread should complete its ->worklist (and thus
the barrier), because currently we don't check kthread_should_stop() in
run_workqueue(). But even if we did, everything should be ok.

[akpm@osdl.org: cleanup]
[akpm@osdl.org: add flush_work_keventd() wrapper]
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/workqueue.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index f16ba1e0687d..26a70992dec8 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -178,6 +178,8 @@ extern int FASTCALL(queue_delayed_work(struct workqueue_struct *wq, struct delay
 extern int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 	struct delayed_work *work, unsigned long delay);
 extern void FASTCALL(flush_workqueue(struct workqueue_struct *wq));
+extern void flush_work(struct workqueue_struct *wq, struct work_struct *work);
+extern void flush_work_keventd(struct work_struct *work);
 
 extern int FASTCALL(schedule_work(struct work_struct *work));
 extern int FASTCALL(run_scheduled_work(struct work_struct *work));
@@ -199,7 +201,7 @@ int execute_in_process_context(work_func_t fn, struct execute_work *);
  * Kill off a pending schedule_delayed_work().  Note that the work callback
  * function may still be running on return from cancel_delayed_work(), unless
  * it returns 1 and the work doesn't re-arm itself. Run flush_workqueue() or
- * cancel_work_sync() to wait on it.
+ * flush_work() or cancel_work_sync() to wait on it.
  */
 static inline int cancel_delayed_work(struct delayed_work *work)
 {
-- 
cgit v1.2.3


From 19a75d83ffeab004cfcfac64024ad3997bac7220 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 9 May 2007 02:33:56 -0700
Subject: kblockd: use flush_work

Switch the kblockd flushing from a global flush to a more specific
flush_work().

(akpm: bypassed maintainers, sorry.  There are other patches which depend on
this)

Cc: "Maciej W. Rozycki" <macro@linux-mips.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Jens Axboe <axboe@suse.de>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/blkdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a686eabe22d6..db5b00a792f5 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -854,7 +854,7 @@ static inline void put_dev_sector(Sector p)
 
 struct work_struct;
 int kblockd_schedule_work(struct work_struct *work);
-void kblockd_flush(void);
+void kblockd_flush_work(struct work_struct *work);
 
 #define MODULE_ALIAS_BLOCKDEV(major,minor) \
 	MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor))
-- 
cgit v1.2.3


From 7c9cb38302e78d24e37f7d8a2ea7eed4ae5f2fa7 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@comcast.net>
Date: Wed, 9 May 2007 02:34:01 -0700
Subject: relay: use plain timer instead of delayed work

relay doesn't need to use schedule_delayed_work() for waking readers
when a simple timer will do.

Signed-off-by: Tom Zanussi <zanussi@comcast.net>
Cc: Satyam Sharma <satyam.sharma@gmail.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/relay.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/relay.h b/include/linux/relay.h
index 759a0f97bec2..6cd8c4425fc7 100644
--- a/include/linux/relay.h
+++ b/include/linux/relay.h
@@ -12,6 +12,7 @@
 
 #include <linux/types.h>
 #include <linux/sched.h>
+#include <linux/timer.h>
 #include <linux/wait.h>
 #include <linux/list.h>
 #include <linux/fs.h>
@@ -38,7 +39,7 @@ struct rchan_buf
 	size_t subbufs_consumed;	/* count of sub-buffers consumed */
 	struct rchan *chan;		/* associated channel */
 	wait_queue_head_t read_wait;	/* reader wait queue */
-	struct delayed_work wake_readers; /* reader wake-up work struct */
+	struct timer_list timer; 	/* reader wake-up timer */
 	struct dentry *dentry;		/* channel file dentry */
 	struct kref kref;		/* channel buffer refcount */
 	struct page **page_array;	/* array of current buffer pages */
-- 
cgit v1.2.3


From 6f7cc11aa6c7d5002e16096c7590944daece70ed Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Wed, 9 May 2007 02:34:02 -0700
Subject: Extend notifier_call_chain to count nr_calls made

Since 2.6.18-something, the community has been bugged by the problem to
provide a clean and a stable mechanism to postpone a cpu-hotplug event as
lock_cpu_hotplug was badly broken.

This is another proposal towards solving that problem.  This one is along the
lines of the solution provided in kernel/workqueue.c

Instead of having a global mechanism like lock_cpu_hotplug, we allow the
subsytems to define their own per-subsystem hot cpu mutexes.  These would be
taken(released) where ever we are currently calling
lock_cpu_hotplug(unlock_cpu_hotplug).

Also, in the per-subsystem hotcpu callback function,we take this mutex before
we handle any pre-cpu-hotplug events and release it once we finish handling
the post-cpu-hotplug events.  A standard means for doing this has been
provided in [PATCH 2/4] and demonstrated in [PATCH 3/4].

The ordering of these per-subsystem mutexes might still prove to be a
problem, but hopefully lockdep should help us get out of that muddle.

The patch set to be applied against linux-2.6.19-rc5 is as follows:

[PATCH 1/4] :	Extend notifier_call_chain with an option to specify the
		number of notifications to be sent and also count the
		number of notifications actually sent.

[PATCH 2/4] :	Define events CPU_LOCK_ACQUIRE and CPU_LOCK_RELEASE
		and send out notifications for these in _cpu_up and
		_cpu_down. This would help us standardise the acquire and
		release of the subsystem locks in the hotcpu
		callback functions of these subsystems.

[PATCH 3/4] :	Eliminate lock_cpu_hotplug from kernel/sched.c.

[PATCH 4/4] :	In workqueue_cpu_callback function, acquire(release) the
		workqueue_mutex while handling
		CPU_LOCK_ACQUIRE(CPU_LOCK_RELEASE).

If the per-subsystem-locking approach survives the test of time, we can expect
a slow phasing out of lock_cpu_hotplug, which has not yet been eliminated in
these patches :)

This patch:

Provide notifier_call_chain with an option to call only a specified number of
notifiers and also record the number of call to notifiers made.

The need for this enhancement was identified in the post entitled
"Slab - Eliminate lock_cpu_hotplug from slab"
(http://lkml.org/lkml/2006/10/28/92) by Ravikiran G Thirumalai and
Andrew Morton.

This patch adds two additional parameters to notifier_call_chain API namely
 - int nr_to_calls : Number of notifier_functions to be called.
 		     The don't care value is -1.

 - unsigned int *nr_calls : Records the total number of notifier_funtions
			    called by notifier_call_chain. The don't care
			    value is NULL.

[michal.k.k.piotrowski@gmail.com: build fix]
Credit: Andrew Morton <akpm@osdl.org>
Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Michal Piotrowski <michal.k.k.piotrowski@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/notifier.h | 52 ++++++++++++++++++++++++++++--------------------
 1 file changed, 30 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index 10a43ed0527e..e34221bf8946 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -112,32 +112,40 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
 
 #ifdef __KERNEL__
 
-extern int atomic_notifier_chain_register(struct atomic_notifier_head *,
-		struct notifier_block *);
-extern int blocking_notifier_chain_register(struct blocking_notifier_head *,
-		struct notifier_block *);
-extern int raw_notifier_chain_register(struct raw_notifier_head *,
-		struct notifier_block *);
-extern int srcu_notifier_chain_register(struct srcu_notifier_head *,
-		struct notifier_block *);
-
-extern int atomic_notifier_chain_unregister(struct atomic_notifier_head *,
-		struct notifier_block *);
-extern int blocking_notifier_chain_unregister(struct blocking_notifier_head *,
-		struct notifier_block *);
-extern int raw_notifier_chain_unregister(struct raw_notifier_head *,
-		struct notifier_block *);
-extern int srcu_notifier_chain_unregister(struct srcu_notifier_head *,
-		struct notifier_block *);
-
-extern int atomic_notifier_call_chain(struct atomic_notifier_head *,
+extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
+		struct notifier_block *nb);
+extern int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
+		struct notifier_block *nb);
+extern int raw_notifier_chain_register(struct raw_notifier_head *nh,
+		struct notifier_block *nb);
+extern int srcu_notifier_chain_register(struct srcu_notifier_head *nh,
+		struct notifier_block *nb);
+
+extern int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
+		struct notifier_block *nb);
+extern int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
+		struct notifier_block *nb);
+extern int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
+		struct notifier_block *nb);
+extern int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh,
+		struct notifier_block *nb);
+
+extern int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
 		unsigned long val, void *v);
-extern int blocking_notifier_call_chain(struct blocking_notifier_head *,
+extern int __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+	unsigned long val, void *v, int nr_to_call, int *nr_calls);
+extern int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
 		unsigned long val, void *v);
-extern int raw_notifier_call_chain(struct raw_notifier_head *,
+extern int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
+	unsigned long val, void *v, int nr_to_call, int *nr_calls);
+extern int raw_notifier_call_chain(struct raw_notifier_head *nh,
 		unsigned long val, void *v);
-extern int srcu_notifier_call_chain(struct srcu_notifier_head *,
+extern int __raw_notifier_call_chain(struct raw_notifier_head *nh,
+	unsigned long val, void *v, int nr_to_call, int *nr_calls);
+extern int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
 		unsigned long val, void *v);
+extern int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
+	unsigned long val, void *v, int nr_to_call, int *nr_calls);
 
 #define NOTIFY_DONE		0x0000		/* Don't care */
 #define NOTIFY_OK		0x0001		/* Suits me */
-- 
cgit v1.2.3


From baaca49f415b25fdbe2a8f3c22b39929e450fbfd Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Wed, 9 May 2007 02:34:03 -0700
Subject: Define and use new events,CPU_LOCK_ACQUIRE and CPU_LOCK_RELEASE

This is an attempt to provide an alternate mechanism for postponing
a hotplug event instead of using a global mechanism like lock_cpu_hotplug.

The proposal is to add two new events namely CPU_LOCK_ACQUIRE and
CPU_LOCK_RELEASE. The notification for these two events would be sent
out before and after a cpu_hotplug event respectively.

During the CPU_LOCK_ACQUIRE event, a cpu-hotplug-aware subsystem is
supposed to acquire any per-subsystem hotcpu mutex ( Eg. workqueue_mutex
in kernel/workqueue.c ).

During the CPU_LOCK_RELEASE release event the cpu-hotplug-aware subsystem
is supposed to release the per-subsystem hotcpu mutex.

The reasons for defining new events as opposed to reusing the existing events
like CPU_UP_PREPARE/CPU_UP_FAILED/CPU_ONLINE for locking/unlocking of
per-subsystem hotcpu mutexes are as follow:

	- CPU_LOCK_ACQUIRE: All hotcpu mutexes are taken before subsystems
	start handling pre-hotplug events like CPU_UP_PREPARE/CPU_DOWN_PREPARE
	etc, thus ensuring a clean handling of these events.

	- CPU_LOCK_RELEASE: The hotcpu mutexes will be released only after
	all subsystems have handled post-hotplug events like CPU_DOWN_FAILED,
	CPU_DEAD,CPU_ONLINE etc thereby ensuring that there are no subsequent
	clashes amongst the interdependent subsystems after a cpu hotplugs.

This patch also uses __raw_notifier_call chain in _cpu_up to take care
of the dependency between the two consequetive calls to
raw_notifier_call_chain.

[akpm@linux-foundation.org: fix a bug]
Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/notifier.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index e34221bf8946..1903e5490c04 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -194,6 +194,8 @@ extern int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
 #define CPU_DOWN_PREPARE	0x0005 /* CPU (unsigned)v going down */
 #define CPU_DOWN_FAILED		0x0006 /* CPU (unsigned)v NOT going down */
 #define CPU_DEAD		0x0007 /* CPU (unsigned)v dead */
+#define CPU_LOCK_ACQUIRE	0x0008 /* Acquire all hotcpu locks */
+#define CPU_LOCK_RELEASE	0x0009 /* Release all hotcpu locks */
 
 #endif /* __KERNEL__ */
 #endif /* _LINUX_NOTIFIER_H */
-- 
cgit v1.2.3


From 7097a87afe937a5879528d52880c2d95f089e96c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 9 May 2007 02:34:10 -0700
Subject: workqueue: kill run_scheduled_work()

Because it has no callers.

Actually, I think the whole idea of run_scheduled_work() was not right, not
good to mix "unqueue this work and execute its ->func()" in one function.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/workqueue.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 26a70992dec8..2a58f16e1961 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -182,7 +182,6 @@ extern void flush_work(struct workqueue_struct *wq, struct work_struct *work);
 extern void flush_work_keventd(struct work_struct *work);
 
 extern int FASTCALL(schedule_work(struct work_struct *work));
-extern int FASTCALL(run_scheduled_work(struct work_struct *work));
 extern int FASTCALL(schedule_delayed_work(struct delayed_work *work, unsigned long delay));
 
 extern int schedule_delayed_work_on(int cpu, struct delayed_work *work, unsigned long delay);
-- 
cgit v1.2.3


From 1634c48f8b85dcb05101f1eb2eab9af40b5976da Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 9 May 2007 02:34:18 -0700
Subject: make cancel_rearming_delayed_work() work on any workqueue, not just
 keventd_wq

cancel_rearming_delayed_workqueue(wq, dwork) doesn't need the first
parameter.  We don't hang on un-queued dwork any longer, and work->data
doesn't change its type.  This means we can always figure out "wq" from
dwork when it is needed.

Remove this parameter, and rename the function to
cancel_rearming_delayed_work().  Re-create an inline "obsolete"
cancel_rearming_delayed_workqueue(wq) which just calls
cancel_rearming_delayed_work().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/workqueue.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 2a58f16e1961..27110c04f21e 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -191,9 +191,6 @@ extern int current_is_keventd(void);
 extern int keventd_up(void);
 
 extern void init_workqueues(void);
-void cancel_rearming_delayed_work(struct delayed_work *work);
-void cancel_rearming_delayed_workqueue(struct workqueue_struct *,
-				       struct delayed_work *);
 int execute_in_process_context(work_func_t fn, struct execute_work *);
 
 /*
@@ -212,4 +209,14 @@ static inline int cancel_delayed_work(struct delayed_work *work)
 	return ret;
 }
 
+extern void cancel_rearming_delayed_work(struct delayed_work *work);
+
+/* Obsolete. use cancel_rearming_delayed_work() */
+static inline
+void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq,
+					struct delayed_work *work)
+{
+	cancel_rearming_delayed_work(work);
+}
+
 #endif
-- 
cgit v1.2.3


From 23b2e5991afde5af91a1a661d7f47ee56120759e Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 9 May 2007 02:34:19 -0700
Subject: workqueue: kill NOAUTOREL works

We don't have any users, and it is not so trivial to use NOAUTOREL works
correctly.  It is better to simplify API.

Delete NOAUTOREL support and rename work_release to work_clear_pending to
avoid a confusion.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/workqueue.h | 64 +++++++----------------------------------------
 1 file changed, 9 insertions(+), 55 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 27110c04f21e..e1581dce5890 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -24,15 +24,13 @@ typedef void (*work_func_t)(struct work_struct *work);
 struct work_struct {
 	atomic_long_t data;
 #define WORK_STRUCT_PENDING 0		/* T if work item pending execution */
-#define WORK_STRUCT_NOAUTOREL 1		/* F if work item automatically released on exec */
 #define WORK_STRUCT_FLAG_MASK (3UL)
 #define WORK_STRUCT_WQ_DATA_MASK (~WORK_STRUCT_FLAG_MASK)
 	struct list_head entry;
 	work_func_t func;
 };
 
-#define WORK_DATA_INIT(autorelease) \
-	ATOMIC_LONG_INIT((autorelease) << WORK_STRUCT_NOAUTOREL)
+#define WORK_DATA_INIT()	ATOMIC_LONG_INIT(0)
 
 struct delayed_work {
 	struct work_struct work;
@@ -44,14 +42,8 @@ struct execute_work {
 };
 
 #define __WORK_INITIALIZER(n, f) {				\
-	.data = WORK_DATA_INIT(0),				\
-        .entry	= { &(n).entry, &(n).entry },			\
-	.func = (f),						\
-	}
-
-#define __WORK_INITIALIZER_NAR(n, f) {				\
-	.data = WORK_DATA_INIT(1),				\
-        .entry	= { &(n).entry, &(n).entry },			\
+	.data = WORK_DATA_INIT(),				\
+	.entry	= { &(n).entry, &(n).entry },			\
 	.func = (f),						\
 	}
 
@@ -60,23 +52,12 @@ struct execute_work {
 	.timer = TIMER_INITIALIZER(NULL, 0, 0),			\
 	}
 
-#define __DELAYED_WORK_INITIALIZER_NAR(n, f) {			\
-	.work = __WORK_INITIALIZER_NAR((n).work, (f)),		\
-	.timer = TIMER_INITIALIZER(NULL, 0, 0),			\
-	}
-
 #define DECLARE_WORK(n, f)					\
 	struct work_struct n = __WORK_INITIALIZER(n, f)
 
-#define DECLARE_WORK_NAR(n, f)					\
-	struct work_struct n = __WORK_INITIALIZER_NAR(n, f)
-
 #define DECLARE_DELAYED_WORK(n, f)				\
 	struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f)
 
-#define DECLARE_DELAYED_WORK_NAR(n, f)			\
-	struct dwork_struct n = __DELAYED_WORK_INITIALIZER_NAR(n, f)
-
 /*
  * initialize a work item's function pointer
  */
@@ -95,16 +76,9 @@ struct execute_work {
  * assignment of the work data initializer allows the compiler
  * to generate better code.
  */
-#define INIT_WORK(_work, _func)					\
-	do {							\
-		(_work)->data = (atomic_long_t) WORK_DATA_INIT(0);	\
-		INIT_LIST_HEAD(&(_work)->entry);		\
-		PREPARE_WORK((_work), (_func));			\
-	} while (0)
-
-#define INIT_WORK_NAR(_work, _func)					\
+#define INIT_WORK(_work, _func)						\
 	do {								\
-		(_work)->data = (atomic_long_t) WORK_DATA_INIT(1);	\
+		(_work)->data = (atomic_long_t) WORK_DATA_INIT();	\
 		INIT_LIST_HEAD(&(_work)->entry);			\
 		PREPARE_WORK((_work), (_func));				\
 	} while (0)
@@ -115,12 +89,6 @@ struct execute_work {
 		init_timer(&(_work)->timer);			\
 	} while (0)
 
-#define INIT_DELAYED_WORK_NAR(_work, _func)			\
-	do {							\
-		INIT_WORK_NAR(&(_work)->work, (_func));		\
-		init_timer(&(_work)->timer);			\
-	} while (0)
-
 #define INIT_DELAYED_WORK_DEFERRABLE(_work, _func)			\
 	do {							\
 		INIT_WORK(&(_work)->work, (_func));		\
@@ -143,24 +111,10 @@ struct execute_work {
 	work_pending(&(w)->work)
 
 /**
- * work_release - Release a work item under execution
- * @work: The work item to release
- *
- * This is used to release a work item that has been initialised with automatic
- * release mode disabled (WORK_STRUCT_NOAUTOREL is set).  This gives the work
- * function the opportunity to grab auxiliary data from the container of the
- * work_struct before clearing the pending bit as the work_struct may be
- * subject to deallocation the moment the pending bit is cleared.
- *
- * In such a case, this should be called in the work function after it has
- * fetched any data it may require from the containter of the work_struct.
- * After this function has been called, the work_struct may be scheduled for
- * further execution or it may be deallocated unless other precautions are
- * taken.
- *
- * This should also be used to release a delayed work item.
+ * work_clear_pending - for internal use only, mark a work item as not pending
+ * @work: The work item in question
  */
-#define work_release(work) \
+#define work_clear_pending(work) \
 	clear_bit(WORK_STRUCT_PENDING, work_data_bits(work))
 
 
@@ -205,7 +159,7 @@ static inline int cancel_delayed_work(struct delayed_work *work)
 
 	ret = del_timer(&work->timer);
 	if (ret)
-		work_release(&work->work);
+		work_clear_pending(&work->work);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 28e53bddf814485699a4142bc056fd37d4e11dd4 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 9 May 2007 02:34:22 -0700
Subject: unify flush_work/flush_work_keventd and rename it to cancel_work_sync

flush_work(wq, work) doesn't need the first parameter, we can use cwq->wq
(this was possible from the very beginnig, I missed this).  So we can unify
flush_work_keventd and flush_work.

Also, rename flush_work() to cancel_work_sync() and fix all callers.
Perhaps this is not the best name, but "flush_work" is really bad.

(akpm: this is why the earlier patches bypassed maintainers)

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Jeff Garzik <jeff@garzik.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Tejun Heo <htejun@gmail.com>
Cc: Auke Kok <auke-jan.h.kok@intel.com>,
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/workqueue.h | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index e1581dce5890..d555f31c0746 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -128,30 +128,33 @@ extern struct workqueue_struct *__create_workqueue(const char *name,
 extern void destroy_workqueue(struct workqueue_struct *wq);
 
 extern int FASTCALL(queue_work(struct workqueue_struct *wq, struct work_struct *work));
-extern int FASTCALL(queue_delayed_work(struct workqueue_struct *wq, struct delayed_work *work, unsigned long delay));
+extern int FASTCALL(queue_delayed_work(struct workqueue_struct *wq,
+			struct delayed_work *work, unsigned long delay));
 extern int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
-	struct delayed_work *work, unsigned long delay);
+			struct delayed_work *work, unsigned long delay);
+
 extern void FASTCALL(flush_workqueue(struct workqueue_struct *wq));
-extern void flush_work(struct workqueue_struct *wq, struct work_struct *work);
-extern void flush_work_keventd(struct work_struct *work);
+extern void flush_scheduled_work(void);
 
 extern int FASTCALL(schedule_work(struct work_struct *work));
-extern int FASTCALL(schedule_delayed_work(struct delayed_work *work, unsigned long delay));
-
-extern int schedule_delayed_work_on(int cpu, struct delayed_work *work, unsigned long delay);
+extern int FASTCALL(schedule_delayed_work(struct delayed_work *work,
+					unsigned long delay));
+extern int schedule_delayed_work_on(int cpu, struct delayed_work *work,
+					unsigned long delay);
 extern int schedule_on_each_cpu(work_func_t func);
-extern void flush_scheduled_work(void);
 extern int current_is_keventd(void);
 extern int keventd_up(void);
 
 extern void init_workqueues(void);
 int execute_in_process_context(work_func_t fn, struct execute_work *);
 
+extern void cancel_work_sync(struct work_struct *work);
+
 /*
  * Kill off a pending schedule_delayed_work().  Note that the work callback
  * function may still be running on return from cancel_delayed_work(), unless
  * it returns 1 and the work doesn't re-arm itself. Run flush_workqueue() or
- * flush_work() or cancel_work_sync() to wait on it.
+ * cancel_work_sync() to wait on it.
  */
 static inline int cancel_delayed_work(struct delayed_work *work)
 {
-- 
cgit v1.2.3


From 73c279927f89561ecb45b2dfdf9314bafcfd9f67 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 9 May 2007 02:34:32 -0700
Subject: kthread: don't depend on work queues

Currently there is a circular reference between work queue initialization
and kthread initialization.  This prevents the kthread infrastructure from
initializing until after work queues have been initialized.

We want the properties of tasks created with kthread_create to be as close
as possible to the init_task and to not be contaminated by user processes.
The later we start our kthreadd that creates these tasks the harder it is
to avoid contamination from user processes and the more of a mess we have
to clean up because the defaults have changed on us.

So this patch modifies the kthread support to not use work queues but to
instead use a simple list of structures, and to have kthreadd start from
init_task immediately after our kernel thread that execs /sbin/init.

By being a true child of init_task we only have to change those process
settings that we want to have different from init_task, such as our process
name, the cpus that are allowed, blocking all signals and setting SIGCHLD
to SIG_IGN so that all of our children are reaped automatically.

By being a true child of init_task we also naturally get our ppid set to 0
and do not wind up as a child of PID == 1.  Ensuring that tasks generated
by kthread_create will not slow down the functioning of the wait family of
functions.

[akpm@linux-foundation.org: use interruptible sleeps]
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kthread.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 1c65e7a9f186..00dd957e245b 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -30,4 +30,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu);
 int kthread_stop(struct task_struct *k);
 int kthread_should_stop(void);
 
+int kthreadd(void *unused);
+extern struct task_struct *kthreadd_task;
+
 #endif /* _LINUX_KTHREAD_H */
-- 
cgit v1.2.3


From 10ab825bdef8df510f99c703a5a2d9b13a4e31a5 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 9 May 2007 02:34:37 -0700
Subject: change kernel threads to ignore signals instead of blocking them

Currently kernel threads use sigprocmask(SIG_BLOCK) to protect against
signals.  This doesn't prevent the signal delivery, this only blocks
signal_wake_up().  Every "killall -33 kthreadd" means a "struct siginfo"
leak.

Change kthreadd_setup() to set all handlers to SIG_IGN instead of blocking
them (make a new helper ignore_signals() for that).  If the kernel thread
needs some signal, it should use allow_signal() anyway, and in that case it
should not use CLONE_SIGHAND.

Note that we can't change daemonize() (should die!) in the same way,
because it can be used along with CLONE_SIGHAND.  This means that
allow_signal() still should unblock the signal to work correctly with
daemonize()ed threads.

However, disallow_signal() doesn't block the signal any longer but ignores
it.

NOTE: with or without this patch the kernel threads are not protected from
handle_stop_signal(), this seems harmless, but not good.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3d95c480f58d..28000b1658f9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1317,6 +1317,7 @@ extern int in_egroup_p(gid_t);
 
 extern void proc_caches_init(void);
 extern void flush_signals(struct task_struct *);
+extern void ignore_signals(struct task_struct *);
 extern void flush_signal_handlers(struct task_struct *, int force_default);
 extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info);
 
-- 
cgit v1.2.3


From 8842c9655b2b7f0e8e6c50a773b649e5d8a57678 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 9 May 2007 02:34:46 -0700
Subject: remove nfs4_acl_add_ace()

nfs4_acl_add_ace() can now be removed.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: Neil Brown <neilb@cse.unsw.edu.au>
Acked-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/nfs4_acl.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs4_acl.h b/include/linux/nfs4_acl.h
index 409b6e02f337..c9c05a78e9bb 100644
--- a/include/linux/nfs4_acl.h
+++ b/include/linux/nfs4_acl.h
@@ -44,7 +44,6 @@
 #define NFS4_ACL_MAX 170
 
 struct nfs4_acl *nfs4_acl_new(int);
-void nfs4_acl_add_ace(struct nfs4_acl *, u32, u32, u32, int, uid_t);
 int nfs4_acl_get_whotype(char *, u32);
 int nfs4_acl_write_who(int who, char *p);
 int nfs4_acl_permission(struct nfs4_acl *acl, uid_t owner, gid_t group,
-- 
cgit v1.2.3


From 7ac1bea5507218da03f6005d228789da5a831c3f Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 9 May 2007 02:34:48 -0700
Subject: knfsd: rename sk_defer_lock to sk_lock

Now that sk_defer_lock protects two different things, make the name more
generic.

Also don't bother with disabling _bh as the lock is only ever taken from
process context.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sunrpc/svcsock.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h
index 7909687557bf..e21dd93ac4b7 100644
--- a/include/linux/sunrpc/svcsock.h
+++ b/include/linux/sunrpc/svcsock.h
@@ -37,7 +37,8 @@ struct svc_sock {
 
 	atomic_t    	    	sk_reserved;	/* space on outq that is reserved */
 
-	spinlock_t		sk_defer_lock;	/* protects sk_deferred */
+	spinlock_t		sk_lock;	/* protects sk_deferred and
+						 * sk_info_authunix */
 	struct list_head	sk_deferred;	/* deferred requests that need to
 						 * be revisted */
 	struct mutex		sk_mutex;	/* to serialize sending data */
-- 
cgit v1.2.3


From cd123012d99fde4759500fee611e724e4f3016e3 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 9 May 2007 02:34:50 -0700
Subject: RPC: add wrapper for svc_reserve to account for checksum

When the kernel calls svc_reserve to downsize the expected size of an RPC
reply, it fails to account for the possibility of a checksum at the end of
the packet.  If a client mounts a NFSv2/3 with sec=krb5i/p, and does I/O
then you'll generally see messages similar to this in the server's ring
buffer:

RPC request reserved 164 but used 208

While I was never able to verify it, I suspect that this problem is also
the root cause of some oopses I've seen under these conditions:

https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=227726

This is probably also a problem for other sec= types and for NFSv4.  The
large reserved size for NFSv4 compound packets seems to generally paper
over the problem, however.

This patch adds a wrapper for svc_reserve that accounts for the possibility
of a checksum.  It also fixes up the appropriate callers of svc_reserve to
call the wrapper.  For now, it just uses a hardcoded value that I
determined via testing.  That value may need to be revised upward as things
change, or we may want to eventually add a new auth_op that attempts to
calculate this somehow.

Unfortunately, there doesn't seem to be a good way to reliably determine
the expected checksum length prior to actually calculating it, particularly
with schemes like spkm3.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Acked-by: Neil Brown <neilb@suse.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Acked-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sunrpc/svc.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 35fa4d5aadd0..4a7ae8ab6eb8 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -396,4 +396,23 @@ char *		   svc_print_addr(struct svc_rqst *, char *, size_t);
 
 #define	RPC_MAX_ADDRBUFLEN	(63U)
 
+/*
+ * When we want to reduce the size of the reserved space in the response
+ * buffer, we need to take into account the size of any checksum data that
+ * may be at the end of the packet. This is difficult to determine exactly
+ * for all cases without actually generating the checksum, so we just use a
+ * static value.
+ */
+static inline void
+svc_reserve_auth(struct svc_rqst *rqstp, int space)
+{
+	int			added_space = 0;
+
+	switch(rqstp->rq_authop->flavour) {
+		case RPC_AUTH_GSS:
+			added_space = RPC_MAX_AUTH_SIZE;
+	}
+	return svc_reserve(rqstp, space + added_space);
+}
+
 #endif /* SUNRPC_SVC_H */
-- 
cgit v1.2.3


From b8522ead3534c6cd06752b47a3bc380956191a2a Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 9 May 2007 02:34:58 -0700
Subject: aio is unlikely

Stick an unlikely() around is_aio(): I assert that most IO is synchronous.

Cc: Suparna Bhattacharya <suparna@in.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Cc: Zach Brown <zach.brown@oracle.com>
Cc: Ulrich Drepper <drepper@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/aio.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/aio.h b/include/linux/aio.h
index a30ef13c9e62..43dc2ebfaa0e 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -226,7 +226,8 @@ int FASTCALL(io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 		__put_ioctx(kioctx);					\
 } while (0)
 
-#define in_aio() !is_sync_wait(current->io_wait)
+#define in_aio() (unlikely(!is_sync_wait(current->io_wait)))
+
 /* may be used for debugging */
 #define warn_if_async()							\
 do {									\
-- 
cgit v1.2.3


From f34c506b0385b43abd25c490335036ecbb173aed Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 9 May 2007 02:34:59 -0700
Subject: declare struct ktime

Some smarty went and inflicted ktime_t as a typedef upon us, so we cannot
forward declare it.

Create a new `union ktime', map ktime_t onto that.  Now we need to kill off
this ktime_t thing.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ktime.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ktime.h b/include/linux/ktime.h
index 81bb9c7a4eb3..c762954bda14 100644
--- a/include/linux/ktime.h
+++ b/include/linux/ktime.h
@@ -43,7 +43,7 @@
  * plain scalar nanosecond based representation can be selected by the
  * config switch CONFIG_KTIME_SCALAR.
  */
-typedef union {
+union ktime {
 	s64	tv64;
 #if BITS_PER_LONG != 64 && !defined(CONFIG_KTIME_SCALAR)
 	struct {
@@ -54,7 +54,9 @@ typedef union {
 # endif
 	} tv;
 #endif
-} ktime_t;
+};
+
+typedef union ktime ktime_t;		/* Kill this */
 
 #define KTIME_MAX			((s64)~((u64)1 << 63))
 #if (BITS_PER_LONG == 64)
-- 
cgit v1.2.3


From c19384b5b296905d4988c7c684ff540a0f9d65be Mon Sep 17 00:00:00 2001
From: Pierre Peiffer <pierre.peiffer@bull.net>
Date: Wed, 9 May 2007 02:35:02 -0700
Subject: Make futex_wait() use an hrtimer for timeout

This patch modifies futex_wait() to use an hrtimer + schedule() in place of
schedule_timeout().

schedule_timeout() is tick based, therefore the timeout granularity is the
tick (1 ms, 4 ms or 10 ms depending on HZ).  By using a high resolution timer
for timeout wakeup, we can attain a much finer timeout granularity (in the
microsecond range).  This parallels what is already done for futex_lock_pi().

The timeout passed to the syscall is no longer converted to jiffies and is
therefore passed to do_futex() and futex_wait() as an absolute ktime_t
therefore keeping nanosecond resolution.

Also this removes the need to pass the nanoseconds timeout part to
futex_lock_pi() in val2.

In futex_wait(), if there is no timeout then a regular schedule() is
performed.  Otherwise, an hrtimer is fired before schedule() is called.

[akpm@linux-foundation.org: fix `make headers_check']
Signed-off-by: Sebastien Dugue <sebastien.dugue@bull.net>
Signed-off-by: Pierre Peiffer <pierre.peiffer@bull.net>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/futex.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/futex.h b/include/linux/futex.h
index 820125c628c1..34e54f2b8997 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -3,6 +3,8 @@
 
 #include <linux/sched.h>
 
+union ktime;
+
 /* Second argument to futex syscall */
 
 
@@ -94,7 +96,7 @@ struct robust_list_head {
 #define ROBUST_LIST_LIMIT	2048
 
 #ifdef __KERNEL__
-long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout,
+long do_futex(u32 __user *uaddr, int op, u32 val, union ktime *timeout,
 	      u32 __user *uaddr2, u32 val2, u32 val3);
 
 extern int
-- 
cgit v1.2.3


From d0aa7a70bf03b9de9e995ab272293be1f7937822 Mon Sep 17 00:00:00 2001
From: Pierre Peiffer <pierre.peiffer@bull.net>
Date: Wed, 9 May 2007 02:35:02 -0700
Subject: futex_requeue_pi optimization

This patch provides the futex_requeue_pi functionality, which allows some
threads waiting on a normal futex to be requeued on the wait-queue of a
PI-futex.

This provides an optimization, already used for (normal) futexes, to be used
with the PI-futexes.

This optimization is currently used by the glibc in pthread_broadcast, when
using "normal" mutexes.  With futex_requeue_pi, it can be used with
PRIO_INHERIT mutexes too.

Signed-off-by: Pierre Peiffer <pierre.peiffer@bull.net>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/futex.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/futex.h b/include/linux/futex.h
index 34e54f2b8997..1bd8dfcb037b 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -17,6 +17,7 @@ union ktime;
 #define FUTEX_LOCK_PI		6
 #define FUTEX_UNLOCK_PI		7
 #define FUTEX_TRYLOCK_PI	8
+#define FUTEX_CMP_REQUEUE_PI	9
 
 /*
  * Support for robust futexes: the kernel cleans up held futexes at
@@ -84,10 +85,15 @@ struct robust_list_head {
  */
 #define FUTEX_OWNER_DIED	0x40000000
 
+/*
+ * Some processes have been requeued on this PI-futex
+ */
+#define FUTEX_WAITER_REQUEUED	0x20000000
+
 /*
  * The rest of the robust-futex field is for the TID:
  */
-#define FUTEX_TID_MASK		0x3fffffff
+#define FUTEX_TID_MASK		0x0fffffff
 
 /*
  * This limit protects against a deliberately circular list.
@@ -111,6 +117,7 @@ handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi);
  * We set bit 0 to indicate if it's an inode-based key.
  */
 union futex_key {
+	u32 __user *uaddr;
 	struct {
 		unsigned long pgoff;
 		struct inode *inode;
-- 
cgit v1.2.3


From 34f01cc1f512fa783302982776895c73714ebbc2 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Wed, 9 May 2007 02:35:04 -0700
Subject: FUTEX: new PRIVATE futexes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

  Analysis of current linux futex code :
  --------------------------------------

A central hash table futex_queues[] holds all contexts (futex_q) of waiting
threads.

Each futex_wait()/futex_wait() has to obtain a spinlock on a hash slot to
perform lookups or insert/deletion of a futex_q.

When a futex_wait() is done, calling thread has to :

1) - Obtain a read lock on mmap_sem to be able to validate the user pointer
     (calling find_vma()). This validation tells us if the futex uses
     an inode based store (mapped file), or mm based store (anonymous mem)

2) - compute a hash key

3) - Atomic increment of reference counter on an inode or a mm_struct

4) - lock part of futex_queues[] hash table

5) - perform the test on value of futex.
	(rollback is value != expected_value, returns EWOULDBLOCK)
	(various loops if test triggers mm faults)

6) queue the context into hash table, release the lock got in 4)

7) - release the read_lock on mmap_sem

   <block>

8) Eventually unqueue the context (but rarely, as this part  may be done
   by the futex_wake())

Futexes were designed to improve scalability but current implementation has
various problems :

- Central hashtable :

  This means scalability problems if many processes/threads want to use
  futexes at the same time.
  This means NUMA unbalance because this hashtable is located on one node.

- Using mmap_sem on every futex() syscall :

  Even if mmap_sem is a rw_semaphore, up_read()/down_read() are doing atomic
  ops on mmap_sem, dirtying cache line :
    - lot of cache line ping pongs on SMP configurations.

  mmap_sem is also extensively used by mm code (page faults, mmap()/munmap())
  Highly threaded processes might suffer from mmap_sem contention.

  mmap_sem is also used by oprofile code. Enabling oprofile hurts threaded
  programs because of contention on the mmap_sem cache line.

- Using an atomic_inc()/atomic_dec() on inode ref counter or mm ref counter:
  It's also a cache line ping pong on SMP. It also increases mmap_sem hold time
  because of cache misses.

Most of these scalability problems come from the fact that futexes are in
one global namespace.  As we use a central hash table, we must make sure
they are all using the same reference (given by the mm subsystem).  We
chose to force all futexes be 'shared'.  This has a cost.

But fact is POSIX defined PRIVATE and SHARED, allowing clear separation,
and optimal performance if carefuly implemented.  Time has come for linux
to have better threading performance.

The goal is to permit new futex commands to avoid :
 - Taking the mmap_sem semaphore, conflicting with other subsystems.
 - Modifying a ref_count on mm or an inode, still conflicting with mm or fs.

This is possible because, for one process using PTHREAD_PROCESS_PRIVATE
futexes, we only need to distinguish futexes by their virtual address, no
matter the underlying mm storage is.

If glibc wants to exploit this new infrastructure, it should use new
_PRIVATE futex subcommands for PTHREAD_PROCESS_PRIVATE futexes.  And be
prepared to fallback on old subcommands for old kernels.  Using one global
variable with the FUTEX_PRIVATE_FLAG or 0 value should be OK.

PTHREAD_PROCESS_SHARED futexes should still use the old subcommands.

Compatibility with old applications is preserved, they still hit the
scalability problems, but new applications can fly :)

Note : the same SHARED futex (mapped on a file) can be used by old binaries
*and* new binaries, because both binaries will use the old subcommands.

Note : Vast majority of futexes should be using PROCESS_PRIVATE semantic,
as this is the default semantic. Almost all applications should benefit
of this changes (new kernel and updated libc)

Some bench results on a Pentium M 1.6 GHz (SMP kernel on a UP machine)

/* calling futex_wait(addr, value) with value != *addr */
433 cycles per futex(FUTEX_WAIT) call (mixing 2 futexes)
424 cycles per futex(FUTEX_WAIT) call (using one futex)
334 cycles per futex(FUTEX_WAIT_PRIVATE) call (mixing 2 futexes)
334 cycles per futex(FUTEX_WAIT_PRIVATE) call (using one futex)
For reference :
187 cycles per getppid() call
188 cycles per umask() call
181 cycles per ni_syscall() call

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Pierre Peiffer <pierre.peiffer@bull.net>
Cc: "Ulrich Drepper" <drepper@gmail.com>
Cc: "Nick Piggin" <nickpiggin@yahoo.com.au>
Cc: "Ingo Molnar" <mingo@elte.hu>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/futex.h | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/futex.h b/include/linux/futex.h
index 1bd8dfcb037b..899fc7f20edd 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -19,6 +19,18 @@ union ktime;
 #define FUTEX_TRYLOCK_PI	8
 #define FUTEX_CMP_REQUEUE_PI	9
 
+#define FUTEX_PRIVATE_FLAG	128
+#define FUTEX_CMD_MASK		~FUTEX_PRIVATE_FLAG
+
+#define FUTEX_WAIT_PRIVATE	(FUTEX_WAIT | FUTEX_PRIVATE_FLAG)
+#define FUTEX_WAKE_PRIVATE	(FUTEX_WAKE | FUTEX_PRIVATE_FLAG)
+#define FUTEX_REQUEUE_PRIVATE	(FUTEX_REQUEUE | FUTEX_PRIVATE_FLAG)
+#define FUTEX_CMP_REQUEUE_PRIVATE (FUTEX_CMP_REQUEUE | FUTEX_PRIVATE_FLAG)
+#define FUTEX_WAKE_OP_PRIVATE	(FUTEX_WAKE_OP | FUTEX_PRIVATE_FLAG)
+#define FUTEX_LOCK_PI_PRIVATE	(FUTEX_LOCK_PI | FUTEX_PRIVATE_FLAG)
+#define FUTEX_UNLOCK_PI_PRIVATE	(FUTEX_UNLOCK_PI | FUTEX_PRIVATE_FLAG)
+#define FUTEX_TRYLOCK_PI_PRIVATE (FUTEX_TRYLOCK_PI | FUTEX_PRIVATE_FLAG)
+
 /*
  * Support for robust futexes: the kernel cleans up held futexes at
  * thread exit time.
@@ -114,8 +126,18 @@ handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi);
  * Don't rearrange members without looking at hash_futex().
  *
  * offset is aligned to a multiple of sizeof(u32) (== 4) by definition.
- * We set bit 0 to indicate if it's an inode-based key.
- */
+ * We use the two low order bits of offset to tell what is the kind of key :
+ *  00 : Private process futex (PTHREAD_PROCESS_PRIVATE)
+ *       (no reference on an inode or mm)
+ *  01 : Shared futex (PTHREAD_PROCESS_SHARED)
+ *	mapped on a file (reference on the underlying inode)
+ *  10 : Shared futex (PTHREAD_PROCESS_SHARED)
+ *       (but private mapping on an mm, and reference taken on it)
+*/
+
+#define FUT_OFF_INODE    1 /* We set bit 0 if key has a reference on inode */
+#define FUT_OFF_MMSHARED 2 /* We set bit 1 if key has a reference on mm */
+
 union futex_key {
 	u32 __user *uaddr;
 	struct {
@@ -134,7 +156,8 @@ union futex_key {
 		int offset;
 	} both;
 };
-int get_futex_key(u32 __user *uaddr, union futex_key *key);
+int get_futex_key(u32 __user *uaddr, struct rw_semaphore *shared,
+		  union futex_key *key);
 void get_futex_key_refs(union futex_key *key);
 void drop_futex_key_refs(union futex_key *key);
 
-- 
cgit v1.2.3


From 01f2705daf5a36208e69d7cf95db9c330f843af6 Mon Sep 17 00:00:00 2001
From: Nate Diller <nate.diller@gmail.com>
Date: Wed, 9 May 2007 02:35:07 -0700
Subject: fs: convert core functions to zero_user_page

It's very common for file systems to need to zero part or all of a page,
the simplist way is just to use kmap_atomic() and memset().  There's
actually a library function in include/linux/highmem.h that does exactly
that, but it's confusingly named memclear_highpage_flush(), which is
descriptive of *how* it does the work rather than what the *purpose* is.
So this patchset renames the function to zero_user_page(), and calls it
from the various places that currently open code it.

This first patch introduces the new function call, and converts all the
core kernel callsites, both the open-coded ones and the old
memclear_highpage_flush() ones.  Following this patch is a series of
conversions for each file system individually, per AKPM, and finally a
patch deprecating the old call.  The diffstat below shows the entire
patchset.

[akpm@linux-foundation.org: fix a few things]
Signed-off-by: Nate Diller <nate.diller@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/highmem.h | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index a515eb0afdfb..b5f2ab42d984 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -94,17 +94,27 @@ static inline void clear_highpage(struct page *page)
 
 /*
  * Same but also flushes aliased cache contents to RAM.
+ *
+ * This must be a macro because KM_USER0 and friends aren't defined if
+ * !CONFIG_HIGHMEM
  */
-static inline void memclear_highpage_flush(struct page *page, unsigned int offset, unsigned int size)
+#define zero_user_page(page, offset, size, km_type)		\
+	do {							\
+		void *kaddr;					\
+								\
+		BUG_ON((offset) + (size) > PAGE_SIZE);		\
+								\
+		kaddr = kmap_atomic(page, km_type);		\
+		memset((char *)kaddr + (offset), 0, (size));	\
+		flush_dcache_page(page);			\
+		kunmap_atomic(kaddr, (km_type));		\
+	} while (0)
+
+
+static inline void memclear_highpage_flush(struct page *page,
+			unsigned int offset, unsigned int size)
 {
-	void *kaddr;
-
-	BUG_ON(offset + size > PAGE_SIZE);
-
-	kaddr = kmap_atomic(page, KM_USER0);
-	memset((char *)kaddr + offset, 0, size);
-	flush_dcache_page(page);
-	kunmap_atomic(kaddr, KM_USER0);
+	zero_user_page(page, offset, size, KM_USER0);
 }
 
 #ifndef __HAVE_ARCH_COPY_USER_HIGHPAGE
-- 
cgit v1.2.3


From f37bc2712b54ec641e0c0c8634f1a4b61d9956c0 Mon Sep 17 00:00:00 2001
From: Nate Diller <nate.diller@gmail.com>
Date: Wed, 9 May 2007 02:35:09 -0700
Subject: fs: deprecate memclear_highpage_flush

Now that all the in-tree users are converted over to zero_user_page(),
deprecate the old memclear_highpage_flush() call.

Signed-off-by: Nate Diller <nate.diller@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/highmem.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index b5f2ab42d984..98e2cce996a4 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -110,8 +110,7 @@ static inline void clear_highpage(struct page *page)
 		kunmap_atomic(kaddr, (km_type));		\
 	} while (0)
 
-
-static inline void memclear_highpage_flush(struct page *page,
+static inline void __deprecated memclear_highpage_flush(struct page *page,
 			unsigned int offset, unsigned int size)
 {
 	zero_user_page(page, offset, size, KM_USER0);
-- 
cgit v1.2.3


From 8bb7844286fb8c9fce6f65d8288aeb09d03a5e0d Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 9 May 2007 02:35:10 -0700
Subject: Add suspend-related notifications for CPU hotplug

Since nonboot CPUs are now disabled after tasks and devices have been
frozen and the CPU hotplug infrastructure is used for this purpose, we need
special CPU hotplug notifications that will help the CPU-hotplug-aware
subsystems distinguish normal CPU hotplug events from CPU hotplug events
related to a system-wide suspend or resume operation in progress.  This
patch introduces such notifications and causes them to be used during
suspend and resume transitions.  It also changes all of the
CPU-hotplug-aware subsystems to take these notifications into consideration
(for now they are handled in the same way as the corresponding "normal"
ones).

[oleg@tv-sign.ru: cleanups]
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Gautham R Shenoy <ego@in.ibm.com>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/notifier.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index 1903e5490c04..9431101bf876 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -197,5 +197,17 @@ extern int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
 #define CPU_LOCK_ACQUIRE	0x0008 /* Acquire all hotcpu locks */
 #define CPU_LOCK_RELEASE	0x0009 /* Release all hotcpu locks */
 
+/* Used for CPU hotplug events occuring while tasks are frozen due to a suspend
+ * operation in progress
+ */
+#define CPU_TASKS_FROZEN	0x0010
+
+#define CPU_ONLINE_FROZEN	(CPU_ONLINE | CPU_TASKS_FROZEN)
+#define CPU_UP_PREPARE_FROZEN	(CPU_UP_PREPARE | CPU_TASKS_FROZEN)
+#define CPU_UP_CANCELED_FROZEN	(CPU_UP_CANCELED | CPU_TASKS_FROZEN)
+#define CPU_DOWN_PREPARE_FROZEN	(CPU_DOWN_PREPARE | CPU_TASKS_FROZEN)
+#define CPU_DOWN_FAILED_FROZEN	(CPU_DOWN_FAILED | CPU_TASKS_FROZEN)
+#define CPU_DEAD_FROZEN		(CPU_DEAD | CPU_TASKS_FROZEN)
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_NOTIFIER_H */
-- 
cgit v1.2.3


From d1187ed21026fd512b87851d0ca26d9ae16f9059 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 9 May 2007 02:35:12 -0700
Subject: vmstat: use our own timer events

vmstat is currently using the cache reaper to periodically bring the
statistics up to date.  The cache reaper does only exists in SLUB as a way to
provide compatibility with SLAB.  This patch removes the vmstat calls from the
slab allocators and provides its own handling.

The advantage is also that we can use a different frequency for the updates.
Refreshing vm stats is a pretty fast job so we can run this every second and
stagger this by only one tick.  This will lead to some overlap in large
systems.  F.e a system running at 250 HZ with 1024 processors will have 4 vm
updates occurring at once.

However, the vm stats update only accesses per node information.  It is only
necessary to stagger the vm statistics updates per processor in each node.  Vm
counter updates occurring on distant nodes will not cause cacheline
contention.

We could implement an alternate approach that runs the first processor on each
node at the second and then each of the other processor on a node on a
subsequent tick.  That may be useful to keep a large amount of the second free
of timer activity.  Maybe the timer folks will have some feedback on this one?

[jirislaby@gmail.com: add missing break]
Cc: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmstat.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index acb1f105870c..d9325cf8a134 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -212,8 +212,6 @@ extern void dec_zone_state(struct zone *, enum zone_stat_item);
 extern void __dec_zone_state(struct zone *, enum zone_stat_item);
 
 void refresh_cpu_vm_stats(int);
-void refresh_vm_stats(void);
-
 #else /* CONFIG_SMP */
 
 /*
@@ -260,7 +258,6 @@ static inline void __dec_zone_page_state(struct page *page,
 #define mod_zone_page_state __mod_zone_page_state
 
 static inline void refresh_cpu_vm_stats(int cpu) { }
-static inline void refresh_vm_stats(void) { }
 #endif
 
 #endif /* _LINUX_VMSTAT_H */
-- 
cgit v1.2.3


From 4037d452202e34214e8a939fa5621b2b3bbb45b7 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 9 May 2007 02:35:14 -0700
Subject: Move remote node draining out of slab allocators

Currently the slab allocators contain callbacks into the page allocator to
perform the draining of pagesets on remote nodes.  This requires SLUB to have
a whole subsystem in order to be compatible with SLAB.  Moving node draining
out of the slab allocators avoids a section of code in SLUB.

Move the node draining so that is is done when the vm statistics are updated.
At that point we are already touching all the cachelines with the pagesets of
a processor.

Add a expire counter there.  If we have to update per zone or global vm
statistics then assume that the pageset will require subsequent draining.

The expire counter will be decremented on each vm stats update pass until it
reaches zero.  Then we will drain one batch from the pageset.  The draining
will cause vm counter updates which will then cause another expiration until
the pcp is empty.  So we will drain a batch every 3 seconds.

Note that remote node draining is a somewhat esoteric feature that is required
on large NUMA systems because otherwise significant portions of system memory
can become trapped in pcp queues.  The number of pcp is determined by the
number of processors and nodes in a system.  A system with 4 processors and 2
nodes has 8 pcps which is okay.  But a system with 1024 processors and 512
nodes has 512k pcps with a high potential for large amount of memory being
caught in them.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h    | 6 +-----
 include/linux/mmzone.h | 3 +++
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 97a36c3d96e2..0d2ef0b082a6 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -176,10 +176,6 @@ extern void FASTCALL(free_cold_page(struct page *page));
 #define free_page(addr) free_pages((addr),0)
 
 void page_alloc_init(void);
-#ifdef CONFIG_NUMA
-void drain_node_pages(int node);
-#else
-static inline void drain_node_pages(int node) { };
-#endif
+void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
 
 #endif /* __LINUX_GFP_H */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2f1544e83042..d09b1345a3a1 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -83,6 +83,9 @@ struct per_cpu_pages {
 
 struct per_cpu_pageset {
 	struct per_cpu_pages pcp[2];	/* 0: hot.  1: cold */
+#ifdef CONFIG_NUMA
+	s8 expire;
+#endif
 #ifdef CONFIG_SMP
 	s8 stat_threshold;
 	s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
-- 
cgit v1.2.3


From b52f52a093bb1e841e014c2087b5bee7162da413 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 9 May 2007 02:35:15 -0700
Subject: clocksource: fix resume logic

We need to make sure that the clocksources are resumed, when timekeeping is
resumed.  The current resume logic does not guarantee this.

Add a resume function pointer to the clocksource struct, so clocksource
drivers which need to reinitialize the clocksource can provide a resume
function.

Add a resume function, which calls the maybe available clocksource resume
functions and resets the watchdog function, so a stable TSC can be used
accross suspend/resume.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/clocksource.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 2665ca04cf8f..bf297b03a4e4 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -49,6 +49,7 @@ struct clocksource;
  * @shift:		cycle to nanosecond divisor (power of two)
  * @flags:		flags describing special properties
  * @vread:		vsyscall based read
+ * @resume:		resume function for the clocksource, if necessary
  * @cycle_interval:	Used internally by timekeeping core, please ignore.
  * @xtime_interval:	Used internally by timekeeping core, please ignore.
  */
@@ -65,6 +66,7 @@ struct clocksource {
 	u32 shift;
 	unsigned long flags;
 	cycle_t (*vread)(void);
+	void (*resume)(void);
 
 	/* timekeeping specific data, ignore */
 	cycle_t cycle_interval;
@@ -209,6 +211,7 @@ static inline void clocksource_calculate_interval(struct clocksource *c,
 extern int clocksource_register(struct clocksource*);
 extern struct clocksource* clocksource_get_next(void);
 extern void clocksource_change_rating(struct clocksource *cs, int rating);
+extern void clocksource_resume(void);
 
 #ifdef CONFIG_GENERIC_TIME_VSYSCALL
 extern void update_vsyscall(struct timespec *ts, struct clocksource *c);
-- 
cgit v1.2.3


From e61a1c1c4f240cec61300c8f27518c3e47570fd4 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Wed, 9 May 2007 02:35:15 -0700
Subject: Allow arch to initialize arch field of the module structure

This will later allow an arch to add module specific information via linker
generated tables instead of poking directly in the module object structure.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/module.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index 6d3dc9c4ff96..792d483c9af7 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -356,6 +356,9 @@ struct module
 	   keeping pointers to this stuff */
 	char *args;
 };
+#ifndef MODULE_ARCH_INIT
+#define MODULE_ARCH_INIT {}
+#endif
 
 /* FIXME: It'd be nice to isolate modules during init, too, so they
    aren't used before they (may) fail.  But presently too much code
-- 
cgit v1.2.3


From f7e4217b007d1f73e7e3cf10ba4fea4a608c603f Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Wed, 9 May 2007 02:35:17 -0700
Subject: rename thread_info to stack

This finally renames the thread_info field in task structure to stack, so that
the assumptions about this field are gone and archs have more freedom about
placing the thread_info structure.

Nonbroken archs which have a proper thread pointer can do the access to both
current thread and task structure via a single pointer.

It'll allow for a few more cleanups of the fork code, from which e.g.  ia64
could benefit.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
[akpm@linux-foundation.org: build fix]
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Ian Molton <spyro@f2s.com>
Cc: Haavard Skinnemoen <hskinnemoen@atmel.com>
Cc: Mikael Starvik <starvik@axis.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Hirokazu Takata <takata@linux-m32r.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Roman Zippel <zippel@linux-m68k.org>
Cc: Greg Ungerer <gerg@uclinux.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Kazumoto Kojima <kkojima@rr.iij4u.or.jp>
Cc: Richard Curnow <rc@rc0.org.uk>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Cc: Miles Bader <uclinux-v850@lsi.nec.co.jp>
Cc: Andi Kleen <ak@muc.de>
Cc: Chris Zankel <chris@zankel.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/init_task.h | 2 +-
 include/linux/sched.h     | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 795102309bf1..45170b2fa253 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -95,7 +95,7 @@ extern struct group_info init_groups;
 #define INIT_TASK(tsk)	\
 {									\
 	.state		= 0,						\
-	.thread_info	= &init_thread_info,				\
+	.stack		= &init_thread_info,				\
 	.usage		= ATOMIC_INIT(2),				\
 	.flags		= 0,						\
 	.lock_depth	= -1,						\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 28000b1658f9..17b72d88c4cb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -817,7 +817,7 @@ struct prio_array;
 
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
-	struct thread_info *thread_info;
+	void *stack;
 	atomic_t usage;
 	unsigned int flags;	/* per process flags, defined below */
 	unsigned int ptrace;
@@ -1513,8 +1513,8 @@ static inline void unlock_task_sighand(struct task_struct *tsk,
 
 #ifndef __HAVE_THREAD_FUNCTIONS
 
-#define task_thread_info(task) (task)->thread_info
-#define task_stack_page(task) ((void*)((task)->thread_info))
+#define task_thread_info(task)	((struct thread_info *)(task)->stack)
+#define task_stack_page(task)	((task)->stack)
 
 static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org)
 {
@@ -1524,7 +1524,7 @@ static inline void setup_thread_stack(struct task_struct *p, struct task_struct
 
 static inline unsigned long *end_of_stack(struct task_struct *p)
 {
-	return (unsigned long *)(p->thread_info + 1);
+	return (unsigned long *)(task_thread_info(p) + 1);
 }
 
 #endif
-- 
cgit v1.2.3


From 0d7ebbbc6eaa5539f78ab20ed6ff1725a4e332ef Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Wed, 9 May 2007 02:35:27 -0700
Subject: compiler: introduce __used and __maybe_unused

__used is defined to be __attribute__((unused)) for all pre-3.3 gcc
compilers to suppress warnings for unused functions because perhaps they
are referenced only in inline assembly.  It is defined to be
__attribute__((used)) for gcc 3.3 and later so that the code is still
emitted for such functions.

__maybe_unused is defined to be __attribute__((unused)) for both function
and variable use if it could possibly be unreferenced due to the evaluation
of preprocessor macros.  Function prototypes shall be marked with
__maybe_unused if the actual definition of the function is dependant on
preprocessor macros.

No update to compiler-intel.h is necessary because ICC supports both
__attribute__((used)) and __attribute__((unused)) as specified by the gcc
manual.

__attribute_used__ is deprecated and will be removed once all current
code is converted to using __used.

Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Adrian Bunk <bunk@stusta.de>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler-gcc.h  |  1 +
 include/linux/compiler-gcc3.h |  6 ++++--
 include/linux/compiler-gcc4.h |  3 ++-
 include/linux/compiler.h      | 21 ++++++++++++++++++---
 4 files changed, 25 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index a9f794716a81..03ec2311fb29 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -40,3 +40,4 @@
 #define  noinline			__attribute__((noinline))
 #define __attribute_pure__		__attribute__((pure))
 #define __attribute_const__		__attribute__((__const__))
+#define __maybe_unused			__attribute__((unused))
diff --git a/include/linux/compiler-gcc3.h b/include/linux/compiler-gcc3.h
index ecd621fd27d2..a9e2863c2dbf 100644
--- a/include/linux/compiler-gcc3.h
+++ b/include/linux/compiler-gcc3.h
@@ -4,9 +4,11 @@
 #include <linux/compiler-gcc.h>
 
 #if __GNUC_MINOR__ >= 3
-# define __attribute_used__	__attribute__((__used__))
+# define __used			__attribute__((__used__))
+# define __attribute_used__	__used				/* deprecated */
 #else
-# define __attribute_used__	__attribute__((__unused__))
+# define __used			__attribute__((__unused__))
+# define __attribute_used__	__used				/* deprecated */
 #endif
 
 #if __GNUC_MINOR__ >= 4
diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h
index fd0cc7c4a636..a03e9398a6c2 100644
--- a/include/linux/compiler-gcc4.h
+++ b/include/linux/compiler-gcc4.h
@@ -12,7 +12,8 @@
 # define __inline		__inline	__attribute__((always_inline))
 #endif
 
-#define __attribute_used__	__attribute__((__used__))
+#define __used			__attribute__((__used__))
+#define __attribute_used__	__used			/* deprecated */
 #define __must_check 		__attribute__((warn_unused_result))
 #define __compiler_offsetof(a,b) __builtin_offsetof(a,b)
 #define __always_inline		inline __attribute__((always_inline))
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 3b6949b41745..498c35920762 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -108,15 +108,30 @@ extern void __chk_io_ptr(const void __iomem *);
  * Allow us to avoid 'defined but not used' warnings on functions and data,
  * as well as force them to be emitted to the assembly file.
  *
- * As of gcc 3.3, static functions that are not marked with attribute((used))
- * may be elided from the assembly file.  As of gcc 3.3, static data not so
+ * As of gcc 3.4, static functions that are not marked with attribute((used))
+ * may be elided from the assembly file.  As of gcc 3.4, static data not so
  * marked will not be elided, but this may change in a future gcc version.
  *
+ * NOTE: Because distributions shipped with a backported unit-at-a-time
+ * compiler in gcc 3.3, we must define __used to be __attribute__((used))
+ * for gcc >=3.3 instead of 3.4.
+ *
  * In prior versions of gcc, such functions and data would be emitted, but
  * would be warned about except with attribute((unused)).
+ *
+ * Mark functions that are referenced only in inline assembly as __used so
+ * the code is emitted even though it appears to be unreferenced.
  */
 #ifndef __attribute_used__
-# define __attribute_used__	/* unimplemented */
+# define __attribute_used__	/* deprecated */
+#endif
+
+#ifndef __used
+# define __used			/* unimplemented */
+#endif
+
+#ifndef __maybe_unused
+# define __maybe_unused		/* unimplemented */
 #endif
 
 /*
-- 
cgit v1.2.3


From 5a87ede94595f58934000e26e8b13398e63868b5 Mon Sep 17 00:00:00 2001
From: "Antonino A. Daplas" <adaplas@gmail.com>
Date: Wed, 9 May 2007 02:35:32 -0700
Subject: svgalib: move fb_get_caps to svgalib

Move fb_get_caps() method to svgalib.c as svga_get_caps() so it can be used by
s3fb, arkfb and vt8623fb.

Signed-off-by: Antonino Daplas <adaplas@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/svga.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/svga.h b/include/linux/svga.h
index e1cc552e04fe..13ad0b82ac28 100644
--- a/include/linux/svga.h
+++ b/include/linux/svga.h
@@ -113,6 +113,8 @@ void svga_tilefill(struct fb_info *info, struct fb_tilerect *rect);
 void svga_tileblit(struct fb_info *info, struct fb_tileblit *blit);
 void svga_tilecursor(struct fb_info *info, struct fb_tilecursor *cursor);
 int svga_get_tilemax(struct fb_info *info);
+void svga_get_caps(struct fb_info *info, struct fb_blit_caps *caps,
+		   struct fb_var_screeninfo *var);
 
 int svga_compute_pll(const struct svga_pll *pll, u32 f_wanted, u16 *m, u16 *n, u16 *r, int node);
 int svga_check_timings(const struct svga_timing_regs *tm, struct fb_var_screeninfo *var, int node);
-- 
cgit v1.2.3


From 880169dd2edc4297b7811a0542be9766ca6945bc Mon Sep 17 00:00:00 2001
From: Haavard Skinnemoen <hskinnemoen@atmel.com>
Date: Wed, 9 May 2007 02:35:33 -0700
Subject: fbdev: add support for AVR32

Provide framebuffer page protection flags and definitions of
fb_readl/fb_writel for AVR32.

Signed-off-by: Haavard Skinnemoen <hskinnemoen@atmel.com>
Cc: "Antonino A. Daplas" <adaplas@pol.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fb.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fb.h b/include/linux/fb.h
index dff7a728948c..c654d0e9ce33 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -868,7 +868,7 @@ struct fb_info {
 #define fb_writeq sbus_writeq
 #define fb_memset sbus_memset_io
 
-#elif defined(__i386__) || defined(__alpha__) || defined(__x86_64__) || defined(__hppa__) || (defined(__sh__) && !defined(__SH5__)) || defined(__powerpc__)
+#elif defined(__i386__) || defined(__alpha__) || defined(__x86_64__) || defined(__hppa__) || (defined(__sh__) && !defined(__SH5__)) || defined(__powerpc__) || defined(__avr32__)
 
 #define fb_readb __raw_readb
 #define fb_readw __raw_readw
-- 
cgit v1.2.3


From 5b479c91da90eef605f851508744bfe8269591a0 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 9 May 2007 02:35:39 -0700
Subject: md: improve partition detection in md array

md currently uses ->media_changed to make sure rescan_partitions
is call on md array after they are assembled.

However that doesn't happen until the array is opened, which is later
than some people would like.

So use blkdev_ioctl to do the rescan immediately that the
array has been assembled.

This means we can remove all the ->change infrastructure as it was only used
to trigger a partition rescan.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/raid/md_k.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index de72c49747c8..a121f36f4437 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -201,7 +201,6 @@ struct mddev_s
 	struct mutex			reconfig_mutex;
 	atomic_t			active;
 
-	int				changed;	/* true if we might need to reread partition info */
 	int				degraded;	/* whether md should consider
 							 * adding a spare
 							 */
-- 
cgit v1.2.3


From 18137207236285989dfc0ee7f929b954199228f3 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Thu, 10 May 2007 00:01:07 +0200
Subject: ide: fix UDMA/MWDMA/SWDMA masks (v3)

* use 0x00 instead of 0x80 to disable ->{ultra,mwdma,swdma}_mask
* add udma_mask field to ide_pci_device_t and use it to initialize
  ->ultra_mask in aec62xx, cmd64x, pdc202xx_{new,old} and piix drivers
* fix UDMA masks to match with chipset specific *_ratemask()
  (alim15x3, hpt366, serverworks and siimage drivers need UDMA mask
   filtering method - done in the next patch)

v2:
* piix: fix cable detection for 82801AA_1 and 82372FB_1
  [ Noticed by Sergei Shtylyov <sshtylyov@ru.mvista.com>. ]
* cmd64x: use hwif->cds->udma_mask
  [ Suggested by Sergei Shtylyov <sshtylyov@ru.mvista.com>. ]
* aec62xx: fix newly introduced bug - check DMA status not command register
  [ Noticed by Sergei Shtylyov <sshtylyov@ru.mvista.com>. ]

v3:
* piix: use hwif->cds->udma_mask
  [ Suggested by Sergei Shtylyov <sshtylyov@ru.mvista.com>. ]

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 include/linux/ide.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/ide.h b/include/linux/ide.h
index 418dfb5adadd..c9375c863584 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1257,6 +1257,7 @@ typedef struct ide_pci_device_s {
 	unsigned int		extra;
 	struct ide_pci_device_s	*next;
 	u8			flags;
+	u8			udma_mask;
 } ide_pci_device_t;
 
 extern int ide_setup_pci_device(struct pci_dev *, ide_pci_device_t *);
-- 
cgit v1.2.3


From 2d5eaa6dd744a641e75503232a01f52d0768884c Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Thu, 10 May 2007 00:01:08 +0200
Subject: ide: rework the code for selecting the best DMA transfer mode (v3)

Depends on the "ide: fix UDMA/MWDMA/SWDMA masks" patch.

* add ide_hwif_t.udma_filter hook for filtering UDMA mask
  (use it in alim15x3, hpt366, siimage and serverworks drivers)
* add ide_max_dma_mode() for finding best DMA mode for the device
  (loosely based on some older libata-core.c code)
* convert ide_dma_speed() users to use ide_max_dma_mode()
* make ide_rate_filter() take "ide_drive_t *drive" as an argument instead
  of "u8 mode" and teach it to how to use UDMA mask to do filtering
* use ide_rate_filter() in hpt366 driver
* remove no longer needed ide_dma_speed() and *_ratemask()
* unexport eighty_ninty_three()

v2:
* rename ->filter_udma_mask to ->udma_filter
  [ Suggested by Sergei Shtylyov <sshtylyov@ru.mvista.com>. ]

v3:
* updated for scc_pata driver (fixes XFER_UDMA_6 filtering for user-space
  originated transfer mode change requests when 100MHz clock is used)

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 include/linux/ide.h | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ide.h b/include/linux/ide.h
index c9375c863584..23ab4dc05009 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -717,11 +717,8 @@ typedef struct hwif_s {
 	int	(*quirkproc)(ide_drive_t *);
 	/* driver soft-power interface */
 	int	(*busproc)(ide_drive_t *, int);
-//	/* host rate limiter */
-//	u8	(*ratemask)(ide_drive_t *);
-//	/* device rate limiter */
-//	u8	(*ratefilter)(ide_drive_t *, u8);
 #endif
+	u8 (*udma_filter)(ide_drive_t *);
 
 	void (*ata_input_data)(ide_drive_t *, void *, u32);
 	void (*ata_output_data)(ide_drive_t *, void *, u32);
@@ -1279,6 +1276,7 @@ int ide_in_drive_list(struct hd_driveid *, const struct drive_list_entry *);
 int __ide_dma_bad_drive(ide_drive_t *);
 int __ide_dma_good_drive(ide_drive_t *);
 int ide_use_dma(ide_drive_t *);
+u8 ide_max_dma_mode(ide_drive_t *);
 void ide_dma_off(ide_drive_t *);
 void ide_dma_verbose(ide_drive_t *);
 int ide_set_dma(ide_drive_t *);
@@ -1305,6 +1303,7 @@ extern int __ide_dma_timeout(ide_drive_t *);
 
 #else
 static inline int ide_use_dma(ide_drive_t *drive) { return 0; }
+static inline u8 ide_max_dma_mode(ide_drive_t *drive) { return 0; }
 static inline void ide_dma_off(ide_drive_t *drive) { ; }
 static inline void ide_dma_verbose(ide_drive_t *drive) { ; }
 static inline int ide_set_dma(ide_drive_t *drive) { return 1; }
@@ -1349,8 +1348,7 @@ static inline void ide_set_hwifdata (ide_hwif_t * hwif, void *data)
 }
 
 /* ide-lib.c */
-extern u8 ide_dma_speed(ide_drive_t *drive, u8 mode);
-extern u8 ide_rate_filter(u8 mode, u8 speed); 
+u8 ide_rate_filter(ide_drive_t *, u8);
 extern int ide_dma_enable(ide_drive_t *drive);
 extern char *ide_xfer_verbose(u8 xfer_rate);
 extern void ide_toggle_bounce(ide_drive_t *drive, int on);
-- 
cgit v1.2.3


From 29e744d088e3555f4efbdf390f01088dd66993b6 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Thu, 10 May 2007 00:01:09 +0200
Subject: ide: add ide_tune_dma() helper

After reworking the code responsible for selecting the best DMA
transfer mode it is now possible to add generic ide_tune_dma() helper.

Convert some IDE PCI host drivers to use it (the ones left need more work).

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 include/linux/ide.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ide.h b/include/linux/ide.h
index 23ab4dc05009..d03fa2d5d75a 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1277,6 +1277,7 @@ int __ide_dma_bad_drive(ide_drive_t *);
 int __ide_dma_good_drive(ide_drive_t *);
 int ide_use_dma(ide_drive_t *);
 u8 ide_max_dma_mode(ide_drive_t *);
+int ide_tune_dma(ide_drive_t *);
 void ide_dma_off(ide_drive_t *);
 void ide_dma_verbose(ide_drive_t *);
 int ide_set_dma(ide_drive_t *);
@@ -1304,6 +1305,7 @@ extern int __ide_dma_timeout(ide_drive_t *);
 #else
 static inline int ide_use_dma(ide_drive_t *drive) { return 0; }
 static inline u8 ide_max_dma_mode(ide_drive_t *drive) { return 0; }
+static inline int ide_tune_dma(ide_drive_t *drive) { return 0; }
 static inline void ide_dma_off(ide_drive_t *drive) { ; }
 static inline void ide_dma_verbose(ide_drive_t *drive) { ; }
 static inline int ide_set_dma(ide_drive_t *drive) { return 1; }
-- 
cgit v1.2.3


From ecfd80e4a514123070b4cfb674b817ba75055df2 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Thu, 10 May 2007 00:01:09 +0200
Subject: ide: make /proc/ide/ optional

All important information/features should be already available through
sysfs and ioctl interfaces.

Add CONFIG_IDE_PROC_FS (CONFIG_SCSI_PROC_FS rip-off) config option,
disabling it makes IDE driver ~5 kB smaller (on x86-32).

While at it add CONFIG_PROC_FS=n versions of proc_ide_{create,destroy}()
and remove no longer needed #ifdefs.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 include/linux/ide.h | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ide.h b/include/linux/ide.h
index d03fa2d5d75a..697c39dd66a1 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -912,15 +912,15 @@ typedef struct {
 	write_proc_t	*write_proc;
 } ide_proc_entry_t;
 
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_IDE_PROC_FS
 extern struct proc_dir_entry *proc_ide_root;
 
-extern void proc_ide_create(void);
-extern void proc_ide_destroy(void);
-extern void create_proc_ide_interfaces(void);
+void proc_ide_create(void);
+void proc_ide_destroy(void);
+void create_proc_ide_interfaces(void);
 void destroy_proc_ide_interface(ide_hwif_t *);
-extern void ide_add_proc_entries(struct proc_dir_entry *, ide_proc_entry_t *, void *);
-extern void ide_remove_proc_entries(struct proc_dir_entry *, ide_proc_entry_t *);
+void ide_add_proc_entries(struct proc_dir_entry *, ide_proc_entry_t *, void *);
+void ide_remove_proc_entries(struct proc_dir_entry *, ide_proc_entry_t *);
 read_proc_t proc_ide_read_capacity;
 read_proc_t proc_ide_read_geometry;
 
@@ -944,6 +944,8 @@ void ide_pci_create_host_proc(const char *, get_info_t *);
 	return len;			\
 }
 #else
+static inline void proc_ide_create(void) { ; }
+static inline void proc_ide_destroy(void) { ; }
 static inline void create_proc_ide_interfaces(void) { ; }
 static inline void destroy_proc_ide_interface(ide_hwif_t *hwif) { ; }
 #define PROC_IDE_READ_RETURN(page,start,off,count,eof,len) return 0;
-- 
cgit v1.2.3


From 1497943ee692aa7519fa972d0e3a339649bf3a96 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Thu, 10 May 2007 00:01:10 +0200
Subject: ide: split off ioctl handling from IDE settings (v2)

* do write permission and min/max checks in ide_procset_t functions

* ide-disk.c: drive->id is always available so cleanup "multcount" setting
  accordingly

* ide-disk.c: "address" setting was incorrectly defined as type TYPE_INTA,
  fix it by using type TYPE_BYTE and updating ide_drive_t->adressing field,
  the bug didn't trigger because this IDE setting uses custom ->set function

* ide.c: add set_ksettings() for handling HDIO_SET_KEEPSETTINGS ioctl

* ide.c: add set_unmaskirq() for handling HDIO_SET_UNMASKINTR ioctl

* handle ioctls directly in generic_ide_ioclt() and idedisk_ioctl()
  instead of using IDE settings to deal with them

* remove no longer needed ide_find_setting_by_ioctl() and {read,write}_ioctl
  fields from ide_settings_t, also remove now unused TYPE_INTA handling

v2:
* add missing EXPORT_SYMBOL_GPL(ide_setting_sem) needed now for ide-disk

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 include/linux/ide.h | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ide.h b/include/linux/ide.h
index 697c39dd66a1..591a0b55e31c 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -601,16 +601,11 @@ typedef struct ide_drive_s {
 	unsigned remap_0_to_1	: 1;	/* 0=noremap, 1=remap 0->1 (for EZDrive) */
 	unsigned blocked        : 1;	/* 1=powermanagment told us not to do anything, so sleep nicely */
 	unsigned vdma		: 1;	/* 1=doing PIO over DMA 0=doing normal DMA */
-	unsigned addressing;		/*      : 3;
-					 *  0=28-bit
-					 *  1=48-bit
-					 *  2=48-bit doing 28-bit
-					 *  3=64-bit
-					 */
 	unsigned scsi		: 1;	/* 0=default, 1=ide-scsi emulation */
 	unsigned sleeping	: 1;	/* 1=sleeping & sleep field valid */
 	unsigned post_reset	: 1;
 
+	u8	addressing;	/* 0=28-bit, 1=48-bit, 2=48-bit doing 28-bit */
         u8	quirk_list;	/* considered quirky, set for a specific host */
         u8	init_speed;	/* transfer rate set at boot */
         u8	current_speed;	/* current transfer rate set */
@@ -870,9 +865,8 @@ typedef struct hwgroup_s {
  */
 
 #define TYPE_INT	0
-#define TYPE_INTA	1
-#define TYPE_BYTE	2
-#define TYPE_SHORT	3
+#define TYPE_BYTE	1
+#define TYPE_SHORT	2
 
 #define SETTING_READ	(1 << 0)
 #define SETTING_WRITE	(1 << 1)
@@ -882,8 +876,6 @@ typedef int (ide_procset_t)(ide_drive_t *, int);
 typedef struct ide_settings_s {
 	char			*name;
 	int			rw;
-	int			read_ioctl;
-	int			write_ioctl;
 	int			data_type;
 	int			min;
 	int			max;
@@ -896,7 +888,7 @@ typedef struct ide_settings_s {
 } ide_settings_t;
 
 extern struct semaphore ide_setting_sem;
-extern int ide_add_setting(ide_drive_t *drive, const char *name, int rw, int read_ioctl, int write_ioctl, int data_type, int min, int max, int mul_factor, int div_factor, void *data, ide_procset_t *set);
+int ide_add_setting(ide_drive_t *, const char *, int, int, int, int, int, int, void *, ide_procset_t *set);
 extern ide_settings_t *ide_find_setting_by_name(ide_drive_t *drive, char *name);
 extern int ide_read_setting(ide_drive_t *t, ide_settings_t *setting);
 extern int ide_write_setting(ide_drive_t *drive, ide_settings_t *setting, int val);
-- 
cgit v1.2.3


From 7662d046df09e80680b77b68de896beab45e675e Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Thu, 10 May 2007 00:01:10 +0200
Subject: ide: move IDE settings handling to ide-proc.c

* move
	__ide_add_setting()
	ide_add_setting()
	__ide_remove_setting()
	auto_remove_settings()
	ide_find_setting_by_name()
	ide_read_setting()
	ide_write_setting()
	set_xfer_rate()
	ide_add_generic_settings()
	ide_register_subdriver()
	ide_unregister_subdriver()

  from ide.c to ide-proc.c

* set_{io_32bit,pio_mode,using_dma}() cannot be marked static now, fix it

* rename ide_[un]register_subdriver() to ide_proc_[un]register_driver(),
  update device drivers to use new names

* add CONFIG_IDE_PROC_FS=n versions of ide_proc_[un]register_driver()
  and ide_add_generic_settings()

* make ide_find_setting_by_name(), ide_{read,write}_setting()
  and ide_{add,remove}_proc_entries() static

* cover IDE settings code in device drivers with CONFIG_IDE_PROC_FS #ifdef,
  also while at it cover with CONFIG_IDE_PROC_FS #ifdef ide_driver_t.proc

* remove bogus comment from ide.h

* cover with CONFIG_IDE_PROC_FS #ifdef .proc and .settings in ide_drive_t

Besides saner code this patch results in the IDE core smaller by ~2 kB
(on x86-32) and IDE disk driver by ~1 kB (ditto) when CONFIG_IDE_PROC_FS=n.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 include/linux/ide.h | 39 +++++++++++++++++++++++----------------
 1 file changed, 23 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ide.h b/include/linux/ide.h
index 591a0b55e31c..477b8c6be727 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -559,9 +559,10 @@ typedef struct ide_drive_s {
 	struct ide_drive_s 	*next;	/* circular list of hwgroup drives */
 	void		*driver_data;	/* extra driver data */
 	struct hd_driveid	*id;	/* drive model identification info */
+#ifdef CONFIG_IDE_PROC_FS
 	struct proc_dir_entry *proc;	/* /proc/ide/ directory entry */
 	struct ide_settings_s *settings;/* /proc/ide/ drive settings */
-
+#endif
 	struct hwif_s		*hwif;	/* actually (ide_hwif_t *) */
 
 	unsigned long sleep;		/* sleep until this time */
@@ -858,8 +859,15 @@ typedef struct hwgroup_s {
 	unsigned char cmd_buf[4];
 } ide_hwgroup_t;
 
-/* structure attached to the request for IDE_TASK_CMDS */
+typedef struct ide_driver_s ide_driver_t;
+
+extern struct semaphore ide_setting_sem;
 
+int set_io_32bit(ide_drive_t *, int);
+int set_pio_mode(ide_drive_t *, int);
+int set_using_dma(ide_drive_t *, int);
+
+#ifdef CONFIG_IDE_PROC_FS
 /*
  * configurable drive settings
  */
@@ -887,12 +895,7 @@ typedef struct ide_settings_s {
 	struct ide_settings_s	*next;
 } ide_settings_t;
 
-extern struct semaphore ide_setting_sem;
 int ide_add_setting(ide_drive_t *, const char *, int, int, int, int, int, int, void *, ide_procset_t *set);
-extern ide_settings_t *ide_find_setting_by_name(ide_drive_t *drive, char *name);
-extern int ide_read_setting(ide_drive_t *t, ide_settings_t *setting);
-extern int ide_write_setting(ide_drive_t *drive, ide_settings_t *setting, int val);
-extern void ide_add_generic_settings(ide_drive_t *drive);
 
 /*
  * /proc/ide interface
@@ -904,15 +907,17 @@ typedef struct {
 	write_proc_t	*write_proc;
 } ide_proc_entry_t;
 
-#ifdef CONFIG_IDE_PROC_FS
 extern struct proc_dir_entry *proc_ide_root;
 
 void proc_ide_create(void);
 void proc_ide_destroy(void);
 void create_proc_ide_interfaces(void);
 void destroy_proc_ide_interface(ide_hwif_t *);
-void ide_add_proc_entries(struct proc_dir_entry *, ide_proc_entry_t *, void *);
-void ide_remove_proc_entries(struct proc_dir_entry *, ide_proc_entry_t *);
+void ide_proc_register_driver(ide_drive_t *, ide_driver_t *);
+void ide_proc_unregister_driver(ide_drive_t *, ide_driver_t *);
+
+void ide_add_generic_settings(ide_drive_t *);
+
 read_proc_t proc_ide_read_capacity;
 read_proc_t proc_ide_read_geometry;
 
@@ -940,6 +945,9 @@ static inline void proc_ide_create(void) { ; }
 static inline void proc_ide_destroy(void) { ; }
 static inline void create_proc_ide_interfaces(void) { ; }
 static inline void destroy_proc_ide_interface(ide_hwif_t *hwif) { ; }
+static inline void ide_proc_register_driver(ide_drive_t *drive, ide_driver_t *driver) { ; }
+static inline void ide_proc_unregister_driver(ide_drive_t *drive, ide_driver_t *driver) { ; }
+static inline void ide_add_generic_settings(ide_drive_t *drive) { ; }
 #define PROC_IDE_READ_RETURN(page,start,off,count,eof,len) return 0;
 #endif
 
@@ -982,7 +990,7 @@ enum {
  * The gendriver.owner field should be set to the module owner of this driver.
  * The gendriver.name field should be set to the name of this driver
  */
-typedef struct ide_driver_s {
+struct ide_driver_s {
 	const char			*version;
 	u8				media;
 	unsigned supports_dsc_overlap	: 1;
@@ -990,12 +998,14 @@ typedef struct ide_driver_s {
 	int		(*end_request)(ide_drive_t *, int, int);
 	ide_startstop_t	(*error)(ide_drive_t *, struct request *rq, u8, u8);
 	ide_startstop_t	(*abort)(ide_drive_t *, struct request *rq);
-	ide_proc_entry_t	*proc;
 	struct device_driver	gen_driver;
 	int		(*probe)(ide_drive_t *);
 	void		(*remove)(ide_drive_t *);
 	void		(*shutdown)(ide_drive_t *);
-} ide_driver_t;
+#ifdef CONFIG_IDE_PROC_FS
+	ide_proc_entry_t	*proc;
+#endif
+};
 
 #define to_ide_driver(drv) container_of(drv, ide_driver_t, gen_driver)
 
@@ -1205,9 +1215,6 @@ extern void default_hwif_iops(ide_hwif_t *);
 extern void default_hwif_mmiops(ide_hwif_t *);
 extern void default_hwif_transport(ide_hwif_t *);
 
-void ide_register_subdriver(ide_drive_t *, ide_driver_t *);
-void ide_unregister_subdriver(ide_drive_t *, ide_driver_t *);
-
 #define ON_BOARD		1
 #define NEVER_BOARD		0
 
-- 
cgit v1.2.3


From 7f8f48af0861c38c28d4abd550102643e0ea9e6a Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Thu, 10 May 2007 00:01:10 +0200
Subject: ide: cable detection fixes (take 2)

Tejun's recent eighty_ninty_three() fix has inspired me to do more thorough
review of the cable detection code...

* print user-friendly warning about limiting the maximum transfer speed
  to UDMA33 (and the reason behind it) when 80-wire cable is not detected,
  also while at it cleanup eighty_ninty_three() a bit

* use eighty_ninty_three() in ide_ata66_check(), this actually fixes 3 bugs:
  - bit 14 (word 93 validity check) == 1 && bit 13 (80-wire cable test) == 1
    were used as 80-wire cable present test for CONFIG_IDEDMA_IVB=n case
    (please see FIXME comment in eighty_ninty_three() for more details)
  - CONFIG_IDEDMA_IVB=y/n cases were interchanged
  - check for SATA devices was missing

* remove private cable warnings from pdc_202xx{old,new} drivers now that core
  code provides this functionality (plus, in pdc202xx_new case the test could
  give false warnings for ATAPI devices because pdc202xx_new driver doesn't
  even support ATAPI DMA)

Cc: Tejun Heo <htejun@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 include/linux/ide.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/ide.h b/include/linux/ide.h
index 477b8c6be727..ca924b295c2e 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -605,6 +605,7 @@ typedef struct ide_drive_s {
 	unsigned scsi		: 1;	/* 0=default, 1=ide-scsi emulation */
 	unsigned sleeping	: 1;	/* 1=sleeping & sleep field valid */
 	unsigned post_reset	: 1;
+	unsigned udma33_warned	: 1;
 
 	u8	addressing;	/* 0=28-bit, 1=48-bit, 2=48-bit doing 28-bit */
         u8	quirk_list;	/* considered quirky, set for a specific host */
-- 
cgit v1.2.3


From 869c56ee9de1b72cd3f8ab9cdfbd3601e55c61f2 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Thu, 10 May 2007 00:01:10 +0200
Subject: ide: add "initializing" argument to ide_register_hw()

Add "initializing" argument to ide_register_hw() and use it instead of ide.c
wide variable of the same name.  Update all users of ide_register_hw()
accordingly.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 include/linux/ide.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ide.h b/include/linux/ide.h
index ca924b295c2e..bdb97655ef61 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -223,8 +223,9 @@ typedef struct hw_regs_s {
 /*
  * Register new hardware with ide
  */
-int ide_register_hw(hw_regs_t *hw, struct hwif_s **hwifp);
-int ide_register_hw_with_fixup(hw_regs_t *, struct hwif_s **, void (*)(struct hwif_s *));
+int ide_register_hw(hw_regs_t *, int, struct hwif_s **);
+int ide_register_hw_with_fixup(hw_regs_t *, int, struct hwif_s **,
+			       void (*)(struct hwif_s *));
 
 /*
  * Set up hw_regs_t structure before calling ide_register_hw (optional)
-- 
cgit v1.2.3


From 5cbf79cdb37be2aa2a1b4fa94144526b14557060 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Thu, 10 May 2007 00:01:11 +0200
Subject: ide: add ide_proc_register_port()

* create_proc_ide_interfaces() tries to add /proc entries for every probed
  and initialized IDE port, replace it by ide_proc_register_port() which does
  it only for the given port (also rename destroy_proc_ide_interface() to
  ide_proc_unregister_port() for consistency)

* convert {create,destroy}_proc_ide_interface[s]() users to use new functions

* pmac driver depended on proc_ide_create() to add /proc port entries, fix it

* au1xxx-ide, swarm and cs5520 drivers depended indirectly on ide-generic
  driver (CONFIG_IDE_GENERIC=y) to add port /proc entries, fix them

* there is now no need to add /proc entries for IDE ports in proc_ide_create()
  so don't do it

* proc_ide_create() needs now to be called before drivers are probed - fix it,
  while at it make proc_ide_create() create /proc "ide" directory

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 include/linux/ide.h | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ide.h b/include/linux/ide.h
index bdb97655ef61..52d482a16dd9 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -909,12 +909,10 @@ typedef struct {
 	write_proc_t	*write_proc;
 } ide_proc_entry_t;
 
-extern struct proc_dir_entry *proc_ide_root;
-
 void proc_ide_create(void);
 void proc_ide_destroy(void);
-void create_proc_ide_interfaces(void);
-void destroy_proc_ide_interface(ide_hwif_t *);
+void ide_proc_register_port(ide_hwif_t *);
+void ide_proc_unregister_port(ide_hwif_t *);
 void ide_proc_register_driver(ide_drive_t *, ide_driver_t *);
 void ide_proc_unregister_driver(ide_drive_t *, ide_driver_t *);
 
@@ -945,8 +943,8 @@ void ide_pci_create_host_proc(const char *, get_info_t *);
 #else
 static inline void proc_ide_create(void) { ; }
 static inline void proc_ide_destroy(void) { ; }
-static inline void create_proc_ide_interfaces(void) { ; }
-static inline void destroy_proc_ide_interface(ide_hwif_t *hwif) { ; }
+static inline void ide_proc_register_port(ide_hwif_t *hwif) { ; }
+static inline void ide_proc_unregister_port(ide_hwif_t *hwif) { ; }
 static inline void ide_proc_register_driver(ide_drive_t *drive, ide_driver_t *driver) { ; }
 static inline void ide_proc_unregister_driver(ide_drive_t *drive, ide_driver_t *driver) { ; }
 static inline void ide_add_generic_settings(ide_drive_t *drive) { ; }
-- 
cgit v1.2.3


From 6d208b39c45edee5def6c201fcd51561c5a39828 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Thu, 10 May 2007 00:01:11 +0200
Subject: ide: legacy PCI bus order probing fixes

IDE PCI host drivers should register themselves with IDE core only when
IDE driver is built-in, otherwise (IDE driver is modular and thus IDE PCI
host drivers are also modular) the code has no effect and just complicates
the probing.

Fix it by adding new config option CONFIG_IDEPCI_PCIBUS (defined only when
needed and invisible to the user) and covering by #ifdef/#endif the code
in question.  It turned out that "ide=reverse" was silently accepted but did
nothing in case when IDE driver was modular, this is fixed now.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 include/linux/ide.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ide.h b/include/linux/ide.h
index 52d482a16dd9..df4e6a510310 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1205,9 +1205,14 @@ void ide_init_disk(struct gendisk *, ide_drive_t *);
 
 extern int ideprobe_init(void);
 
+#ifdef CONFIG_IDEPCI_PCIBUS_ORDER
 extern void ide_scan_pcibus(int scan_direction) __init;
 extern int __ide_pci_register_driver(struct pci_driver *driver, struct module *owner, const char *mod_name);
 #define ide_pci_register_driver(d) __ide_pci_register_driver(d, THIS_MODULE, KBUILD_MODNAME)
+#else
+#define ide_pci_register_driver(d) pci_register_driver(d)
+#endif
+
 void ide_pci_setup_ports(struct pci_dev *, struct ide_pci_device_s *, int, ata_index_t *);
 extern void ide_setup_pci_noise (struct pci_dev *dev, struct ide_pci_device_s *d);
 
-- 
cgit v1.2.3


From 44ce6294d07555c3d313757105fd44b78208407f Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@woody.linux-foundation.org>
Date: Wed, 9 May 2007 18:51:36 -0700
Subject: Revert "md: improve partition detection in md array"

This reverts commit 5b479c91da90eef605f851508744bfe8269591a0.

Quoth Neil Brown:

  "It causes an oops when auto-detecting raid arrays, and it doesn't
   seem easy to fix.

   The array may not be 'open' when do_md_run is called, so
   bdev->bd_disk might be NULL, so bd_set_size can oops.

   This whole approach of opening an md device before it has been
   assembled just seems to get more and more painful.  I think I'm going
   to have to come up with something clever to provide both backward
   comparability with usage expectation, and sane integration into the
   rest of the kernel."

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/raid/md_k.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index a121f36f4437..de72c49747c8 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -201,6 +201,7 @@ struct mddev_s
 	struct mutex			reconfig_mutex;
 	atomic_t			active;
 
+	int				changed;	/* true if we might need to reread partition info */
 	int				degraded;	/* whether md should consider
 							 * adding a spare
 							 */
-- 
cgit v1.2.3