From 857a2beb09ab83e9a8185821ae16db7dfbe8b837 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 14 Apr 2013 20:50:08 -0700
Subject: cgroup: implement task_cgroup_path_from_hierarchy()

kdbus folks want a sane way to determine the cgroup path that a given
task belongs to on a given hierarchy, which is a reasonble thing to
expect from cgroup core.

Implement task_cgroup_path_from_hierarchy().

v2: Dropped unnecessary NULL check on the return value of
    task_cgroup_from_root() as suggested by Li Zefan.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Greg Kroah-Hartman <greg@kroah.com>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Kay Sievers <kay@vrfy.org>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Daniel Mack <daniel@zonque.org>
---
 include/linux/cgroup.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 5047355b9a0f..383c630f36f9 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -542,6 +542,8 @@ int cgroup_is_removed(const struct cgroup *cgrp);
 bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor);
 
 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen);
+int task_cgroup_path_from_hierarchy(struct task_struct *task, int hierarchy_id,
+				    char *buf, size_t buflen);
 
 int cgroup_task_count(const struct cgroup *cgrp);
 
-- 
cgit v1.2.3


From 23958e729e7029678e746bf8f4094c8863a79c3d Mon Sep 17 00:00:00 2001
From: Greg KH <gregkh@linuxfoundation.org>
Date: Fri, 3 May 2013 16:26:59 -0700
Subject: cgroup.h: remove some functions that are now gone

cgroup_lock() and cgroup_unlock() are now no longer exported, so fix
cgroup.h to not declare them if CONFIG_CGROUPS is not enabled.

Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 383c630f36f9..4f6f5138c340 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -840,8 +840,6 @@ static inline void cgroup_fork(struct task_struct *p) {}
 static inline void cgroup_post_fork(struct task_struct *p) {}
 static inline void cgroup_exit(struct task_struct *p, int callbacks) {}
 
-static inline void cgroup_lock(void) {}
-static inline void cgroup_unlock(void) {}
 static inline int cgroupstats_build(struct cgroupstats *stats,
 					struct dentry *dentry)
 {
-- 
cgit v1.2.3


From 9138125beabbb76b4a373d4a619870f6f5d86fc5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 May 2013 13:52:38 -0700
Subject: blk-throttle: implement proper hierarchy support

With the recent updates, blk-throttle is finally ready for proper
hierarchy support.  Dispatching now honors service_queue->parent_sq
and propagates correctly.  The only thing missing is setting
->parent_sq correctly so that throtl_grp hierarchy matches the cgroup
hierarchy.

This patch updates throtl_pd_init() such that service_queues form the
same hierarchy as the cgroup hierarchy if sane_behavior is enabled.
As this concludes proper hierarchy support for blkcg, the shameful
.broken_hierarchy tag is removed from blkio_subsys.

v2: Updated blkio-controller.txt as suggested by Vivek.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Cc: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 5047355b9a0f..09f1a1408ae0 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -272,6 +272,8 @@ enum {
 	 * - memcg: use_hierarchy is on by default and the cgroup file for
 	 *   the flag is not created.
 	 *
+	 * - blkcg: blk-throttle becomes properly hierarchical.
+	 *
 	 * The followings are planned changes.
 	 *
 	 * - release_agent will be disallowed once replacement notification
-- 
cgit v1.2.3


From bdc7119f1bdd0632d42f435941dc290216a436e7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 24 May 2013 10:55:38 +0900
Subject: cgroup: make cgroup_is_removed() static

cgroup_is_removed() no longer has external users and it shouldn't grow
any - controllers should deal with cgroup_subsys_state on/offline
state instead of cgroup removal state.  Make it static.

While at it, make it return bool.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 1df5f699be61..8d9f3c911fca 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -538,7 +538,6 @@ static inline const char *cgroup_name(const struct cgroup *cgrp)
 int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
 int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
 
-int cgroup_is_removed(const struct cgroup *cgrp);
 bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor);
 
 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen);
-- 
cgit v1.2.3


From 53fa5261747a90746531e8a1c81eeb78fedc2f71 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 24 May 2013 10:55:38 +0900
Subject: cgroup: add cgroup->serial_nr and implement cgroup_next_sibling()

Currently, there's no easy way to find out the next sibling cgroup
unless it's known that the current cgroup is accessed from the
parent's children list in a single RCU critical section.  This in turn
forces all iterators to require whole iteration to be enclosed in a
single RCU critical section, which sometimes is too restrictive.  This
patch implements cgroup_next_sibling() which can reliably determine
the next sibling regardless of the state of the current cgroup as long
as it's accessible.

It currently is impossible to determine the next sibling after
dropping RCU read lock because the cgroup being iterated could be
removed anytime and if RCU read lock is dropped, nothing guarantess
its ->sibling.next pointer is accessible.  A removed cgroup would
continue to point to its next sibling for RCU accesses but stop
receiving updates from the sibling.  IOW, the next sibling could be
removed and then complete its grace period while RCU read lock is
dropped, making it unsafe to dereference ->sibling.next after dropping
and re-acquiring RCU read lock.

This can be solved by adding a way to traverse to the next sibling
without dereferencing ->sibling.next.  This patch adds a monotonically
increasing cgroup serial number, cgroup->serial_nr, which guarantees
that all cgroup->children lists are kept in increasing serial_nr
order.  A new function, cgroup_next_sibling(), is implemented, which,
if CGRP_REMOVED is not set on the current cgroup, follows
->sibling.next; otherwise, traverses the parent's ->children list
until it sees a sibling with higher ->serial_nr.

This allows the function to always return the next sibling regardless
of the state of the current cgroup without adding overhead in the fast
path.

Further patches will update the iterators to use cgroup_next_sibling()
so that they allow dropping RCU read lock and blocking while iteration
is in progress which in turn will be used to simplify controllers.

v2: Typo fix as per Serge.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com>
---
 include/linux/cgroup.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 8d9f3c911fca..ee041a01a67e 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -188,6 +188,14 @@ struct cgroup {
 	struct cgroup *parent;		/* my parent */
 	struct dentry *dentry;		/* cgroup fs entry, RCU protected */
 
+	/*
+	 * Monotonically increasing unique serial number which defines a
+	 * uniform order among all cgroups.  It's guaranteed that all
+	 * ->children lists are in the ascending order of ->serial_nr.
+	 * It's used to allow interrupting and resuming iterations.
+	 */
+	u64 serial_nr;
+
 	/*
 	 * This is a copy of dentry->d_name, and it's needed because
 	 * we can't use dentry->d_name in cgroup_path().
@@ -675,6 +683,8 @@ static inline struct cgroup* task_cgroup(struct task_struct *task,
 	return task_subsys_state(task, subsys_id)->cgroup;
 }
 
+struct cgroup *cgroup_next_sibling(struct cgroup *pos);
+
 /**
  * cgroup_for_each_child - iterate through children of a cgroup
  * @pos: the cgroup * to use as the loop cursor
-- 
cgit v1.2.3


From 75501a6d59e989e5c286716e5b3b66ace4660e83 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 24 May 2013 10:55:38 +0900
Subject: cgroup: update iterators to use cgroup_next_sibling()

This patch converts cgroup_for_each_child(),
cgroup_next_descendant_pre/post() and thus
cgroup_for_each_descendant_pre/post() to use cgroup_next_sibling()
instead of manually dereferencing ->sibling.next.

The only reason the iterators couldn't allow dropping RCU read lock
while iteration is in progress was because they couldn't determine the
next sibling safely once RCU read lock is dropped.  Using
cgroup_next_sibling() removes that problem and enables all iterators
to allow dropping RCU read lock in the middle.  Comments are updated
accordingly.

This makes the iterators easier to use and will simplify controllers.

Note that @cgroup argument is renamed to @cgrp in
cgroup_for_each_child() because it conflicts with "struct cgroup" used
in the new macro body.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
---
 include/linux/cgroup.h | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index ee041a01a67e..d0ad3794b947 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -688,9 +688,9 @@ struct cgroup *cgroup_next_sibling(struct cgroup *pos);
 /**
  * cgroup_for_each_child - iterate through children of a cgroup
  * @pos: the cgroup * to use as the loop cursor
- * @cgroup: cgroup whose children to walk
+ * @cgrp: cgroup whose children to walk
  *
- * Walk @cgroup's children.  Must be called under rcu_read_lock().  A child
+ * Walk @cgrp's children.  Must be called under rcu_read_lock().  A child
  * cgroup which hasn't finished ->css_online() or already has finished
  * ->css_offline() may show up during traversal and it's each subsystem's
  * responsibility to verify that each @pos is alive.
@@ -698,9 +698,15 @@ struct cgroup *cgroup_next_sibling(struct cgroup *pos);
  * If a subsystem synchronizes against the parent in its ->css_online() and
  * before starting iterating, a cgroup which finished ->css_online() is
  * guaranteed to be visible in the future iterations.
+ *
+ * It is allowed to temporarily drop RCU read lock during iteration.  The
+ * caller is responsible for ensuring that @pos remains accessible until
+ * the start of the next iteration by, for example, bumping the css refcnt.
  */
-#define cgroup_for_each_child(pos, cgroup)				\
-	list_for_each_entry_rcu(pos, &(cgroup)->children, sibling)
+#define cgroup_for_each_child(pos, cgrp)				\
+	for ((pos) = list_first_or_null_rcu(&(cgrp)->children,		\
+					    struct cgroup, sibling);	\
+	     (pos); (pos) = cgroup_next_sibling((pos)))
 
 struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
 					  struct cgroup *cgroup);
@@ -759,6 +765,10 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos);
  * Alternatively, a subsystem may choose to use a single global lock to
  * synchronize ->css_online() and ->css_offline() against tree-walking
  * operations.
+ *
+ * It is allowed to temporarily drop RCU read lock during iteration.  The
+ * caller is responsible for ensuring that @pos remains accessible until
+ * the start of the next iteration by, for example, bumping the css refcnt.
  */
 #define cgroup_for_each_descendant_pre(pos, cgroup)			\
 	for (pos = cgroup_next_descendant_pre(NULL, (cgroup)); (pos);	\
-- 
cgit v1.2.3


From 5c5cc62321d9df7a9a608346fc649c4528380c8f Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Sun, 9 Jun 2013 17:16:29 +0800
Subject: cpuset: allow to keep tasks in empty cpusets

To achieve this:

- We call update_tasks_cpumask/nodemask() for empty cpusets when
hotplug happens, instead of moving tasks out of them.

- When a cpuset's masks are changed by writing cpuset.cpus/mems,
we also update tasks in child cpusets which are empty.

v3:
- do propagation work in one place for both hotplug and unplug

v2:
- drop rcu_read_lock before calling update_task_nodemask() and
  update_task_cpumask(), instead of using workqueue.
- add documentation in include/linux/cgroup.h

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index d0ad3794b947..53e81a61be57 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -277,6 +277,10 @@ enum {
 	 *
 	 * - Remount is disallowed.
 	 *
+	 * - cpuset: tasks will be kept in empty cpusets when hotplug happens
+	 *   and take masks of ancestors with non-empty cpus/mems, instead of
+	 *   being moved to an ancestor.
+	 *
 	 * - memcg: use_hierarchy is on by default and the cgroup file for
 	 *   the flag is not created.
 	 *
-- 
cgit v1.2.3


From 88fa523bff295f1d60244a54833480b02f775152 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Sun, 9 Jun 2013 17:16:46 +0800
Subject: cpuset: allow to move tasks to empty cpusets

Currently some cpuset behaviors are not friendly when cpuset is co-mounted
with other cgroup controllers.

Now with this patchset if cpuset is mounted with sane_behavior option,
it behaves differently:

- Tasks will be kept in empty cpusets when hotplug happens and take
  masks of ancestors with non-empty cpus/mems, instead of being moved to
  an ancestor.

- A task can be moved into an empty cpuset, and again it takes masks of
  ancestors, so the user can drop a task into a newly created cgroup without
  having to do anything for it.

As tasks can reside in empy cpusets, here're some rules:

- They can be moved to another cpuset, regardless it's empty or not.

- Though it takes masks from ancestors, it takes other configs from the
  empty cpuset.

- If the ancestors' masks are changed, those tasks will also be updated
  to take new masks.

v2: add documentation in include/linux/cgroup.h

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 53e81a61be57..74e8b8e4cd7f 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -281,6 +281,9 @@ enum {
 	 *   and take masks of ancestors with non-empty cpus/mems, instead of
 	 *   being moved to an ancestor.
 	 *
+	 * - cpuset: a task can be moved into an empty cpuset, and again it
+	 *   takes masks of ancestors.
+	 *
 	 * - memcg: use_hierarchy is on by default and the cgroup file for
 	 *   the flag is not created.
 	 *
-- 
cgit v1.2.3


From 3fc3db9a3ae0ce108badf31a4a00e41b4236f5fc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 12 Jun 2013 21:04:48 -0700
Subject: cgroup: remove now unused css_depth()

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index d0ad3794b947..5830592258dc 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -848,7 +848,6 @@ bool css_is_ancestor(struct cgroup_subsys_state *cg,
 
 /* Get id and depth of css */
 unsigned short css_id(struct cgroup_subsys_state *css);
-unsigned short css_depth(struct cgroup_subsys_state *css);
 struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id);
 
 #else /* !CONFIG_CGROUPS */
-- 
cgit v1.2.3


From 69d0206c793a17431eacee2694ee7a4b25df76b7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 12 Jun 2013 21:04:50 -0700
Subject: cgroup: bring some sanity to naming around cg_cgroup_link

cgroups and css_sets are mapped M:N and this M:N mapping is
represented by struct cg_cgroup_link which forms linked lists on both
sides.  The naming around this mapping is already confusing and struct
cg_cgroup_link exacerbates the situation quite a bit.

>From cgroup side, it starts off ->css_sets and runs through
->cgrp_link_list.  From css_set side, it starts off ->cg_links and
runs through ->cg_link_list.  This is rather reversed as
cgrp_link_list is used to iterate css_sets and cg_link_list cgroups.
Also, this is the only place which is still using the confusing "cg"
for css_sets.  This patch cleans it up a bit.

* s/cgroup->css_sets/cgroup->cset_links/
  s/css_set->cg_links/css_set->cgrp_links/
  s/cgroup_iter->cg_link/cgroup_iter->cset_link/

* s/cg_cgroup_link/cgrp_cset_link/

* s/cgrp_cset_link->cg/cgrp_cset_link->cset/
  s/cgrp_cset_link->cgrp_link_list/cgrp_cset_link->cset_link/
  s/cgrp_cset_link->cg_link_list/cgrp_cset_link->cgrp_link/

* s/init_css_set_link/init_cgrp_cset_link/
  s/free_cg_links/free_cgrp_cset_links/
  s/allocate_cg_links/allocate_cgrp_cset_links/

* s/cgl[12]/link[12]/ in compare_css_sets()

* s/saved_link/tmp_link/ s/tmp/tmp_links/ and a couple similar
  adustments.

* Comment and whiteline adjustments.

After the changes, we have

	list_for_each_entry(link, &cont->cset_links, cset_link) {
		struct css_set *cset = link->cset;

instead of

	list_for_each_entry(link, &cont->css_sets, cgrp_link_list) {
		struct css_set *cset = link->cg;

This patch is purely cosmetic.

v2: Fix broken sentences in the patch description.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 5830592258dc..0e32855edc92 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -215,10 +215,10 @@ struct cgroup {
 	struct cgroupfs_root *root;
 
 	/*
-	 * List of cg_cgroup_links pointing at css_sets with
-	 * tasks in this cgroup. Protected by css_set_lock
+	 * List of cgrp_cset_links pointing at css_sets with tasks in this
+	 * cgroup.  Protected by css_set_lock.
 	 */
-	struct list_head css_sets;
+	struct list_head cset_links;
 
 	struct list_head allcg_node;	/* cgroupfs_root->allcg_list */
 	struct list_head cft_q_node;	/* used during cftype add/rm */
@@ -365,11 +365,10 @@ struct css_set {
 	struct list_head tasks;
 
 	/*
-	 * List of cg_cgroup_link objects on link chains from
-	 * cgroups referenced from this css_set. Protected by
-	 * css_set_lock
+	 * List of cgrp_cset_links pointing at cgroups referenced from this
+	 * css_set.  Protected by css_set_lock.
 	 */
-	struct list_head cg_links;
+	struct list_head cgrp_links;
 
 	/*
 	 * Set of subsystem states, one for each subsystem. This array
@@ -792,7 +791,7 @@ struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
 
 /* A cgroup_iter should be treated as an opaque object */
 struct cgroup_iter {
-	struct list_head *cg_link;
+	struct list_head *cset_link;
 	struct list_head *task;
 };
 
-- 
cgit v1.2.3


From 5de0107e634ce862f16360139709d9d3a656463e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 12 Jun 2013 21:04:52 -0700
Subject: cgroup: clean up css_[try]get() and css_put()

* __css_get() isn't used by anyone.  Fold it into css_get().

* Add proper function comments to all css reference functions.

This patch is purely cosmetic.

v2: Typo fix as per Li.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 48 ++++++++++++++++++++++++------------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 0e32855edc92..a494636a34da 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -94,33 +94,31 @@ enum {
 	CSS_ONLINE	= (1 << 1), /* between ->css_online() and ->css_offline() */
 };
 
-/* Caller must verify that the css is not for root cgroup */
-static inline void __css_get(struct cgroup_subsys_state *css, int count)
-{
-	atomic_add(count, &css->refcnt);
-}
-
-/*
- * Call css_get() to hold a reference on the css; it can be used
- * for a reference obtained via:
- * - an existing ref-counted reference to the css
- * - task->cgroups for a locked task
+/**
+ * css_get - obtain a reference on the specified css
+ * @css: target css
+ *
+ * The caller must already have a reference.
  */
-
 static inline void css_get(struct cgroup_subsys_state *css)
 {
 	/* We don't need to reference count the root state */
 	if (!(css->flags & CSS_ROOT))
-		__css_get(css, 1);
+		atomic_inc(&css->refcnt);
 }
 
-/*
- * Call css_tryget() to take a reference on a css if your existing
- * (known-valid) reference isn't already ref-counted. Returns false if
- * the css has been destroyed.
- */
-
 extern bool __css_tryget(struct cgroup_subsys_state *css);
+
+/**
+ * css_tryget - try to obtain a reference on the specified css
+ * @css: target css
+ *
+ * Obtain a reference on @css if it's alive.  The caller naturally needs to
+ * ensure that @css is accessible but doesn't have to be holding a
+ * reference on it - IOW, RCU protected access is good enough for this
+ * function.  Returns %true if a reference count was successfully obtained;
+ * %false otherwise.
+ */
 static inline bool css_tryget(struct cgroup_subsys_state *css)
 {
 	if (css->flags & CSS_ROOT)
@@ -128,12 +126,14 @@ static inline bool css_tryget(struct cgroup_subsys_state *css)
 	return __css_tryget(css);
 }
 
-/*
- * css_put() should be called to release a reference taken by
- * css_get() or css_tryget()
- */
-
 extern void __css_put(struct cgroup_subsys_state *css);
+
+/**
+ * css_put - put a css reference
+ * @css: target css
+ *
+ * Put a reference obtained via css_get() and css_tryget().
+ */
 static inline void css_put(struct cgroup_subsys_state *css)
 {
 	if (!(css->flags & CSS_ROOT))
-- 
cgit v1.2.3


From 54766d4a1d3d6f84ff8fa475cd8f165c0a0000eb Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 12 Jun 2013 21:04:53 -0700
Subject: cgroup: rename CGRP_REMOVED to CGRP_DEAD

We will add another flag indicating that the cgroup is in the process
of being killed.  REMOVING / REMOVED is more difficult to distinguish
and cgroup_is_removing()/cgroup_is_removed() are a bit awkward.  Also,
later percpu_ref usage will involve "kill"ing the refcnt.

 s/CGRP_REMOVED/CGRP_DEAD/
 s/cgroup_is_removed()/cgroup_is_dead()

This patch is purely cosmetic.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index a494636a34da..c86a93abe83d 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -143,7 +143,7 @@ static inline void css_put(struct cgroup_subsys_state *css)
 /* bits in struct cgroup flags field */
 enum {
 	/* Control Group is dead */
-	CGRP_REMOVED,
+	CGRP_DEAD,
 	/*
 	 * Control Group has previously had a child cgroup or a task,
 	 * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set)
-- 
cgit v1.2.3


From 6f3d828f0fb7fdaffc6f32cb8a1cb7fcf8824598 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 12 Jun 2013 21:04:55 -0700
Subject: cgroup: remove cgroup->count and use

cgroup->count tracks the number of css_sets associated with the cgroup
and used only to verify that no css_set is associated when the cgroup
is being destroyed.  It's superflous as the destruction path can
simply check whether cgroup->cset_links is empty instead.

Drop cgroup->count and check ->cset_links directly from
cgroup_destroy_locked().

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index c86a93abe83d..81bfd0268e93 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -169,12 +169,6 @@ struct cgroup_name {
 struct cgroup {
 	unsigned long flags;		/* "unsigned long" so bitops work */
 
-	/*
-	 * count users of this cgroup. >0 means busy, but doesn't
-	 * necessarily indicate the number of tasks in the cgroup
-	 */
-	atomic_t count;
-
 	int id;				/* ida allocated in-hierarchy ID */
 
 	/*
-- 
cgit v1.2.3


From ea15f8ccdb430af1e8bc9b4e19a230eb4c356777 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Jun 2013 19:27:42 -0700
Subject: cgroup: split cgroup destruction into two steps

Split cgroup_destroy_locked() into two steps and put the latter half
into cgroup_offline_fn() which is executed from a work item.  The
latter half is responsible for offlining the css's, removing the
cgroup from internal lists, and propagating release notification to
the parent.  The separation is to allow using percpu refcnt for css.

Note that this allows for other cgroup operations to happen between
the first and second halves of destruction, including creating a new
cgroup with the same name.  As the target cgroup is marked DEAD in the
first half and cgroup internals don't care about the names of cgroups,
this should be fine.  A comment explaining this will be added by the
next patch which implements the actual percpu refcnting.

As RCU freeing is guaranteed to happen after the second step of
destruction, we can use the same work item for both.  This patch
renames cgroup->free_work to ->destroy_work and uses it for both
purposes.  INIT_WORK() is now performed right before queueing the work
item.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 81bfd0268e93..e345d8b90046 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -233,7 +233,7 @@ struct cgroup {
 
 	/* For RCU-protected deletion */
 	struct rcu_head rcu_head;
-	struct work_struct free_work;
+	struct work_struct destroy_work;
 
 	/* List of events which userspace want to receive */
 	struct list_head event_list;
-- 
cgit v1.2.3


From d3daf28da16a30af95bfb303189a634a87606725 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Jun 2013 19:39:16 -0700
Subject: cgroup: use percpu refcnt for cgroup_subsys_states

A css (cgroup_subsys_state) is how each cgroup is represented to a
controller.  As such, it can be used in hot paths across the various
subsystems different controllers are associated with.

One of the common operations is reference counting, which up until now
has been implemented using a global atomic counter and can have
significant adverse impact on scalability.  For example, css refcnt
can be gotten and put multiple times by blkcg for each IO request.
For highops configurations which try to do as much per-cpu as
possible, the global frequent refcnting can be very expensive.

In general, given the various and hugely diverse paths css's end up
being used from, we need to make it cheap and highly scalable.  In its
usage, css refcnting isn't very different from module refcnting.

This patch converts css refcnting to use the recently added
percpu_ref.  css_get/tryget/put() directly maps to the matching
percpu_ref operations and the deactivation logic is no longer
necessary as percpu_ref already has refcnt killing.

The only complication is that as the refcnt is per-cpu,
percpu_ref_kill() in itself doesn't ensure that further tryget
operations will fail, which we need to guarantee before invoking
->css_offline()'s.  This is resolved collecting kill confirmation
using percpu_ref_kill_and_confirm() and initiating the offline phase
of destruction after all css refcnt's are confirmed to be seen as
killed on all CPUs.  The previous patches already splitted destruction
into two phases, so percpu_ref_kill_and_confirm() can be hooked up
easily.

This patch removes css_refcnt() which is used for rcu dereference
sanity check in css_id().  While we can add a percpu refcnt API to ask
the same question, css_id() itself is scheduled to be removed fairly
soon, so let's not bother with it.  Just drop the sanity check and use
rcu_dereference_raw() instead.

v2: - init_cgroup_css() was calling percpu_ref_init() without checking
      the return value.  This causes two problems - the obvious lack
      of error handling and percpu_ref_init() being called from
      cgroup_init_subsys() before the allocators are up, which
      triggers warnings but doesn't cause actual problems as the
      refcnt isn't used for roots anyway.  Fix both by moving
      percpu_ref_init() to cgroup_create().

    - The base references were put too early by
      percpu_ref_kill_and_confirm() and cgroup_offline_fn() put the
      refs one extra time.  This wasn't noticeable because css's go
      through another RCU grace period before being freed.  Update
      cgroup_destroy_locked() to grab an extra reference before
      killing the refcnts.  This problem was noticed by Kent.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Kent Overstreet <koverstreet@google.com>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: "Alasdair G. Kergon" <agk@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Cc: Glauber Costa <glommer@gmail.com>
---
 include/linux/cgroup.h | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index e345d8b90046..b7bd4beae294 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -20,6 +20,7 @@
 #include <linux/workqueue.h>
 #include <linux/xattr.h>
 #include <linux/fs.h>
+#include <linux/percpu-refcount.h>
 
 #ifdef CONFIG_CGROUPS
 
@@ -72,13 +73,8 @@ struct cgroup_subsys_state {
 	 */
 	struct cgroup *cgroup;
 
-	/*
-	 * State maintained by the cgroup system to allow subsystems
-	 * to be "busy". Should be accessed via css_get(),
-	 * css_tryget() and css_put().
-	 */
-
-	atomic_t refcnt;
+	/* reference count - access via css_[try]get() and css_put() */
+	struct percpu_ref refcnt;
 
 	unsigned long flags;
 	/* ID for this css, if possible */
@@ -104,11 +100,9 @@ static inline void css_get(struct cgroup_subsys_state *css)
 {
 	/* We don't need to reference count the root state */
 	if (!(css->flags & CSS_ROOT))
-		atomic_inc(&css->refcnt);
+		percpu_ref_get(&css->refcnt);
 }
 
-extern bool __css_tryget(struct cgroup_subsys_state *css);
-
 /**
  * css_tryget - try to obtain a reference on the specified css
  * @css: target css
@@ -123,11 +117,9 @@ static inline bool css_tryget(struct cgroup_subsys_state *css)
 {
 	if (css->flags & CSS_ROOT)
 		return true;
-	return __css_tryget(css);
+	return percpu_ref_tryget(&css->refcnt);
 }
 
-extern void __css_put(struct cgroup_subsys_state *css);
-
 /**
  * css_put - put a css reference
  * @css: target css
@@ -137,7 +129,7 @@ extern void __css_put(struct cgroup_subsys_state *css);
 static inline void css_put(struct cgroup_subsys_state *css)
 {
 	if (!(css->flags & CSS_ROOT))
-		__css_put(css);
+		percpu_ref_put(&css->refcnt);
 }
 
 /* bits in struct cgroup flags field */
@@ -231,9 +223,10 @@ struct cgroup {
 	struct list_head pidlists;
 	struct mutex pidlist_mutex;
 
-	/* For RCU-protected deletion */
+	/* For css percpu_ref killing and RCU-protected deletion */
 	struct rcu_head rcu_head;
 	struct work_struct destroy_work;
+	atomic_t css_kill_cnt;
 
 	/* List of events which userspace want to receive */
 	struct list_head event_list;
-- 
cgit v1.2.3


From f63674fd0d6afa1ba24309aee1f8c60195d39041 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Jun 2013 19:58:38 -0700
Subject: cgroup: update sane_behavior documentation

f12dc02014 ("cgroup: mark "tasks" cgroup file as insane") and
cc5943a781 ("cgroup: mark "notify_on_release" and "release_agent"
cgroup files insane") forgot to update the changed behavior
documentation in cgroup.h.  Update it.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b7bd4beae294..17604767adfd 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -264,13 +264,14 @@ enum {
 	 *
 	 * - Remount is disallowed.
 	 *
-	 * - memcg: use_hierarchy is on by default and the cgroup file for
-	 *   the flag is not created.
+	 * - "tasks" is removed.  Everything should be at process
+	 *   granularity.  Use "cgroup.procs" instead.
 	 *
-	 * The followings are planned changes.
+	 * - "release_agent" and "notify_on_release" are removed.
+	 *   Replacement notification mechanism will be implemented.
 	 *
-	 * - release_agent will be disallowed once replacement notification
-	 *   mechanism is implemented.
+	 * - memcg: use_hierarchy is on by default and the cgroup file for
+	 *   the flag is not created.
 	 */
 	CGRP_ROOT_SANE_BEHAVIOR	= (1 << 0),
 
-- 
cgit v1.2.3


From 6db8e85c5c1f89cd0183b76dab027c81009f129f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 14 Jun 2013 11:18:22 -0700
Subject: cgroup: disallow rename(2) if sane_behavior

cgroup's rename(2) isn't a proper migration implementation - it can't
move the cgroup to a different parent in the hierarchy.  All it can do
is swapping the name string for that cgroup.  This isn't useful and
can mislead users to think that cgroup supports proper cgroup-level
migration.  Disallow rename(2) if sane_behavior.

v2: Fail with -EPERM instead of -EINVAL so that it matches the vfs
    return value when ->rename is not implemented as suggested by Li.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 17604767adfd..f97522790682 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -270,6 +270,8 @@ enum {
 	 * - "release_agent" and "notify_on_release" are removed.
 	 *   Replacement notification mechanism will be implemented.
 	 *
+	 * - rename(2) is disallowed.
+	 *
 	 * - memcg: use_hierarchy is on by default and the cgroup file for
 	 *   the flag is not created.
 	 */
-- 
cgit v1.2.3


From e8c82d20a9f729cf4b9f73043f7fd4e0872bebfd Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Tue, 18 Jun 2013 18:48:37 +0800
Subject: cgroup: convert cgroup_cft_commit() to use
 cgroup_for_each_descendant_pre()

We used root->allcg_list to iterate cgroup hierarchy because at that time
cgroup_for_each_descendant_pre() hasn't been invented.

tj: In cgroup_cfts_commit(), s/@serial_nr/@update_upto/, move the
    assignment right above releasing cgroup_mutex and explain what's
    going on there.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index f97522790682..b28365890646 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -206,9 +206,6 @@ struct cgroup {
 	 */
 	struct list_head cset_links;
 
-	struct list_head allcg_node;	/* cgroupfs_root->allcg_list */
-	struct list_head cft_q_node;	/* used during cftype add/rm */
-
 	/*
 	 * Linked list running through all cgroups that can
 	 * potentially be reaped by the release agent. Protected by
@@ -313,9 +310,6 @@ struct cgroupfs_root {
 	/* A list running through the active hierarchies */
 	struct list_head root_list;
 
-	/* All cgroups on this root, cgroup_mutex protected */
-	struct list_head allcg_list;
-
 	/* Hierarchy-specific flags */
 	unsigned long flags;
 
-- 
cgit v1.2.3


From 03c78cbebb323fc97295ff97dc5e009d56371d57 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Fri, 14 Jun 2013 11:17:19 +0800
Subject: cgroup: rename cont to cgrp

Cont is short for container. control group was named process container
at first, but then people found container already has a meaning in
linux kernel.

Clean up the leftover variable name @cont.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b28365890646..6c2ba52fc5d4 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -433,13 +433,13 @@ struct cftype {
 	 * entry. The key/value pairs (and their ordering) should not
 	 * change between reboots.
 	 */
-	int (*read_map)(struct cgroup *cont, struct cftype *cft,
+	int (*read_map)(struct cgroup *cgrp, struct cftype *cft,
 			struct cgroup_map_cb *cb);
 	/*
 	 * read_seq_string() is used for outputting a simple sequence
 	 * using seqfile.
 	 */
-	int (*read_seq_string)(struct cgroup *cont, struct cftype *cft,
+	int (*read_seq_string)(struct cgroup *cgrp, struct cftype *cft,
 			       struct seq_file *m);
 
 	ssize_t (*write)(struct cgroup *cgrp, struct cftype *cft,
-- 
cgit v1.2.3


From 02c402d98588bdfd3bebd267db574e13afdef722 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 24 Jun 2013 15:21:47 -0700
Subject: cgroup: convert CFTYPE_* flags to enums

Purely cosmetic.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 6c2ba52fc5d4..ab27001a2c4a 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -385,9 +385,11 @@ struct cgroup_map_cb {
  */
 
 /* cftype->flags */
-#define CFTYPE_ONLY_ON_ROOT	(1U << 0)	/* only create on root cg */
-#define CFTYPE_NOT_ON_ROOT	(1U << 1)	/* don't create on root cg */
-#define CFTYPE_INSANE		(1U << 2)	/* don't create if sane_behavior */
+enum {
+	CFTYPE_ONLY_ON_ROOT	= (1 << 0),	/* only create on root cg */
+	CFTYPE_NOT_ON_ROOT	= (1 << 1),	/* don't create on root cg */
+	CFTYPE_INSANE		= (1 << 2),	/* don't create if sane_behavior */
+};
 
 #define MAX_CFTYPE_NAME		64
 
-- 
cgit v1.2.3


From a8a648c4acee2095262f7fa65b0d8a68a03c32e4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 24 Jun 2013 15:21:47 -0700
Subject: cgroup: remove cgroup->actual_subsys_mask

cgroup curiously has two subsystem masks, ->subsys_mask and
->actual_subsys_mask.  The latter only exists because the new target
subsys_mask is passed into rebind_subsystems() via @root>subsys_mask.
rebind_subsystems() needs to know what the current mask is to decide
how to reach the target mask so ->actual_subsys_mask is used as the
temp location to remember the current state.

Adding a temporary field to a permanent data structure is rather silly
and can be misleading.  Update rebind_subsystems() to take @added_mask
and @removed_mask instead and remove @root->actual_subsys_mask.

This patch shouldn't introduce any behavior changes.

v2: Comment and description updated as suggested by Li.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index ab27001a2c4a..4c1eceb8c439 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -286,18 +286,12 @@ enum {
 struct cgroupfs_root {
 	struct super_block *sb;
 
-	/*
-	 * The bitmask of subsystems intended to be attached to this
-	 * hierarchy
-	 */
+	/* The bitmask of subsystems attached to this hierarchy */
 	unsigned long subsys_mask;
 
 	/* Unique id for this hierarchy. */
 	int hierarchy_id;
 
-	/* The bitmask of subsystems currently attached to this hierarchy */
-	unsigned long actual_subsys_mask;
-
 	/* A list running through the attached subsystems */
 	struct list_head subsys_list;
 
-- 
cgit v1.2.3


From 1672d040709b789671c0502e7aac9d632c2f9175 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 25 Jun 2013 18:04:54 -0700
Subject: cgroup: fix cgroupfs_root early destruction path

cgroupfs_root used to have ->actual_subsys_mask in addition to
->subsys_mask.  a8a648c4ac ("cgroup: remove
cgroup->actual_subsys_mask") removed it noting that the subsys_mask is
essentially temporary and doesn't belong in cgroupfs_root; however,
the patch made it impossible to tell whether a cgroupfs_root actually
has the subsystems bound or just have the bits set leading to the
following BUG when trying to mount with subsystems which are already
mounted elsewhere.

 kernel BUG at kernel/cgroup.c:1038!
 invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
 ...
 CPU: 1 PID: 7973 Comm: mount Tainted: G        W    3.10.0-rc7-next-20130625-sasha-00011-g1c1dc0e #1105
 task: ffff880fc0ae8000 ti: ffff880fc0b9a000 task.ti: ffff880fc0b9a000
 RIP: 0010:[<ffffffff81249b29>]  [<ffffffff81249b29>] rebind_subsystems+0x409/0x5f0
 ...
 Call Trace:
  [<ffffffff8124bd4f>] cgroup_kill_sb+0xff/0x210
  [<ffffffff813d21af>] deactivate_locked_super+0x4f/0x90
  [<ffffffff8124f3b3>] cgroup_mount+0x673/0x6e0
  [<ffffffff81257169>] cpuset_mount+0xd9/0x110
  [<ffffffff813d2580>] mount_fs+0xb0/0x2d0
  [<ffffffff81404afd>] vfs_kern_mount+0xbd/0x180
  [<ffffffff814070b5>] do_new_mount+0x145/0x2c0
  [<ffffffff814085d6>] do_mount+0x356/0x3c0
  [<ffffffff8140873d>] SyS_mount+0xfd/0x140
  [<ffffffff854eb600>] tracesys+0xdd/0xe2

We still want rebind_subsystems() to take added/removed masks, so
let's fix it by marking whether a cgroupfs_root has finished binding
or not.  Also, document what's going on around ->subsys_mask
initialization so that similar mistakes aren't repeated.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Sasha Levin <sasha.levin@oracle.com>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 4c1eceb8c439..8e4fd5e67384 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -276,6 +276,7 @@ enum {
 
 	CGRP_ROOT_NOPREFIX	= (1 << 1), /* mounted subsystems have no named prefix */
 	CGRP_ROOT_XATTR		= (1 << 2), /* supports extended attributes */
+	CGRP_ROOT_SUBSYS_BOUND	= (1 << 3), /* subsystems finished binding */
 };
 
 /*
-- 
cgit v1.2.3


From 14611e51a57df10240817d8ada510842faf0ec51 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 25 Jun 2013 11:48:32 -0700
Subject: cgroup: fix RCU accesses to task->cgroups

task->cgroups is a RCU pointer pointing to struct css_set.  A task
switches to a different css_set on cgroup migration but a css_set
doesn't change once created and its pointers to cgroup_subsys_states
aren't RCU protected.

task_subsys_state[_check]() is the macro to acquire css given a task
and subsys_id pair.  It RCU-dereferences task->cgroups->subsys[] not
task->cgroups, so the RCU pointer task->cgroups ends up being
dereferenced without read_barrier_depends() after it.  It's broken.

Fix it by introducing task_css_set[_check]() which does
RCU-dereference on task->cgroups.  task_subsys_state[_check]() is
reimplemented to directly dereference ->subsys[] of the css_set
returned from task_css_set[_check]().

This removes some of sparse RCU warnings in cgroup.

v2: Fixed unbalanced parenthsis and there's no need to use
    rcu_dereference_raw() when !CONFIG_PROVE_RCU.  Both spotted by Li.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: stable@vger.kernel.org
---
 include/linux/cgroup.h | 58 +++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 48 insertions(+), 10 deletions(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 8e4fd5e67384..ad3555bc21f4 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -635,22 +635,60 @@ static inline struct cgroup_subsys_state *cgroup_subsys_state(
 	return cgrp->subsys[subsys_id];
 }
 
-/*
- * function to get the cgroup_subsys_state which allows for extra
- * rcu_dereference_check() conditions, such as locks used during the
- * cgroup_subsys::attach() methods.
+/**
+ * task_css_set_check - obtain a task's css_set with extra access conditions
+ * @task: the task to obtain css_set for
+ * @__c: extra condition expression to be passed to rcu_dereference_check()
+ *
+ * A task's css_set is RCU protected, initialized and exited while holding
+ * task_lock(), and can only be modified while holding both cgroup_mutex
+ * and task_lock() while the task is alive.  This macro verifies that the
+ * caller is inside proper critical section and returns @task's css_set.
+ *
+ * The caller can also specify additional allowed conditions via @__c, such
+ * as locks used during the cgroup_subsys::attach() methods.
  */
 #ifdef CONFIG_PROVE_RCU
 extern struct mutex cgroup_mutex;
-#define task_subsys_state_check(task, subsys_id, __c)			\
-	rcu_dereference_check((task)->cgroups->subsys[(subsys_id)],	\
-			      lockdep_is_held(&(task)->alloc_lock) ||	\
-			      lockdep_is_held(&cgroup_mutex) || (__c))
+#define task_css_set_check(task, __c)					\
+	rcu_dereference_check((task)->cgroups,				\
+		lockdep_is_held(&(task)->alloc_lock) ||			\
+		lockdep_is_held(&cgroup_mutex) || (__c))
 #else
-#define task_subsys_state_check(task, subsys_id, __c)			\
-	rcu_dereference((task)->cgroups->subsys[(subsys_id)])
+#define task_css_set_check(task, __c)					\
+	rcu_dereference((task)->cgroups)
 #endif
 
+/**
+ * task_subsys_state_check - obtain css for (task, subsys) w/ extra access conds
+ * @task: the target task
+ * @subsys_id: the target subsystem ID
+ * @__c: extra condition expression to be passed to rcu_dereference_check()
+ *
+ * Return the cgroup_subsys_state for the (@task, @subsys_id) pair.  The
+ * synchronization rules are the same as task_css_set_check().
+ */
+#define task_subsys_state_check(task, subsys_id, __c)			\
+	task_css_set_check((task), (__c))->subsys[(subsys_id)]
+
+/**
+ * task_css_set - obtain a task's css_set
+ * @task: the task to obtain css_set for
+ *
+ * See task_css_set_check().
+ */
+static inline struct css_set *task_css_set(struct task_struct *task)
+{
+	return task_css_set_check(task, false);
+}
+
+/**
+ * task_subsys_state - obtain css for (task, subsys)
+ * @task: the target task
+ * @subsys_id: the target subsystem ID
+ *
+ * See task_subsys_state_check().
+ */
 static inline struct cgroup_subsys_state *
 task_subsys_state(struct task_struct *task, int subsys_id)
 {
-- 
cgit v1.2.3


From 0ce6cba35777cf96a54ce0d5856dc962566b8717 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 27 Jun 2013 19:37:26 -0700
Subject: cgroup: CGRP_ROOT_SUBSYS_BOUND should be ignored when comparing mount
 options

1672d04070 ("cgroup: fix cgroupfs_root early destruction path")
introduced CGRP_ROOT_SUBSYS_BOUND which is used to mark completion of
subsys binding on a new root; however, this broke remounts.
cgroup_remount() doesn't allow changing root options via remount and
CGRP_ROOT_SUBSYS_BOUND, which is set on all fully initialized roots,
makes the function reject all remounts.

Fix it by putting the options part in the lower 16 bits of root->flags
and masking the comparions.  While at it, make cgroup_remount() emit
an error message explaining why it's rejecting a remount request, so
that it's less of a mystery.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index ad3555bc21f4..8db53974f7b5 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -276,7 +276,11 @@ enum {
 
 	CGRP_ROOT_NOPREFIX	= (1 << 1), /* mounted subsystems have no named prefix */
 	CGRP_ROOT_XATTR		= (1 << 2), /* supports extended attributes */
-	CGRP_ROOT_SUBSYS_BOUND	= (1 << 3), /* subsystems finished binding */
+
+	/* mount options live below bit 16 */
+	CGRP_ROOT_OPTION_MASK	= (1 << 16) - 1,
+
+	CGRP_ROOT_SUBSYS_BOUND	= (1 << 16), /* subsystems finished binding */
 };
 
 /*
-- 
cgit v1.2.3


From 913ffdb54366f94eec65c656cae8c6e00e1ab1b0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 11 Jul 2013 16:34:48 -0700
Subject: cgroup: replace task_cgroup_path_from_hierarchy() with
 task_cgroup_path()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

task_cgroup_path_from_hierarchy() was added for the planned new users
and none of the currently planned users wants to know about multiple
hierarchies.  This patch drops the multiple hierarchy part and makes
it always return the path in the first non-dummy hierarchy.

As unified hierarchy will always have id 1, this is guaranteed to
return the path for the unified hierarchy if mounted; otherwise, it
will return the path from the hierarchy which happens to occupy the
lowest hierarchy id, which will usually be the first hierarchy mounted
after boot.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Jan Kaluža <jkaluza@redhat.com>
---
 include/linux/cgroup.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index fd097ecfcd97..21cfaff7e002 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -540,8 +540,7 @@ int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
 bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor);
 
 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen);
-int task_cgroup_path_from_hierarchy(struct task_struct *task, int hierarchy_id,
-				    char *buf, size_t buflen);
+int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen);
 
 int cgroup_task_count(const struct cgroup *cgrp);
 
-- 
cgit v1.2.3