From 3ed80a62bf959d34ebd4d553b026fbe7e6fbcc54 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 8 Feb 2014 10:36:58 -0500
Subject: cgroup: drop module support

With module supported dropped from net_prio, no controller is using
cgroup module support.  None of actual resource controllers can be
built as a module and we aren't gonna add new controllers which don't
control resources.  This patch drops module support from cgroup.

* cgroup_[un]load_subsys() and cgroup_subsys->module removed.

* As there's no point in distinguishing IS_BUILTIN() and IS_MODULE(),
  cgroup_subsys.h now uses IS_ENABLED() directly.

* enum cgroup_subsys_id now exactly matches the list of enabled
  controllers as ordered in cgroup_subsys.h.

* cgroup_subsys[] is now a contiguously occupied array.  Size
  specification is no longer necessary and dropped.

* for_each_builtin_subsys() is removed and for_each_subsys() is
  updated to not require any locking.

* module ref handling is removed from rebind_subsystems().

* Module related comments dropped.

v2: Rebased on top of fe1217c4f3f7 ("net: net_cls: move cgroupfs
    classid handling into core").

v3: Added {} around the if (need_forkexit_callback) block in
    cgroup_post_fork() for readability as suggested by Li.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 29 ++++-------------------------
 1 file changed, 4 insertions(+), 25 deletions(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 5c097596104b..d842a737d448 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -37,28 +37,13 @@ extern void cgroup_post_fork(struct task_struct *p);
 extern void cgroup_exit(struct task_struct *p, int run_callbacks);
 extern int cgroupstats_build(struct cgroupstats *stats,
 				struct dentry *dentry);
-extern int cgroup_load_subsys(struct cgroup_subsys *ss);
-extern void cgroup_unload_subsys(struct cgroup_subsys *ss);
 
 extern int proc_cgroup_show(struct seq_file *, void *);
 
-/*
- * Define the enumeration of all cgroup subsystems.
- *
- * We define ids for builtin subsystems and then modular ones.
- */
+/* define the enumeration of all cgroup subsystems */
 #define SUBSYS(_x) _x ## _subsys_id,
 enum cgroup_subsys_id {
-#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)
 #include <linux/cgroup_subsys.h>
-#undef IS_SUBSYS_ENABLED
-	CGROUP_BUILTIN_SUBSYS_COUNT,
-
-	__CGROUP_SUBSYS_TEMP_PLACEHOLDER = CGROUP_BUILTIN_SUBSYS_COUNT - 1,
-
-#define IS_SUBSYS_ENABLED(option) IS_MODULE(option)
-#include <linux/cgroup_subsys.h>
-#undef IS_SUBSYS_ENABLED
 	CGROUP_SUBSYS_COUNT,
 };
 #undef SUBSYS
@@ -370,10 +355,9 @@ struct css_set {
 	struct list_head cgrp_links;
 
 	/*
-	 * Set of subsystem states, one for each subsystem. This array
-	 * is immutable after creation apart from the init_css_set
-	 * during subsystem registration (at boot time) and modular subsystem
-	 * loading/unloading.
+	 * Set of subsystem states, one for each subsystem. This array is
+	 * immutable after creation apart from the init_css_set during
+	 * subsystem registration (at boot time).
 	 */
 	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
 
@@ -620,15 +604,10 @@ struct cgroup_subsys {
 	/* base cftypes, automatically [de]registered with subsys itself */
 	struct cftype *base_cftypes;
 	struct cftype_set base_cftset;
-
-	/* should be defined only by modular subsystems */
-	struct module *module;
 };
 
 #define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys;
-#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)
 #include <linux/cgroup_subsys.h>
-#undef IS_SUBSYS_ENABLED
 #undef SUBSYS
 
 /**
-- 
cgit v1.2.3


From 073219e995b4a3f8cf1ce8228b7ef440b6994ac0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 8 Feb 2014 10:36:58 -0500
Subject: cgroup: clean up cgroup_subsys names and initialization

cgroup_subsys is a bit messier than it needs to be.

* The name of a subsys can be different from its internal identifier
  defined in cgroup_subsys.h.  Most subsystems use the matching name
  but three - cpu, memory and perf_event - use different ones.

* cgroup_subsys_id enums are postfixed with _subsys_id and each
  cgroup_subsys is postfixed with _subsys.  cgroup.h is widely
  included throughout various subsystems, it doesn't and shouldn't
  have claim on such generic names which don't have any qualifier
  indicating that they belong to cgroup.

* cgroup_subsys->subsys_id should always equal the matching
  cgroup_subsys_id enum; however, we require each controller to
  initialize it and then BUG if they don't match, which is a bit
  silly.

This patch cleans up cgroup_subsys names and initialization by doing
the followings.

* cgroup_subsys_id enums are now postfixed with _cgrp_id, and each
  cgroup_subsys with _cgrp_subsys.

* With the above, renaming subsys identifiers to match the userland
  visible names doesn't cause any naming conflicts.  All non-matching
  identifiers are renamed to match the official names.

  cpu_cgroup -> cpu
  mem_cgroup -> memory
  perf -> perf_event

* controllers no longer need to initialize ->subsys_id and ->name.
  They're generated in cgroup core and set automatically during boot.

* Redundant cgroup_subsys declarations removed.

* While updating BUG_ON()s in cgroup_init_early(), convert them to
  WARN()s.  BUGging that early during boot is stupid - the kernel
  can't print anything, even through serial console and the trap
  handler doesn't even link stack frame properly for back-tracing.

This patch doesn't introduce any behavior changes.

v2: Rebased on top of fe1217c4f3f7 ("net: net_cls: move cgroupfs
    classid handling into core").

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Acked-by: "David S. Miller" <davem@davemloft.net>
Acked-by: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Aristeu Rozanski <aris@redhat.com>
Acked-by: Ingo Molnar <mingo@redhat.com>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Serge E. Hallyn <serue@us.ibm.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Thomas Graf <tgraf@suug.ch>
---
 include/linux/cgroup.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index d842a737d448..cd6611e622fd 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -41,7 +41,7 @@ extern int cgroupstats_build(struct cgroupstats *stats,
 extern int proc_cgroup_show(struct seq_file *, void *);
 
 /* define the enumeration of all cgroup subsystems */
-#define SUBSYS(_x) _x ## _subsys_id,
+#define SUBSYS(_x) _x ## _cgrp_id,
 enum cgroup_subsys_id {
 #include <linux/cgroup_subsys.h>
 	CGROUP_SUBSYS_COUNT,
@@ -573,7 +573,6 @@ struct cgroup_subsys {
 		     struct task_struct *task);
 	void (*bind)(struct cgroup_subsys_state *root_css);
 
-	int subsys_id;
 	int disabled;
 	int early_init;
 
@@ -592,6 +591,8 @@ struct cgroup_subsys {
 	bool broken_hierarchy;
 	bool warned_broken_hierarchy;
 
+	/* the following two fields are initialized automtically during boot */
+	int subsys_id;
 #define MAX_CGROUP_TYPE_NAMELEN 32
 	const char *name;
 
@@ -606,7 +607,7 @@ struct cgroup_subsys {
 	struct cftype_set base_cftset;
 };
 
-#define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys;
+#define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys;
 #include <linux/cgroup_subsys.h>
 #undef SUBSYS
 
-- 
cgit v1.2.3


From aec25020f5d4b69aea5317551d1cb7043f6b04fb Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 8 Feb 2014 10:36:58 -0500
Subject: cgroup: rename cgroup_subsys->subsys_id to ->id

It's no longer referenced outside cgroup core, so renaming is easy.
Let's rename it for consistency & brevity.

This patch is pure rename.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index cd6611e622fd..198c7fcd727e 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -548,7 +548,7 @@ int cgroup_taskset_size(struct cgroup_taskset *tset);
 	     (task) = cgroup_taskset_next((tset)))			\
 		if (!(skip_css) ||					\
 		    cgroup_taskset_cur_css((tset),			\
-			(skip_css)->ss->subsys_id) != (skip_css))
+			(skip_css)->ss->id) != (skip_css))
 
 /*
  * Control Group subsystem type.
@@ -592,7 +592,7 @@ struct cgroup_subsys {
 	bool warned_broken_hierarchy;
 
 	/* the following two fields are initialized automtically during boot */
-	int subsys_id;
+	int id;
 #define MAX_CGROUP_TYPE_NAMELEN 32
 	const char *name;
 
-- 
cgit v1.2.3


From 5a17f543ed6808e9085063277fe46795dea484bd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 11 Feb 2014 11:52:47 -0500
Subject: cgroup: improve css_from_dir() into css_tryget_from_dir()

css_from_dir() returns the matching css (cgroup_subsys_state) given a
dentry and subsystem.  The function doesn't pin the css before
returning and requires the caller to be holding RCU read lock or
cgroup_mutex and handling pinning on the caller side.

Given that users of the function are likely to want to pin the
returned css (both existing users do) and that getting and putting
css's are very cheap, there's no reason for the interface to be tricky
like this.

Rename css_from_dir() to css_tryget_from_dir() and make it try to pin
the found css and return it only if pinning succeeded.  The callers
are updated so that they no longer do RCU locking and pinning around
the function and just use the returned css.

This will also ease converting cgroup to kernfs.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 include/linux/cgroup.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index c86ba7ff7a7e..1ba4fc08f776 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -825,8 +825,8 @@ int css_scan_tasks(struct cgroup_subsys_state *css,
 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
 int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);
 
-struct cgroup_subsys_state *css_from_dir(struct dentry *dentry,
-					 struct cgroup_subsys *ss);
+struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
+						struct cgroup_subsys *ss);
 
 #else /* !CONFIG_CGROUPS */
 
-- 
cgit v1.2.3


From de00ffa56ea3132c6013fc8f07133b8a1014cf53 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 11 Feb 2014 11:52:48 -0500
Subject: cgroup: make cgroup_subsys->base_cftypes use cgroup_add_cftypes()

Currently, cgroup_subsys->base_cftypes registration is different from
dynamic cftypes registartion.  Instead of going through
cgroup_add_cftypes(), cgroup_init_subsys() invokes
cgroup_init_cftsets() which makes use of cgroup_subsys->base_cftset
which doesn't involve dynamic allocation.

While avoiding dynamic allocation is somewhat nice, having two
separate paths for cftypes registration is nasty, especially as we're
planning to add more operations during cftypes registration.

This patch drops cgroup_init_cftsets() and cgroup_subsys->base_cftset
and registers base_cftypes using cgroup_add_cftypes().  This is done
as a separate step in cgroup_init() instead of a part of
cgroup_init_subsys().  This is because cgroup_init_subsys() can be
called very early during boot when kmalloc() isn't available yet.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 1ba4fc08f776..c4350a82320b 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -604,9 +604,8 @@ struct cgroup_subsys {
 	/* list of cftype_sets */
 	struct list_head cftsets;
 
-	/* base cftypes, automatically [de]registered with subsys itself */
+	/* base cftypes, automatically registered with subsys itself */
 	struct cftype *base_cftypes;
-	struct cftype_set base_cftset;
 };
 
 #define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys;
-- 
cgit v1.2.3


From 5f46990787e2721b4db190ddc8af6fdbe8f010d7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 11 Feb 2014 11:52:48 -0500
Subject: cgroup: update the meaning of cftype->max_write_len

cftype->max_write_len is used to extend the maximum size of writes.
It's interpreted in such a way that the actual maximum size is one
less than the specified value.  The default size is defined by
CGROUP_LOCAL_BUFFER_SIZE.  Its interpretation is quite confusing - its
value is decremented by 1 and then compared for equality with max
size, which means that the actual default size is
CGROUP_LOCAL_BUFFER_SIZE - 2, which is 62 chars.

There's no point in having a limit that low.  Update its definition so
that it means the actual string length sans termination and anything
below PAGE_SIZE-1 is treated as PAGE_SIZE-1.

.max_write_len for "release_agent" is updated to PATH_MAX-1 and
cgroup_release_agent_write() is updated so that the redundant strlen()
check is removed and it uses strlcpy() instead of strcpy().
.max_write_len initializations in blk-throttle.c and cfq-iosched.c are
no longer necessary and removed.  The one in cpuset is kept unchanged
as it's an approximated value to begin with.

This will also make transition to kernfs smoother.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index c4350a82320b..63feaed83bc6 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -400,8 +400,9 @@ struct cftype {
 	umode_t mode;
 
 	/*
-	 * If non-zero, defines the maximum length of string that can
-	 * be passed to write_string; defaults to 64
+	 * The maximum length of string, excluding trailing nul, that can
+	 * be passed to write_string.  If < PAGE_SIZE-1, PAGE_SIZE-1 is
+	 * assumed.
 	 */
 	size_t max_write_len;
 
-- 
cgit v1.2.3


From b1664924062393bb048203bd4622e0b1c9e1d328 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 11 Feb 2014 11:52:49 -0500
Subject: cgroup: introduce cgroup_ino()

mm/memory-failure.c::hwpoison_filter_task() has been reaching into
cgroup to extract the associated ino to be used as a filtering
criterion.  This is an implementation detail which shouldn't be
depended upon from outside cgroup proper and is about to change with
the scheduled kernfs conversion.

This patch introduces a proper interface to determine the associated
ino, cgroup_ino(), and updates hwpoison_filter_task() to use it
instead of reaching directly into cgroup.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Wu Fengguang <fengguang.wu@intel.com>
---
 include/linux/cgroup.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 63feaed83bc6..06f577764414 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -507,6 +507,15 @@ static inline const char *cgroup_name(const struct cgroup *cgrp)
 	return rcu_dereference(cgrp->name)->name;
 }
 
+/* returns ino associated with a cgroup, 0 indicates unmounted root */
+static inline ino_t cgroup_ino(struct cgroup *cgrp)
+{
+	if (cgrp->dentry)
+		return cgrp->dentry->d_inode->i_ino;
+	else
+		return 0;
+}
+
 static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq)
 {
 	struct cgroup_open_file *of = seq->private;
-- 
cgit v1.2.3


From 59f5296b51b86718dd6eecf0a268b2f1a1ec0a2d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 11 Feb 2014 11:52:49 -0500
Subject: cgroup: misc preps for kernfs conversion

* Un-inline seq_css().  After kernfs conversion, the function will
  need to dereference internal data structures.

* Add cgroup_get/put_root() and replace direct super_block->s_active
  manipulatinos with them.  These will be converted to kernfs_root
  refcnting.

* Add cgroup_get/put() and replace dget/put() on cgrp->dentry with
  them.  These will be converted to kernfs refcnting.

* Update current_css_set_cg_links_read() to use cgroup_name() instead
  of reaching into the dentry name.  The end result is the same.

These changes don't make functional differences but will make
transition to kernfs easier.

v2: Rebased on top of 0ab02ca8f887 ("cgroup: protect modifications to
    cgroup_idr with cgroup_mutex").

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 06f577764414..523277871913 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -516,18 +516,14 @@ static inline ino_t cgroup_ino(struct cgroup *cgrp)
 		return 0;
 }
 
-static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq)
-{
-	struct cgroup_open_file *of = seq->private;
-	return of->cfe->css;
-}
-
 static inline struct cftype *seq_cft(struct seq_file *seq)
 {
 	struct cgroup_open_file *of = seq->private;
 	return of->cfe->type;
 }
 
+struct cgroup_subsys_state *seq_css(struct seq_file *seq);
+
 int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
 int cgroup_rm_cftypes(struct cftype *cfts);
 
-- 
cgit v1.2.3


From 2bd59d48ebfb3df41ee56938946ca0dd30887312 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 11 Feb 2014 11:52:49 -0500
Subject: cgroup: convert to kernfs

cgroup filesystem code was derived from the original sysfs
implementation which was heavily intertwined with vfs objects and
locking with the goal of re-using the existing vfs infrastructure.
That experiment turned out rather disastrous and sysfs switched, a
long time ago, to distributed filesystem model where a separate
representation is maintained which is queried by vfs.  Unfortunately,
cgroup stuck with the failed experiment all these years and
accumulated even more problems over time.

Locking and object lifetime management being entangled with vfs is
probably the most egregious.  vfs is never designed to be misused like
this and cgroup ends up jumping through various convoluted dancing to
make things work.  Even then, operations across multiple cgroups can't
be done safely as it'll deadlock with rename locking.

Recently, kernfs is separated out from sysfs so that it can be used by
users other than sysfs.  This patch converts cgroup to use kernfs,
which will bring the following benefits.

* Separation from vfs internals.  Locking and object lifetime
  management is contained in cgroup proper making things a lot
  simpler.  This removes significant amount of locking convolutions,
  hairy object lifetime rules and the restriction on multi-cgroup
  operations.

* Can drop a lot of code to implement filesystem interface as most are
  provided by kernfs.

* Proper "severing" semantics, which allows controllers to not worry
  about lingering file accesses after offline.

While the preceding patches did as much as possible to make the
transition less painful, large part of the conversion has to be one
discrete step making this patch rather large.  The rest of the commit
message lists notable changes in different areas.

Overall
-------

* vfs constructs replaced with kernfs ones.  cgroup->dentry w/ ->kn,
  cgroupfs_root->sb w/ ->kf_root.

* All dentry accessors are removed.  Helpers to map from kernfs
  constructs are added.

* All vfs plumbing around dentry, inode and bdi removed.

* cgroup_mount() now directly looks for matching root and then
  proceeds to create a new one if not found.

Synchronization and object lifetime
-----------------------------------

* vfs inode locking removed.  Among other things, this removes the
  need for the convolution in cgroup_cfts_commit().  Future patches
  will further simplify it.

* vfs refcnting replaced with cgroup internal ones.  cgroup->refcnt,
  cgroupfs_root->refcnt added.  cgroup_put_root() now directly puts
  root->refcnt and when it reaches zero proceeds to destroy it thus
  merging cgroup_put_root() and the former cgroup_kill_sb().
  Simliarly, cgroup_put() now directly schedules cgroup_free_rcu()
  when refcnt reaches zero.

* Unlike before, kernfs objects don't hold onto cgroup objects.  When
  cgroup destroys a kernfs node, all existing operations are drained
  and the association is broken immediately.  The same for
  cgroupfs_roots and mounts.

* All operations which come through kernfs guarantee that the
  associated cgroup is and stays valid for the duration of operation;
  however, there are two paths which need to find out the associated
  cgroup from dentry without going through kernfs -
  css_tryget_from_dir() and cgroupstats_build().  For these two,
  kernfs_node->priv is RCU managed so that they can dereference it
  under RCU read lock.

File and directory handling
---------------------------

* File and directory operations converted to kernfs_ops and
  kernfs_syscall_ops.

* xattrs is implicitly supported by kernfs.  No need to worry about it
  from cgroup.  This means that "xattr" mount option is no longer
  necessary.  A future patch will add a deprecated warning message
  when sane_behavior.

* When cftype->max_write_len > PAGE_SIZE, it's necessary to make a
  private copy of one of the kernfs_ops to set its atomic_write_len.
  cftype->kf_ops is added and cgroup_init/exit_cftypes() are updated
  to handle it.

* cftype->lockdep_key added so that kernfs lockdep annotation can be
  per cftype.

* Inidividual file entries and open states are now managed by kernfs.
  No need to worry about them from cgroup.  cfent, cgroup_open_file
  and their friends are removed.

* kernfs_nodes are created deactivated and kernfs_activate()
  invocations added to places where creation of new nodes are
  committed.

* cgroup_rmdir() uses kernfs_[un]break_active_protection() for
  self-removal.

v2: - Li pointed out in an earlier patch that specifying "name="
      during mount without subsystem specification should succeed if
      there's an existing hierarchy with a matching name although it
      should fail with -EINVAL if a new hierarchy should be created.
      Prior to the conversion, this used by handled by deferring
      failure from NULL return from cgroup_root_from_opts(), which was
      necessary because root was being created before checking for
      existing ones.  Note that cgroup_root_from_opts() returned an
      ERR_PTR() value for error conditions which require immediate
      mount failure.

      As we now have separate search and creation steps, deferring
      failure from cgroup_root_from_opts() is no longer necessary.
      cgroup_root_from_opts() is updated to always return ERR_PTR()
      value on failure.

    - The logic to match existing roots is updated so that a mount
      attempt with a matching name but different subsys_mask are
      rejected.  This was handled by a separate matching loop under
      the comment "Check for name clashes with existing mounts" but
      got lost during conversion.  Merge the check into the main
      search loop.

    - Add __rcu __force casting in RCU_INIT_POINTER() in
      cgroup_destroy_locked() to avoid the sparse address space
      warning reported by kbuild test bot.  Maybe we want an explicit
      interface to use kn->priv as RCU protected pointer?

v3: Make CONFIG_CGROUPS select CONFIG_KERNFS.

v4: Rebased on top of 0ab02ca8f887 ("cgroup: protect modifications to
    cgroup_idr with cgroup_mutex").

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: kbuild test robot fengguang.wu@intel.com>
---
 include/linux/cgroup.h | 52 +++++++++++++++++++-------------------------------
 1 file changed, 20 insertions(+), 32 deletions(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 523277871913..0e45a932b823 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -18,10 +18,10 @@
 #include <linux/rwsem.h>
 #include <linux/idr.h>
 #include <linux/workqueue.h>
-#include <linux/xattr.h>
 #include <linux/fs.h>
 #include <linux/percpu-refcount.h>
 #include <linux/seq_file.h>
+#include <linux/kernfs.h>
 
 #ifdef CONFIG_CGROUPS
 
@@ -159,16 +159,17 @@ struct cgroup {
 	/* the number of attached css's */
 	int nr_css;
 
+	atomic_t refcnt;
+
 	/*
 	 * We link our 'sibling' struct into our parent's 'children'.
 	 * Our children link their 'sibling' into our 'children'.
 	 */
 	struct list_head sibling;	/* my parent's children */
 	struct list_head children;	/* my children */
-	struct list_head files;		/* my files */
 
 	struct cgroup *parent;		/* my parent */
-	struct dentry *dentry;		/* cgroup fs entry, RCU protected */
+	struct kernfs_node *kn;		/* cgroup kernfs entry */
 
 	/*
 	 * Monotonically increasing unique serial number which defines a
@@ -222,9 +223,6 @@ struct cgroup {
 	/* For css percpu_ref killing and RCU-protected deletion */
 	struct rcu_head rcu_head;
 	struct work_struct destroy_work;
-
-	/* directory xattrs */
-	struct simple_xattrs xattrs;
 };
 
 #define MAX_CGROUP_ROOT_NAMELEN 64
@@ -291,15 +289,17 @@ enum {
 
 /*
  * A cgroupfs_root represents the root of a cgroup hierarchy, and may be
- * associated with a superblock to form an active hierarchy.  This is
+ * associated with a kernfs_root to form an active hierarchy.  This is
  * internal to cgroup core.  Don't access directly from controllers.
  */
 struct cgroupfs_root {
-	struct super_block *sb;
+	struct kernfs_root *kf_root;
 
 	/* The bitmask of subsystems attached to this hierarchy */
 	unsigned long subsys_mask;
 
+	atomic_t refcnt;
+
 	/* Unique id for this hierarchy. */
 	int hierarchy_id;
 
@@ -415,6 +415,9 @@ struct cftype {
 	 */
 	struct cgroup_subsys *ss;
 
+	/* kernfs_ops to use, initialized automatically during registration */
+	struct kernfs_ops *kf_ops;
+
 	/*
 	 * read_u64() is a shortcut for the common case of returning a
 	 * single integer. Use it in place of read()
@@ -460,6 +463,10 @@ struct cftype {
 	 * kick type for multiplexing.
 	 */
 	int (*trigger)(struct cgroup_subsys_state *css, unsigned int event);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lock_class_key	lockdep_key;
+#endif
 };
 
 /*
@@ -472,26 +479,6 @@ struct cftype_set {
 	struct cftype			*cfts;
 };
 
-/*
- * cgroupfs file entry, pointed to from leaf dentry->d_fsdata.  Don't
- * access directly.
- */
-struct cfent {
-	struct list_head		node;
-	struct dentry			*dentry;
-	struct cftype			*type;
-	struct cgroup_subsys_state	*css;
-
-	/* file xattrs */
-	struct simple_xattrs		xattrs;
-};
-
-/* seq_file->private points to the following, only ->priv is public */
-struct cgroup_open_file {
-	struct cfent			*cfe;
-	void				*priv;
-};
-
 /*
  * See the comment above CGRP_ROOT_SANE_BEHAVIOR for details.  This
  * function can be called as long as @cgrp is accessible.
@@ -510,16 +497,17 @@ static inline const char *cgroup_name(const struct cgroup *cgrp)
 /* returns ino associated with a cgroup, 0 indicates unmounted root */
 static inline ino_t cgroup_ino(struct cgroup *cgrp)
 {
-	if (cgrp->dentry)
-		return cgrp->dentry->d_inode->i_ino;
+	if (cgrp->kn)
+		return cgrp->kn->ino;
 	else
 		return 0;
 }
 
 static inline struct cftype *seq_cft(struct seq_file *seq)
 {
-	struct cgroup_open_file *of = seq->private;
-	return of->cfe->type;
+	struct kernfs_open_file *of = seq->private;
+
+	return of->kn->priv;
 }
 
 struct cgroup_subsys_state *seq_css(struct seq_file *seq);
-- 
cgit v1.2.3


From 86bf4b68759141459864ebd36ac3038a9cda895b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 12 Feb 2014 09:29:48 -0500
Subject: cgroup: warn if "xattr" is specified with "sane_behavior"

Mount option "xattr" is no longer necessary as it's enabled by default
on kernfs.  Warn if "xattr" is specified with "sane_behavior" so that
the option can be removed in the future.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 0e45a932b823..305e94ee17f5 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -264,6 +264,8 @@ enum {
 	 * - "release_agent" and "notify_on_release" are removed.
 	 *   Replacement notification mechanism will be implemented.
 	 *
+	 * - "xattr" mount option is deprecated.  kernfs always enables it.
+	 *
 	 * - cpuset: tasks will be kept in empty cpusets when hotplug happens
 	 *   and take masks of ancestors with non-empty cpus/mems, instead of
 	 *   being moved to an ancestor.
-- 
cgit v1.2.3


From 0adb070426dde2fd0b84e7f4f5cefcd8f0b24410 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 12 Feb 2014 09:29:48 -0500
Subject: cgroup: remove cftype_set

cftype_set was added primarily to allow registering the same cftype
array more than once for different subsystems.  Nobody uses or needs
such thing and it's already broken because each cftype has ->ss
pointer which is initialized during registration.

Let's add list_head ->node to cftype and use the first cftype entry in
the array to link them instead of allocating separate cftype_set.
While at it, trigger WARN if cft seems previously initialized during
registration.

This simplifies cftype handling a bit.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 26 +++++++++-----------------
 1 file changed, 9 insertions(+), 17 deletions(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 305e94ee17f5..b42251a23129 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -412,12 +412,11 @@ struct cftype {
 	unsigned int flags;
 
 	/*
-	 * The subsys this file belongs to.  Initialized automatically
-	 * during registration.  NULL for cgroup core files.
+	 * Fields used for internal bookkeeping.  Initialized automatically
+	 * during registration.
 	 */
-	struct cgroup_subsys *ss;
-
-	/* kernfs_ops to use, initialized automatically during registration */
+	struct cgroup_subsys *ss;	/* NULL for cgroup core files */
+	struct list_head node;		/* anchored at ss->cfts */
 	struct kernfs_ops *kf_ops;
 
 	/*
@@ -471,16 +470,6 @@ struct cftype {
 #endif
 };
 
-/*
- * cftype_sets describe cftypes belonging to a subsystem and are chained at
- * cgroup_subsys->cftsets.  Each cftset points to an array of cftypes
- * terminated by zero length name.
- */
-struct cftype_set {
-	struct list_head		node;	/* chained at subsys->cftsets */
-	struct cftype			*cfts;
-};
-
 /*
  * See the comment above CGRP_ROOT_SANE_BEHAVIOR for details.  This
  * function can be called as long as @cgrp is accessible.
@@ -597,8 +586,11 @@ struct cgroup_subsys {
 	/* link to parent, protected by cgroup_lock() */
 	struct cgroupfs_root *root;
 
-	/* list of cftype_sets */
-	struct list_head cftsets;
+	/*
+	 * List of cftypes.  Each entry is the first entry of an array
+	 * terminated by zero length name.
+	 */
+	struct list_head cfts;
 
 	/* base cftypes, automatically registered with subsys itself */
 	struct cftype *base_cftypes;
-- 
cgit v1.2.3


From e61734c55c24cdf11b07e52a74aec4dc4a7f4bd0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 12 Feb 2014 09:29:50 -0500
Subject: cgroup: remove cgroup->name

cgroup->name handling became quite complicated over time involving
dedicated struct cgroup_name for RCU protection.  Now that cgroup is
on kernfs, we can drop all of it and simply use kernfs_name/path() and
friends.  Replace cgroup->name and all related code with kernfs
name/path constructs.

* Reimplement cgroup_name() and cgroup_path() as thin wrappers on top
  of kernfs counterparts, which involves semantic changes.
  pr_cont_cgroup_name() and pr_cont_cgroup_path() added.

* cgroup->name handling dropped from cgroup_rename().

* All users of cgroup_name/path() updated to the new semantics.  Users
  which were formatting the string just to printk them are converted
  to use pr_cont_cgroup_name/path() instead, which simplifies things
  quite a bit.  As cgroup_name() no longer requires RCU read lock
  around it, RCU lockings which were protecting only cgroup_name() are
  removed.

v2: Comment above oom_info_lock updated as suggested by Michal.

v3: dummy_top doesn't have a kn associated and
    pr_cont_cgroup_name/path() ended up calling the matching kernfs
    functions with NULL kn leading to oops.  Test for NULL kn and
    print "/" if so.  This issue was reported by Fengguang Wu.

v4: Rebased on top of 0ab02ca8f887 ("cgroup: protect modifications to
    cgroup_idr with cgroup_mutex").

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 include/linux/cgroup.h | 63 ++++++++++++++++++++++++++++----------------------
 1 file changed, 36 insertions(+), 27 deletions(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b42251a23129..4d6ff7d40cf6 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -138,11 +138,6 @@ enum {
 	CGRP_SANE_BEHAVIOR,
 };
 
-struct cgroup_name {
-	struct rcu_head rcu_head;
-	char name[];
-};
-
 struct cgroup {
 	unsigned long flags;		/* "unsigned long" so bitops work */
 
@@ -179,19 +174,6 @@ struct cgroup {
 	 */
 	u64 serial_nr;
 
-	/*
-	 * This is a copy of dentry->d_name, and it's needed because
-	 * we can't use dentry->d_name in cgroup_path().
-	 *
-	 * You must acquire rcu_read_lock() to access cgrp->name, and
-	 * the only place that can change it is rename(), which is
-	 * protected by parent dir's i_mutex.
-	 *
-	 * Normally you should use cgroup_name() wrapper rather than
-	 * access it directly.
-	 */
-	struct cgroup_name __rcu *name;
-
 	/* Private pointers for each registered subsystem */
 	struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT];
 
@@ -479,12 +461,6 @@ static inline bool cgroup_sane_behavior(const struct cgroup *cgrp)
 	return cgrp->root->flags & CGRP_ROOT_SANE_BEHAVIOR;
 }
 
-/* Caller should hold rcu_read_lock() */
-static inline const char *cgroup_name(const struct cgroup *cgrp)
-{
-	return rcu_dereference(cgrp->name)->name;
-}
-
 /* returns ino associated with a cgroup, 0 indicates unmounted root */
 static inline ino_t cgroup_ino(struct cgroup *cgrp)
 {
@@ -503,14 +479,47 @@ static inline struct cftype *seq_cft(struct seq_file *seq)
 
 struct cgroup_subsys_state *seq_css(struct seq_file *seq);
 
+/*
+ * Name / path handling functions.  All are thin wrappers around the kernfs
+ * counterparts and can be called under any context.
+ */
+
+static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen)
+{
+	return kernfs_name(cgrp->kn, buf, buflen);
+}
+
+static inline char * __must_check cgroup_path(struct cgroup *cgrp, char *buf,
+					      size_t buflen)
+{
+	return kernfs_path(cgrp->kn, buf, buflen);
+}
+
+static inline void pr_cont_cgroup_name(struct cgroup *cgrp)
+{
+	/* dummy_top doesn't have a kn associated */
+	if (cgrp->kn)
+		pr_cont_kernfs_name(cgrp->kn);
+	else
+		pr_cont("/");
+}
+
+static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
+{
+	/* dummy_top doesn't have a kn associated */
+	if (cgrp->kn)
+		pr_cont_kernfs_path(cgrp->kn);
+	else
+		pr_cont("/");
+}
+
+char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen);
+
 int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
 int cgroup_rm_cftypes(struct cftype *cfts);
 
 bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor);
 
-int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen);
-int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen);
-
 int cgroup_task_count(const struct cgroup *cgrp);
 
 /*
-- 
cgit v1.2.3


From 3c9c825b8b50de7dbb015e6bfc04bb2da79364d9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 12 Feb 2014 09:29:50 -0500
Subject: cgroup: rename cgroupfs_root->number_of_cgroups to ->nr_cgrps and
 make it atomic_t

root->number_of_cgroups is currently an integer protected with
cgroup_mutex.  Except for sanity checks and proc reporting, the only
place it's used is to check whether the root has any child during
remount; however, this is a bit flawed as the counter is not
decremented when the cgroup is unlinked but when it's released,
meaning that there could be an extended period where all cgroups are
removed but remount is still not allowed because some internal objects
are lingering.  While not perfect either, it'd be better to use
emptiness test on root->top_cgroup.children.

This patch updates cgroup_remount() to test top_cgroup's children
instead, which makes number_of_cgroups only actual usage statistics
printing in proc implemented in proc_cgroupstats_show().  Let's
shorten its name and make it an atomic_t so that we don't have to
worry about its synchronization.  It's purely auxiliary at this point.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 4d6ff7d40cf6..f0e6105bbbc1 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -290,8 +290,8 @@ struct cgroupfs_root {
 	/* The root cgroup for this hierarchy */
 	struct cgroup top_cgroup;
 
-	/* Tracks how many cgroups are currently defined in hierarchy.*/
-	int number_of_cgroups;
+	/* Number of cgroups in the hierarchy, used only for /proc/cgroups */
+	atomic_t nr_cgrps;
 
 	/* A list running through the active hierarchies */
 	struct list_head root_list;
-- 
cgit v1.2.3


From 776f02fa4e1ad70557c0318c70ce928e0642bee0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 12 Feb 2014 09:29:50 -0500
Subject: cgroup: remove cgroupfs_root->refcnt

Currently, cgroupfs_root and its ->top_cgroup are separated reference
counted and the latter's is ignored.  There's no reason to do this
separately.  This patch removes cgroupfs_root->refcnt and destroys
cgroupfs_root when the top_cgroup is released.

* cgroup_put() updated to ignore cgroup_is_dead() test for top
  cgroups.  cgroup_free_fn() updated to handle root destruction when
  releasing a top cgroup.

* As root destruction is now bounced through cgroup destruction, it is
  asynchronous.  Update cgroup_mount() so that it waits for pending
  release which is currently implemented using msleep().  Converting
  this to proper wait_queue isn't hard but likely unnecessary.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index f0e6105bbbc1..298d616e8f40 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -282,12 +282,10 @@ struct cgroupfs_root {
 	/* The bitmask of subsystems attached to this hierarchy */
 	unsigned long subsys_mask;
 
-	atomic_t refcnt;
-
 	/* Unique id for this hierarchy. */
 	int hierarchy_id;
 
-	/* The root cgroup for this hierarchy */
+	/* The root cgroup.  Root is destroyed on its release. */
 	struct cgroup top_cgroup;
 
 	/* Number of cgroups in the hierarchy, used only for /proc/cgroups */
-- 
cgit v1.2.3


From d3ba07c3aa9ae3e03329b0a7f1a067c0647aa2af Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Feb 2014 06:58:38 -0500
Subject: cgroup: disallow xattr, release_agent and name if sane_behavior

Disallow more mount options if sane_behavior.  Note that xattr used to
generate warning.

While at it, simplify option check in cgroup_mount() and update
sane_behavior comment in cgroup.h.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 298d616e8f40..5f669ca0ee36 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -227,8 +227,8 @@ enum {
 	 *
 	 * The followings are the behaviors currently affected this flag.
 	 *
-	 * - Mount options "noprefix" and "clone_children" are disallowed.
-	 *   Also, cgroupfs file cgroup.clone_children is not created.
+	 * - Mount options "noprefix", "xattr", "clone_children",
+	 *   "release_agent" and "name" are disallowed.
 	 *
 	 * - When mounting an existing superblock, mount options should
 	 *   match.
@@ -246,7 +246,7 @@ enum {
 	 * - "release_agent" and "notify_on_release" are removed.
 	 *   Replacement notification mechanism will be implemented.
 	 *
-	 * - "xattr" mount option is deprecated.  kernfs always enables it.
+	 * - "cgroup.clone_children" is removed.
 	 *
 	 * - cpuset: tasks will be kept in empty cpusets when hotplug happens
 	 *   and take masks of ancestors with non-empty cpus/mems, instead of
-- 
cgit v1.2.3


From 35585573055f37837eb752ee22eb5523682ca742 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Feb 2014 06:58:38 -0500
Subject: cgroup: drop CGRP_ROOT_SUBSYS_BOUND

Before kernfs conversion, due to the way super_block lookup works,
cgroup roots were created and made visible before being fully
initialized.  This in turn required a special flag to mark that the
root hasn't been fully initialized so that the destruction path can
tell fully bound ones from half initialized.

That flag is CGRP_ROOT_SUBSYS_BOUND and no longer necessary after the
kernfs conversion as the lookup and creation of new root are atomic
w.r.t. cgroup_mutex.  This patch removes the flag and passes the
requests subsystem mask to cgroup_setup_root() so that it can set the
respective mask bits as subsystems are bound.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 5f669ca0ee36..e2ffcdc26cb7 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -267,8 +267,6 @@ enum {
 
 	/* mount options live below bit 16 */
 	CGRP_ROOT_OPTION_MASK	= (1 << 16) - 1,
-
-	CGRP_ROOT_SUBSYS_BOUND	= (1 << 16), /* subsystems finished binding */
 };
 
 /*
-- 
cgit v1.2.3


From 07bc356ed2950048d33d667e933e1b913c6e6b6d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Feb 2014 06:58:39 -0500
Subject: cgroup: implement cgroup_has_tasks() and unexport cgroup_task_count()

cgroup_task_count() read-locks css_set_lock and walks all tasks to
count them and then returns the result.  The only thing all the users
want is determining whether the cgroup is empty or not.  This patch
implements cgroup_has_tasks() which tests whether cgroup->cset_links
is empty, replaces all cgroup_task_count() usages and unexports it.

Note that the test isn't synchronized.  This is the same as before.
The test has always been racy.

This will help planned css_set locking update.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 include/linux/cgroup.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index e2ffcdc26cb7..72154fb44fb5 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -457,6 +457,12 @@ static inline bool cgroup_sane_behavior(const struct cgroup *cgrp)
 	return cgrp->root->flags & CGRP_ROOT_SANE_BEHAVIOR;
 }
 
+/* no synchronization, the result can only be used as a hint */
+static inline bool cgroup_has_tasks(struct cgroup *cgrp)
+{
+	return !list_empty(&cgrp->cset_links);
+}
+
 /* returns ino associated with a cgroup, 0 indicates unmounted root */
 static inline ino_t cgroup_ino(struct cgroup *cgrp)
 {
@@ -516,8 +522,6 @@ int cgroup_rm_cftypes(struct cftype *cfts);
 
 bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor);
 
-int cgroup_task_count(const struct cgroup *cgrp);
-
 /*
  * Control Group taskset, used to pass around set of tasks to cgroup_subsys
  * methods.
-- 
cgit v1.2.3


From 889ed9ceaa97bb02bf5d7349e24639f7fc5f4fa0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Feb 2014 06:58:40 -0500
Subject: cgroup: remove css_scan_tasks()

css_scan_tasks() doesn't have any user left.  Remove it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 72154fb44fb5..3bd0a7138371 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -14,7 +14,6 @@
 #include <linux/rcupdate.h>
 #include <linux/rculist.h>
 #include <linux/cgroupstats.h>
-#include <linux/prio_heap.h>
 #include <linux/rwsem.h>
 #include <linux/idr.h>
 #include <linux/workqueue.h>
@@ -813,11 +812,6 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
 struct task_struct *css_task_iter_next(struct css_task_iter *it);
 void css_task_iter_end(struct css_task_iter *it);
 
-int css_scan_tasks(struct cgroup_subsys_state *css,
-		   bool (*test)(struct task_struct *, void *),
-		   void (*process)(struct task_struct *, void *),
-		   void *data, struct ptr_heap *heap);
-
 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
 int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);
 
-- 
cgit v1.2.3


From 924f0d9a2078f49ff331bb43196ec5afadc16b8f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Feb 2014 06:58:41 -0500
Subject: cgroup: drop @skip_css from cgroup_taskset_for_each()

If !NULL, @skip_css makes cgroup_taskset_for_each() skip the matching
css.  The intention of the interface is to make it easy to skip css's
(cgroup_subsys_states) which already match the migration target;
however, this is entirely unnecessary as migration taskset doesn't
include tasks which are already in the target cgroup.  Drop @skip_css
from cgroup_taskset_for_each().

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Cc: Daniel Borkmann <dborkman@redhat.com>
---
 include/linux/cgroup.h | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 3bd0a7138371..581a124c7bc8 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -535,15 +535,11 @@ int cgroup_taskset_size(struct cgroup_taskset *tset);
 /**
  * cgroup_taskset_for_each - iterate cgroup_taskset
  * @task: the loop cursor
- * @skip_css: skip if task's css matches this, %NULL to iterate through all
  * @tset: taskset to iterate
  */
-#define cgroup_taskset_for_each(task, skip_css, tset)			\
+#define cgroup_taskset_for_each(task, tset)				\
 	for ((task) = cgroup_taskset_first((tset)); (task);		\
-	     (task) = cgroup_taskset_next((tset)))			\
-		if (!(skip_css) ||					\
-		    cgroup_taskset_cur_css((tset),			\
-			(skip_css)->ss->id) != (skip_css))
+	     (task) = cgroup_taskset_next((tset)))
 
 /*
  * Control Group subsystem type.
-- 
cgit v1.2.3


From bc668c7519ff8b4681af80e92f463bec7bf7cf9e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Feb 2014 06:58:43 -0500
Subject: cgroup: remove cgroup_taskset_cur_css() and cgroup_taskset_size()

The two functions don't have any users left.  Remove them along with
cgroup_taskset->cur_cgrp.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 581a124c7bc8..ef0b3af0e61c 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -528,9 +528,6 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor);
 struct cgroup_taskset;
 struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset);
 struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset);
-struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset,
-						   int subsys_id);
-int cgroup_taskset_size(struct cgroup_taskset *tset);
 
 /**
  * cgroup_taskset_for_each - iterate cgroup_taskset
-- 
cgit v1.2.3


From cc045e3952175e84c38dad22dea14465b9fc8fb5 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Fri, 14 Feb 2014 16:56:04 +0800
Subject: cgroup: deal with dummp_top in cgroup_name() and cgroup_path()

My kernel fails to boot, because blkcg calls cgroup_path() while
cgroupfs is not mounted.

Fix both cgroup_name() and cgroup_path().

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index ef0b3af0e61c..8c283a910b91 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -487,13 +487,21 @@ struct cgroup_subsys_state *seq_css(struct seq_file *seq);
 
 static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen)
 {
-	return kernfs_name(cgrp->kn, buf, buflen);
+	/* dummy_top doesn't have a kn associated */
+	if (cgrp->kn)
+		return kernfs_name(cgrp->kn, buf, buflen);
+	else
+		return strlcpy(buf, "/", buflen);
 }
 
 static inline char * __must_check cgroup_path(struct cgroup *cgrp, char *buf,
 					      size_t buflen)
 {
-	return kernfs_path(cgrp->kn, buf, buflen);
+	/* dummy_top doesn't have a kn associated */
+	if (cgrp->kn)
+		return kernfs_path(cgrp->kn, buf, buflen);
+	strlcpy(buf, "/", buflen);
+	return (buflen <= 2) ? NULL : buf;
 }
 
 static inline void pr_cont_cgroup_name(struct cgroup *cgrp)
-- 
cgit v1.2.3


From c75611282cf1bf717c1866e7a7eb4d0743815187 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 25 Feb 2014 10:04:01 -0500
Subject: cgroup: add css_set->mg_tasks

Currently, while migrating tasks from one cgroup to another,
cgroup_attach_task() builds a flex array of all target tasks;
unfortunately, this has a couple issues.

* Flex array has size limit.  On 64bit, struct task_and_cgroup is
  24bytes making the flex element limit around 87k.  It is a high
  number but not impossible to hit.  This means that the current
  cgroup implementation can't migrate a process with more than 87k
  threads.

* Process migration involves memory allocation whose size is dependent
  on the number of threads the process has.  This means that cgroup
  core can't guarantee success or failure of multi-process migrations
  as memory allocation failure can happen in the middle.  This is in
  part because cgroup can't grab threadgroup locks of multiple
  processes at the same time, so when there are multiple processes to
  migrate, it is imposible to tell how many tasks are to be migrated
  beforehand.

  Note that this already affects cgroup_transfer_tasks().  cgroup
  currently cannot guarantee atomic success or failure of the
  operation.  It may fail in the middle and after such failure cgroup
  doesn't have enough information to roll back properly.  It just
  aborts with some tasks migrated and others not.

To resolve the situation, we're going to use task->cg_list during
migration too.  Instead of building a separate array, target tasks
will be linked into a dedicated migration list_head on the owning
css_set.  Tasks on the migration list are treated the same as tasks on
the usual tasks list; however, being on a separate list allows cgroup
migration code path to keep track of the target tasks by simply
keeping the list of css_sets with tasks being migrated, making
unpredictable dynamic allocation unnecessary.

In prepartion of such migration path update, this patch introduces
css_set->mg_tasks list and updates css_set task iterations so that
they walk both css_set->tasks and ->mg_tasks.  Note that ->mg_tasks
isn't used yet.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 8c283a910b91..528e2aed36c3 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -324,10 +324,14 @@ struct css_set {
 	struct hlist_node hlist;
 
 	/*
-	 * List running through all tasks using this cgroup
-	 * group. Protected by css_set_lock
+	 * Lists running through all tasks using this cgroup group.
+	 * mg_tasks lists tasks which belong to this cset but are in the
+	 * process of being migrated out or in.  Protected by
+	 * css_set_rwsem, but, during migration, once tasks are moved to
+	 * mg_tasks, it can be read safely while holding cgroup_mutex.
 	 */
 	struct list_head tasks;
+	struct list_head mg_tasks;
 
 	/*
 	 * List of cgrp_cset_links pointing at cgroups referenced from this
-- 
cgit v1.2.3


From b3dc094e93905ae9c1bc0815402ad8e5b203d068 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 25 Feb 2014 10:04:01 -0500
Subject: cgroup: use css_set->mg_tasks to track target tasks during migration

Currently, while migrating tasks from one cgroup to another,
cgroup_attach_task() builds a flex array of all target tasks;
unfortunately, this has a couple issues.

* Flex array has size limit.  On 64bit, struct task_and_cgroup is
  24bytes making the flex element limit around 87k.  It is a high
  number but not impossible to hit.  This means that the current
  cgroup implementation can't migrate a process with more than 87k
  threads.

* Process migration involves memory allocation whose size is dependent
  on the number of threads the process has.  This means that cgroup
  core can't guarantee success or failure of multi-process migrations
  as memory allocation failure can happen in the middle.  This is in
  part because cgroup can't grab threadgroup locks of multiple
  processes at the same time, so when there are multiple processes to
  migrate, it is imposible to tell how many tasks are to be migrated
  beforehand.

  Note that this already affects cgroup_transfer_tasks().  cgroup
  currently cannot guarantee atomic success or failure of the
  operation.  It may fail in the middle and after such failure cgroup
  doesn't have enough information to roll back properly.  It just
  aborts with some tasks migrated and others not.

To resolve the situation, this patch updates the migration path to use
task->cg_list to track target tasks.  The previous patch already added
css_set->mg_tasks and updated iterations in non-migration paths to
include them during task migration.  This patch updates migration path
to actually make use of it.

Instead of putting onto a flex_array, each target task is moved from
its css_set->tasks list to css_set->mg_tasks and the migration path
keeps trace of all the source css_sets and the associated cgroups.
Once all source css_sets are determined, the destination css_set for
each is determined, linked to the matching source css_set and put on a
separate list.

To iterate the target tasks, migration path just needs to iterat
through either the source or target css_sets, depending on whether
migration has been committed or not, and the tasks on their ->mg_tasks
lists.  cgroup_taskset is updated to contain the list_heads for source
and target css_sets and the iteration cursor.  cgroup_taskset_*() are
accordingly updated to walk through css_sets and their ->mg_tasks.

This resolves the above listed issues with moderate additional
complexity.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 528e2aed36c3..3a1cb265afd6 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -346,6 +346,22 @@ struct css_set {
 	 */
 	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
 
+	/*
+	 * List of csets participating in the on-going migration either as
+	 * source or destination.  Protected by cgroup_mutex.
+	 */
+	struct list_head mg_node;
+
+	/*
+	 * If this cset is acting as the source of migration the following
+	 * two fields are set.  mg_src_cgrp is the source cgroup of the
+	 * on-going migration and mg_dst_cset is the destination cset the
+	 * target tasks on this cset should be migrated to.  Protected by
+	 * cgroup_mutex.
+	 */
+	struct cgroup *mg_src_cgrp;
+	struct css_set *mg_dst_cset;
+
 	/* For RCU-protected deletion */
 	struct rcu_head rcu_head;
 };
-- 
cgit v1.2.3


From 1958d2d53dadbb1c9aaf0b37741f13a60098b243 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 25 Feb 2014 10:04:03 -0500
Subject: cgroup: split process / task migration into four steps

Currently, process / task migration is a single operation which may
fail depending on memory pressure or the involved controllers'
->can_attach() callbacks.  One problem with this approach is migration
of multiple targets.  It's impossible to tell whether a given target
will be successfully migrated beforehand and cgroup core can't keep
track of enough states to roll back after intermediate failure.

This is already an issue with cgroup_transfer_tasks().  Also, we're
gonna need multiple target migration for unified hierarchy.

This patch splits migration into four stages -
cgroup_migrate_add_src(), cgroup_migrate_prepare_dst(),
cgroup_migrate() and cgroup_migrate_finish(), where
cgroup_migrate_prepare_dst() performs all the operations which may
fail due to allocation failure without actually migrating the target.

The four separate stages mean that, disregarding ->can_attach()
failures, the success or failure of multi target migration can be
determined before performing any actual migration.  If preparations of
all targets succeed, the whole thing will succeed.  If not, the whole
operation can fail without any side-effect.

Since the previous patch to use css_set->mg_tasks to keep track of
migration targets, the only thing which may need memory allocation
during migration is the target css_sets.  cgroup_migrate_prepare()
pins all source and target css_sets and link them up.  Note that this
can be performed without holding threadgroup_lock even if the target
is a process.  As long as cgroup_mutex is held, no new css_set can be
put into play.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 3a1cb265afd6..4829a577c1b9 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -350,6 +350,7 @@ struct css_set {
 	 * List of csets participating in the on-going migration either as
 	 * source or destination.  Protected by cgroup_mutex.
 	 */
+	struct list_head mg_preload_node;
 	struct list_head mg_node;
 
 	/*
-- 
cgit v1.2.3


From 0e1d768f1b1873272ec4e8dc1482bb5281855017 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 25 Feb 2014 10:04:03 -0500
Subject: cgroup: drop task_lock() protection around task->cgroups

For optimization, task_lock() is additionally used to protect
task->cgroups.  The optimization is pretty dubious as either
css_set_rwsem is grabbed anyway or PF_EXITING already protects
task->cgroups.  It adds only overhead and confusion at this point.
Let's drop task_[un]lock() and update comments accordingly.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 4829a577c1b9..acbb9a4cb6e9 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -658,10 +658,12 @@ struct cgroup_subsys_state *css_parent(struct cgroup_subsys_state *css)
  */
 #ifdef CONFIG_PROVE_RCU
 extern struct mutex cgroup_mutex;
+extern struct rw_semaphore css_set_rwsem;
 #define task_css_set_check(task, __c)					\
 	rcu_dereference_check((task)->cgroups,				\
-		lockdep_is_held(&(task)->alloc_lock) ||			\
-		lockdep_is_held(&cgroup_mutex) || (__c))
+		lockdep_is_held(&cgroup_mutex) ||			\
+		lockdep_is_held(&css_set_rwsem) ||			\
+		((task)->flags & PF_EXITING) || (__c))
 #else
 #define task_css_set_check(task, __c)					\
 	rcu_dereference((task)->cgroups)
-- 
cgit v1.2.3


From fdce6bf8c5b6968eb9b96ecc5fe400514a604902 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 19 Mar 2014 10:23:54 -0400
Subject: cgroup: remove NULL checks from [pr_cont_]cgroup_{name|path}()

The dummy hierarchy is now a fully functional one and dummy_top has a
kernfs_node associated with it.  Drop the NULL checks in
[pr_cont_]cont_{name|path}() which are no longer necessary.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 24 ++++--------------------
 1 file changed, 4 insertions(+), 20 deletions(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index acbb9a4cb6e9..9f4f253f0e47 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -508,39 +508,23 @@ struct cgroup_subsys_state *seq_css(struct seq_file *seq);
 
 static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen)
 {
-	/* dummy_top doesn't have a kn associated */
-	if (cgrp->kn)
-		return kernfs_name(cgrp->kn, buf, buflen);
-	else
-		return strlcpy(buf, "/", buflen);
+	return kernfs_name(cgrp->kn, buf, buflen);
 }
 
 static inline char * __must_check cgroup_path(struct cgroup *cgrp, char *buf,
 					      size_t buflen)
 {
-	/* dummy_top doesn't have a kn associated */
-	if (cgrp->kn)
-		return kernfs_path(cgrp->kn, buf, buflen);
-	strlcpy(buf, "/", buflen);
-	return (buflen <= 2) ? NULL : buf;
+	return kernfs_path(cgrp->kn, buf, buflen);
 }
 
 static inline void pr_cont_cgroup_name(struct cgroup *cgrp)
 {
-	/* dummy_top doesn't have a kn associated */
-	if (cgrp->kn)
-		pr_cont_kernfs_name(cgrp->kn);
-	else
-		pr_cont("/");
+	pr_cont_kernfs_name(cgrp->kn);
 }
 
 static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
 {
-	/* dummy_top doesn't have a kn associated */
-	if (cgrp->kn)
-		pr_cont_kernfs_path(cgrp->kn);
-	else
-		pr_cont("/");
+	pr_cont_kernfs_path(cgrp->kn);
 }
 
 char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen);
-- 
cgit v1.2.3


From 944196278d3dea0cece1636de417b56897d9a108 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 19 Mar 2014 10:23:54 -0400
Subject: cgroup: move ->subsys_mask from cgroupfs_root to cgroup

cgroupfs_root->subsys_mask represents the controllers attached to the
hierarchy.  This patch moves the field to cgroup.  Subsystem
initialization and rebinding updates the top cgroup's subsys_mask.
For !root cgroups, the subsys_mask bits are set from create_css() and
cleared from kill_css(), which effectively means that all cgroups will
have the same subsys_mask as the top cgroup.

While this doesn't make any difference now, this will help
implementation of the default unified hierarchy where !root cgroups
may have subsets of the top_cgroup's subsys_mask.

While at it, __kill_css() is split out of kill_css().  The former
doesn't care about the subsys_mask while the latter becomes noop if
the controller is already killed and clears the matching bit if not
before proceeding to killing the css.  This will be used later by the
default unified hierarchy implementation.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 9f4f253f0e47..3752a0182c94 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -173,6 +173,9 @@ struct cgroup {
 	 */
 	u64 serial_nr;
 
+	/* The bitmask of subsystems attached to this cgroup */
+	unsigned long subsys_mask;
+
 	/* Private pointers for each registered subsystem */
 	struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT];
 
@@ -276,9 +279,6 @@ enum {
 struct cgroupfs_root {
 	struct kernfs_root *kf_root;
 
-	/* The bitmask of subsystems attached to this hierarchy */
-	unsigned long subsys_mask;
-
 	/* Unique id for this hierarchy. */
 	int hierarchy_id;
 
-- 
cgit v1.2.3


From 3dd06ffa9df99aa88f4e01eaa0c9d3cb362430b0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 19 Mar 2014 10:23:54 -0400
Subject: cgroup: rename cgroup_dummy_root and related names

The dummy root will be repurposed to serve as the default unified
hierarchy.  Let's rename things in preparation.

* s/cgroup_dummy_root/cgrp_dfl_root/
* s/cgroupfs_root/cgroup_root/ as we don't do fs part directly anymore
* s/cgroup_root->top_cgroup/cgroup_root->cgrp/ for brevity

This is pure rename.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 3752a0182c94..77294fc66603 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -24,7 +24,7 @@
 
 #ifdef CONFIG_CGROUPS
 
-struct cgroupfs_root;
+struct cgroup_root;
 struct cgroup_subsys;
 struct inode;
 struct cgroup;
@@ -179,7 +179,7 @@ struct cgroup {
 	/* Private pointers for each registered subsystem */
 	struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT];
 
-	struct cgroupfs_root *root;
+	struct cgroup_root *root;
 
 	/*
 	 * List of cgrp_cset_links pointing at css_sets with tasks in this
@@ -211,7 +211,7 @@ struct cgroup {
 
 #define MAX_CGROUP_ROOT_NAMELEN 64
 
-/* cgroupfs_root->flags */
+/* cgroup_root->flags */
 enum {
 	/*
 	 * Unfortunately, cgroup core and various controllers are riddled
@@ -272,18 +272,18 @@ enum {
 };
 
 /*
- * A cgroupfs_root represents the root of a cgroup hierarchy, and may be
+ * A cgroup_root represents the root of a cgroup hierarchy, and may be
  * associated with a kernfs_root to form an active hierarchy.  This is
  * internal to cgroup core.  Don't access directly from controllers.
  */
-struct cgroupfs_root {
+struct cgroup_root {
 	struct kernfs_root *kf_root;
 
 	/* Unique id for this hierarchy. */
 	int hierarchy_id;
 
 	/* The root cgroup.  Root is destroyed on its release. */
-	struct cgroup top_cgroup;
+	struct cgroup cgrp;
 
 	/* Number of cgroups in the hierarchy, used only for /proc/cgroups */
 	atomic_t nr_cgrps;
@@ -598,7 +598,7 @@ struct cgroup_subsys {
 	const char *name;
 
 	/* link to parent, protected by cgroup_lock() */
-	struct cgroupfs_root *root;
+	struct cgroup_root *root;
 
 	/*
 	 * List of cftypes.  Each entry is the first entry of an array
-- 
cgit v1.2.3


From 4d3bb511b5f9980fc3e9ae5939ebc475b231d3fc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 19 Mar 2014 10:23:54 -0400
Subject: cgroup: drop const from @buffer of cftype->write_string()

cftype->write_string() just passes on the writeable buffer from kernfs
and there's no reason to add const restriction on the buffer.  The
only thing const achieves is unnecessarily complicating parsing of the
buffer.  Drop const from @buffer.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Cc: Daniel Borkmann <dborkman@redhat.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 include/linux/cgroup.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 77294fc66603..79993ac066c5 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -454,7 +454,7 @@ struct cftype {
 	 * Returns 0 or -ve error code.
 	 */
 	int (*write_string)(struct cgroup_subsys_state *css, struct cftype *cft,
-			    const char *buffer);
+			    char *buffer);
 	/*
 	 * trigger() callback can be used to get some kick from the
 	 * userspace, when the actual string written is not important
-- 
cgit v1.2.3


From a2dd424750807f83632a2a70293961086fd8f870 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 19 Mar 2014 10:23:55 -0400
Subject: cgroup: make cgrp_dfl_root mountable

cgrp_dfl_root will be used as the default unified hierarchy.  This
patch makes cgrp_dfl_root mountable by making the following changes.

* cgroup_init_early() now initializes cgrp_dfl_root w/
  CGRP_ROOT_SANE_BEHAVIOR.  The default hierarchy is always sane.

* parse_cgroupfs_options() and cgroup_mount() are updated such that
  cgrp_dfl_root is mounted if sane_behavior is specified w/o any
  subsystems.

* rebind_subsystems() now populates the root directory of
  cgrp_dfl_root.  Note that the function still guarantees success of
  rebinding subsystems to cgrp_dfl_root.  If populating fails while
  rebinding to cgrp_dfl_root, it whines but ignores the error.

* For backward compatibility, the default hierarchy shows up in
  /proc/$PID/cgroup only after it's explicitly mounted so that
  userland which doesn't make use of it doesn't see any change.

* "current_css_set_cg_links" file of debug cgroup now treats the
  default hierarchy the same as other hierarchies.  This is visible to
  userland.  Given that it's for debug controller, this should be
  fine.

* While at it, implement cgroup_on_dfl() which tests whether a give
  cgroup is on the default hierarchy or not.

The above changes make cgrp_dfl_root mostly equivalent to other
controllers but the actual unified hierarchy behaviors are not
implemented yet.  Let's plug child cgroup creation in cgrp_dfl_root
from create_cgroup() for now.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 79993ac066c5..7e9fa505b7bb 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -250,6 +250,9 @@ enum {
 	 *
 	 * - "cgroup.clone_children" is removed.
 	 *
+	 * - If mount is requested with sane_behavior but without any
+	 *   subsystem, the default unified hierarchy is mounted.
+	 *
 	 * - cpuset: tasks will be kept in empty cpusets when hotplug happens
 	 *   and take masks of ancestors with non-empty cpus/mems, instead of
 	 *   being moved to an ancestor.
@@ -468,6 +471,13 @@ struct cftype {
 #endif
 };
 
+extern struct cgroup_root cgrp_dfl_root;
+
+static inline bool cgroup_on_dfl(const struct cgroup *cgrp)
+{
+	return cgrp->root == &cgrp_dfl_root;
+}
+
 /*
  * See the comment above CGRP_ROOT_SANE_BEHAVIOR for details.  This
  * function can be called as long as @cgrp is accessible.
-- 
cgit v1.2.3


From 8cbbf2c972c4444cad36f61cd571714c39b8cf04 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 19 Mar 2014 10:23:55 -0400
Subject: cgroup: implement CFTYPE_ONLY_ON_DFL

This cftype flag makes the file only appear on the default hierarchy.
This will later be used for cgroup.controllers file.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 7e9fa505b7bb..43d1ed30bae3 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -384,6 +384,7 @@ enum {
 	CFTYPE_NOT_ON_ROOT	= (1 << 1),	/* don't create on root cgrp */
 	CFTYPE_INSANE		= (1 << 2),	/* don't create if sane_behavior */
 	CFTYPE_NO_PREFIX	= (1 << 3),	/* (DON'T USE FOR NEW FILES) no subsys prefix */
+	CFTYPE_ONLY_ON_DFL	= (1 << 4),	/* only on default hierarchy */
 };
 
 #define MAX_CFTYPE_NAME		64
-- 
cgit v1.2.3


From 1ec41830e087cda1f62dda4182c2b62811eb0ffc Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Fri, 28 Mar 2014 15:22:19 +0800
Subject: cgroup: remove useless argument from cgroup_exit()

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 43d1ed30bae3..c2515851c1aa 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -33,7 +33,7 @@ extern int cgroup_init_early(void);
 extern int cgroup_init(void);
 extern void cgroup_fork(struct task_struct *p);
 extern void cgroup_post_fork(struct task_struct *p);
-extern void cgroup_exit(struct task_struct *p, int run_callbacks);
+extern void cgroup_exit(struct task_struct *p);
 extern int cgroupstats_build(struct cgroupstats *stats,
 				struct dentry *dentry);
 
@@ -843,7 +843,7 @@ static inline int cgroup_init_early(void) { return 0; }
 static inline int cgroup_init(void) { return 0; }
 static inline void cgroup_fork(struct task_struct *p) {}
 static inline void cgroup_post_fork(struct task_struct *p) {}
-static inline void cgroup_exit(struct task_struct *p, int callbacks) {}
+static inline void cgroup_exit(struct task_struct *p) {}
 
 static inline int cgroupstats_build(struct cgroupstats *stats,
 					struct dentry *dentry)
-- 
cgit v1.2.3


From 5024ae29cd285ce9e736776414da645d3a91828c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 7 May 2014 21:31:17 -0400
Subject: cgroup: introduce task_css_is_root()

Determining the css of a task usually requires RCU read lock as that's
the only thing which keeps the returned css accessible till its
reference is acquired; however, testing whether a task belongs to the
root can be performed without dereferencing the returned css by
comparing the returned pointer against the root one in init_css_set[]
which never changes.

Implement task_css_is_root() which can be invoked in any context.
This will be used by the scheduled cgroup_freezer change.

v2: cgroup no longer supports modular controllers.  No need to export
    init_css_set.  Pointed out by Li.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/linux/cgroup.h')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index c2515851c1aa..d60904b9e505 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -473,6 +473,7 @@ struct cftype {
 };
 
 extern struct cgroup_root cgrp_dfl_root;
+extern struct css_set init_css_set;
 
 static inline bool cgroup_on_dfl(const struct cgroup *cgrp)
 {
@@ -700,6 +701,20 @@ static inline struct cgroup_subsys_state *task_css(struct task_struct *task,
 	return task_css_check(task, subsys_id, false);
 }
 
+/**
+ * task_css_is_root - test whether a task belongs to the root css
+ * @task: the target task
+ * @subsys_id: the target subsystem ID
+ *
+ * Test whether @task belongs to the root css on the specified subsystem.
+ * May be invoked in any context.
+ */
+static inline bool task_css_is_root(struct task_struct *task, int subsys_id)
+{
+	return task_css_check(task, subsys_id, true) ==
+		init_css_set.subsys[subsys_id];
+}
+
 static inline struct cgroup *task_cgroup(struct task_struct *task,
 					 int subsys_id)
 {
-- 
cgit v1.2.3