--- v7
+++ v8
@@ -1,129 +1,458 @@
-There are a number of different reasons which can cause a partition to
-become invalid. A user seeing an invalid partition may not know exactly
-why. To help user to get a better understanding of the underlying reason,
-The cpuset.cpus.partition control file, when read, will now report the
-reason why a partition become invalid. When a partition does become
-invalid, reading the control file will show "root invalid (<reason>)"
-where <reason> is a string that describes why the partition is invalid.
-
-Signed-off-by: Waiman Long <longman@redhat.com>
+In order to plan for proper partitioning of a system, a valid partition
+must be using only CPUs designated in "cpuset.cpus" of the partition
+root. It can be a subset, but it can't use CPUs outside of its designed
+list. If none of the CPUs in the desginated list can be granted to the
+partition, its "cpuset.cpus.effective" becomes empty. This is allowed
+as long as there is no task in the partition.
+
+To ease implementation, there are additional contraints in enabling a
+partition root.
+
+ 1) The "cpuset.cpus" is non-empty and exclusive.
+ 2) The parent cgroup is a valid partition root.
+ 3) The "cpuset.cpus" is a subset of parent's "cpuset.cpus".
+ 4) There is no child cgroups with cpuset enabled.
+
+This allows offlined cpus in parent's "cpuset.cpus" to be granted to
+a child partition which can have empty "cpuset.cpus.effective" when it
+has no task.
+
+The cpuset's subparts_cpus keeps track of CPUs (including offline CPUs)
+that are allocated to child partitions. It does not change during
+hotplug operations.
+
+Once a partition root has been enabled, changes to "cpuset.cpus" is
+generally allowed as long as the cpu list is exclusive, non-empty and
+is a superset of children's cpu lists. These existing rules are enforced
+by validate_change().
+
+A partition will become invalid when one or more of the following
+constraints are violated:
+
+ 1) The parent cgroup is a valid partition root.
+ 2) "cpuset.cpus.effective" is a subset of "cpuset.cpus"
+ 3) "cpuset.cpus.effective" is non-empty when there are tasks
+ in the partition.
+
+Disabling a partition root is always allowed even if there are child
+partitions underneath it. In this case, all the child partitions are
+also disabled (switch to "member"). So care must be taken to double check
+if there are child partitions underneath it before disabling a partition.
+
+This patch makes the necessary change to support the above features
+and constraints.
+
+Signed-off-by: Waiman Long <longman-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
---
- kernel/cgroup/cpuset.c | 46 +++++++++++++++++++++++++++++++++++++++---
- 1 file changed, 43 insertions(+), 3 deletions(-)
+ kernel/cgroup/cpuset.c | 228 +++++++++++++++++++++--------------------
+ 1 file changed, 117 insertions(+), 111 deletions(-)
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
-index ddea05e4d1f0..eb2e81f9326b 100644
+index e3a6609bc919..e2c01345353c 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
-@@ -78,6 +78,24 @@ struct fmeter {
- spinlock_t lock; /* guards read or write of above */
- };
-
-+/*
-+ * Invalid partition error code
-+ */
-+enum prs_errcode {
-+ PERR_NONE = 0,
-+ PERR_INVCPUS,
-+ PERR_NOCPUS,
-+ PERR_PARENT,
-+ PERR_HOTPLUG,
-+};
+@@ -1227,29 +1227,27 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
+ return -EINVAL;
+
+ /*
+- * Enabling/disabling partition root is not allowed if there are
+- * online children.
++ * Enabling partition root is not allowed if there are online children.
+ */
+- if ((cmd != partcmd_update) && css_has_online_children(&cpuset->css))
++ if ((cmd == partcmd_enable) && css_has_online_children(&cpuset->css))
+ return -EBUSY;
+
+ adding = deleting = false;
+ old_prs = new_prs = cpuset->partition_root_state;
+ if (cmd == partcmd_enable) {
+ /*
+- * Enabling partition root is not allowed if not all the CPUs
+- * can be granted from parent's effective_cpus.
++ * Enabling partition root is not allowed if cpus_allowed isn't
++ * a subset of parent's cpus_allowed.
+ */
+- if (!cpumask_subset(cpuset->cpus_allowed, parent->effective_cpus))
++ if (!cpumask_subset(cpuset->cpus_allowed, parent->cpus_allowed))
+ return -EINVAL;
+
+ /*
+ * A parent can be left with no CPU as long as there is no
+- * task directly associated with the parent partition. For
+- * such a parent, no new task can be moved into it.
++ * task directly associated with the parent partition.
+ */
+ if (partition_is_populated(parent, cpuset) &&
+- cpumask_equal(cpuset->cpus_allowed, parent->effective_cpus))
++ cpumask_subset(parent->effective_cpus, cpuset->cpus_allowed))
+ return -EINVAL;
+
+ cpumask_copy(tmp->addmask, cpuset->cpus_allowed);
+@@ -1261,54 +1259,52 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
+ /*
+ * partcmd_update with newmask:
+ *
++ * Compute add/delete mask to/from subparts_cpus
++ *
+ * delmask = cpus_allowed & ~newmask & parent->subparts_cpus
+- * addmask = newmask & parent->effective_cpus
++ * addmask = newmask & parent->cpus_allowed
+ * & ~parent->subparts_cpus
+ */
+ cpumask_andnot(tmp->delmask, cpuset->cpus_allowed, newmask);
+ deleting = cpumask_and(tmp->delmask, tmp->delmask,
+ parent->subparts_cpus);
+
+- cpumask_and(tmp->addmask, newmask, parent->effective_cpus);
++ cpumask_and(tmp->addmask, newmask, parent->cpus_allowed);
+ adding = cpumask_andnot(tmp->addmask, tmp->addmask,
+ parent->subparts_cpus);
+ /*
+- * Return error if the new effective_cpus could become empty
+- * and there are tasks in the parent.
++ * Make partition invalid if parent's effective_cpus could
++ * become empty and there are tasks in the parent.
+ */
+- if (adding && partition_is_populated(parent, cpuset) &&
+- cpumask_equal(parent->effective_cpus, tmp->addmask)) {
+- if (!deleting)
+- return -EINVAL;
+- /*
+- * As some of the CPUs in subparts_cpus might have
+- * been offlined, we need to compute the real delmask
+- * to confirm that.
+- */
+- if (!cpumask_and(tmp->addmask, tmp->delmask,
+- cpu_active_mask))
+- return -EINVAL;
+- cpumask_copy(tmp->addmask, parent->effective_cpus);
+- }
++ part_error = partition_is_populated(parent, cpuset) &&
++ cpumask_subset(parent->effective_cpus, tmp->addmask) &&
++ !cpumask_intersects(tmp->delmask, cpu_active_mask);
+ } else {
+ /*
+ * partcmd_update w/o newmask:
+ *
+ * addmask = cpus_allowed & parent->effective_cpus
+ *
+- * Note that parent's subparts_cpus may have been
+- * pre-shrunk in case there is a change in the cpu list.
+- * So no deletion is needed.
++ * This gets invoked either due to a hotplug event or
++ * from update_cpumasks_hier() where we can't return an
++ * error. This can cause a partition root to become invalid
++ * in the case of a hotplug.
++ *
++ * A partition error happens when:
++ * 1) Cpuset is valid partition, but parent does not distribute
++ * out any CPUs.
++ * 2) Parent has tasks and all its effective CPUs will have
++ * to be distributed out.
+ */
+ adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed,
+ parent->effective_cpus);
+- part_error = cpumask_equal(tmp->addmask, parent->effective_cpus) &&
+- partition_is_populated(parent, cpuset);
++ part_error = (is_partition_root(cpuset) &&
++ !parent->nr_subparts_cpus) ||
++ (cpumask_equal(parent->effective_cpus, tmp->addmask) &&
++ partition_is_populated(parent, cpuset));
+ }
+
+ if (cmd == partcmd_update) {
+- int prev_prs = cpuset->partition_root_state;
+-
+ /*
+ * Check for possible transition between PRS_ENABLED
+ * and PRS_ERROR.
+@@ -1323,13 +1319,9 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
+ new_prs = PRS_ENABLED;
+ break;
+ }
+- /*
+- * Set part_error if previously in invalid state.
+- */
+- part_error = (prev_prs == PRS_ERROR);
+ }
+
+- if (!part_error && (new_prs == PRS_ERROR))
++ if ((old_prs == PRS_ERROR) && (new_prs == PRS_ERROR))
+ return 0; /* Nothing need to be done */
+
+ if (new_prs == PRS_ERROR) {
+@@ -1382,6 +1374,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
+ * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
+ * @cs: the cpuset to consider
+ * @tmp: temp variables for calculating effective_cpus & partition setup
++ * @force: don't skip any descendant cpusets if set
+ *
+ * When configured cpumask is changed, the effective cpumasks of this cpuset
+ * and all its descendants need to be updated.
+@@ -1390,7 +1383,8 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
+ *
+ * Called with cpuset_rwsem held
+ */
+-static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
++static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
++ bool force)
+ {
+ struct cpuset *cp;
+ struct cgroup_subsys_state *pos_css;
+@@ -1400,6 +1394,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_css, cs) {
+ struct cpuset *parent = parent_cs(cp);
++ bool update_parent = false;
+
+ compute_effective_cpumask(tmp->new_cpus, cp, parent);
+
+@@ -1427,9 +1422,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
+
+ /*
+ * Skip the whole subtree if the cpumask remains the same
+- * and has no partition root state.
++ * with no partition root state and force flag not set.
+ */
+- if (!cp->partition_root_state &&
++ if (!cp->partition_root_state && !force &&
+ cpumask_equal(tmp->new_cpus, cp->effective_cpus)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+@@ -1466,8 +1461,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
+ break;
+
+ case PRS_ENABLED:
+- if (update_parent_subparts_cpumask(cp, partcmd_update, NULL, tmp))
+- update_tasks_cpumask(parent);
++ update_parent = true;
+ break;
+
+ case PRS_ERROR:
+@@ -1483,40 +1477,41 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
+ continue;
+ rcu_read_unlock();
+
++ if (update_parent) {
++ if (update_parent_subparts_cpumask(cp, partcmd_update, NULL, tmp))
++ update_tasks_cpumask(parent);
++ /*
++ * The cpuset partition_root_state may be changed
++ * to PRS_ERROR. Capture it.
++ */
++ new_prs = cp->partition_root_state;
++ }
+
-+static const char * const perr_strings[] = {
-+ [PERR_INVCPUS] = "Invalid change to cpuset.cpus",
-+ [PERR_PARENT] = "Parent is no longer a partition root",
-+ [PERR_NOCPUS] = "Parent unable to distribute cpu downstream",
-+ [PERR_HOTPLUG] = "No cpu available due to hotplug",
-+};
+ spin_lock_irq(&callback_lock);
+
+- cpumask_copy(cp->effective_cpus, tmp->new_cpus);
+- if (cp->nr_subparts_cpus && (new_prs != PRS_ENABLED)) {
++ if (cp->nr_subparts_cpus && (new_prs <= 0)) {
++ /*
++ * Put all active subparts_cpus back to effective_cpus.
++ */
++ cpumask_or(tmp->new_cpus, tmp->new_cpus,
++ cp->subparts_cpus);
++ cpumask_and(tmp->new_cpus, tmp->new_cpus,
++ cpu_active_mask);
+ cp->nr_subparts_cpus = 0;
+ cpumask_clear(cp->subparts_cpus);
+- } else if (cp->nr_subparts_cpus) {
++ }
+
- struct cpuset {
- struct cgroup_subsys_state css;
-
-@@ -163,6 +181,9 @@ struct cpuset {
-
- /* Handle for cpuset.cpus.partition */
- struct cgroup_file partition_file;
++ cpumask_copy(cp->effective_cpus, tmp->new_cpus);
++ if (cp->nr_subparts_cpus) {
+ /*
+ * Make sure that effective_cpus & subparts_cpus
+ * are mutually exclusive.
+- *
+- * In the unlikely event that effective_cpus
+- * becomes empty. we clear cp->nr_subparts_cpus and
+- * let its child partition roots to compete for
+- * CPUs again.
+ */
+ cpumask_andnot(cp->effective_cpus, cp->effective_cpus,
+ cp->subparts_cpus);
+- if (cpumask_empty(cp->effective_cpus)) {
+- cpumask_copy(cp->effective_cpus, tmp->new_cpus);
+- cpumask_clear(cp->subparts_cpus);
+- cp->nr_subparts_cpus = 0;
+- } else if (!cpumask_subset(cp->subparts_cpus,
+- tmp->new_cpus)) {
+- cpumask_andnot(cp->subparts_cpus,
+- cp->subparts_cpus, tmp->new_cpus);
+- cp->nr_subparts_cpus
+- = cpumask_weight(cp->subparts_cpus);
+- }
+ }
+
+- if (new_prs != old_prs)
+- cp->partition_root_state = new_prs;
+-
++ cp->partition_root_state = new_prs;
+ spin_unlock_irq(&callback_lock);
+ notify_partition_change(cp, old_prs, new_prs);
+
+@@ -1570,7 +1565,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
+ if (!sibling->use_parent_ecpus)
+ continue;
+
+- update_cpumasks_hier(sibling, tmp);
++ update_cpumasks_hier(sibling, tmp, false);
+ }
+ rcu_read_unlock();
+ }
+@@ -1643,13 +1638,13 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
+ * Make sure that subparts_cpus is a subset of cpus_allowed.
+ */
+ if (cs->nr_subparts_cpus) {
+- cpumask_andnot(cs->subparts_cpus, cs->subparts_cpus,
+- cs->cpus_allowed);
++ cpumask_and(cs->subparts_cpus, cs->subparts_cpus,
++ cs->cpus_allowed);
+ cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
+ }
+ spin_unlock_irq(&callback_lock);
+
+- update_cpumasks_hier(cs, &tmp);
++ update_cpumasks_hier(cs, &tmp, false);
+
+ if (cs->partition_root_state) {
+ struct cpuset *parent = parent_cs(cs);
+@@ -2071,20 +2066,23 @@ static int update_prstate(struct cpuset *cs, int new_prs)
+ }
+ } else {
+ /*
+- * Turning off partition root will clear the
+- * CS_CPU_EXCLUSIVE bit.
++ * Switch back to member is always allowed even if it
++ * disables child partitions.
+ */
+- if (old_prs == PRS_ERROR) {
+- update_flag(CS_CPU_EXCLUSIVE, cs, 0);
+- err = 0;
+- goto out;
++ err = 0;
++ update_parent_subparts_cpumask(cs, partcmd_disable, NULL,
++ &tmpmask);
++ /*
++ * If there are child partitions, we have to disable them.
++ */
++ if (unlikely(cs->nr_subparts_cpus)) {
++ spin_lock_irq(&callback_lock);
++ cs->nr_subparts_cpus = 0;
++ cpumask_clear(cs->subparts_cpus);
++ compute_effective_cpumask(cs->effective_cpus, cs, parent);
++ spin_unlock_irq(&callback_lock);
+ }
+
+- err = update_parent_subparts_cpumask(cs, partcmd_disable,
+- NULL, &tmpmask);
+- if (err)
+- goto out;
+-
+ /* Turning off CS_CPU_EXCLUSIVE will not return error */
+ update_flag(CS_CPU_EXCLUSIVE, cs, 0);
+ }
+@@ -2105,6 +2103,11 @@ static int update_prstate(struct cpuset *cs, int new_prs)
+ spin_lock_irq(&callback_lock);
+ cs->partition_root_state = new_prs;
+ spin_unlock_irq(&callback_lock);
++ /*
++ * Update child cpusets when disabling partition.
++ */
++ if (new_prs == PRS_DISABLED && !list_empty(&cs->css.children))
++ update_cpumasks_hier(cs, &tmpmask, true);
+ notify_partition_change(cs, old_prs, new_prs);
+ }
+
+@@ -3176,12 +3179,32 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
+
+ /*
+ * In the unlikely event that a partition root has empty
+- * effective_cpus with tasks or its parent becomes erroneous, we
+- * have to transition it to the erroneous state.
++ * effective_cpus with tasks, we will have to invalidate child
++ * partitions, if present, by setting nr_subparts_cpus to 0 to
++ * reclaim their cpus.
++ */
++ if (is_partition_root(cs) && cpumask_empty(&new_cpus) &&
++ cs->nr_subparts_cpus && partition_is_populated(cs, NULL)) {
++ spin_lock_irq(&callback_lock);
++ cs->nr_subparts_cpus = 0;
++ cpumask_clear(cs->subparts_cpus);
++ spin_unlock_irq(&callback_lock);
++ compute_effective_cpumask(&new_cpus, cs, parent);
++ }
+
-+ /* Invalid partition error code, not lock protected */
-+ enum prs_errcode prs_err;
- };
-
- /*
-@@ -272,8 +293,13 @@ static inline int is_partition_root(const struct cpuset *cs)
- static inline void notify_partition_change(struct cpuset *cs,
- int old_prs, int new_prs)
- {
-- if (old_prs != new_prs)
-- cgroup_file_notify(&cs->partition_file);
-+ if (old_prs == new_prs)
-+ return;
-+ cgroup_file_notify(&cs->partition_file);
++ /*
++ * Force the partition to become invalid if either one of
++ * the following conditions hold:
++ * 1) empty effective cpus with tasks in partition
++ * 2) parent is invalid or doesn't grant any cpus to child partitions.
+ */
+ if (is_partition_root(cs) &&
+ ((cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)) ||
+- (parent->partition_root_state == PRS_ERROR))) {
++ !parent->nr_subparts_cpus)) {
++ int old_prs;
+
-+ /* Reset prs_err if not invalid */
-+ if (new_prs != PRS_ERROR)
-+ WRITE_ONCE(cs->prs_err, PERR_NONE);
- }
-
- static struct cpuset top_cpuset = {
-@@ -1243,6 +1269,8 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
- cpu_active_mask))
- part_error = true;
- cpumask_copy(tmp->addmask, parent->effective_cpus);
-+ if ((READ_ONCE(cpuset->prs_err) == PERR_NONE) && part_error)
-+ WRITE_ONCE(cpuset->prs_err, PERR_INVCPUS);
- }
- } else {
- /*
-@@ -1264,6 +1292,8 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
- part_error = (is_partition_root(cpuset) &&
- !parent->nr_subparts_cpus) ||
- cpumask_equal(tmp->addmask, parent->effective_cpus);
-+ if (is_partition_root(cpuset) && part_error)
-+ WRITE_ONCE(cpuset->prs_err, PERR_NOCPUS);
- }
-
- if (cmd == partcmd_update) {
-@@ -1427,6 +1457,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
- * When parent is invalid, it has to be too.
- */
- new_prs = PRS_ERROR;
-+ WRITE_ONCE(cp->prs_err, PERR_PARENT);
- break;
- }
- }
-@@ -2546,6 +2577,7 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
- static int sched_partition_show(struct seq_file *seq, void *v)
- {
- struct cpuset *cs = css_cs(seq_css(seq));
-+ const char *err;
-
- switch (cs->partition_root_state) {
- case PRS_ENABLED:
-@@ -2555,7 +2587,11 @@ static int sched_partition_show(struct seq_file *seq, void *v)
- seq_puts(seq, "member\n");
- break;
- case PRS_ERROR:
-- seq_puts(seq, "root invalid\n");
-+ err = perr_strings[READ_ONCE(cs->prs_err)];
-+ if (err)
-+ seq_printf(seq, "root invalid (%s)\n", err);
-+ else
-+ seq_puts(seq, "root invalid\n");
- break;
- }
- return 0;
-@@ -3155,6 +3191,10 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
++ update_parent_subparts_cpumask(cs, partcmd_disable,
++ NULL, tmp);
+ if (cs->nr_subparts_cpus) {
spin_lock_irq(&callback_lock);
- cs->partition_root_state = PRS_ERROR;
- spin_unlock_irq(&callback_lock);
-+ if (parent->partition_root_state == PRS_ERROR)
-+ WRITE_ONCE(cs->prs_err, PERR_PARENT);
-+ else
-+ WRITE_ONCE(cs->prs_err, PERR_HOTPLUG);
- notify_partition_change(cs, old_prs, PRS_ERROR);
+ cs->nr_subparts_cpus = 0;
+@@ -3190,40 +3213,23 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
+ compute_effective_cpumask(&new_cpus, cs, parent);
+ }
+
+- /*
+- * Force the partition to become invalid if either one of
+- * the following conditions hold:
+- * 1) empty effective cpus but not valid empty partition.
+- * 2) parent is invalid or doesn't grant any cpus to child
+- * partitions.
+- */
+- if ((parent->partition_root_state == PRS_ERROR) ||
+- (cpumask_empty(&new_cpus) &&
+- partition_is_populated(cs, NULL))) {
+- int old_prs;
+-
+- update_parent_subparts_cpumask(cs, partcmd_disable,
+- NULL, tmp);
+- old_prs = cs->partition_root_state;
+- if (old_prs != PRS_ERROR) {
+- spin_lock_irq(&callback_lock);
+- cs->partition_root_state = PRS_ERROR;
+- spin_unlock_irq(&callback_lock);
+- notify_partition_change(cs, old_prs, PRS_ERROR);
+- }
++ old_prs = cs->partition_root_state;
++ if (old_prs != PRS_ERROR) {
++ spin_lock_irq(&callback_lock);
++ cs->partition_root_state = PRS_ERROR;
++ spin_unlock_irq(&callback_lock);
++ notify_partition_change(cs, old_prs, PRS_ERROR);
}
cpuset_force_rebuild();
+ }
+
+ /*
+ * On the other hand, an erroneous partition root may be transitioned
+- * back to a regular one or a partition root with no CPU allocated
+- * from the parent may change to erroneous.
++ * back to a regular one.
+ */
+- if (is_partition_root(parent) &&
+- ((cs->partition_root_state == PRS_ERROR) ||
+- !cpumask_intersects(&new_cpus, parent->subparts_cpus)) &&
+- update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp))
++ else if (is_partition_root(parent) &&
++ (cs->partition_root_state == PRS_ERROR) &&
++ update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp))
+ cpuset_force_rebuild();
+
+ update_tasks:
--
-2.18.1
-
+2.27.0
+