Re: [PATCHv6 1/3] rdmacg: Added rdma cgroup controller

From: Haggai Eran <hidden>
Date: 2016-02-24 16:49:32
Also in: linux-rdma, lkml

Hi,

Overall I the patch looks good to me. I have a few comments below.

On 20/02/2016 13:00, Parav Pandit wrote:

Resource pool is created/destroyed dynamically whenever
charging/uncharging occurs respectively and whenever user
configuration is done. Its a tradeoff of memory vs little more code

Its -> It's

space that creates resource pool whenever necessary,
instead of creating them during cgroup creation and device registration
time.

Signed-off-by: Parav Pandit <redacted>

quoted hunk ↗ jump to hunk

diff --git a/include/linux/cgroup_rdma.h b/include/linux/cgroup_rdma.h
new file mode 100644
index 0000000..b370733
--- /dev/null
+++ b/include/linux/cgroup_rdma.h

+struct rdmacg_device {
+	struct rdmacg_pool_info pool_info;
+	struct list_head	rdmacg_list;
+	struct list_head	rpool_head;
+	spinlock_t		rpool_lock;	/* protects rsource pool list */

rsource -> resource

+	char			*name;
+};
+
+/* APIs for RDMA/IB stack to publish when a device wants to
+ * participate in resource accounting
+ */
+int rdmacg_register_device(struct rdmacg_device *device);
+void rdmacg_unregister_device(struct rdmacg_device *device);
+
+/* APIs for RDMA/IB stack to charge/uncharge pool specific resources */
+int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
+		      struct rdmacg_device *device,
+		      int resource_index,
+		      int num);
+void rdmacg_uncharge(struct rdma_cgroup *cg,
+		     struct rdmacg_device *device,
+		     int resource_index,
+		     int num);
+void rdmacg_query_limit(struct rdmacg_device *device,
+			int *limits, int max_count);

You can drop the max_count parameter, and require the caller to
always provide pool_info->table_len items, couldn't you?

quoted hunk ↗ jump to hunk

+
+#endif	/* CONFIG_CGROUP_RDMA */
+#endif	/* _CGROUP_RDMA_H */

diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 0df0336a..d0e597c 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h

@@ -56,6 +56,10 @@ SUBSYS(hugetlb)
 SUBSYS(pids)
 #endif
 
+#if IS_ENABLED(CONFIG_CGROUP_RDMA)
+SUBSYS(rdma)
+#endif
+
 /*
  * The following subsystems are not supported on the default hierarchy.
  */

diff --git a/init/Kconfig b/init/Kconfig
index 2232080..1741b65 100644
--- a/init/Kconfig
+++ b/init/Kconfig

@@ -1054,6 +1054,16 @@ config CGROUP_PIDS
 	  since the PIDs limit only affects a process's ability to fork, not to
 	  attach to a cgroup.
 
+config CGROUP_RDMA
+	bool "RDMA controller"
+	help
+	  Provides enforcement of RDMA resources defined by IB stack.
+	  It is fairly easy for consumers to exhaust RDMA resources, which
+	  can result into resource unavailibility to other consumers.

unavailibility -> unavailability

quoted hunk ↗ jump to hunk

+	  RDMA controller is designed to stop this from happening.
+	  Attaching processes with active RDMA resources to the cgroup
+	  hierarchy is allowed even if can cross the hierarchy's limit.
+
 config CGROUP_FREEZER
 	bool "Freezer controller"
 	help

diff --git a/kernel/Makefile b/kernel/Makefile
index baa55e5..501f5df 100644

+/**
+ * uncharge_cg_resource - uncharge resource for rdma cgroup
+ * @cg: pointer to cg to uncharge and all parents in hierarchy

It only uncharges a single cg, right?

+ * @device: pointer to ib device
+ * @index: index of the resource to uncharge in cg (resource pool)
+ * @num: the number of rdma resource to uncharge
+ *
+ * It also frees the resource pool in the hierarchy for the resource pool
+ * which was created as part of charing operation.

charing -> charging

+ */
+static void uncharge_cg_resource(struct rdma_cgroup *cg,
+				 struct rdmacg_device *device,
+				 int index, int num)
+{
+	struct rdmacg_resource_pool *rpool;
+	struct rdmacg_pool_info *pool_info = &device->pool_info;
+
+	spin_lock(&cg->rpool_list_lock);
+	rpool = find_cg_rpool_locked(cg, device);

Is it possible for rpool to be NULL?

+
+	/*
+	 * A negative count (or overflow) is invalid,
+	 * it indicates a bug in the rdma controller.
+	 */
+	rpool->resources[index].usage -= num;
+
+	WARN_ON_ONCE(rpool->resources[index].usage < 0);
+	rpool->refcnt--;
+	if (rpool->refcnt == 0 && rpool->num_max_cnt == pool_info->table_len) {
+		/*
+		 * No user of the rpool and all entries are set to max, so
+		 * safe to delete this rpool.
+		 */
+		list_del(&rpool->cg_list);
+		spin_unlock(&cg->rpool_list_lock);
+
+		free_cg_rpool(rpool);
+		return;
+	}
+	spin_unlock(&cg->rpool_list_lock);
+}

+/**
+ * charge_cg_resource - charge resource for rdma cgroup
+ * @cg: pointer to cg to charge
+ * @device: pointer to rdmacg device
+ * @index: index of the resource to charge in cg (resource pool)
+ * @num: the number of rdma resource to charge
+ */
+static int charge_cg_resource(struct rdma_cgroup *cg,
+			      struct rdmacg_device *device,
+			      int index, int num)
+{
+	struct rdmacg_resource_pool *rpool;
+	s64 new;
+	int ret = 0;
+
+retry:
+	spin_lock(&cg->rpool_list_lock);
+	rpool = find_cg_rpool_locked(cg, device);
+	if (!rpool) {
+		spin_unlock(&cg->rpool_list_lock);
+		ret = alloc_cg_rpool(cg, device);
+		if (ret)
+			goto err;
+		else
+			goto retry;

Instead of retrying after allocation of a new rpool, why not just return the
newly allocated rpool (or the existing one) from alloc_cg_rpool?

+	}
+	new = num + rpool->resources[index].usage;
+	if (new > rpool->resources[index].max) {
+		ret = -EAGAIN;
+	} else {
+		rpool->refcnt++;
+		rpool->resources[index].usage = new;
+	}
+	spin_unlock(&cg->rpool_list_lock);
+err:
+	return ret;
+}

+static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
+				       char *buf, size_t nbytes, loff_t off)
+{
+	struct rdma_cgroup *cg = css_rdmacg(of_css(of));
+	const char *dev_name;
+	struct rdmacg_resource_pool *rpool;
+	struct rdmacg_device *device;
+	char *options = strstrip(buf);
+	struct rdmacg_pool_info *pool_info;
+	u64 enables = 0;

This limits the number of resources to 64. Sounds fine to me, but I think 
there should be a check somewhere (maybe in rdmacg_register_device()?) to
make sure someone doesn't pass too many resources.

+	int *new_limits;
+	int i = 0, ret = 0;
+
+	/* extract the device name first */
+	dev_name = strsep(&options, " ");
+	if (!dev_name) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	/* acquire lock to synchronize with hot plug devices */
+	mutex_lock(&dev_mutex);
+
+	device = rdmacg_get_device_locked(dev_name);
+	if (!device) {
+		ret = -ENODEV;
+		goto parse_err;
+	}
+
+	pool_info = &device->pool_info;
+
+	new_limits = kcalloc(pool_info->table_len, sizeof(int), GFP_KERNEL);
+	if (!new_limits) {
+		ret = -ENOMEM;
+		goto parse_err;
+	}
+
+	ret = rdmacg_parse_limits(options, pool_info, new_limits, &enables);
+	if (ret)
+		goto opt_err;
+
+retry:
+	spin_lock(&cg->rpool_list_lock);
+	rpool = find_cg_rpool_locked(cg, device);
+	if (!rpool) {
+		spin_unlock(&cg->rpool_list_lock);
+		ret = alloc_cg_rpool(cg, device);
+		if (ret)
+			goto opt_err;
+		else
+			goto retry;

You can avoid the retry here too. Perhaps this can go into a function.

+	}
+
+	/* now set the new limits of the rpool */
+	while (enables) {
+		/* if user set the limit, enables bit is set */
+		if (enables & BIT(i)) {
+			enables &= ~BIT(i);
+			set_resource_limit(rpool, i, new_limits[i]);
+		}
+		i++;
+	}
+	if (rpool->refcnt == 0 &&
+	    rpool->num_max_cnt == pool_info->table_len) {
+		/*
+		 * No user of the rpool and all entries are
+		 * set to max, so safe to delete this rpool.
+		 */
+		list_del(&rpool->cg_list);
+		spin_unlock(&cg->rpool_list_lock);
+		free_cg_rpool(rpool);
+	} else {
+		spin_unlock(&cg->rpool_list_lock);
+	}

You should consider putting this piece of code in a function (the
check of the reference counts and release of the rpool).

+
+opt_err:
+	kfree(new_limits);
+parse_err:
+	mutex_unlock(&dev_mutex);
+err:
+	return ret ?: nbytes;
+}
+

+
+static int print_rpool_values(struct seq_file *sf,

This can return void.

+			      struct rdmacg_pool_info *pool_info,
+			      u32 *value_tbl)
+{
+	int i;
+
+	for (i = 0; i < pool_info->table_len; i++) {
+		seq_puts(sf, pool_info->resource_name_table[i]);
+		seq_putc(sf, '=');
+		if (value_tbl[i] == S32_MAX)
+			seq_puts(sf, RDMACG_MAX_STR);
+		else
+			seq_printf(sf, "%d", value_tbl[i]);
+		seq_putc(sf, ' ');
+	}
+	return 0;
+}
+

Thanks,
Haggai

`h`	back out one level
`j`	next message in thread
`k`	previous message in thread
`l`	drill in
`Esc`	close help / fold thread tree
`?`	toggle this help