All the mail mirrored from lore.kernel.org
 help / color / mirror / Atom feed
From: Haggai Eran <haggaie@mellanox.com>
To: Parav Pandit <pandit.parav@gmail.com>, <cgroups@vger.kernel.org>,
	<linux-doc@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
	<linux-rdma@vger.kernel.org>, <tj@kernel.org>,
	<lizefan@huawei.com>, <hannes@cmpxchg.org>, <dledford@redhat.com>
Cc: <corbet@lwn.net>, <james.l.morris@oracle.com>, <serge@hallyn.com>,
	<ogerlitz@mellanox.com>, <matanb@mellanox.com>,
	<raindel@mellanox.com>, <akpm@linux-foundation.org>,
	<linux-security-module@vger.kernel.org>
Subject: Re: [PATCH 5/7] devcg: device cgroup's extension for RDMA resource.
Date: Tue, 8 Sep 2015 11:22:56 +0300	[thread overview]
Message-ID: <55EE9AE0.5030508@mellanox.com> (raw)
In-Reply-To: <1441658303-18081-6-git-send-email-pandit.parav@gmail.com>

On 07/09/2015 23:38, Parav Pandit wrote:
> +/* RDMA resources from device cgroup perspective */
> +enum devcgroup_rdma_rt {
> +	DEVCG_RDMA_RES_TYPE_UCTX,
> +	DEVCG_RDMA_RES_TYPE_CQ,
> +	DEVCG_RDMA_RES_TYPE_PD,
> +	DEVCG_RDMA_RES_TYPE_AH,
> +	DEVCG_RDMA_RES_TYPE_MR,
> +	DEVCG_RDMA_RES_TYPE_MW,
I didn't see memory windows in dev_cgroup_files in patch 3. Is it used?
> +	DEVCG_RDMA_RES_TYPE_SRQ,
> +	DEVCG_RDMA_RES_TYPE_QP,
> +	DEVCG_RDMA_RES_TYPE_FLOW,
> +	DEVCG_RDMA_RES_TYPE_MAX,
> +};

> +struct devcgroup_rdma_tracker {
> +	int limit;
> +	atomic_t usage;
> +	int failcnt;
> +};
Have you considered using struct res_counter?

> + * RDMA resource limits are hierarchical, so the highest configured limit of
> + * the hierarchy is enforced. Allowing resource limit configuration to default
> + * cgroup allows fair share to kernel space ULPs as well.
In what way is the highest configured limit of the hierarchy enforced? I
would expect all the limits along the hierarchy to be enforced.

> +int devcgroup_rdma_get_max_resource(struct seq_file *sf, void *v)
> +{
> +	struct dev_cgroup *dev_cg = css_to_devcgroup(seq_css(sf));
> +	int type = seq_cft(sf)->private;
> +	u32 usage;
> +
> +	if (dev_cg->rdma.tracker[type].limit ==	DEVCG_RDMA_MAX_RESOURCES) {
> +		seq_printf(sf, "%s\n", DEVCG_RDMA_MAX_RESOURCE_STR);
> +	} else {
> +		usage = dev_cg->rdma.tracker[type].limit;
If this is the resource limit, don't name it 'usage'.

> +		seq_printf(sf, "%u\n", usage);
> +	}
> +	return 0;
> +}

> +int devcgroup_rdma_get_max_resource(struct seq_file *sf, void *v)
> +{
> +	struct dev_cgroup *dev_cg = css_to_devcgroup(seq_css(sf));
> +	int type = seq_cft(sf)->private;
> +	u32 usage;
> +
> +	if (dev_cg->rdma.tracker[type].limit ==	DEVCG_RDMA_MAX_RESOURCES) {
> +		seq_printf(sf, "%s\n", DEVCG_RDMA_MAX_RESOURCE_STR);
I'm not sure hiding the actual number is good, especially in the
show_usage case.

> +	} else {
> +		usage = dev_cg->rdma.tracker[type].limit;
> +		seq_printf(sf, "%u\n", usage);
> +	}
> +	return 0;
> +}

> +void devcgroup_rdma_uncharge_resource(struct ib_ucontext *ucontext,
> +				      enum devcgroup_rdma_rt type, int num)
> +{
> +	struct dev_cgroup *dev_cg, *p;
> +	struct task_struct *ctx_task;
> +
> +	if (!num)
> +		return;
> +
> +	/* get cgroup of ib_ucontext it belong to, to uncharge
> +	 * so that when its called from any worker tasks or any
> +	 * other tasks to which this resource doesn't belong to,
> +	 * it can be uncharged correctly.
> +	 */
> +	if (ucontext)
> +		ctx_task = get_pid_task(ucontext->tgid, PIDTYPE_PID);
> +	else
> +		ctx_task = current;
> +	dev_cg = task_devcgroup(ctx_task);
> +
> +	spin_lock(&ctx_task->rdma_res_counter->lock);
Don't you need an rcu read lock and rcu_dereference to access
rdma_res_counter?

> +	ctx_task->rdma_res_counter->usage[type] -= num;
> +
> +	for (p = dev_cg; p; p = parent_devcgroup(p))
> +		uncharge_resource(p, type, num);
> +
> +	spin_unlock(&ctx_task->rdma_res_counter->lock);
> +
> +	if (type == DEVCG_RDMA_RES_TYPE_UCTX)
> +		rdma_free_res_counter(ctx_task);
> +}
> +EXPORT_SYMBOL(devcgroup_rdma_uncharge_resource);

> +int devcgroup_rdma_try_charge_resource(enum devcgroup_rdma_rt type, int num)
> +{
> +	struct dev_cgroup *dev_cg = task_devcgroup(current);
> +	struct task_rdma_res_counter *res_cnt = current->rdma_res_counter;
> +	int status;
> +
> +	if (!res_cnt) {
> +		res_cnt = kzalloc(sizeof(*res_cnt), GFP_KERNEL);
> +		if (!res_cnt)
> +			return -ENOMEM;
> +
> +		spin_lock_init(&res_cnt->lock);
> +		rcu_assign_pointer(current->rdma_res_counter, res_cnt);
Don't you need the task lock to update rdma_res_counter here?

> +	}
> +
> +	/* synchronize with migration task by taking lock, to avoid
> +	 * race condition of performing cgroup resource migration
> +	 * in non atomic way with this task, which can leads to leaked
> +	 * resources in older cgroup.
> +	 */
> +	spin_lock(&res_cnt->lock);
> +	status = try_charge_resource(dev_cg, type, num);
> +	if (status)
> +		goto busy;
> +
> +	/* single task updating its rdma resource usage, so atomic is
> +	 * not required.
> +	 */
> +	current->rdma_res_counter->usage[type] += num;
> +
> +busy:
> +	spin_unlock(&res_cnt->lock);
> +	return status;
> +}
> +EXPORT_SYMBOL(devcgroup_rdma_try_charge_resource);

Regards,
Haggai

WARNING: multiple messages have this Message-ID (diff)
From: Haggai Eran <haggaie@mellanox.com>
To: Parav Pandit <pandit.parav@gmail.com>,
	cgroups@vger.kernel.org, linux-doc@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux-rdma@vger.kernel.org,
	tj@kernel.org, lizefan@huawei.com, hannes@cmpxchg.org,
	dledford@redhat.com
Cc: corbet@lwn.net, james.l.morris@oracle.com, serge@hallyn.com,
	ogerlitz@mellanox.com, matanb@mellanox.com, raindel@mellanox.com,
	akpm@linux-foundation.org, linux-security-module@vger.kernel.org
Subject: Re: [PATCH 5/7] devcg: device cgroup's extension for RDMA resource.
Date: Tue, 8 Sep 2015 11:22:56 +0300	[thread overview]
Message-ID: <55EE9AE0.5030508@mellanox.com> (raw)
In-Reply-To: <1441658303-18081-6-git-send-email-pandit.parav@gmail.com>

On 07/09/2015 23:38, Parav Pandit wrote:
> +/* RDMA resources from device cgroup perspective */
> +enum devcgroup_rdma_rt {
> +	DEVCG_RDMA_RES_TYPE_UCTX,
> +	DEVCG_RDMA_RES_TYPE_CQ,
> +	DEVCG_RDMA_RES_TYPE_PD,
> +	DEVCG_RDMA_RES_TYPE_AH,
> +	DEVCG_RDMA_RES_TYPE_MR,
> +	DEVCG_RDMA_RES_TYPE_MW,
I didn't see memory windows in dev_cgroup_files in patch 3. Is it used?
> +	DEVCG_RDMA_RES_TYPE_SRQ,
> +	DEVCG_RDMA_RES_TYPE_QP,
> +	DEVCG_RDMA_RES_TYPE_FLOW,
> +	DEVCG_RDMA_RES_TYPE_MAX,
> +};

> +struct devcgroup_rdma_tracker {
> +	int limit;
> +	atomic_t usage;
> +	int failcnt;
> +};
Have you considered using struct res_counter?

> + * RDMA resource limits are hierarchical, so the highest configured limit of
> + * the hierarchy is enforced. Allowing resource limit configuration to default
> + * cgroup allows fair share to kernel space ULPs as well.
In what way is the highest configured limit of the hierarchy enforced? I
would expect all the limits along the hierarchy to be enforced.

> +int devcgroup_rdma_get_max_resource(struct seq_file *sf, void *v)
> +{
> +	struct dev_cgroup *dev_cg = css_to_devcgroup(seq_css(sf));
> +	int type = seq_cft(sf)->private;
> +	u32 usage;
> +
> +	if (dev_cg->rdma.tracker[type].limit ==	DEVCG_RDMA_MAX_RESOURCES) {
> +		seq_printf(sf, "%s\n", DEVCG_RDMA_MAX_RESOURCE_STR);
> +	} else {
> +		usage = dev_cg->rdma.tracker[type].limit;
If this is the resource limit, don't name it 'usage'.

> +		seq_printf(sf, "%u\n", usage);
> +	}
> +	return 0;
> +}

> +int devcgroup_rdma_get_max_resource(struct seq_file *sf, void *v)
> +{
> +	struct dev_cgroup *dev_cg = css_to_devcgroup(seq_css(sf));
> +	int type = seq_cft(sf)->private;
> +	u32 usage;
> +
> +	if (dev_cg->rdma.tracker[type].limit ==	DEVCG_RDMA_MAX_RESOURCES) {
> +		seq_printf(sf, "%s\n", DEVCG_RDMA_MAX_RESOURCE_STR);
I'm not sure hiding the actual number is good, especially in the
show_usage case.

> +	} else {
> +		usage = dev_cg->rdma.tracker[type].limit;
> +		seq_printf(sf, "%u\n", usage);
> +	}
> +	return 0;
> +}

> +void devcgroup_rdma_uncharge_resource(struct ib_ucontext *ucontext,
> +				      enum devcgroup_rdma_rt type, int num)
> +{
> +	struct dev_cgroup *dev_cg, *p;
> +	struct task_struct *ctx_task;
> +
> +	if (!num)
> +		return;
> +
> +	/* get cgroup of ib_ucontext it belong to, to uncharge
> +	 * so that when its called from any worker tasks or any
> +	 * other tasks to which this resource doesn't belong to,
> +	 * it can be uncharged correctly.
> +	 */
> +	if (ucontext)
> +		ctx_task = get_pid_task(ucontext->tgid, PIDTYPE_PID);
> +	else
> +		ctx_task = current;
> +	dev_cg = task_devcgroup(ctx_task);
> +
> +	spin_lock(&ctx_task->rdma_res_counter->lock);
Don't you need an rcu read lock and rcu_dereference to access
rdma_res_counter?

> +	ctx_task->rdma_res_counter->usage[type] -= num;
> +
> +	for (p = dev_cg; p; p = parent_devcgroup(p))
> +		uncharge_resource(p, type, num);
> +
> +	spin_unlock(&ctx_task->rdma_res_counter->lock);
> +
> +	if (type == DEVCG_RDMA_RES_TYPE_UCTX)
> +		rdma_free_res_counter(ctx_task);
> +}
> +EXPORT_SYMBOL(devcgroup_rdma_uncharge_resource);

> +int devcgroup_rdma_try_charge_resource(enum devcgroup_rdma_rt type, int num)
> +{
> +	struct dev_cgroup *dev_cg = task_devcgroup(current);
> +	struct task_rdma_res_counter *res_cnt = current->rdma_res_counter;
> +	int status;
> +
> +	if (!res_cnt) {
> +		res_cnt = kzalloc(sizeof(*res_cnt), GFP_KERNEL);
> +		if (!res_cnt)
> +			return -ENOMEM;
> +
> +		spin_lock_init(&res_cnt->lock);
> +		rcu_assign_pointer(current->rdma_res_counter, res_cnt);
Don't you need the task lock to update rdma_res_counter here?

> +	}
> +
> +	/* synchronize with migration task by taking lock, to avoid
> +	 * race condition of performing cgroup resource migration
> +	 * in non atomic way with this task, which can leads to leaked
> +	 * resources in older cgroup.
> +	 */
> +	spin_lock(&res_cnt->lock);
> +	status = try_charge_resource(dev_cg, type, num);
> +	if (status)
> +		goto busy;
> +
> +	/* single task updating its rdma resource usage, so atomic is
> +	 * not required.
> +	 */
> +	current->rdma_res_counter->usage[type] += num;
> +
> +busy:
> +	spin_unlock(&res_cnt->lock);
> +	return status;
> +}
> +EXPORT_SYMBOL(devcgroup_rdma_try_charge_resource);

Regards,
Haggai

  reply	other threads:[~2015-09-08  8:22 UTC|newest]

Thread overview: 95+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-09-07 20:38 [PATCH 0/7] devcg: device cgroup extension for rdma resource Parav Pandit
2015-09-07 20:38 ` Parav Pandit
2015-09-07 20:38 ` [PATCH 1/7] devcg: Added user option to rdma resource tracking Parav Pandit
2015-09-07 20:38   ` Parav Pandit
2015-09-07 20:38 ` [PATCH 2/7] devcg: Added rdma resource tracking module Parav Pandit
2015-09-07 20:38   ` Parav Pandit
2015-09-07 20:38 ` [PATCH 3/7] devcg: Added infrastructure for rdma device cgroup Parav Pandit
2015-09-08  5:31   ` Haggai Eran
2015-09-08  5:31     ` Haggai Eran
2015-09-08  7:02     ` Parav Pandit
2015-09-08  7:02       ` Parav Pandit
2015-09-07 20:38 ` [PATCH 4/7] devcg: Added rdma resource tracker object per task Parav Pandit
2015-09-08  5:48   ` Haggai Eran
2015-09-08  5:48     ` Haggai Eran
2015-09-08  7:04     ` Parav Pandit
2015-09-08  8:24       ` Haggai Eran
2015-09-08  8:24         ` Haggai Eran
2015-09-08  8:26         ` Parav Pandit
2015-09-07 20:38 ` [PATCH 5/7] devcg: device cgroup's extension for RDMA resource Parav Pandit
2015-09-07 20:38   ` Parav Pandit
2015-09-08  8:22   ` Haggai Eran [this message]
2015-09-08  8:22     ` Haggai Eran
2015-09-08 10:18     ` Parav Pandit
2015-09-08 13:50       ` Haggai Eran
2015-09-08 13:50         ` Haggai Eran
2015-09-08 14:13         ` Parav Pandit
2015-09-08  8:36   ` Haggai Eran
2015-09-08  8:36     ` Haggai Eran
2015-09-08 10:50     ` Parav Pandit
2015-09-08 10:50       ` Parav Pandit
2015-09-08 14:10       ` Haggai Eran
2015-09-08 14:10         ` Haggai Eran
2015-09-07 20:38 ` [PATCH 6/7] devcg: Added support to use RDMA device cgroup Parav Pandit
2015-09-08  8:40   ` Haggai Eran
2015-09-08  8:40     ` Haggai Eran
2015-09-08 10:22     ` Parav Pandit
2015-09-08 13:40       ` Haggai Eran
2015-09-08 13:40         ` Haggai Eran
2015-09-07 20:38 ` [PATCH 7/7] devcg: Added Documentation of " Parav Pandit
2015-09-07 20:38   ` Parav Pandit
2015-09-07 20:55 ` [PATCH 0/7] devcg: device cgroup extension for rdma resource Parav Pandit
2015-09-08 12:45 ` Haggai Eran
2015-09-08 12:45   ` Haggai Eran
2015-09-08 15:23 ` Tejun Heo
2015-09-08 15:23   ` Tejun Heo
2015-09-09  3:57   ` Parav Pandit
2015-09-10 16:49     ` Tejun Heo
2015-09-10 17:46       ` Parav Pandit
2015-09-10 17:46         ` Parav Pandit
2015-09-10 20:22         ` Tejun Heo
2015-09-11  3:39           ` Parav Pandit
2015-09-11  4:04             ` Tejun Heo
2015-09-11  4:04               ` Tejun Heo
2015-09-11  4:24               ` Doug Ledford
2015-09-11  4:24                 ` Doug Ledford
2015-09-11 14:52                 ` Tejun Heo
2015-09-11 14:52                   ` Tejun Heo
2015-09-11 16:26                   ` Parav Pandit
2015-09-11 16:34                     ` Tejun Heo
2015-09-11 16:34                       ` Tejun Heo
2015-09-11 16:39                       ` Parav Pandit
2015-09-11 16:39                         ` Parav Pandit
2015-09-11 19:25                         ` Tejun Heo
2015-09-14 10:18                           ` Parav Pandit
2015-09-14 10:18                             ` Parav Pandit
2015-09-11 16:47                   ` Parav Pandit
2015-09-11 16:47                     ` Parav Pandit
2015-09-11 19:05                     ` Tejun Heo
2015-09-11 19:05                       ` Tejun Heo
2015-09-11 19:22                   ` Hefty, Sean
2015-09-11 19:43                     ` Jason Gunthorpe
2015-09-11 19:43                       ` Jason Gunthorpe
2015-09-11 20:06                       ` Hefty, Sean
2015-09-14 11:09                         ` Parav Pandit
2015-09-14 14:04                           ` Parav Pandit
2015-09-14 15:21                             ` Tejun Heo
2015-09-14 15:21                               ` Tejun Heo
2015-09-14 17:28                           ` Jason Gunthorpe
2015-09-14 17:28                             ` Jason Gunthorpe
2015-09-14 18:54                             ` Parav Pandit
2015-09-14 18:54                               ` Parav Pandit
2015-09-14 20:18                               ` Jason Gunthorpe
2015-09-15  3:08                                 ` Parav Pandit
2015-09-15  3:45                                   ` Jason Gunthorpe
2015-09-15  3:45                                     ` Jason Gunthorpe
2015-09-16  4:41                                     ` Parav Pandit
2015-09-16  4:41                                       ` Parav Pandit
2015-09-20 10:35                                     ` Haggai Eran
2015-09-20 10:35                                       ` Haggai Eran
2015-10-28  8:14                                       ` Parav Pandit
2015-10-28  8:14                                         ` Parav Pandit
2015-09-14 10:15                     ` Parav Pandit
2015-09-11  4:43               ` Parav Pandit
2015-09-11 15:03                 ` Tejun Heo
2015-09-10 17:48       ` Hefty, Sean

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=55EE9AE0.5030508@mellanox.com \
    --to=haggaie@mellanox.com \
    --cc=akpm@linux-foundation.org \
    --cc=cgroups@vger.kernel.org \
    --cc=corbet@lwn.net \
    --cc=dledford@redhat.com \
    --cc=hannes@cmpxchg.org \
    --cc=james.l.morris@oracle.com \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-rdma@vger.kernel.org \
    --cc=linux-security-module@vger.kernel.org \
    --cc=lizefan@huawei.com \
    --cc=matanb@mellanox.com \
    --cc=ogerlitz@mellanox.com \
    --cc=pandit.parav@gmail.com \
    --cc=raindel@mellanox.com \
    --cc=serge@hallyn.com \
    --cc=tj@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.