[PATCH v2 net-next 3/5] bpf: Add new cgroup attach type to enable sock modifications

From: David Ahern <hidden>
Date: 2016-10-27 00:58:52
Subsystem: bpf [core], bpf [general] (safe dynamic programs and tools), bpf [networking] (tcx & tc bpf, sock_addr), bpf [storage & cgroups], networking [general], the rest · Maintainers: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, Eduard Zingerman, Kumar Kartikeya Dwivedi, Martin KaFai Lau, "David S. Miller", Eric Dumazet, Jakub Kicinski, Paolo Abeni, Linus Torvalds

Allow BPF_PROG_TYPE_CGROUP programs with cgroup.sock subtype to modify
sk_bound_dev_if for newly created AF_INET or AF_INET6 sockets. The program
can be attached to a cgroup using attach type BPF_CGROUP_INET_SOCK. The
cgroup verifier ops are updated to handle the sock offsets as well as the
existing skb accesses.

This allows a cgroup to be configured such that AF_INET{6} sockets opened
by processes are automatically bound to a specific device. In turn, this
enables the running of programs that do not support SO_BINDTODEVICE in a
specific VRF context / L3 domain.

v2
- dropped the bpf_sock_store_u32 helper
- dropped the new prog type BPF_PROG_TYPE_CGROUP_SOCK
- moved valid access and context conversion to use subtype
- dropped CREATE from BPF_CGROUP_INET_SOCK and related function names
- moved running of filter from sk_alloc to inet{6}_create

Signed-off-by: David Ahern <redacted>
---
 include/linux/filter.h   |  2 +-
 include/uapi/linux/bpf.h |  5 ++++
 kernel/bpf/cgroup.c      |  9 ++++++
 kernel/bpf/syscall.c     |  2 ++
 net/core/filter.c        | 77 ++++++++++++++++++++++++++++++++++++++++++++++--
 net/ipv4/af_inet.c       |  4 +++
 net/ipv6/af_inet6.c      |  3 ++
 7 files changed, 99 insertions(+), 3 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 88470cdd3ee1..ffde714f3a98 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h

@@ -409,7 +409,7 @@ struct bpf_prog {
 	union bpf_prog_subtype	subtype;	/* For fine-grained verifications */
 	struct bpf_prog_aux	*aux;		/* Auxiliary fields */
 	struct sock_fprog_kern	*orig_prog;	/* Original BPF program */
-	unsigned int		(*bpf_func)(const struct sk_buff *skb,
+	unsigned int		(*bpf_func)(const void *ctx,
 					    const struct bpf_insn *filter);
 	/* Instructions for interpreter */
 	union {

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 160c24ffdce2..546e84b1792f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h

@@ -104,6 +104,7 @@ enum bpf_prog_type {
 enum bpf_attach_type {
 	BPF_CGROUP_INET_INGRESS,
 	BPF_CGROUP_INET_EGRESS,
+	BPF_CGROUP_INET_SOCK,
 	__MAX_BPF_ATTACH_TYPE
 };

@@ -532,6 +533,10 @@ struct bpf_tunnel_key {
 	__u32 tunnel_label;
 };
 
+struct bpf_sock {
+	__u32 bound_dev_if;
+};
+
 /* User return codes for XDP prog type.
  * A valid XDP program must return one of these defined values. All other
  * return codes are reserved for future use. Unknown return codes will result

diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index d5746aec8f34..796e39aa28f5 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c

@@ -117,6 +117,12 @@ void __cgroup_bpf_update(struct cgroup *cgrp,
 	}
 }
 
+static int __cgroup_bpf_run_filter_sock(struct sock *sk,
+					struct bpf_prog *prog)
+{
+	return prog->bpf_func(sk, prog->insnsi) == 1 ? 0 : -EPERM;
+}
+
 static int __cgroup_bpf_run_filter_skb(struct sk_buff *skb,
 				       struct bpf_prog *prog)
 {

@@ -171,6 +177,9 @@ int __cgroup_bpf_run_filter(struct sock *sk,
 		case BPF_CGROUP_INET_EGRESS:
 			ret = __cgroup_bpf_run_filter_skb(skb, prog);
 			break;
+		case BPF_CGROUP_INET_SOCK:
+			ret = __cgroup_bpf_run_filter_sock(sk, prog);
+			break;
 		/* make gcc happy else complains about missing enum value */
 		default:
 			return 0;

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index fbf81156e49d..bc3be0b19b57 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c

@@ -843,6 +843,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	switch (attr->attach_type) {
 	case BPF_CGROUP_INET_INGRESS:
 	case BPF_CGROUP_INET_EGRESS:
+	case BPF_CGROUP_INET_SOCK:
 		prog = bpf_prog_get_type(attr->attach_bpf_fd,
 					 BPF_PROG_TYPE_CGROUP);
 		if (IS_ERR(prog))

@@ -880,6 +881,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	switch (attr->attach_type) {
 	case BPF_CGROUP_INET_INGRESS:
 	case BPF_CGROUP_INET_EGRESS:
+	case BPF_CGROUP_INET_SOCK:
 		cgrp = cgroup_get_from_fd(attr->target_fd);
 		if (IS_ERR(cgrp))
 			return PTR_ERR(cgrp);

diff --git a/net/core/filter.c b/net/core/filter.c
index 4207ab2e56ba..7193eb7fe892 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c

@@ -2634,6 +2634,40 @@ static bool sk_filter_is_valid_access(int off, int size,
 	return __is_valid_access(off, size, type);
 }
 
+static bool sock_filter_is_valid_access(int off, int size,
+					enum bpf_access_type type)
+{
+	if (type == BPF_WRITE) {
+		switch (off) {
+		case offsetof(struct bpf_sock, bound_dev_if):
+			break;
+		default:
+			return false;
+		}
+	}
+
+	if (off < 0 || off + size > sizeof(struct bpf_sock))
+		return false;
+
+	/* The verifier guarantees that size > 0. */
+	if (off % size != 0)
+		return false;
+
+	return true;
+}
+
+static bool cgroup_is_valid_access(int off, int size,
+				   enum bpf_access_type type,
+				   enum bpf_reg_type *reg_type,
+				   union bpf_prog_subtype *prog_subtype)
+{
+	if (prog_subtype->cgroup.sock)
+		return sock_filter_is_valid_access(off, size, type);
+
+	return sk_filter_is_valid_access(off, size, type, reg_type,
+					 prog_subtype);
+}
+
 static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
 			       const struct bpf_prog *prog)
 {

@@ -2894,6 +2928,45 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 	return insn - insn_buf;
 }
 
+static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
+					  int dst_reg, int src_reg,
+					  int ctx_off,
+					  struct bpf_insn *insn_buf,
+					  struct bpf_prog *prog)
+{
+	struct bpf_insn *insn = insn_buf;
+
+	switch (ctx_off) {
+	case offsetof(struct bpf_sock, bound_dev_if):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_bound_dev_if) != 4);
+
+		if (type == BPF_WRITE)
+			*insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg,
+					offsetof(struct sock, sk_bound_dev_if));
+		else
+			*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+				      offsetof(struct sock, sk_bound_dev_if));
+		break;
+	}
+
+	return insn - insn_buf;
+}
+
+static u32 cgroup_convert_ctx_access(enum bpf_access_type type, int dst_reg,
+				     int src_reg, int ctx_off,
+				     struct bpf_insn *insn_buf,
+				     struct bpf_prog *prog)
+{
+	union bpf_prog_subtype *prog_subtype = &prog->subtype;
+
+	if (prog_subtype->cgroup.sock)
+		return sock_filter_convert_ctx_access(type, dst_reg, src_reg,
+						      ctx_off, insn_buf, prog);
+
+	return sk_filter_convert_ctx_access(type, dst_reg, src_reg, ctx_off,
+					    insn_buf, prog);
+}
+
 static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 					 int src_reg, int ctx_off,
 					 struct bpf_insn *insn_buf,

@@ -2963,8 +3036,8 @@ static const struct bpf_verifier_ops xdp_ops = {
 
 static const struct bpf_verifier_ops cgroup_ops = {
 	.get_func_proto		= cgroup_func_proto,
-	.is_valid_access	= sk_filter_is_valid_access,
-	.convert_ctx_access	= sk_filter_convert_ctx_access,
+	.is_valid_access	= cgroup_is_valid_access,
+	.convert_ctx_access	= cgroup_convert_ctx_access,
 };
 
 static struct bpf_prog_type_list sk_filter_type __read_mostly = {

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 1effc986739e..c0934f7483cb 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c

@@ -377,6 +377,10 @@ static int inet_create(struct net *net, struct socket *sock, int protocol,
 		if (err)
 			sk_common_release(sk);
 	}
+
+	if (!kern)
+		cgroup_bpf_run_filter(sk, NULL, BPF_CGROUP_INET_SOCK);
+
 out:
 	return err;
 out_rcu_unlock:

diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 46ad699937fd..c499ae3c472e 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c

@@ -257,6 +257,9 @@ static int inet6_create(struct net *net, struct socket *sock, int protocol,
 			goto out;
 		}
 	}
+
+	if (!kern)
+		cgroup_bpf_run_filter(sk, NULL, BPF_CGROUP_INET_SOCK);
 out:
 	return err;
 out_rcu_unlock:

-- 
2.1.4

`h`	back out one level
`j`	next message in thread
`k`	previous message in thread
`l`	drill in
`Esc`	close help / fold thread tree
`?`	toggle this help

Lifecycle