From db5d0b597bc27bbddf40f2f8359a73be4eb77104 Mon Sep 17 00:00:00 2001 From: Nathan Fontenot Date: Fri, 10 Feb 2017 13:45:05 -0500 Subject: [PATCH 01/27] ibmvnic: Initialize completion variables before starting work Initialize condition variables prior to invoking any work that can mark them complete. This resolves a race in the ibmvnic driver where the driver faults trying to complete an uninitialized condition variable. Signed-off-by: Nathan Fontenot Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index c12596676bbb..c7150343342d 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -189,9 +189,10 @@ static int alloc_long_term_buff(struct ibmvnic_adapter *adapter, } ltb->map_id = adapter->map_id; adapter->map_id++; + + init_completion(&adapter->fw_done); send_request_map(adapter, ltb->addr, ltb->size, ltb->map_id); - init_completion(&adapter->fw_done); wait_for_completion(&adapter->fw_done); return 0; } @@ -1121,10 +1122,10 @@ static void ibmvnic_get_ethtool_stats(struct net_device *dev, crq.request_statistics.ioba = cpu_to_be32(adapter->stats_token); crq.request_statistics.len = cpu_to_be32(sizeof(struct ibmvnic_statistics)); - ibmvnic_send_crq(adapter, &crq); /* Wait for data to be written */ init_completion(&adapter->stats_done); + ibmvnic_send_crq(adapter, &crq); wait_for_completion(&adapter->stats_done); for (i = 0; i < ARRAY_SIZE(ibmvnic_stats); i++) @@ -2799,9 +2800,9 @@ static ssize_t trace_read(struct file *file, char __user *user_buf, size_t len, crq.collect_fw_trace.correlator = adapter->ras_comps[num].correlator; crq.collect_fw_trace.ioba = cpu_to_be32(trace_tok); crq.collect_fw_trace.len = adapter->ras_comps[num].trace_buff_size; - ibmvnic_send_crq(adapter, &crq); init_completion(&adapter->fw_done); + ibmvnic_send_crq(adapter, &crq); wait_for_completion(&adapter->fw_done); if (*ppos + len > be32_to_cpu(adapter->ras_comps[num].trace_buff_size)) @@ -3581,9 +3582,9 @@ static int ibmvnic_dump_show(struct seq_file *seq, void *v) memset(&crq, 0, sizeof(crq)); crq.request_dump_size.first = IBMVNIC_CRQ_CMD; crq.request_dump_size.cmd = REQUEST_DUMP_SIZE; - ibmvnic_send_crq(adapter, &crq); init_completion(&adapter->fw_done); + ibmvnic_send_crq(adapter, &crq); wait_for_completion(&adapter->fw_done); seq_write(seq, adapter->dump_data, adapter->dump_data_size); @@ -3629,8 +3630,8 @@ static void handle_crq_init_rsp(struct work_struct *work) } } - send_version_xchg(adapter); reinit_completion(&adapter->init_done); + send_version_xchg(adapter); if (!wait_for_completion_timeout(&adapter->init_done, timeout)) { dev_err(dev, "Passive init timeout\n"); goto task_failed; @@ -3640,9 +3641,9 @@ static void handle_crq_init_rsp(struct work_struct *work) if (adapter->renegotiate) { adapter->renegotiate = false; release_sub_crqs_no_irqs(adapter); - send_cap_queries(adapter); reinit_completion(&adapter->init_done); + send_cap_queries(adapter); if (!wait_for_completion_timeout(&adapter->init_done, timeout)) { dev_err(dev, "Passive init timeout\n"); @@ -3772,9 +3773,9 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) adapter->debugfs_dump = ent; } } - ibmvnic_send_crq_init(adapter); init_completion(&adapter->init_done); + ibmvnic_send_crq_init(adapter); if (!wait_for_completion_timeout(&adapter->init_done, timeout)) return 0; @@ -3782,9 +3783,9 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) if (adapter->renegotiate) { adapter->renegotiate = false; release_sub_crqs_no_irqs(adapter); - send_cap_queries(adapter); reinit_completion(&adapter->init_done); + send_cap_queries(adapter); if (!wait_for_completion_timeout(&adapter->init_done, timeout)) return 0; From e722af6391949e8851310441bb0cec157d25611d Mon Sep 17 00:00:00 2001 From: Nathan Fontenot Date: Fri, 10 Feb 2017 13:29:06 -0500 Subject: [PATCH 02/27] ibmvnic: Call napi_disable instead of napi_enable in failure path The failure path in ibmvnic_open() mistakenly makes a second call to napi_enable instead of calling napi_disable. This can result in a BUG_ON for any queues that were enabled in the previous call to napi_enable. Signed-off-by: Nathan Fontenot Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index c7150343342d..752b0822b020 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -506,7 +506,7 @@ rx_pool_alloc_failed: adapter->rx_pool = NULL; rx_pool_arr_alloc_failed: for (i = 0; i < adapter->req_rx_queues; i++) - napi_enable(&adapter->napi[i]); + napi_disable(&adapter->napi[i]); alloc_napi_failed: return -ENOMEM; } From 7f677633379b4abb3281cdbe7e7006f049305c03 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 10 Feb 2017 20:28:24 -0800 Subject: [PATCH 03/27] bpf: introduce BPF_F_ALLOW_OVERRIDE flag If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command to the given cgroup the descendent cgroup will be able to override effective bpf program that was inherited from this cgroup. By default it's not passed, therefore override is disallowed. Examples: 1. prog X attached to /A with default prog Y fails to attach to /A/B and /A/B/C Everything under /A runs prog X 2. prog X attached to /A with allow_override. prog Y fails to attach to /A/B with default (non-override) prog M attached to /A/B with allow_override. Everything under /A/B runs prog M only. 3. prog X attached to /A with allow_override. prog Y fails to attach to /A with default. The user has to detach first to switch the mode. In the future this behavior may be extended with a chain of non-overridable programs. Also fix the bug where detach from cgroup where nothing is attached was not throwing error. Return ENOENT in such case. Add several testcases and adjust libbpf. Fixes: 3007098494be ("cgroup: add support for eBPF programs") Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Acked-by: Tejun Heo Acked-by: Daniel Mack Signed-off-by: David S. Miller --- include/linux/bpf-cgroup.h | 13 +++--- include/uapi/linux/bpf.h | 7 ++++ kernel/bpf/cgroup.c | 59 +++++++++++++++++++++------ kernel/bpf/syscall.c | 20 +++++++--- kernel/cgroup.c | 9 +++-- samples/bpf/test_cgrp2_attach.c | 2 +- samples/bpf/test_cgrp2_attach2.c | 68 ++++++++++++++++++++++++++++++-- samples/bpf/test_cgrp2_sock.c | 2 +- samples/bpf/test_cgrp2_sock2.c | 2 +- tools/lib/bpf/bpf.c | 4 +- tools/lib/bpf/bpf.h | 3 +- 11 files changed, 151 insertions(+), 38 deletions(-) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 92bc89ae7e20..c970a25d2a49 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -21,20 +21,19 @@ struct cgroup_bpf { */ struct bpf_prog *prog[MAX_BPF_ATTACH_TYPE]; struct bpf_prog __rcu *effective[MAX_BPF_ATTACH_TYPE]; + bool disallow_override[MAX_BPF_ATTACH_TYPE]; }; void cgroup_bpf_put(struct cgroup *cgrp); void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent); -void __cgroup_bpf_update(struct cgroup *cgrp, - struct cgroup *parent, - struct bpf_prog *prog, - enum bpf_attach_type type); +int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent, + struct bpf_prog *prog, enum bpf_attach_type type, + bool overridable); /* Wrapper for __cgroup_bpf_update() protected by cgroup_mutex */ -void cgroup_bpf_update(struct cgroup *cgrp, - struct bpf_prog *prog, - enum bpf_attach_type type); +int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog, + enum bpf_attach_type type, bool overridable); int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0eb0e87dbe9f..d2b0ac799d03 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -116,6 +116,12 @@ enum bpf_attach_type { #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE +/* If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command + * to the given target_fd cgroup the descendent cgroup will be able to + * override effective bpf program that was inherited from this cgroup + */ +#define BPF_F_ALLOW_OVERRIDE (1U << 0) + #define BPF_PSEUDO_MAP_FD 1 /* flags for BPF_MAP_UPDATE_ELEM command */ @@ -171,6 +177,7 @@ union bpf_attr { __u32 target_fd; /* container object to attach to */ __u32 attach_bpf_fd; /* eBPF program to attach */ __u32 attach_type; + __u32 attach_flags; }; } __attribute__((aligned(8))); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index a515f7b007c6..da0f53690295 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -52,6 +52,7 @@ void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent) e = rcu_dereference_protected(parent->bpf.effective[type], lockdep_is_held(&cgroup_mutex)); rcu_assign_pointer(cgrp->bpf.effective[type], e); + cgrp->bpf.disallow_override[type] = parent->bpf.disallow_override[type]; } } @@ -82,30 +83,63 @@ void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent) * * Must be called with cgroup_mutex held. */ -void __cgroup_bpf_update(struct cgroup *cgrp, - struct cgroup *parent, - struct bpf_prog *prog, - enum bpf_attach_type type) +int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent, + struct bpf_prog *prog, enum bpf_attach_type type, + bool new_overridable) { - struct bpf_prog *old_prog, *effective; + struct bpf_prog *old_prog, *effective = NULL; struct cgroup_subsys_state *pos; + bool overridable = true; - old_prog = xchg(cgrp->bpf.prog + type, prog); + if (parent) { + overridable = !parent->bpf.disallow_override[type]; + effective = rcu_dereference_protected(parent->bpf.effective[type], + lockdep_is_held(&cgroup_mutex)); + } - effective = (!prog && parent) ? - rcu_dereference_protected(parent->bpf.effective[type], - lockdep_is_held(&cgroup_mutex)) : - prog; + if (prog && effective && !overridable) + /* if parent has non-overridable prog attached, disallow + * attaching new programs to descendent cgroup + */ + return -EPERM; + + if (prog && effective && overridable != new_overridable) + /* if parent has overridable prog attached, only + * allow overridable programs in descendent cgroup + */ + return -EPERM; + + old_prog = cgrp->bpf.prog[type]; + + if (prog) { + overridable = new_overridable; + effective = prog; + if (old_prog && + cgrp->bpf.disallow_override[type] == new_overridable) + /* disallow attaching non-overridable on top + * of existing overridable in this cgroup + * and vice versa + */ + return -EPERM; + } + + if (!prog && !old_prog) + /* report error when trying to detach and nothing is attached */ + return -ENOENT; + + cgrp->bpf.prog[type] = prog; css_for_each_descendant_pre(pos, &cgrp->self) { struct cgroup *desc = container_of(pos, struct cgroup, self); /* skip the subtree if the descendant has its own program */ - if (desc->bpf.prog[type] && desc != cgrp) + if (desc->bpf.prog[type] && desc != cgrp) { pos = css_rightmost_descendant(pos); - else + } else { rcu_assign_pointer(desc->bpf.effective[type], effective); + desc->bpf.disallow_override[type] = !overridable; + } } if (prog) @@ -115,6 +149,7 @@ void __cgroup_bpf_update(struct cgroup *cgrp, bpf_prog_put(old_prog); static_branch_dec(&cgroup_bpf_enabled_key); } + return 0; } /** diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 19b6129eab23..bbb016adbaeb 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -920,13 +920,14 @@ static int bpf_obj_get(const union bpf_attr *attr) #ifdef CONFIG_CGROUP_BPF -#define BPF_PROG_ATTACH_LAST_FIELD attach_type +#define BPF_PROG_ATTACH_LAST_FIELD attach_flags static int bpf_prog_attach(const union bpf_attr *attr) { + enum bpf_prog_type ptype; struct bpf_prog *prog; struct cgroup *cgrp; - enum bpf_prog_type ptype; + int ret; if (!capable(CAP_NET_ADMIN)) return -EPERM; @@ -934,6 +935,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) if (CHECK_ATTR(BPF_PROG_ATTACH)) return -EINVAL; + if (attr->attach_flags & ~BPF_F_ALLOW_OVERRIDE) + return -EINVAL; + switch (attr->attach_type) { case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: @@ -956,10 +960,13 @@ static int bpf_prog_attach(const union bpf_attr *attr) return PTR_ERR(cgrp); } - cgroup_bpf_update(cgrp, prog, attr->attach_type); + ret = cgroup_bpf_update(cgrp, prog, attr->attach_type, + attr->attach_flags & BPF_F_ALLOW_OVERRIDE); + if (ret) + bpf_prog_put(prog); cgroup_put(cgrp); - return 0; + return ret; } #define BPF_PROG_DETACH_LAST_FIELD attach_type @@ -967,6 +974,7 @@ static int bpf_prog_attach(const union bpf_attr *attr) static int bpf_prog_detach(const union bpf_attr *attr) { struct cgroup *cgrp; + int ret; if (!capable(CAP_NET_ADMIN)) return -EPERM; @@ -982,7 +990,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) if (IS_ERR(cgrp)) return PTR_ERR(cgrp); - cgroup_bpf_update(cgrp, NULL, attr->attach_type); + ret = cgroup_bpf_update(cgrp, NULL, attr->attach_type, false); cgroup_put(cgrp); break; @@ -990,7 +998,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) return -EINVAL; } - return 0; + return ret; } #endif /* CONFIG_CGROUP_BPF */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 688dd02af985..53bbca7c4859 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -6498,15 +6498,16 @@ static __init int cgroup_namespaces_init(void) subsys_initcall(cgroup_namespaces_init); #ifdef CONFIG_CGROUP_BPF -void cgroup_bpf_update(struct cgroup *cgrp, - struct bpf_prog *prog, - enum bpf_attach_type type) +int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog, + enum bpf_attach_type type, bool overridable) { struct cgroup *parent = cgroup_parent(cgrp); + int ret; mutex_lock(&cgroup_mutex); - __cgroup_bpf_update(cgrp, parent, prog, type); + ret = __cgroup_bpf_update(cgrp, parent, prog, type, overridable); mutex_unlock(&cgroup_mutex); + return ret; } #endif /* CONFIG_CGROUP_BPF */ diff --git a/samples/bpf/test_cgrp2_attach.c b/samples/bpf/test_cgrp2_attach.c index 504058631ffc..4bfcaf93fcf3 100644 --- a/samples/bpf/test_cgrp2_attach.c +++ b/samples/bpf/test_cgrp2_attach.c @@ -104,7 +104,7 @@ static int attach_filter(int cg_fd, int type, int verdict) return EXIT_FAILURE; } - ret = bpf_prog_attach(prog_fd, cg_fd, type); + ret = bpf_prog_attach(prog_fd, cg_fd, type, 0); if (ret < 0) { printf("Failed to attach prog to cgroup: '%s'\n", strerror(errno)); diff --git a/samples/bpf/test_cgrp2_attach2.c b/samples/bpf/test_cgrp2_attach2.c index 6e69be37f87f..3049b1f26267 100644 --- a/samples/bpf/test_cgrp2_attach2.c +++ b/samples/bpf/test_cgrp2_attach2.c @@ -79,11 +79,12 @@ int main(int argc, char **argv) if (join_cgroup(FOO)) goto err; - if (bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS)) { + if (bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS, 1)) { log_err("Attaching prog to /foo"); goto err; } + printf("Attached DROP prog. This ping in cgroup /foo should fail...\n"); assert(system(PING_CMD) != 0); /* Create cgroup /foo/bar, get fd, and join it */ @@ -94,24 +95,27 @@ int main(int argc, char **argv) if (join_cgroup(BAR)) goto err; + printf("Attached DROP prog. This ping in cgroup /foo/bar should fail...\n"); assert(system(PING_CMD) != 0); - if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS)) { + if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) { log_err("Attaching prog to /foo/bar"); goto err; } + printf("Attached PASS prog. This ping in cgroup /foo/bar should pass...\n"); assert(system(PING_CMD) == 0); - if (bpf_prog_detach(bar, BPF_CGROUP_INET_EGRESS)) { log_err("Detaching program from /foo/bar"); goto err; } + printf("Detached PASS from /foo/bar while DROP is attached to /foo.\n" + "This ping in cgroup /foo/bar should fail...\n"); assert(system(PING_CMD) != 0); - if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS)) { + if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) { log_err("Attaching prog to /foo/bar"); goto err; } @@ -121,8 +125,60 @@ int main(int argc, char **argv) goto err; } + printf("Attached PASS from /foo/bar and detached DROP from /foo.\n" + "This ping in cgroup /foo/bar should pass...\n"); assert(system(PING_CMD) == 0); + if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) { + log_err("Attaching prog to /foo/bar"); + goto err; + } + + if (!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 0)) { + errno = 0; + log_err("Unexpected success attaching prog to /foo/bar"); + goto err; + } + + if (bpf_prog_detach(bar, BPF_CGROUP_INET_EGRESS)) { + log_err("Detaching program from /foo/bar"); + goto err; + } + + if (!bpf_prog_detach(foo, BPF_CGROUP_INET_EGRESS)) { + errno = 0; + log_err("Unexpected success in double detach from /foo"); + goto err; + } + + if (bpf_prog_attach(allow_prog, foo, BPF_CGROUP_INET_EGRESS, 0)) { + log_err("Attaching non-overridable prog to /foo"); + goto err; + } + + if (!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 0)) { + errno = 0; + log_err("Unexpected success attaching non-overridable prog to /foo/bar"); + goto err; + } + + if (!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) { + errno = 0; + log_err("Unexpected success attaching overridable prog to /foo/bar"); + goto err; + } + + if (!bpf_prog_attach(allow_prog, foo, BPF_CGROUP_INET_EGRESS, 1)) { + errno = 0; + log_err("Unexpected success attaching overridable prog to /foo"); + goto err; + } + + if (bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS, 0)) { + log_err("Attaching different non-overridable prog to /foo"); + goto err; + } + goto out; err: @@ -132,5 +188,9 @@ out: close(foo); close(bar); cleanup_cgroup_environment(); + if (!rc) + printf("PASS\n"); + else + printf("FAIL\n"); return rc; } diff --git a/samples/bpf/test_cgrp2_sock.c b/samples/bpf/test_cgrp2_sock.c index 0791b949cbe4..c3cfb23e23b5 100644 --- a/samples/bpf/test_cgrp2_sock.c +++ b/samples/bpf/test_cgrp2_sock.c @@ -75,7 +75,7 @@ int main(int argc, char **argv) return EXIT_FAILURE; } - ret = bpf_prog_attach(prog_fd, cg_fd, BPF_CGROUP_INET_SOCK_CREATE); + ret = bpf_prog_attach(prog_fd, cg_fd, BPF_CGROUP_INET_SOCK_CREATE, 0); if (ret < 0) { printf("Failed to attach prog to cgroup: '%s'\n", strerror(errno)); diff --git a/samples/bpf/test_cgrp2_sock2.c b/samples/bpf/test_cgrp2_sock2.c index 455ef0d06e93..db036077b644 100644 --- a/samples/bpf/test_cgrp2_sock2.c +++ b/samples/bpf/test_cgrp2_sock2.c @@ -55,7 +55,7 @@ int main(int argc, char **argv) } ret = bpf_prog_attach(prog_fd[filter_id], cg_fd, - BPF_CGROUP_INET_SOCK_CREATE); + BPF_CGROUP_INET_SOCK_CREATE, 0); if (ret < 0) { printf("Failed to attach prog to cgroup: '%s'\n", strerror(errno)); diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 3ddb58a36d3c..ae752fa4eaa7 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -168,7 +168,8 @@ int bpf_obj_get(const char *pathname) return sys_bpf(BPF_OBJ_GET, &attr, sizeof(attr)); } -int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type) +int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type, + unsigned int flags) { union bpf_attr attr; @@ -176,6 +177,7 @@ int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type) attr.target_fd = target_fd; attr.attach_bpf_fd = prog_fd; attr.attach_type = type; + attr.attach_flags = flags; return sys_bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)); } diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index a2f9853dd882..4ac6c4b84100 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -41,7 +41,8 @@ int bpf_map_delete_elem(int fd, void *key); int bpf_map_get_next_key(int fd, void *key, void *next_key); int bpf_obj_pin(int fd, const char *pathname); int bpf_obj_get(const char *pathname); -int bpf_prog_attach(int prog_fd, int attachable_fd, enum bpf_attach_type type); +int bpf_prog_attach(int prog_fd, int attachable_fd, enum bpf_attach_type type, + unsigned int flags); int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type); From 8b74d439e1697110c5e5c600643e823eb1dd0762 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 12 Feb 2017 14:03:52 -0800 Subject: [PATCH 04/27] net/llc: avoid BUG_ON() in skb_orphan() It seems nobody used LLC since linux-3.12. Fortunately fuzzers like syzkaller still know how to run this code, otherwise it would be no fun. Setting skb->sk without skb->destructor leads to all kinds of bugs, we now prefer to be very strict about it. Ideally here we would use skb_set_owner() but this helper does not exist yet, only CAN seems to have a private helper for that. Fixes: 376c7311bdb6 ("net: add a temporary sanity check in skb_orphan()") Signed-off-by: Eric Dumazet Reported-by: Andrey Konovalov Signed-off-by: David S. Miller --- net/llc/llc_conn.c | 3 +++ net/llc/llc_sap.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index 3e821daf9dd4..8bc5a1bd2d45 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -821,7 +821,10 @@ void llc_conn_handler(struct llc_sap *sap, struct sk_buff *skb) * another trick required to cope with how the PROCOM state * machine works. -acme */ + skb_orphan(skb); + sock_hold(sk); skb->sk = sk; + skb->destructor = sock_efree; } if (!sock_owned_by_user(sk)) llc_conn_rcv(sk, skb); diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c index d0e1e804ebd7..5404d0d195cc 100644 --- a/net/llc/llc_sap.c +++ b/net/llc/llc_sap.c @@ -290,7 +290,10 @@ static void llc_sap_rcv(struct llc_sap *sap, struct sk_buff *skb, ev->type = LLC_SAP_EV_TYPE_PDU; ev->reason = 0; + skb_orphan(skb); + sock_hold(sk); skb->sk = sk; + skb->destructor = sock_efree; llc_sap_state_process(sap, skb); } From 0c59d28121b96d826c188280f367e754b5d83350 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 13 Feb 2017 14:15:44 -0300 Subject: [PATCH 05/27] MAINTAINERS: Remove old e-mail address The ghostprotocols.net domain is not working, remove it from CREDITS and MAINTAINERS, and change the status to "Odd fixes", and since I haven't been maintaining those, remove my address from there. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- CREDITS | 5 ++--- MAINTAINERS | 15 ++++++--------- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/CREDITS b/CREDITS index c58560701d13..c5626bf06264 100644 --- a/CREDITS +++ b/CREDITS @@ -2478,12 +2478,11 @@ S: D-90453 Nuernberg S: Germany N: Arnaldo Carvalho de Melo -E: acme@ghostprotocols.net +E: acme@kernel.org E: arnaldo.melo@gmail.com E: acme@redhat.com -W: http://oops.ghostprotocols.net:81/blog/ P: 1024D/9224DF01 D5DF E3BB E3C8 BCBB F8AD 841A B6AB 4681 9224 DF01 -D: IPX, LLC, DCCP, cyc2x, wl3501_cs, net/ hacks +D: tools/, IPX, LLC, DCCP, cyc2x, wl3501_cs, net/ hacks S: Brazil N: Karsten Merker diff --git a/MAINTAINERS b/MAINTAINERS index 107c10e8f2d2..527d13759ecc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -877,8 +877,8 @@ S: Odd fixes F: drivers/hwmon/applesmc.c APPLETALK NETWORK LAYER -M: Arnaldo Carvalho de Melo -S: Maintained +L: netdev@vger.kernel.org +S: Odd fixes F: drivers/net/appletalk/ F: net/appletalk/ @@ -6727,9 +6727,8 @@ S: Odd Fixes F: drivers/tty/ipwireless/ IPX NETWORK LAYER -M: Arnaldo Carvalho de Melo L: netdev@vger.kernel.org -S: Maintained +S: Odd fixes F: include/net/ipx.h F: include/uapi/linux/ipx.h F: net/ipx/ @@ -7501,8 +7500,8 @@ S: Maintained F: drivers/misc/lkdtm* LLC (802.2) -M: Arnaldo Carvalho de Melo -S: Maintained +L: netdev@vger.kernel.org +S: Odd fixes F: include/linux/llc.h F: include/uapi/linux/llc.h F: include/net/llc* @@ -13373,10 +13372,8 @@ S: Maintained F: drivers/input/misc/wistron_btns.c WL3501 WIRELESS PCMCIA CARD DRIVER -M: Arnaldo Carvalho de Melo L: linux-wireless@vger.kernel.org -W: http://oops.ghostprotocols.net:81/blog -S: Maintained +S: Odd fixes F: drivers/net/wireless/wl3501* WOLFSON MICROELECTRONICS DRIVERS From ebf692f85ff78092cd238166d8d7ec51419f9c02 Mon Sep 17 00:00:00 2001 From: Mart van Santen Date: Fri, 10 Feb 2017 12:02:18 +0000 Subject: [PATCH 06/27] xen-netback: vif counters from int/long to u64 This patch fixes an issue where the type of counters in the queue(s) and interface are not in sync (queue counters are int, interface counters are long), causing incorrect reporting of tx/rx values of the vif interface and unclear counter overflows. This patch sets both counters to the u64 type. Signed-off-by: Mart van Santen Reviewed-by: Paul Durrant Signed-off-by: David S. Miller --- drivers/net/xen-netback/common.h | 8 ++++---- drivers/net/xen-netback/interface.c | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h index 3ce1f7da8647..530586be05b4 100644 --- a/drivers/net/xen-netback/common.h +++ b/drivers/net/xen-netback/common.h @@ -113,10 +113,10 @@ struct xenvif_stats { * A subset of struct net_device_stats that contains only the * fields that are updated in netback.c for each queue. */ - unsigned int rx_bytes; - unsigned int rx_packets; - unsigned int tx_bytes; - unsigned int tx_packets; + u64 rx_bytes; + u64 rx_packets; + u64 tx_bytes; + u64 tx_packets; /* Additional stats used by xenvif */ unsigned long rx_gso_checksum_fixup; diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c index 579521327b03..50fa1692d985 100644 --- a/drivers/net/xen-netback/interface.c +++ b/drivers/net/xen-netback/interface.c @@ -221,10 +221,10 @@ static struct net_device_stats *xenvif_get_stats(struct net_device *dev) { struct xenvif *vif = netdev_priv(dev); struct xenvif_queue *queue = NULL; - unsigned long rx_bytes = 0; - unsigned long rx_packets = 0; - unsigned long tx_bytes = 0; - unsigned long tx_packets = 0; + u64 rx_bytes = 0; + u64 rx_packets = 0; + u64 tx_bytes = 0; + u64 tx_packets = 0; unsigned int index; spin_lock(&vif->lock); From 4872e57c812dd312bf8193b5933fa60585cda42f Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Sat, 11 Feb 2017 00:38:57 +0100 Subject: [PATCH 07/27] NET: Fix /proc/net/arp for AX.25 When sending ARP requests over AX.25 links the hwaddress in the neighbour cache are not getting initialized. For such an incomplete arp entry ax2asc2 will generate an empty string resulting in /proc/net/arp output like the following: $ cat /proc/net/arp IP address HW type Flags HW address Mask Device 192.168.122.1 0x1 0x2 52:54:00:00:5d:5f * ens3 172.20.1.99 0x3 0x0 * bpq0 The missing field will confuse the procfs parsing of arp(8) resulting in incorrect output for the device such as the following: $ arp Address HWtype HWaddress Flags Mask Iface gateway ether 52:54:00:00:5d:5f C ens3 172.20.1.99 (incomplete) ens3 This changes the content of /proc/net/arp to: $ cat /proc/net/arp IP address HW type Flags HW address Mask Device 172.20.1.99 0x3 0x0 * * bpq0 192.168.122.1 0x1 0x2 52:54:00:00:5d:5f * ens3 To do so it change ax2asc to put the string "*" in buf for a NULL address argument. Finally the HW address field is left aligned in a 17 character field (the length of an ethernet HW address in the usual hex notation) for readability. Signed-off-by: Ralf Baechle Signed-off-by: David S. Miller --- net/ipv4/arp.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 89a8cac4726a..51b27ae09fbd 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1263,7 +1263,7 @@ void __init arp_init(void) /* * ax25 -> ASCII conversion */ -static char *ax2asc2(ax25_address *a, char *buf) +static void ax2asc2(ax25_address *a, char *buf) { char c, *s; int n; @@ -1285,10 +1285,10 @@ static char *ax2asc2(ax25_address *a, char *buf) *s++ = n + '0'; *s++ = '\0'; - if (*buf == '\0' || *buf == '-') - return "*"; - - return buf; + if (*buf == '\0' || *buf == '-') { + buf[0] = '*'; + buf[1] = '\0'; + } } #endif /* CONFIG_AX25 */ @@ -1322,7 +1322,7 @@ static void arp_format_neigh_entry(struct seq_file *seq, } #endif sprintf(tbuf, "%pI4", n->primary_key); - seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n", + seq_printf(seq, "%-16s 0x%-10x0x%-10x%-17s * %s\n", tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name); read_unlock(&n->lock); } From 6a25478077d987edc5e2f880590a2bc5fcab4441 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 11 Feb 2017 19:26:45 +0800 Subject: [PATCH 08/27] gfs2: Use rhashtable walk interface in glock_hash_walk The function glock_hash_walk walks the rhashtable by hand. This is broken because if it catches the hash table in the middle of a rehash, then it will miss entries. This patch replaces the manual walk by using the rhashtable walk interface. Fixes: 88ffbf3e037e ("GFS2: Use resizable hash table for glocks") Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- fs/gfs2/glock.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 94f50cac91c6..70e94170af85 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1420,26 +1420,32 @@ static struct shrinker glock_shrinker = { * @sdp: the filesystem * @bucket: the bucket * + * Note that the function can be called multiple times on the same + * object. So the user must ensure that the function can cope with + * that. */ static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp) { struct gfs2_glock *gl; - struct rhash_head *pos; - const struct bucket_table *tbl; - int i; + struct rhashtable_iter iter; - rcu_read_lock(); - tbl = rht_dereference_rcu(gl_hash_table.tbl, &gl_hash_table); - for (i = 0; i < tbl->size; i++) { - rht_for_each_entry_rcu(gl, pos, tbl, i, gl_node) { + rhashtable_walk_enter(&gl_hash_table, &iter); + + do { + gl = ERR_PTR(rhashtable_walk_start(&iter)); + if (gl) + continue; + + while ((gl = rhashtable_walk_next(&iter)) && !IS_ERR(gl)) if ((gl->gl_name.ln_sbd == sdp) && lockref_get_not_dead(&gl->gl_lockref)) examiner(gl); - } - } - rcu_read_unlock(); - cond_resched(); + + rhashtable_walk_stop(&iter); + } while (cond_resched(), gl == ERR_PTR(-EAGAIN)); + + rhashtable_walk_exit(&iter); } /** From 9dbbfb0ab6680c6a85609041011484e6658e7d3c Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 11 Feb 2017 19:26:46 +0800 Subject: [PATCH 09/27] tipc: Fix tipc_sk_reinit race conditions There are two problems with the function tipc_sk_reinit. Firstly it's doing a manual walk over an rhashtable. This is broken as an rhashtable can be resized and if you manually walk over it during a resize then you may miss entries. Secondly it's missing memory barriers as previously the code used spinlocks which provide the barriers implicitly. This patch fixes both problems. Fixes: 07f6c4bc048a ("tipc: convert tipc reference table to...") Signed-off-by: Herbert Xu Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/net.c | 4 ++++ net/tipc/socket.c | 30 +++++++++++++++++++----------- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/net/tipc/net.c b/net/tipc/net.c index 28bf4feeb81c..ab8a2d5d1e32 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -110,6 +110,10 @@ int tipc_net_start(struct net *net, u32 addr) char addr_string[16]; tn->own_addr = addr; + + /* Ensure that the new address is visible before we reinit. */ + smp_mb(); + tipc_named_reinit(net); tipc_sk_reinit(net); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 800caaa699a1..370a5912bcb5 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -384,8 +384,6 @@ static int tipc_sk_create(struct net *net, struct socket *sock, INIT_LIST_HEAD(&tsk->publications); msg = &tsk->phdr; tn = net_generic(sock_net(sk), tipc_net_id); - tipc_msg_init(tn->own_addr, msg, TIPC_LOW_IMPORTANCE, TIPC_NAMED_MSG, - NAMED_H_SIZE, 0); /* Finish initializing socket data structures */ sock->ops = ops; @@ -395,6 +393,13 @@ static int tipc_sk_create(struct net *net, struct socket *sock, pr_warn("Socket create failed; port number exhausted\n"); return -EINVAL; } + + /* Ensure tsk is visible before we read own_addr. */ + smp_mb(); + + tipc_msg_init(tn->own_addr, msg, TIPC_LOW_IMPORTANCE, TIPC_NAMED_MSG, + NAMED_H_SIZE, 0); + msg_set_origport(msg, tsk->portid); setup_timer(&sk->sk_timer, tipc_sk_timeout, (unsigned long)tsk); sk->sk_shutdown = 0; @@ -2269,24 +2274,27 @@ static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope, void tipc_sk_reinit(struct net *net) { struct tipc_net *tn = net_generic(net, tipc_net_id); - const struct bucket_table *tbl; - struct rhash_head *pos; + struct rhashtable_iter iter; struct tipc_sock *tsk; struct tipc_msg *msg; - int i; - rcu_read_lock(); - tbl = rht_dereference_rcu((&tn->sk_rht)->tbl, &tn->sk_rht); - for (i = 0; i < tbl->size; i++) { - rht_for_each_entry_rcu(tsk, pos, tbl, i, node) { + rhashtable_walk_enter(&tn->sk_rht, &iter); + + do { + tsk = ERR_PTR(rhashtable_walk_start(&iter)); + if (tsk) + continue; + + while ((tsk = rhashtable_walk_next(&iter)) && !IS_ERR(tsk)) { spin_lock_bh(&tsk->sk.sk_lock.slock); msg = &tsk->phdr; msg_set_prevnode(msg, tn->own_addr); msg_set_orignode(msg, tn->own_addr); spin_unlock_bh(&tsk->sk.sk_lock.slock); } - } - rcu_read_unlock(); + + rhashtable_walk_stop(&iter); + } while (tsk == ERR_PTR(-EAGAIN)); } static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid) From 40137906c5f55c252194ef5834130383e639536f Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 11 Feb 2017 19:26:47 +0800 Subject: [PATCH 10/27] rhashtable: Add nested tables This patch adds code that handles GFP_ATOMIC kmalloc failure on insertion. As we cannot use vmalloc, we solve it by making our hash table nested. That is, we allocate single pages at each level and reach our desired table size by nesting them. When a nested table is created, only a single page is allocated at the top-level. Lower levels are allocated on demand during insertion. Therefore for each insertion to succeed, only two (non-consecutive) pages are needed. After a nested table is created, a rehash will be scheduled in order to switch to a vmalloced table as soon as possible. Also, the rehash code will never rehash into a nested table. If we detect a nested table during a rehash, the rehash will be aborted and a new rehash will be scheduled. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 78 ++++++++--- lib/rhashtable.c | 270 ++++++++++++++++++++++++++++++------- 2 files changed, 276 insertions(+), 72 deletions(-) diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 5c132d3188be..f2e12a845910 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -61,6 +61,7 @@ struct rhlist_head { /** * struct bucket_table - Table of hash buckets * @size: Number of hash buckets + * @nest: Number of bits of first-level nested table. * @rehash: Current bucket being rehashed * @hash_rnd: Random seed to fold into hash * @locks_mask: Mask to apply before accessing locks[] @@ -68,10 +69,12 @@ struct rhlist_head { * @walkers: List of active walkers * @rcu: RCU structure for freeing the table * @future_tbl: Table under construction during rehashing + * @ntbl: Nested table used when out of memory. * @buckets: size * hash buckets */ struct bucket_table { unsigned int size; + unsigned int nest; unsigned int rehash; u32 hash_rnd; unsigned int locks_mask; @@ -81,7 +84,7 @@ struct bucket_table { struct bucket_table __rcu *future_tbl; - struct rhash_head __rcu *buckets[] ____cacheline_aligned_in_smp; + struct rhash_head __rcu *buckets[] ____cacheline_aligned_in_smp; }; /** @@ -374,6 +377,12 @@ void rhashtable_free_and_destroy(struct rhashtable *ht, void *arg); void rhashtable_destroy(struct rhashtable *ht); +struct rhash_head __rcu **rht_bucket_nested(const struct bucket_table *tbl, + unsigned int hash); +struct rhash_head __rcu **rht_bucket_nested_insert(struct rhashtable *ht, + struct bucket_table *tbl, + unsigned int hash); + #define rht_dereference(p, ht) \ rcu_dereference_protected(p, lockdep_rht_mutex_is_held(ht)) @@ -389,6 +398,27 @@ void rhashtable_destroy(struct rhashtable *ht); #define rht_entry(tpos, pos, member) \ ({ tpos = container_of(pos, typeof(*tpos), member); 1; }) +static inline struct rhash_head __rcu *const *rht_bucket( + const struct bucket_table *tbl, unsigned int hash) +{ + return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) : + &tbl->buckets[hash]; +} + +static inline struct rhash_head __rcu **rht_bucket_var( + struct bucket_table *tbl, unsigned int hash) +{ + return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) : + &tbl->buckets[hash]; +} + +static inline struct rhash_head __rcu **rht_bucket_insert( + struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash) +{ + return unlikely(tbl->nest) ? rht_bucket_nested_insert(ht, tbl, hash) : + &tbl->buckets[hash]; +} + /** * rht_for_each_continue - continue iterating over hash chain * @pos: the &struct rhash_head to use as a loop cursor. @@ -408,7 +438,7 @@ void rhashtable_destroy(struct rhashtable *ht); * @hash: the hash value / bucket index */ #define rht_for_each(pos, tbl, hash) \ - rht_for_each_continue(pos, (tbl)->buckets[hash], tbl, hash) + rht_for_each_continue(pos, *rht_bucket(tbl, hash), tbl, hash) /** * rht_for_each_entry_continue - continue iterating over hash chain @@ -433,7 +463,7 @@ void rhashtable_destroy(struct rhashtable *ht); * @member: name of the &struct rhash_head within the hashable struct. */ #define rht_for_each_entry(tpos, pos, tbl, hash, member) \ - rht_for_each_entry_continue(tpos, pos, (tbl)->buckets[hash], \ + rht_for_each_entry_continue(tpos, pos, *rht_bucket(tbl, hash), \ tbl, hash, member) /** @@ -448,13 +478,13 @@ void rhashtable_destroy(struct rhashtable *ht); * This hash chain list-traversal primitive allows for the looped code to * remove the loop cursor from the list. */ -#define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member) \ - for (pos = rht_dereference_bucket((tbl)->buckets[hash], tbl, hash), \ - next = !rht_is_a_nulls(pos) ? \ - rht_dereference_bucket(pos->next, tbl, hash) : NULL; \ - (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ - pos = next, \ - next = !rht_is_a_nulls(pos) ? \ +#define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member) \ + for (pos = rht_dereference_bucket(*rht_bucket(tbl, hash), tbl, hash), \ + next = !rht_is_a_nulls(pos) ? \ + rht_dereference_bucket(pos->next, tbl, hash) : NULL; \ + (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ + pos = next, \ + next = !rht_is_a_nulls(pos) ? \ rht_dereference_bucket(pos->next, tbl, hash) : NULL) /** @@ -485,7 +515,7 @@ void rhashtable_destroy(struct rhashtable *ht); * traversal is guarded by rcu_read_lock(). */ #define rht_for_each_rcu(pos, tbl, hash) \ - rht_for_each_rcu_continue(pos, (tbl)->buckets[hash], tbl, hash) + rht_for_each_rcu_continue(pos, *rht_bucket(tbl, hash), tbl, hash) /** * rht_for_each_entry_rcu_continue - continue iterating over rcu hash chain @@ -518,8 +548,8 @@ void rhashtable_destroy(struct rhashtable *ht); * the _rcu mutation primitives such as rhashtable_insert() as long as the * traversal is guarded by rcu_read_lock(). */ -#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member) \ - rht_for_each_entry_rcu_continue(tpos, pos, (tbl)->buckets[hash],\ +#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member) \ + rht_for_each_entry_rcu_continue(tpos, pos, *rht_bucket(tbl, hash), \ tbl, hash, member) /** @@ -565,7 +595,7 @@ static inline struct rhash_head *__rhashtable_lookup( .ht = ht, .key = key, }; - const struct bucket_table *tbl; + struct bucket_table *tbl; struct rhash_head *he; unsigned int hash; @@ -697,8 +727,12 @@ slow_path: } elasticity = ht->elasticity; - pprev = &tbl->buckets[hash]; - rht_for_each(head, tbl, hash) { + pprev = rht_bucket_insert(ht, tbl, hash); + data = ERR_PTR(-ENOMEM); + if (!pprev) + goto out; + + rht_for_each_continue(head, *pprev, tbl, hash) { struct rhlist_head *plist; struct rhlist_head *list; @@ -736,7 +770,7 @@ slow_path: if (unlikely(rht_grow_above_100(ht, tbl))) goto slow_path; - head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash); + head = rht_dereference_bucket(*pprev, tbl, hash); RCU_INIT_POINTER(obj->next, head); if (rhlist) { @@ -746,7 +780,7 @@ slow_path: RCU_INIT_POINTER(list->next, NULL); } - rcu_assign_pointer(tbl->buckets[hash], obj); + rcu_assign_pointer(*pprev, obj); atomic_inc(&ht->nelems); if (rht_grow_above_75(ht, tbl)) @@ -955,8 +989,8 @@ static inline int __rhashtable_remove_fast_one( spin_lock_bh(lock); - pprev = &tbl->buckets[hash]; - rht_for_each(he, tbl, hash) { + pprev = rht_bucket_var(tbl, hash); + rht_for_each_continue(he, *pprev, tbl, hash) { struct rhlist_head *list; list = container_of(he, struct rhlist_head, rhead); @@ -1107,8 +1141,8 @@ static inline int __rhashtable_replace_fast( spin_lock_bh(lock); - pprev = &tbl->buckets[hash]; - rht_for_each(he, tbl, hash) { + pprev = rht_bucket_var(tbl, hash); + rht_for_each_continue(he, *pprev, tbl, hash) { if (he != obj_old) { pprev = &he->next; continue; diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 32d0ad058380..172454e6b979 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -32,6 +32,11 @@ #define HASH_MIN_SIZE 4U #define BUCKET_LOCKS_PER_CPU 32UL +union nested_table { + union nested_table __rcu *table; + struct rhash_head __rcu *bucket; +}; + static u32 head_hashfn(struct rhashtable *ht, const struct bucket_table *tbl, const struct rhash_head *he) @@ -76,6 +81,9 @@ static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl, /* Never allocate more than 0.5 locks per bucket */ size = min_t(unsigned int, size, tbl->size >> 1); + if (tbl->nest) + size = min(size, 1U << tbl->nest); + if (sizeof(spinlock_t) != 0) { tbl->locks = NULL; #ifdef CONFIG_NUMA @@ -99,8 +107,45 @@ static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl, return 0; } +static void nested_table_free(union nested_table *ntbl, unsigned int size) +{ + const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); + const unsigned int len = 1 << shift; + unsigned int i; + + ntbl = rcu_dereference_raw(ntbl->table); + if (!ntbl) + return; + + if (size > len) { + size >>= shift; + for (i = 0; i < len; i++) + nested_table_free(ntbl + i, size); + } + + kfree(ntbl); +} + +static void nested_bucket_table_free(const struct bucket_table *tbl) +{ + unsigned int size = tbl->size >> tbl->nest; + unsigned int len = 1 << tbl->nest; + union nested_table *ntbl; + unsigned int i; + + ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]); + + for (i = 0; i < len; i++) + nested_table_free(ntbl + i, size); + + kfree(ntbl); +} + static void bucket_table_free(const struct bucket_table *tbl) { + if (tbl->nest) + nested_bucket_table_free(tbl); + if (tbl) kvfree(tbl->locks); @@ -112,6 +157,59 @@ static void bucket_table_free_rcu(struct rcu_head *head) bucket_table_free(container_of(head, struct bucket_table, rcu)); } +static union nested_table *nested_table_alloc(struct rhashtable *ht, + union nested_table __rcu **prev, + unsigned int shifted, + unsigned int nhash) +{ + union nested_table *ntbl; + int i; + + ntbl = rcu_dereference(*prev); + if (ntbl) + return ntbl; + + ntbl = kzalloc(PAGE_SIZE, GFP_ATOMIC); + + if (ntbl && shifted) { + for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0].bucket); i++) + INIT_RHT_NULLS_HEAD(ntbl[i].bucket, ht, + (i << shifted) | nhash); + } + + rcu_assign_pointer(*prev, ntbl); + + return ntbl; +} + +static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht, + size_t nbuckets, + gfp_t gfp) +{ + const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); + struct bucket_table *tbl; + size_t size; + + if (nbuckets < (1 << (shift + 1))) + return NULL; + + size = sizeof(*tbl) + sizeof(tbl->buckets[0]); + + tbl = kzalloc(size, gfp); + if (!tbl) + return NULL; + + if (!nested_table_alloc(ht, (union nested_table __rcu **)tbl->buckets, + 0, 0)) { + kfree(tbl); + return NULL; + } + + tbl->nest = (ilog2(nbuckets) - 1) % shift + 1; + + return tbl; +} + static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, size_t nbuckets, gfp_t gfp) @@ -126,10 +224,17 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, tbl = kzalloc(size, gfp | __GFP_NOWARN | __GFP_NORETRY); if (tbl == NULL && gfp == GFP_KERNEL) tbl = vzalloc(size); + + size = nbuckets; + + if (tbl == NULL && gfp != GFP_KERNEL) { + tbl = nested_bucket_table_alloc(ht, nbuckets, gfp); + nbuckets = 0; + } if (tbl == NULL) return NULL; - tbl->size = nbuckets; + tbl->size = size; if (alloc_bucket_locks(ht, tbl, gfp) < 0) { bucket_table_free(tbl); @@ -164,12 +269,17 @@ static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash) struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); struct bucket_table *new_tbl = rhashtable_last_table(ht, rht_dereference_rcu(old_tbl->future_tbl, ht)); - struct rhash_head __rcu **pprev = &old_tbl->buckets[old_hash]; - int err = -ENOENT; + struct rhash_head __rcu **pprev = rht_bucket_var(old_tbl, old_hash); + int err = -EAGAIN; struct rhash_head *head, *next, *entry; spinlock_t *new_bucket_lock; unsigned int new_hash; + if (new_tbl->nest) + goto out; + + err = -ENOENT; + rht_for_each(entry, old_tbl, old_hash) { err = 0; next = rht_dereference_bucket(entry->next, old_tbl, old_hash); @@ -202,19 +312,26 @@ out: return err; } -static void rhashtable_rehash_chain(struct rhashtable *ht, +static int rhashtable_rehash_chain(struct rhashtable *ht, unsigned int old_hash) { struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); spinlock_t *old_bucket_lock; + int err; old_bucket_lock = rht_bucket_lock(old_tbl, old_hash); spin_lock_bh(old_bucket_lock); - while (!rhashtable_rehash_one(ht, old_hash)) + while (!(err = rhashtable_rehash_one(ht, old_hash))) ; - old_tbl->rehash++; + + if (err == -ENOENT) { + old_tbl->rehash++; + err = 0; + } spin_unlock_bh(old_bucket_lock); + + return err; } static int rhashtable_rehash_attach(struct rhashtable *ht, @@ -246,13 +363,17 @@ static int rhashtable_rehash_table(struct rhashtable *ht) struct bucket_table *new_tbl; struct rhashtable_walker *walker; unsigned int old_hash; + int err; new_tbl = rht_dereference(old_tbl->future_tbl, ht); if (!new_tbl) return 0; - for (old_hash = 0; old_hash < old_tbl->size; old_hash++) - rhashtable_rehash_chain(ht, old_hash); + for (old_hash = 0; old_hash < old_tbl->size; old_hash++) { + err = rhashtable_rehash_chain(ht, old_hash); + if (err) + return err; + } /* Publish the new table pointer. */ rcu_assign_pointer(ht->tbl, new_tbl); @@ -271,31 +392,16 @@ static int rhashtable_rehash_table(struct rhashtable *ht) return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0; } -/** - * rhashtable_expand - Expand hash table while allowing concurrent lookups - * @ht: the hash table to expand - * - * A secondary bucket array is allocated and the hash entries are migrated. - * - * This function may only be called in a context where it is safe to call - * synchronize_rcu(), e.g. not within a rcu_read_lock() section. - * - * The caller must ensure that no concurrent resizing occurs by holding - * ht->mutex. - * - * It is valid to have concurrent insertions and deletions protected by per - * bucket locks or concurrent RCU protected lookups and traversals. - */ -static int rhashtable_expand(struct rhashtable *ht) +static int rhashtable_rehash_alloc(struct rhashtable *ht, + struct bucket_table *old_tbl, + unsigned int size) { - struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht); + struct bucket_table *new_tbl; int err; ASSERT_RHT_MUTEX(ht); - old_tbl = rhashtable_last_table(ht, old_tbl); - - new_tbl = bucket_table_alloc(ht, old_tbl->size * 2, GFP_KERNEL); + new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL); if (new_tbl == NULL) return -ENOMEM; @@ -324,12 +430,9 @@ static int rhashtable_expand(struct rhashtable *ht) */ static int rhashtable_shrink(struct rhashtable *ht) { - struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht); + struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); unsigned int nelems = atomic_read(&ht->nelems); unsigned int size = 0; - int err; - - ASSERT_RHT_MUTEX(ht); if (nelems) size = roundup_pow_of_two(nelems * 3 / 2); @@ -342,15 +445,7 @@ static int rhashtable_shrink(struct rhashtable *ht) if (rht_dereference(old_tbl->future_tbl, ht)) return -EEXIST; - new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL); - if (new_tbl == NULL) - return -ENOMEM; - - err = rhashtable_rehash_attach(ht, old_tbl, new_tbl); - if (err) - bucket_table_free(new_tbl); - - return err; + return rhashtable_rehash_alloc(ht, old_tbl, size); } static void rht_deferred_worker(struct work_struct *work) @@ -366,11 +461,14 @@ static void rht_deferred_worker(struct work_struct *work) tbl = rhashtable_last_table(ht, tbl); if (rht_grow_above_75(ht, tbl)) - rhashtable_expand(ht); + err = rhashtable_rehash_alloc(ht, tbl, tbl->size * 2); else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl)) - rhashtable_shrink(ht); + err = rhashtable_shrink(ht); + else if (tbl->nest) + err = rhashtable_rehash_alloc(ht, tbl, tbl->size); - err = rhashtable_rehash_table(ht); + if (!err) + err = rhashtable_rehash_table(ht); mutex_unlock(&ht->mutex); @@ -439,8 +537,8 @@ static void *rhashtable_lookup_one(struct rhashtable *ht, int elasticity; elasticity = ht->elasticity; - pprev = &tbl->buckets[hash]; - rht_for_each(head, tbl, hash) { + pprev = rht_bucket_var(tbl, hash); + rht_for_each_continue(head, *pprev, tbl, hash) { struct rhlist_head *list; struct rhlist_head *plist; @@ -477,6 +575,7 @@ static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht, struct rhash_head *obj, void *data) { + struct rhash_head __rcu **pprev; struct bucket_table *new_tbl; struct rhash_head *head; @@ -499,7 +598,11 @@ static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht, if (unlikely(rht_grow_above_100(ht, tbl))) return ERR_PTR(-EAGAIN); - head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash); + pprev = rht_bucket_insert(ht, tbl, hash); + if (!pprev) + return ERR_PTR(-ENOMEM); + + head = rht_dereference_bucket(*pprev, tbl, hash); RCU_INIT_POINTER(obj->next, head); if (ht->rhlist) { @@ -509,7 +612,7 @@ static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht, RCU_INIT_POINTER(list->next, NULL); } - rcu_assign_pointer(tbl->buckets[hash], obj); + rcu_assign_pointer(*pprev, obj); atomic_inc(&ht->nelems); if (rht_grow_above_75(ht, tbl)) @@ -975,7 +1078,7 @@ void rhashtable_free_and_destroy(struct rhashtable *ht, void (*free_fn)(void *ptr, void *arg), void *arg) { - const struct bucket_table *tbl; + struct bucket_table *tbl; unsigned int i; cancel_work_sync(&ht->run_work); @@ -986,7 +1089,7 @@ void rhashtable_free_and_destroy(struct rhashtable *ht, for (i = 0; i < tbl->size; i++) { struct rhash_head *pos, *next; - for (pos = rht_dereference(tbl->buckets[i], ht), + for (pos = rht_dereference(*rht_bucket(tbl, i), ht), next = !rht_is_a_nulls(pos) ? rht_dereference(pos->next, ht) : NULL; !rht_is_a_nulls(pos); @@ -1007,3 +1110,70 @@ void rhashtable_destroy(struct rhashtable *ht) return rhashtable_free_and_destroy(ht, NULL, NULL); } EXPORT_SYMBOL_GPL(rhashtable_destroy); + +struct rhash_head __rcu **rht_bucket_nested(const struct bucket_table *tbl, + unsigned int hash) +{ + const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); + static struct rhash_head __rcu *rhnull = + (struct rhash_head __rcu *)NULLS_MARKER(0); + unsigned int index = hash & ((1 << tbl->nest) - 1); + unsigned int size = tbl->size >> tbl->nest; + unsigned int subhash = hash; + union nested_table *ntbl; + + ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]); + ntbl = rht_dereference_bucket(ntbl[index].table, tbl, hash); + subhash >>= tbl->nest; + + while (ntbl && size > (1 << shift)) { + index = subhash & ((1 << shift) - 1); + ntbl = rht_dereference_bucket(ntbl[index].table, tbl, hash); + size >>= shift; + subhash >>= shift; + } + + if (!ntbl) + return &rhnull; + + return &ntbl[subhash].bucket; + +} +EXPORT_SYMBOL_GPL(rht_bucket_nested); + +struct rhash_head __rcu **rht_bucket_nested_insert(struct rhashtable *ht, + struct bucket_table *tbl, + unsigned int hash) +{ + const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); + unsigned int index = hash & ((1 << tbl->nest) - 1); + unsigned int size = tbl->size >> tbl->nest; + union nested_table *ntbl; + unsigned int shifted; + unsigned int nhash; + + ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]); + hash >>= tbl->nest; + nhash = index; + shifted = tbl->nest; + ntbl = nested_table_alloc(ht, &ntbl[index].table, + size <= (1 << shift) ? shifted : 0, nhash); + + while (ntbl && size > (1 << shift)) { + index = hash & ((1 << shift) - 1); + size >>= shift; + hash >>= shift; + nhash |= index << shifted; + shifted += shift; + ntbl = nested_table_alloc(ht, &ntbl[index].table, + size <= (1 << shift) ? shifted : 0, + nhash); + } + + if (!ntbl) + return NULL; + + return &ntbl[hash].bucket; + +} +EXPORT_SYMBOL_GPL(rht_bucket_nested_insert); From fed06ee89b78d3af32e235e0e89ad0d946fcb95d Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Sun, 12 Feb 2017 11:21:31 +0200 Subject: [PATCH 11/27] net/mlx5e: Disable preemption when doing TC statistics upcall When called by HW offloading drivers, the TC action (e.g net/sched/act_mirred.c) code uses this_cpu logic, e.g _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets) per the kernel documention, preemption should be disabled, add that. Before the fix, when running with CONFIG_PREEMPT set, we get a BUG: using smp_processor_id() in preemptible [00000000] code: tc/3793 asserion from the TC action (mirred) stats_update callback. Fixes: aad7e08d39bd ('net/mlx5e: Hardware offloaded flower filter statistics support') Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index c5282b6aba8b..2ebbe80d8126 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -1087,10 +1087,14 @@ int mlx5e_stats_flower(struct mlx5e_priv *priv, mlx5_fc_query_cached(counter, &bytes, &packets, &lastuse); + preempt_disable(); + tcf_exts_to_list(f->exts, &actions); list_for_each_entry(a, &actions, list) tcf_action_stats_update(a, bytes, packets, lastuse); + preempt_enable(); + return 0; } From ec5e3b0a1d41fbda0cc33a45bc9e54e91d9d12c7 Mon Sep 17 00:00:00 2001 From: "Jonathan T. Leighton" Date: Sun, 12 Feb 2017 17:26:06 -0500 Subject: [PATCH 12/27] ipv6: Inhibit IPv4-mapped src address on the wire. This patch adds a check for the problematic case of an IPv4-mapped IPv6 source address and a destination address that is neither an IPv4-mapped IPv6 address nor in6addr_any, and returns an appropriate error. The check in done before returning from looking up the route. Signed-off-by: Jonathan T. Leighton Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index b6a94ff0bbd0..e164684456df 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1021,6 +1021,9 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, } } #endif + if (ipv6_addr_v4mapped(&fl6->saddr) && + !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) + return -EAFNOSUPPORT; return 0; From 052d2369d1b479cdbbe020fdd6d057d3c342db74 Mon Sep 17 00:00:00 2001 From: "Jonathan T. Leighton" Date: Sun, 12 Feb 2017 17:26:07 -0500 Subject: [PATCH 13/27] ipv6: Handle IPv4-mapped src to in6addr_any dst. This patch adds a check on the type of the source address for the case where the destination address is in6addr_any. If the source is an IPv4-mapped IPv6 source address, the destination is changed to ::ffff:127.0.0.1, and otherwise the destination is changed to ::1. This is done in three locations to handle UDP calls to either connect() or sendmsg() and TCP calls to connect(). Note that udpv6_sendmsg() delays handling an in6addr_any destination until very late, so the patch only needs to handle the case where the source is an IPv4-mapped IPv6 address. Signed-off-by: Jonathan T. Leighton Signed-off-by: David S. Miller --- net/ipv6/datagram.c | 14 +++++++++----- net/ipv6/tcp_ipv6.c | 11 ++++++++--- net/ipv6/udp.c | 4 ++++ 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index a3eaafd87100..eec27f87efac 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -167,18 +167,22 @@ int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, if (np->sndflow) fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK; - addr_type = ipv6_addr_type(&usin->sin6_addr); - - if (addr_type == IPV6_ADDR_ANY) { + if (ipv6_addr_any(&usin->sin6_addr)) { /* * connect to self */ - usin->sin6_addr.s6_addr[15] = 0x01; + if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) + ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK), + &usin->sin6_addr); + else + usin->sin6_addr = in6addr_loopback; } + addr_type = ipv6_addr_type(&usin->sin6_addr); + daddr = &usin->sin6_addr; - if (addr_type == IPV6_ADDR_MAPPED) { + if (addr_type & IPV6_ADDR_MAPPED) { struct sockaddr_in sin; if (__ipv6_only_sock(sk)) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index eaad72c3d746..4c60c6f71cd3 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -148,8 +148,13 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, * connect() to INADDR_ANY means loopback (BSD'ism). */ - if (ipv6_addr_any(&usin->sin6_addr)) - usin->sin6_addr.s6_addr[15] = 0x1; + if (ipv6_addr_any(&usin->sin6_addr)) { + if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) + ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK), + &usin->sin6_addr); + else + usin->sin6_addr = in6addr_loopback; + } addr_type = ipv6_addr_type(&usin->sin6_addr); @@ -188,7 +193,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, * TCP over IPv4 */ - if (addr_type == IPV6_ADDR_MAPPED) { + if (addr_type & IPV6_ADDR_MAPPED) { u32 exthdrlen = icsk->icsk_ext_hdr_len; struct sockaddr_in sin; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 8990856f5101..221825a9407a 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1033,6 +1033,10 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; daddr = &sin6->sin6_addr; + if (ipv6_addr_any(daddr) && + ipv6_addr_v4mapped(&np->saddr)) + ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK), + daddr); break; case AF_INET: goto do_udp_sendmsg; From 01f8902bcf3ff124d0aeb88a774180ebcec20ace Mon Sep 17 00:00:00 2001 From: Rui Sousa Date: Mon, 13 Feb 2017 10:01:25 +0800 Subject: [PATCH 14/27] net: fec: fix multicast filtering hardware setup Fix hardware setup of multicast address hash: - Never clear the hardware hash (to avoid packet loss) - Construct the hash register values in software and then write once to hardware Signed-off-by: Rui Sousa Signed-off-by: Fugang Duan Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fec_main.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 38160c2bebcb..8be7034b2e7b 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -2910,6 +2910,7 @@ static void set_multicast_list(struct net_device *ndev) struct netdev_hw_addr *ha; unsigned int i, bit, data, crc, tmp; unsigned char hash; + unsigned int hash_high = 0, hash_low = 0; if (ndev->flags & IFF_PROMISC) { tmp = readl(fep->hwp + FEC_R_CNTRL); @@ -2932,11 +2933,7 @@ static void set_multicast_list(struct net_device *ndev) return; } - /* Clear filter and add the addresses in hash register - */ - writel(0, fep->hwp + FEC_GRP_HASH_TABLE_HIGH); - writel(0, fep->hwp + FEC_GRP_HASH_TABLE_LOW); - + /* Add the addresses in hash register */ netdev_for_each_mc_addr(ha, ndev) { /* calculate crc32 value of mac address */ crc = 0xffffffff; @@ -2954,16 +2951,14 @@ static void set_multicast_list(struct net_device *ndev) */ hash = (crc >> (32 - FEC_HASH_BITS)) & 0x3f; - if (hash > 31) { - tmp = readl(fep->hwp + FEC_GRP_HASH_TABLE_HIGH); - tmp |= 1 << (hash - 32); - writel(tmp, fep->hwp + FEC_GRP_HASH_TABLE_HIGH); - } else { - tmp = readl(fep->hwp + FEC_GRP_HASH_TABLE_LOW); - tmp |= 1 << hash; - writel(tmp, fep->hwp + FEC_GRP_HASH_TABLE_LOW); - } + if (hash > 31) + hash_high |= 1 << (hash - 32); + else + hash_low |= 1 << hash; } + + writel(hash_high, fep->hwp + FEC_GRP_HASH_TABLE_HIGH); + writel(hash_low, fep->hwp + FEC_GRP_HASH_TABLE_LOW); } /* Set a MAC change in hardware. */ From cd27b96bc13841ee7af25837a6ae86fee87273d6 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 13 Feb 2017 11:13:16 -0800 Subject: [PATCH 15/27] kcm: fix a null pointer dereference in kcm_sendmsg() In commit 98e3862ca2b1 ("kcm: fix 0-length case for kcm_sendmsg()") I tried to avoid skb allocation for 0-length case, but missed a check for NULL pointer in the non EOR case. Fixes: 98e3862ca2b1 ("kcm: fix 0-length case for kcm_sendmsg()") Reported-by: Dmitry Vyukov Cc: Tom Herbert Signed-off-by: Cong Wang Acked-by: Tom Herbert Signed-off-by: David S. Miller --- net/kcm/kcmsock.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 64f0e8531af0..a646f3481240 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1044,8 +1044,10 @@ wait_for_memory: } else { /* Message not complete, save state */ partial_message: - kcm->seq_skb = head; - kcm_tx_msg(head)->last_skb = skb; + if (head) { + kcm->seq_skb = head; + kcm_tx_msg(head)->last_skb = skb; + } } KCM_STATS_ADD(kcm->stats.tx_bytes, copied); From a60ced990e309666915d21445e95347d12406694 Mon Sep 17 00:00:00 2001 From: Ivan Khoronzhuk Date: Tue, 14 Feb 2017 14:42:15 +0200 Subject: [PATCH 16/27] net: ethernet: ti: cpsw: fix cpsw assignment in resume There is a copy-paste error, which hides breaking of resume for CPSW driver: there was replaced netdev_priv() to ndev_to_cpsw(ndev) in suspend, but left it unchanged in resume. Fixes: 606f39939595a4d4540406bfc11f265b2036af6d (ti: cpsw: move platform data and slaves info to cpsw_common) Reported-by: Alexey Starikovskiy Signed-off-by: Ivan Khoronzhuk Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpsw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index b203143647e6..65088224c207 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -3160,7 +3160,7 @@ static int cpsw_resume(struct device *dev) { struct platform_device *pdev = to_platform_device(dev); struct net_device *ndev = platform_get_drvdata(pdev); - struct cpsw_common *cpsw = netdev_priv(ndev); + struct cpsw_common *cpsw = ndev_to_cpsw(ndev); /* Select default pin state */ pinctrl_pm_select_default_state(dev); From f39f0d1e1e93145a0e91d9a7a639c42fd037ecc3 Mon Sep 17 00:00:00 2001 From: Thomas Falcon Date: Tue, 14 Feb 2017 10:22:59 -0600 Subject: [PATCH 17/27] ibmvnic: Fix initial MTU settings In the current driver, the MTU is set to the maximum value capable for the backing device. This decision turned out to be a mistake as it led to confusion among users. The expected initial MTU value used for other IBM vNIC capable operating systems is 1500, with the maximum value (9000) reserved for when Jumbo frames are enabled. This patch sets the MTU to the default value for a net device. It also corrects a discrepancy between MTU values received from firmware, which includes the ethernet header length, and net device MTU values. Finally, it removes redundant min/max MTU assignments after device initialization. Signed-off-by: Thomas Falcon Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 752b0822b020..5b66b4fd1767 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -1497,7 +1497,7 @@ static void init_sub_crqs(struct ibmvnic_adapter *adapter, int retry) adapter->req_rx_queues = adapter->opt_rx_comp_queues; adapter->req_rx_add_queues = adapter->max_rx_add_queues; - adapter->req_mtu = adapter->max_mtu; + adapter->req_mtu = adapter->netdev->mtu + ETH_HLEN; } total_queues = adapter->req_tx_queues + adapter->req_rx_queues; @@ -2627,12 +2627,12 @@ static void handle_query_cap_rsp(union ibmvnic_crq *crq, break; case MIN_MTU: adapter->min_mtu = be64_to_cpu(crq->query_capability.number); - netdev->min_mtu = adapter->min_mtu; + netdev->min_mtu = adapter->min_mtu - ETH_HLEN; netdev_dbg(netdev, "min_mtu = %lld\n", adapter->min_mtu); break; case MAX_MTU: adapter->max_mtu = be64_to_cpu(crq->query_capability.number); - netdev->max_mtu = adapter->max_mtu; + netdev->max_mtu = adapter->max_mtu - ETH_HLEN; netdev_dbg(netdev, "max_mtu = %lld\n", adapter->max_mtu); break; case MAX_MULTICAST_FILTERS: @@ -3657,9 +3657,7 @@ static void handle_crq_init_rsp(struct work_struct *work) goto task_failed; netdev->real_num_tx_queues = adapter->req_tx_queues; - netdev->mtu = adapter->req_mtu; - netdev->min_mtu = adapter->min_mtu; - netdev->max_mtu = adapter->max_mtu; + netdev->mtu = adapter->req_mtu - ETH_HLEN; if (adapter->failover) { adapter->failover = false; @@ -3799,7 +3797,7 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) } netdev->real_num_tx_queues = adapter->req_tx_queues; - netdev->mtu = adapter->req_mtu; + netdev->mtu = adapter->req_mtu - ETH_HLEN; rc = register_netdev(netdev); if (rc) { From d199fab63c11998a602205f7ee7ff7c05c97164b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 14 Feb 2017 09:03:51 -0800 Subject: [PATCH 18/27] packet: fix races in fanout_add() Multiple threads can call fanout_add() at the same time. We need to grab fanout_mutex earlier to avoid races that could lead to one thread freeing po->rollover that was set by another thread. Do the same in fanout_release(), for peace of mind, and to help us finding lockdep issues earlier. Fixes: dc99f600698d ("packet: Add fanout support.") Fixes: 0648ab70afe6 ("packet: rollover prepare: per-socket state") Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 57 +++++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 26 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index d56ee46b11fc..0f03f6a53b4d 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1619,6 +1619,7 @@ static void fanout_release_data(struct packet_fanout *f) static int fanout_add(struct sock *sk, u16 id, u16 type_flags) { + struct packet_rollover *rollover = NULL; struct packet_sock *po = pkt_sk(sk); struct packet_fanout *f, *match; u8 type = type_flags & 0xff; @@ -1641,23 +1642,28 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) return -EINVAL; } - if (!po->running) - return -EINVAL; + mutex_lock(&fanout_mutex); + err = -EINVAL; + if (!po->running) + goto out; + + err = -EALREADY; if (po->fanout) - return -EALREADY; + goto out; if (type == PACKET_FANOUT_ROLLOVER || (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) { - po->rollover = kzalloc(sizeof(*po->rollover), GFP_KERNEL); - if (!po->rollover) - return -ENOMEM; - atomic_long_set(&po->rollover->num, 0); - atomic_long_set(&po->rollover->num_huge, 0); - atomic_long_set(&po->rollover->num_failed, 0); + err = -ENOMEM; + rollover = kzalloc(sizeof(*rollover), GFP_KERNEL); + if (!rollover) + goto out; + atomic_long_set(&rollover->num, 0); + atomic_long_set(&rollover->num_huge, 0); + atomic_long_set(&rollover->num_failed, 0); + po->rollover = rollover; } - mutex_lock(&fanout_mutex); match = NULL; list_for_each_entry(f, &fanout_list, list) { if (f->id == id && @@ -1704,11 +1710,11 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) } } out: - mutex_unlock(&fanout_mutex); - if (err) { - kfree(po->rollover); + if (err && rollover) { + kfree(rollover); po->rollover = NULL; } + mutex_unlock(&fanout_mutex); return err; } @@ -1717,23 +1723,22 @@ static void fanout_release(struct sock *sk) struct packet_sock *po = pkt_sk(sk); struct packet_fanout *f; - f = po->fanout; - if (!f) - return; - mutex_lock(&fanout_mutex); - po->fanout = NULL; + f = po->fanout; + if (f) { + po->fanout = NULL; - if (atomic_dec_and_test(&f->sk_ref)) { - list_del(&f->list); - dev_remove_pack(&f->prot_hook); - fanout_release_data(f); - kfree(f); + if (atomic_dec_and_test(&f->sk_ref)) { + list_del(&f->list); + dev_remove_pack(&f->prot_hook); + fanout_release_data(f); + kfree(f); + } + + if (po->rollover) + kfree_rcu(po->rollover, rcu); } mutex_unlock(&fanout_mutex); - - if (po->rollover) - kfree_rcu(po->rollover, rcu); } static bool packet_extra_vlan_len_allowed(const struct net_device *dev, From a725eb15db80643a160310ed6bcfd6c5a6c907f2 Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Wed, 15 Feb 2017 05:23:26 +0300 Subject: [PATCH 19/27] uapi: fix linux/if_pppol2tp.h userspace compilation errors Because of interface limitations, provided by libc cannot be included after , therefore any header that includes cannot be included after . Change uapi/linux/l2tp.h, the last uapi header that includes , to include and instead of and use __SOCK_SIZE__ instead of sizeof(struct sockaddr) the same way as uapi/linux/in.h does, to fix linux/if_pppol2tp.h userspace compilation errors like this: In file included from /usr/include/linux/l2tp.h:12:0, from /usr/include/linux/if_pppol2tp.h:21, /usr/include/netinet/in.h:31:8: error: redefinition of 'struct in_addr' Fixes: 47c3e7783be4 ("net: l2tp: deprecate PPPOL2TP_MSG_* in favour of L2TP_MSG_*") Signed-off-by: Dmitry V. Levin Signed-off-by: David S. Miller --- include/uapi/linux/l2tp.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h index 85ddb74fcd1c..b23c1914a182 100644 --- a/include/uapi/linux/l2tp.h +++ b/include/uapi/linux/l2tp.h @@ -9,9 +9,8 @@ #include #include -#ifndef __KERNEL__ -#include -#endif +#include +#include #define IPPROTO_L2TP 115 @@ -31,7 +30,7 @@ struct sockaddr_l2tpip { __u32 l2tp_conn_id; /* Connection ID of tunnel */ /* Pad to size of `struct sockaddr'. */ - unsigned char __pad[sizeof(struct sockaddr) - + unsigned char __pad[__SOCK_SIZE__ - sizeof(__kernel_sa_family_t) - sizeof(__be16) - sizeof(struct in_addr) - sizeof(__u32)]; From e70ac171658679ecf6bea4bbd9e9325cd6079d2b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 14 Feb 2017 17:11:14 -0800 Subject: [PATCH 20/27] tcp: tcp_probe: use spin_lock_bh() tcp_rcv_established() can now run in process context. We need to disable BH while acquiring tcp probe spinlock, or risk a deadlock. Fixes: 5413d1babe8f ("net: do not block BH while processing socket backlog") Signed-off-by: Eric Dumazet Reported-by: Ricardo Nabinger Sanchez Signed-off-by: David S. Miller --- net/ipv4/tcp_probe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index f6c50af24a64..3d063eb37848 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -117,7 +117,7 @@ static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, (fwmark > 0 && skb->mark == fwmark)) && (full || tp->snd_cwnd != tcp_probe.lastcwnd)) { - spin_lock(&tcp_probe.lock); + spin_lock_bh(&tcp_probe.lock); /* If log fills, just silently drop */ if (tcp_probe_avail() > 1) { struct tcp_log *p = tcp_probe.log + tcp_probe.head; @@ -157,7 +157,7 @@ static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, tcp_probe.head = (tcp_probe.head + 1) & (bufsize - 1); } tcp_probe.lastcwnd = tp->snd_cwnd; - spin_unlock(&tcp_probe.lock); + spin_unlock_bh(&tcp_probe.lock); wake_up(&tcp_probe.wait); } From 5463b3d043826ff8ef487edbd1ef1bfffb677437 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Tue, 14 Feb 2017 08:22:20 +1100 Subject: [PATCH 21/27] bpf: kernel header files need to be copied into the tools directory Signed-off-by: Stephen Rothwell Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- tools/include/uapi/linux/bpf.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 0eb0e87dbe9f..d2b0ac799d03 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -116,6 +116,12 @@ enum bpf_attach_type { #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE +/* If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command + * to the given target_fd cgroup the descendent cgroup will be able to + * override effective bpf program that was inherited from this cgroup + */ +#define BPF_F_ALLOW_OVERRIDE (1U << 0) + #define BPF_PSEUDO_MAP_FD 1 /* flags for BPF_MAP_UPDATE_ELEM command */ @@ -171,6 +177,7 @@ union bpf_attr { __u32 target_fd; /* container object to attach to */ __u32 attach_bpf_fd; /* eBPF program to attach */ __u32 attach_type; + __u32 attach_flags; }; } __attribute__((aligned(8))); From cd224553641848dd17800fe559e4ff5d208553e8 Mon Sep 17 00:00:00 2001 From: Anssi Hannula Date: Tue, 14 Feb 2017 19:11:44 +0200 Subject: [PATCH 22/27] net: xilinx_emaclite: fix receive buffer overflow xilinx_emaclite looks at the received data to try to determine the Ethernet packet length but does not properly clamp it if proto_type == ETH_P_IP or 1500 < proto_type <= 1518, causing a buffer overflow and a panic via skb_panic() as the length exceeds the allocated skb size. Fix those cases. Also add an additional unconditional check with WARN_ON() at the end. Signed-off-by: Anssi Hannula Fixes: bb81b2ddfa19 ("net: add Xilinx emac lite device driver") Signed-off-by: David S. Miller --- drivers/net/ethernet/xilinx/xilinx_emaclite.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/xilinx/xilinx_emaclite.c b/drivers/net/ethernet/xilinx/xilinx_emaclite.c index 93dc10b10c09..455774e4741a 100644 --- a/drivers/net/ethernet/xilinx/xilinx_emaclite.c +++ b/drivers/net/ethernet/xilinx/xilinx_emaclite.c @@ -369,7 +369,7 @@ static int xemaclite_send_data(struct net_local *drvdata, u8 *data, * * Return: Total number of bytes received */ -static u16 xemaclite_recv_data(struct net_local *drvdata, u8 *data) +static u16 xemaclite_recv_data(struct net_local *drvdata, u8 *data, int maxlen) { void __iomem *addr; u16 length, proto_type; @@ -409,7 +409,7 @@ static u16 xemaclite_recv_data(struct net_local *drvdata, u8 *data) /* Check if received ethernet frame is a raw ethernet frame * or an IP packet or an ARP packet */ - if (proto_type > (ETH_FRAME_LEN + ETH_FCS_LEN)) { + if (proto_type > ETH_DATA_LEN) { if (proto_type == ETH_P_IP) { length = ((ntohl(__raw_readl(addr + @@ -417,6 +417,7 @@ static u16 xemaclite_recv_data(struct net_local *drvdata, u8 *data) XEL_RXBUFF_OFFSET)) >> XEL_HEADER_SHIFT) & XEL_RPLR_LENGTH_MASK); + length = min_t(u16, length, ETH_DATA_LEN); length += ETH_HLEN + ETH_FCS_LEN; } else if (proto_type == ETH_P_ARP) @@ -429,6 +430,9 @@ static u16 xemaclite_recv_data(struct net_local *drvdata, u8 *data) /* Use the length in the frame, plus the header and trailer */ length = proto_type + ETH_HLEN + ETH_FCS_LEN; + if (WARN_ON(length > maxlen)) + length = maxlen; + /* Read from the EmacLite device */ xemaclite_aligned_read((u32 __force *) (addr + XEL_RXBUFF_OFFSET), data, length); @@ -603,7 +607,7 @@ static void xemaclite_rx_handler(struct net_device *dev) skb_reserve(skb, 2); - len = xemaclite_recv_data(lp, (u8 *) skb->data); + len = xemaclite_recv_data(lp, (u8 *) skb->data, len); if (!len) { dev->stats.rx_errors++; From acf138f1b00bdd1b7cd9894562ed0c2a1670888e Mon Sep 17 00:00:00 2001 From: Anssi Hannula Date: Tue, 14 Feb 2017 19:11:45 +0200 Subject: [PATCH 23/27] net: xilinx_emaclite: fix freezes due to unordered I/O The xilinx_emaclite uses __raw_writel and __raw_readl for register accesses. Those functions do not imply any kind of memory barriers and they may be reordered. The driver does not seem to take that into account, though, and the driver does not satisfy the ordering requirements of the hardware. For clear examples, see xemaclite_mdio_write() and xemaclite_mdio_read() which try to set MDIO address before initiating the transaction. I'm seeing system freezes with the driver with GCC 5.4 and current Linux kernels on Zynq-7000 SoC immediately when trying to use the interface. In commit 123c1407af87 ("net: emaclite: Do not use microblaze and ppc IO functions") the driver was switched from non-generic in_be32/out_be32 (memory barriers, big endian) to __raw_readl/__raw_writel (no memory barriers, native endian), so apparently the device follows system endianness and the driver was originally written with the assumption of memory barriers. Rather than try to hunt for each case of missing barrier, just switch the driver to use iowrite32/ioread32/iowrite32be/ioread32be depending on endianness instead. Tested on little-endian Zynq-7000 ARM SoC FPGA. Signed-off-by: Anssi Hannula Fixes: 123c1407af87 ("net: emaclite: Do not use microblaze and ppc IO functions") Signed-off-by: David S. Miller --- drivers/net/ethernet/xilinx/xilinx_emaclite.c | 116 ++++++++++-------- 1 file changed, 62 insertions(+), 54 deletions(-) diff --git a/drivers/net/ethernet/xilinx/xilinx_emaclite.c b/drivers/net/ethernet/xilinx/xilinx_emaclite.c index 455774e4741a..aa02a03a6d8d 100644 --- a/drivers/net/ethernet/xilinx/xilinx_emaclite.c +++ b/drivers/net/ethernet/xilinx/xilinx_emaclite.c @@ -100,6 +100,14 @@ /* BUFFER_ALIGN(adr) calculates the number of bytes to the next alignment. */ #define BUFFER_ALIGN(adr) ((ALIGNMENT - ((u32) adr)) % ALIGNMENT) +#ifdef __BIG_ENDIAN +#define xemaclite_readl ioread32be +#define xemaclite_writel iowrite32be +#else +#define xemaclite_readl ioread32 +#define xemaclite_writel iowrite32 +#endif + /** * struct net_local - Our private per device data * @ndev: instance of the network device @@ -156,15 +164,15 @@ static void xemaclite_enable_interrupts(struct net_local *drvdata) u32 reg_data; /* Enable the Tx interrupts for the first Buffer */ - reg_data = __raw_readl(drvdata->base_addr + XEL_TSR_OFFSET); - __raw_writel(reg_data | XEL_TSR_XMIT_IE_MASK, - drvdata->base_addr + XEL_TSR_OFFSET); + reg_data = xemaclite_readl(drvdata->base_addr + XEL_TSR_OFFSET); + xemaclite_writel(reg_data | XEL_TSR_XMIT_IE_MASK, + drvdata->base_addr + XEL_TSR_OFFSET); /* Enable the Rx interrupts for the first buffer */ - __raw_writel(XEL_RSR_RECV_IE_MASK, drvdata->base_addr + XEL_RSR_OFFSET); + xemaclite_writel(XEL_RSR_RECV_IE_MASK, drvdata->base_addr + XEL_RSR_OFFSET); /* Enable the Global Interrupt Enable */ - __raw_writel(XEL_GIER_GIE_MASK, drvdata->base_addr + XEL_GIER_OFFSET); + xemaclite_writel(XEL_GIER_GIE_MASK, drvdata->base_addr + XEL_GIER_OFFSET); } /** @@ -179,17 +187,17 @@ static void xemaclite_disable_interrupts(struct net_local *drvdata) u32 reg_data; /* Disable the Global Interrupt Enable */ - __raw_writel(XEL_GIER_GIE_MASK, drvdata->base_addr + XEL_GIER_OFFSET); + xemaclite_writel(XEL_GIER_GIE_MASK, drvdata->base_addr + XEL_GIER_OFFSET); /* Disable the Tx interrupts for the first buffer */ - reg_data = __raw_readl(drvdata->base_addr + XEL_TSR_OFFSET); - __raw_writel(reg_data & (~XEL_TSR_XMIT_IE_MASK), - drvdata->base_addr + XEL_TSR_OFFSET); + reg_data = xemaclite_readl(drvdata->base_addr + XEL_TSR_OFFSET); + xemaclite_writel(reg_data & (~XEL_TSR_XMIT_IE_MASK), + drvdata->base_addr + XEL_TSR_OFFSET); /* Disable the Rx interrupts for the first buffer */ - reg_data = __raw_readl(drvdata->base_addr + XEL_RSR_OFFSET); - __raw_writel(reg_data & (~XEL_RSR_RECV_IE_MASK), - drvdata->base_addr + XEL_RSR_OFFSET); + reg_data = xemaclite_readl(drvdata->base_addr + XEL_RSR_OFFSET); + xemaclite_writel(reg_data & (~XEL_RSR_RECV_IE_MASK), + drvdata->base_addr + XEL_RSR_OFFSET); } /** @@ -321,7 +329,7 @@ static int xemaclite_send_data(struct net_local *drvdata, u8 *data, byte_count = ETH_FRAME_LEN; /* Check if the expected buffer is available */ - reg_data = __raw_readl(addr + XEL_TSR_OFFSET); + reg_data = xemaclite_readl(addr + XEL_TSR_OFFSET); if ((reg_data & (XEL_TSR_XMIT_BUSY_MASK | XEL_TSR_XMIT_ACTIVE_MASK)) == 0) { @@ -334,7 +342,7 @@ static int xemaclite_send_data(struct net_local *drvdata, u8 *data, addr = (void __iomem __force *)((u32 __force)addr ^ XEL_BUFFER_OFFSET); - reg_data = __raw_readl(addr + XEL_TSR_OFFSET); + reg_data = xemaclite_readl(addr + XEL_TSR_OFFSET); if ((reg_data & (XEL_TSR_XMIT_BUSY_MASK | XEL_TSR_XMIT_ACTIVE_MASK)) != 0) @@ -345,16 +353,16 @@ static int xemaclite_send_data(struct net_local *drvdata, u8 *data, /* Write the frame to the buffer */ xemaclite_aligned_write(data, (u32 __force *) addr, byte_count); - __raw_writel((byte_count & XEL_TPLR_LENGTH_MASK), - addr + XEL_TPLR_OFFSET); + xemaclite_writel((byte_count & XEL_TPLR_LENGTH_MASK), + addr + XEL_TPLR_OFFSET); /* Update the Tx Status Register to indicate that there is a * frame to send. Set the XEL_TSR_XMIT_ACTIVE_MASK flag which * is used by the interrupt handler to check whether a frame * has been transmitted */ - reg_data = __raw_readl(addr + XEL_TSR_OFFSET); + reg_data = xemaclite_readl(addr + XEL_TSR_OFFSET); reg_data |= (XEL_TSR_XMIT_BUSY_MASK | XEL_TSR_XMIT_ACTIVE_MASK); - __raw_writel(reg_data, addr + XEL_TSR_OFFSET); + xemaclite_writel(reg_data, addr + XEL_TSR_OFFSET); return 0; } @@ -379,7 +387,7 @@ static u16 xemaclite_recv_data(struct net_local *drvdata, u8 *data, int maxlen) addr = (drvdata->base_addr + drvdata->next_rx_buf_to_use); /* Verify which buffer has valid data */ - reg_data = __raw_readl(addr + XEL_RSR_OFFSET); + reg_data = xemaclite_readl(addr + XEL_RSR_OFFSET); if ((reg_data & XEL_RSR_RECV_DONE_MASK) == XEL_RSR_RECV_DONE_MASK) { if (drvdata->rx_ping_pong != 0) @@ -396,14 +404,14 @@ static u16 xemaclite_recv_data(struct net_local *drvdata, u8 *data, int maxlen) return 0; /* No data was available */ /* Verify that buffer has valid data */ - reg_data = __raw_readl(addr + XEL_RSR_OFFSET); + reg_data = xemaclite_readl(addr + XEL_RSR_OFFSET); if ((reg_data & XEL_RSR_RECV_DONE_MASK) != XEL_RSR_RECV_DONE_MASK) return 0; /* No data was available */ } /* Get the protocol type of the ethernet frame that arrived */ - proto_type = ((ntohl(__raw_readl(addr + XEL_HEADER_OFFSET + + proto_type = ((ntohl(xemaclite_readl(addr + XEL_HEADER_OFFSET + XEL_RXBUFF_OFFSET)) >> XEL_HEADER_SHIFT) & XEL_RPLR_LENGTH_MASK); @@ -412,7 +420,7 @@ static u16 xemaclite_recv_data(struct net_local *drvdata, u8 *data, int maxlen) if (proto_type > ETH_DATA_LEN) { if (proto_type == ETH_P_IP) { - length = ((ntohl(__raw_readl(addr + + length = ((ntohl(xemaclite_readl(addr + XEL_HEADER_IP_LENGTH_OFFSET + XEL_RXBUFF_OFFSET)) >> XEL_HEADER_SHIFT) & @@ -438,9 +446,9 @@ static u16 xemaclite_recv_data(struct net_local *drvdata, u8 *data, int maxlen) data, length); /* Acknowledge the frame */ - reg_data = __raw_readl(addr + XEL_RSR_OFFSET); + reg_data = xemaclite_readl(addr + XEL_RSR_OFFSET); reg_data &= ~XEL_RSR_RECV_DONE_MASK; - __raw_writel(reg_data, addr + XEL_RSR_OFFSET); + xemaclite_writel(reg_data, addr + XEL_RSR_OFFSET); return length; } @@ -467,14 +475,14 @@ static void xemaclite_update_address(struct net_local *drvdata, xemaclite_aligned_write(address_ptr, (u32 __force *) addr, ETH_ALEN); - __raw_writel(ETH_ALEN, addr + XEL_TPLR_OFFSET); + xemaclite_writel(ETH_ALEN, addr + XEL_TPLR_OFFSET); /* Update the MAC address in the EmacLite */ - reg_data = __raw_readl(addr + XEL_TSR_OFFSET); - __raw_writel(reg_data | XEL_TSR_PROG_MAC_ADDR, addr + XEL_TSR_OFFSET); + reg_data = xemaclite_readl(addr + XEL_TSR_OFFSET); + xemaclite_writel(reg_data | XEL_TSR_PROG_MAC_ADDR, addr + XEL_TSR_OFFSET); /* Wait for EmacLite to finish with the MAC address update */ - while ((__raw_readl(addr + XEL_TSR_OFFSET) & + while ((xemaclite_readl(addr + XEL_TSR_OFFSET) & XEL_TSR_PROG_MAC_ADDR) != 0) ; } @@ -644,32 +652,32 @@ static irqreturn_t xemaclite_interrupt(int irq, void *dev_id) u32 tx_status; /* Check if there is Rx Data available */ - if ((__raw_readl(base_addr + XEL_RSR_OFFSET) & + if ((xemaclite_readl(base_addr + XEL_RSR_OFFSET) & XEL_RSR_RECV_DONE_MASK) || - (__raw_readl(base_addr + XEL_BUFFER_OFFSET + XEL_RSR_OFFSET) + (xemaclite_readl(base_addr + XEL_BUFFER_OFFSET + XEL_RSR_OFFSET) & XEL_RSR_RECV_DONE_MASK)) xemaclite_rx_handler(dev); /* Check if the Transmission for the first buffer is completed */ - tx_status = __raw_readl(base_addr + XEL_TSR_OFFSET); + tx_status = xemaclite_readl(base_addr + XEL_TSR_OFFSET); if (((tx_status & XEL_TSR_XMIT_BUSY_MASK) == 0) && (tx_status & XEL_TSR_XMIT_ACTIVE_MASK) != 0) { tx_status &= ~XEL_TSR_XMIT_ACTIVE_MASK; - __raw_writel(tx_status, base_addr + XEL_TSR_OFFSET); + xemaclite_writel(tx_status, base_addr + XEL_TSR_OFFSET); tx_complete = true; } /* Check if the Transmission for the second buffer is completed */ - tx_status = __raw_readl(base_addr + XEL_BUFFER_OFFSET + XEL_TSR_OFFSET); + tx_status = xemaclite_readl(base_addr + XEL_BUFFER_OFFSET + XEL_TSR_OFFSET); if (((tx_status & XEL_TSR_XMIT_BUSY_MASK) == 0) && (tx_status & XEL_TSR_XMIT_ACTIVE_MASK) != 0) { tx_status &= ~XEL_TSR_XMIT_ACTIVE_MASK; - __raw_writel(tx_status, base_addr + XEL_BUFFER_OFFSET + - XEL_TSR_OFFSET); + xemaclite_writel(tx_status, base_addr + XEL_BUFFER_OFFSET + + XEL_TSR_OFFSET); tx_complete = true; } @@ -702,7 +710,7 @@ static int xemaclite_mdio_wait(struct net_local *lp) /* wait for the MDIO interface to not be busy or timeout after some time. */ - while (__raw_readl(lp->base_addr + XEL_MDIOCTRL_OFFSET) & + while (xemaclite_readl(lp->base_addr + XEL_MDIOCTRL_OFFSET) & XEL_MDIOCTRL_MDIOSTS_MASK) { if (time_before_eq(end, jiffies)) { WARN_ON(1); @@ -738,17 +746,17 @@ static int xemaclite_mdio_read(struct mii_bus *bus, int phy_id, int reg) * MDIO Address register. Set the Status bit in the MDIO Control * register to start a MDIO read transaction. */ - ctrl_reg = __raw_readl(lp->base_addr + XEL_MDIOCTRL_OFFSET); - __raw_writel(XEL_MDIOADDR_OP_MASK | - ((phy_id << XEL_MDIOADDR_PHYADR_SHIFT) | reg), - lp->base_addr + XEL_MDIOADDR_OFFSET); - __raw_writel(ctrl_reg | XEL_MDIOCTRL_MDIOSTS_MASK, - lp->base_addr + XEL_MDIOCTRL_OFFSET); + ctrl_reg = xemaclite_readl(lp->base_addr + XEL_MDIOCTRL_OFFSET); + xemaclite_writel(XEL_MDIOADDR_OP_MASK | + ((phy_id << XEL_MDIOADDR_PHYADR_SHIFT) | reg), + lp->base_addr + XEL_MDIOADDR_OFFSET); + xemaclite_writel(ctrl_reg | XEL_MDIOCTRL_MDIOSTS_MASK, + lp->base_addr + XEL_MDIOCTRL_OFFSET); if (xemaclite_mdio_wait(lp)) return -ETIMEDOUT; - rc = __raw_readl(lp->base_addr + XEL_MDIORD_OFFSET); + rc = xemaclite_readl(lp->base_addr + XEL_MDIORD_OFFSET); dev_dbg(&lp->ndev->dev, "xemaclite_mdio_read(phy_id=%i, reg=%x) == %x\n", @@ -785,13 +793,13 @@ static int xemaclite_mdio_write(struct mii_bus *bus, int phy_id, int reg, * Data register. Finally, set the Status bit in the MDIO Control * register to start a MDIO write transaction. */ - ctrl_reg = __raw_readl(lp->base_addr + XEL_MDIOCTRL_OFFSET); - __raw_writel(~XEL_MDIOADDR_OP_MASK & - ((phy_id << XEL_MDIOADDR_PHYADR_SHIFT) | reg), - lp->base_addr + XEL_MDIOADDR_OFFSET); - __raw_writel(val, lp->base_addr + XEL_MDIOWR_OFFSET); - __raw_writel(ctrl_reg | XEL_MDIOCTRL_MDIOSTS_MASK, - lp->base_addr + XEL_MDIOCTRL_OFFSET); + ctrl_reg = xemaclite_readl(lp->base_addr + XEL_MDIOCTRL_OFFSET); + xemaclite_writel(~XEL_MDIOADDR_OP_MASK & + ((phy_id << XEL_MDIOADDR_PHYADR_SHIFT) | reg), + lp->base_addr + XEL_MDIOADDR_OFFSET); + xemaclite_writel(val, lp->base_addr + XEL_MDIOWR_OFFSET); + xemaclite_writel(ctrl_reg | XEL_MDIOCTRL_MDIOSTS_MASK, + lp->base_addr + XEL_MDIOCTRL_OFFSET); return 0; } @@ -838,8 +846,8 @@ static int xemaclite_mdio_setup(struct net_local *lp, struct device *dev) /* Enable the MDIO bus by asserting the enable bit in MDIO Control * register. */ - __raw_writel(XEL_MDIOCTRL_MDIOEN_MASK, - lp->base_addr + XEL_MDIOCTRL_OFFSET); + xemaclite_writel(XEL_MDIOCTRL_MDIOEN_MASK, + lp->base_addr + XEL_MDIOCTRL_OFFSET); bus = mdiobus_alloc(); if (!bus) { @@ -1144,8 +1152,8 @@ static int xemaclite_of_probe(struct platform_device *ofdev) } /* Clear the Tx CSR's in case this is a restart */ - __raw_writel(0, lp->base_addr + XEL_TSR_OFFSET); - __raw_writel(0, lp->base_addr + XEL_BUFFER_OFFSET + XEL_TSR_OFFSET); + xemaclite_writel(0, lp->base_addr + XEL_TSR_OFFSET); + xemaclite_writel(0, lp->base_addr + XEL_BUFFER_OFFSET + XEL_TSR_OFFSET); /* Set the MAC address in the EmacLite device */ xemaclite_update_address(lp, ndev->dev_addr); From 7627ae6030f56a9a91a5b3867b21f35d79c16e64 Mon Sep 17 00:00:00 2001 From: Marcus Huewe Date: Wed, 15 Feb 2017 01:00:36 +0100 Subject: [PATCH 24/27] net: neigh: Fix netevent NETEVENT_DELAY_PROBE_TIME_UPDATE notification When setting a neigh related sysctl parameter, we always send a NETEVENT_DELAY_PROBE_TIME_UPDATE netevent. For instance, when executing sysctl net.ipv6.neigh.wlp3s0.retrans_time_ms=2000 a NETEVENT_DELAY_PROBE_TIME_UPDATE netevent is generated. This is caused by commit 2a4501ae18b5 ("neigh: Send a notification when DELAY_PROBE_TIME changes"). According to the commit's description, it was intended to generate such an event when setting the "delay_first_probe_time" sysctl parameter. In order to fix this, only generate this event when actually setting the "delay_first_probe_time" sysctl parameter. This fix should not have any unintended side-effects, because all but one registered netevent callbacks check for other netevent event types (the registered callbacks were obtained by grepping for "register_netevent_notifier"). The only callback that uses the NETEVENT_DELAY_PROBE_TIME_UPDATE event is mlxsw_sp_router_netevent_event() (in drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c): in case of this event, it only accesses the DELAY_PROBE_TIME of the passed neigh_parms. Fixes: 2a4501ae18b5 ("neigh: Send a notification when DELAY_PROBE_TIME changes") Signed-off-by: Marcus Huewe Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/core/neighbour.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 7bb12e07ffef..e7c12caa20c8 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2923,7 +2923,8 @@ static void neigh_proc_update(struct ctl_table *ctl, int write) return; set_bit(index, p->data_state); - call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p); + if (index == NEIGH_VAR_DELAY_PROBE_TIME) + call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p); if (!dev) /* NULL dev means this is default value */ neigh_copy_dflt_parms(net, p, index); } From 28f4d16570dcf440e54a4d72666d5be452f27d0e Mon Sep 17 00:00:00 2001 From: Thomas Falcon Date: Wed, 15 Feb 2017 10:32:11 -0600 Subject: [PATCH 25/27] ibmvnic: Fix endian error when requesting device capabilities When a vNIC client driver requests a faulty device setting, the server returns an acceptable value for the client to request. This 64 bit value was incorrectly being swapped as a 32 bit value, resulting in loss of data. This patch corrects that by using the 64 bit swap function. Signed-off-by: Thomas Falcon Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 5b66b4fd1767..158b49a0a1d6 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -2389,10 +2389,10 @@ static void handle_request_cap_rsp(union ibmvnic_crq *crq, case PARTIALSUCCESS: dev_info(dev, "req=%lld, rsp=%ld in %s queue, retrying.\n", *req_value, - (long int)be32_to_cpu(crq->request_capability_rsp. + (long int)be64_to_cpu(crq->request_capability_rsp. number), name); release_sub_crqs_no_irqs(adapter); - *req_value = be32_to_cpu(crq->request_capability_rsp.number); + *req_value = be64_to_cpu(crq->request_capability_rsp.number); init_sub_crqs(adapter, 1); return; default: From 75224c93fa985f4a6fb983f53208f5c5aa555fbf Mon Sep 17 00:00:00 2001 From: Thomas Falcon Date: Wed, 15 Feb 2017 10:33:33 -0600 Subject: [PATCH 26/27] ibmvnic: Fix endian errors in error reporting output Error reports received from firmware were not being converted from big endian values, leading to bogus error codes reported on little endian systems. Signed-off-by: Thomas Falcon Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 158b49a0a1d6..a07b8d79174c 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -2186,12 +2186,12 @@ static void handle_error_info_rsp(union ibmvnic_crq *crq, if (!found) { dev_err(dev, "Couldn't find error id %x\n", - crq->request_error_rsp.error_id); + be32_to_cpu(crq->request_error_rsp.error_id)); return; } dev_err(dev, "Detailed info for error id %x:", - crq->request_error_rsp.error_id); + be32_to_cpu(crq->request_error_rsp.error_id)); for (i = 0; i < error_buff->len; i++) { pr_cont("%02x", (int)error_buff->buff[i]); @@ -2270,8 +2270,8 @@ static void handle_error_indication(union ibmvnic_crq *crq, dev_err(dev, "Firmware reports %serror id %x, cause %d\n", crq->error_indication. flags & IBMVNIC_FATAL_ERROR ? "FATAL " : "", - crq->error_indication.error_id, - crq->error_indication.error_cause); + be32_to_cpu(crq->error_indication.error_id), + be16_to_cpu(crq->error_indication.error_cause)); error_buff = kmalloc(sizeof(*error_buff), GFP_ATOMIC); if (!error_buff) From bf3f14d6342cfb37eab8f0cddd0e4d4063fd9fc9 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 15 Feb 2017 22:29:51 -0500 Subject: [PATCH 27/27] rhashtable: Revert nested table changes. This reverts commits: 6a25478077d987edc5e2f880590a2bc5fcab4441 9dbbfb0ab6680c6a85609041011484e6658e7d3c 40137906c5f55c252194ef5834130383e639536f It's too risky to put in this late in the release cycle. We'll put these changes into the next merge window instead. Signed-off-by: David S. Miller --- fs/gfs2/glock.c | 28 ++-- include/linux/rhashtable.h | 78 +++-------- lib/rhashtable.c | 270 +++++++------------------------------ net/tipc/net.c | 4 - net/tipc/socket.c | 30 ++--- 5 files changed, 94 insertions(+), 316 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 70e94170af85..94f50cac91c6 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1420,32 +1420,26 @@ static struct shrinker glock_shrinker = { * @sdp: the filesystem * @bucket: the bucket * - * Note that the function can be called multiple times on the same - * object. So the user must ensure that the function can cope with - * that. */ static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp) { struct gfs2_glock *gl; - struct rhashtable_iter iter; + struct rhash_head *pos; + const struct bucket_table *tbl; + int i; - rhashtable_walk_enter(&gl_hash_table, &iter); - - do { - gl = ERR_PTR(rhashtable_walk_start(&iter)); - if (gl) - continue; - - while ((gl = rhashtable_walk_next(&iter)) && !IS_ERR(gl)) + rcu_read_lock(); + tbl = rht_dereference_rcu(gl_hash_table.tbl, &gl_hash_table); + for (i = 0; i < tbl->size; i++) { + rht_for_each_entry_rcu(gl, pos, tbl, i, gl_node) { if ((gl->gl_name.ln_sbd == sdp) && lockref_get_not_dead(&gl->gl_lockref)) examiner(gl); - - rhashtable_walk_stop(&iter); - } while (cond_resched(), gl == ERR_PTR(-EAGAIN)); - - rhashtable_walk_exit(&iter); + } + } + rcu_read_unlock(); + cond_resched(); } /** diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index f2e12a845910..5c132d3188be 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -61,7 +61,6 @@ struct rhlist_head { /** * struct bucket_table - Table of hash buckets * @size: Number of hash buckets - * @nest: Number of bits of first-level nested table. * @rehash: Current bucket being rehashed * @hash_rnd: Random seed to fold into hash * @locks_mask: Mask to apply before accessing locks[] @@ -69,12 +68,10 @@ struct rhlist_head { * @walkers: List of active walkers * @rcu: RCU structure for freeing the table * @future_tbl: Table under construction during rehashing - * @ntbl: Nested table used when out of memory. * @buckets: size * hash buckets */ struct bucket_table { unsigned int size; - unsigned int nest; unsigned int rehash; u32 hash_rnd; unsigned int locks_mask; @@ -84,7 +81,7 @@ struct bucket_table { struct bucket_table __rcu *future_tbl; - struct rhash_head __rcu *buckets[] ____cacheline_aligned_in_smp; + struct rhash_head __rcu *buckets[] ____cacheline_aligned_in_smp; }; /** @@ -377,12 +374,6 @@ void rhashtable_free_and_destroy(struct rhashtable *ht, void *arg); void rhashtable_destroy(struct rhashtable *ht); -struct rhash_head __rcu **rht_bucket_nested(const struct bucket_table *tbl, - unsigned int hash); -struct rhash_head __rcu **rht_bucket_nested_insert(struct rhashtable *ht, - struct bucket_table *tbl, - unsigned int hash); - #define rht_dereference(p, ht) \ rcu_dereference_protected(p, lockdep_rht_mutex_is_held(ht)) @@ -398,27 +389,6 @@ struct rhash_head __rcu **rht_bucket_nested_insert(struct rhashtable *ht, #define rht_entry(tpos, pos, member) \ ({ tpos = container_of(pos, typeof(*tpos), member); 1; }) -static inline struct rhash_head __rcu *const *rht_bucket( - const struct bucket_table *tbl, unsigned int hash) -{ - return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) : - &tbl->buckets[hash]; -} - -static inline struct rhash_head __rcu **rht_bucket_var( - struct bucket_table *tbl, unsigned int hash) -{ - return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) : - &tbl->buckets[hash]; -} - -static inline struct rhash_head __rcu **rht_bucket_insert( - struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash) -{ - return unlikely(tbl->nest) ? rht_bucket_nested_insert(ht, tbl, hash) : - &tbl->buckets[hash]; -} - /** * rht_for_each_continue - continue iterating over hash chain * @pos: the &struct rhash_head to use as a loop cursor. @@ -438,7 +408,7 @@ static inline struct rhash_head __rcu **rht_bucket_insert( * @hash: the hash value / bucket index */ #define rht_for_each(pos, tbl, hash) \ - rht_for_each_continue(pos, *rht_bucket(tbl, hash), tbl, hash) + rht_for_each_continue(pos, (tbl)->buckets[hash], tbl, hash) /** * rht_for_each_entry_continue - continue iterating over hash chain @@ -463,7 +433,7 @@ static inline struct rhash_head __rcu **rht_bucket_insert( * @member: name of the &struct rhash_head within the hashable struct. */ #define rht_for_each_entry(tpos, pos, tbl, hash, member) \ - rht_for_each_entry_continue(tpos, pos, *rht_bucket(tbl, hash), \ + rht_for_each_entry_continue(tpos, pos, (tbl)->buckets[hash], \ tbl, hash, member) /** @@ -478,13 +448,13 @@ static inline struct rhash_head __rcu **rht_bucket_insert( * This hash chain list-traversal primitive allows for the looped code to * remove the loop cursor from the list. */ -#define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member) \ - for (pos = rht_dereference_bucket(*rht_bucket(tbl, hash), tbl, hash), \ - next = !rht_is_a_nulls(pos) ? \ - rht_dereference_bucket(pos->next, tbl, hash) : NULL; \ - (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ - pos = next, \ - next = !rht_is_a_nulls(pos) ? \ +#define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member) \ + for (pos = rht_dereference_bucket((tbl)->buckets[hash], tbl, hash), \ + next = !rht_is_a_nulls(pos) ? \ + rht_dereference_bucket(pos->next, tbl, hash) : NULL; \ + (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ + pos = next, \ + next = !rht_is_a_nulls(pos) ? \ rht_dereference_bucket(pos->next, tbl, hash) : NULL) /** @@ -515,7 +485,7 @@ static inline struct rhash_head __rcu **rht_bucket_insert( * traversal is guarded by rcu_read_lock(). */ #define rht_for_each_rcu(pos, tbl, hash) \ - rht_for_each_rcu_continue(pos, *rht_bucket(tbl, hash), tbl, hash) + rht_for_each_rcu_continue(pos, (tbl)->buckets[hash], tbl, hash) /** * rht_for_each_entry_rcu_continue - continue iterating over rcu hash chain @@ -548,8 +518,8 @@ static inline struct rhash_head __rcu **rht_bucket_insert( * the _rcu mutation primitives such as rhashtable_insert() as long as the * traversal is guarded by rcu_read_lock(). */ -#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member) \ - rht_for_each_entry_rcu_continue(tpos, pos, *rht_bucket(tbl, hash), \ +#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member) \ + rht_for_each_entry_rcu_continue(tpos, pos, (tbl)->buckets[hash],\ tbl, hash, member) /** @@ -595,7 +565,7 @@ static inline struct rhash_head *__rhashtable_lookup( .ht = ht, .key = key, }; - struct bucket_table *tbl; + const struct bucket_table *tbl; struct rhash_head *he; unsigned int hash; @@ -727,12 +697,8 @@ slow_path: } elasticity = ht->elasticity; - pprev = rht_bucket_insert(ht, tbl, hash); - data = ERR_PTR(-ENOMEM); - if (!pprev) - goto out; - - rht_for_each_continue(head, *pprev, tbl, hash) { + pprev = &tbl->buckets[hash]; + rht_for_each(head, tbl, hash) { struct rhlist_head *plist; struct rhlist_head *list; @@ -770,7 +736,7 @@ slow_path: if (unlikely(rht_grow_above_100(ht, tbl))) goto slow_path; - head = rht_dereference_bucket(*pprev, tbl, hash); + head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash); RCU_INIT_POINTER(obj->next, head); if (rhlist) { @@ -780,7 +746,7 @@ slow_path: RCU_INIT_POINTER(list->next, NULL); } - rcu_assign_pointer(*pprev, obj); + rcu_assign_pointer(tbl->buckets[hash], obj); atomic_inc(&ht->nelems); if (rht_grow_above_75(ht, tbl)) @@ -989,8 +955,8 @@ static inline int __rhashtable_remove_fast_one( spin_lock_bh(lock); - pprev = rht_bucket_var(tbl, hash); - rht_for_each_continue(he, *pprev, tbl, hash) { + pprev = &tbl->buckets[hash]; + rht_for_each(he, tbl, hash) { struct rhlist_head *list; list = container_of(he, struct rhlist_head, rhead); @@ -1141,8 +1107,8 @@ static inline int __rhashtable_replace_fast( spin_lock_bh(lock); - pprev = rht_bucket_var(tbl, hash); - rht_for_each_continue(he, *pprev, tbl, hash) { + pprev = &tbl->buckets[hash]; + rht_for_each(he, tbl, hash) { if (he != obj_old) { pprev = &he->next; continue; diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 172454e6b979..32d0ad058380 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -32,11 +32,6 @@ #define HASH_MIN_SIZE 4U #define BUCKET_LOCKS_PER_CPU 32UL -union nested_table { - union nested_table __rcu *table; - struct rhash_head __rcu *bucket; -}; - static u32 head_hashfn(struct rhashtable *ht, const struct bucket_table *tbl, const struct rhash_head *he) @@ -81,9 +76,6 @@ static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl, /* Never allocate more than 0.5 locks per bucket */ size = min_t(unsigned int, size, tbl->size >> 1); - if (tbl->nest) - size = min(size, 1U << tbl->nest); - if (sizeof(spinlock_t) != 0) { tbl->locks = NULL; #ifdef CONFIG_NUMA @@ -107,45 +99,8 @@ static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl, return 0; } -static void nested_table_free(union nested_table *ntbl, unsigned int size) -{ - const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); - const unsigned int len = 1 << shift; - unsigned int i; - - ntbl = rcu_dereference_raw(ntbl->table); - if (!ntbl) - return; - - if (size > len) { - size >>= shift; - for (i = 0; i < len; i++) - nested_table_free(ntbl + i, size); - } - - kfree(ntbl); -} - -static void nested_bucket_table_free(const struct bucket_table *tbl) -{ - unsigned int size = tbl->size >> tbl->nest; - unsigned int len = 1 << tbl->nest; - union nested_table *ntbl; - unsigned int i; - - ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]); - - for (i = 0; i < len; i++) - nested_table_free(ntbl + i, size); - - kfree(ntbl); -} - static void bucket_table_free(const struct bucket_table *tbl) { - if (tbl->nest) - nested_bucket_table_free(tbl); - if (tbl) kvfree(tbl->locks); @@ -157,59 +112,6 @@ static void bucket_table_free_rcu(struct rcu_head *head) bucket_table_free(container_of(head, struct bucket_table, rcu)); } -static union nested_table *nested_table_alloc(struct rhashtable *ht, - union nested_table __rcu **prev, - unsigned int shifted, - unsigned int nhash) -{ - union nested_table *ntbl; - int i; - - ntbl = rcu_dereference(*prev); - if (ntbl) - return ntbl; - - ntbl = kzalloc(PAGE_SIZE, GFP_ATOMIC); - - if (ntbl && shifted) { - for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0].bucket); i++) - INIT_RHT_NULLS_HEAD(ntbl[i].bucket, ht, - (i << shifted) | nhash); - } - - rcu_assign_pointer(*prev, ntbl); - - return ntbl; -} - -static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht, - size_t nbuckets, - gfp_t gfp) -{ - const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); - struct bucket_table *tbl; - size_t size; - - if (nbuckets < (1 << (shift + 1))) - return NULL; - - size = sizeof(*tbl) + sizeof(tbl->buckets[0]); - - tbl = kzalloc(size, gfp); - if (!tbl) - return NULL; - - if (!nested_table_alloc(ht, (union nested_table __rcu **)tbl->buckets, - 0, 0)) { - kfree(tbl); - return NULL; - } - - tbl->nest = (ilog2(nbuckets) - 1) % shift + 1; - - return tbl; -} - static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, size_t nbuckets, gfp_t gfp) @@ -224,17 +126,10 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, tbl = kzalloc(size, gfp | __GFP_NOWARN | __GFP_NORETRY); if (tbl == NULL && gfp == GFP_KERNEL) tbl = vzalloc(size); - - size = nbuckets; - - if (tbl == NULL && gfp != GFP_KERNEL) { - tbl = nested_bucket_table_alloc(ht, nbuckets, gfp); - nbuckets = 0; - } if (tbl == NULL) return NULL; - tbl->size = size; + tbl->size = nbuckets; if (alloc_bucket_locks(ht, tbl, gfp) < 0) { bucket_table_free(tbl); @@ -269,17 +164,12 @@ static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash) struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); struct bucket_table *new_tbl = rhashtable_last_table(ht, rht_dereference_rcu(old_tbl->future_tbl, ht)); - struct rhash_head __rcu **pprev = rht_bucket_var(old_tbl, old_hash); - int err = -EAGAIN; + struct rhash_head __rcu **pprev = &old_tbl->buckets[old_hash]; + int err = -ENOENT; struct rhash_head *head, *next, *entry; spinlock_t *new_bucket_lock; unsigned int new_hash; - if (new_tbl->nest) - goto out; - - err = -ENOENT; - rht_for_each(entry, old_tbl, old_hash) { err = 0; next = rht_dereference_bucket(entry->next, old_tbl, old_hash); @@ -312,26 +202,19 @@ out: return err; } -static int rhashtable_rehash_chain(struct rhashtable *ht, +static void rhashtable_rehash_chain(struct rhashtable *ht, unsigned int old_hash) { struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); spinlock_t *old_bucket_lock; - int err; old_bucket_lock = rht_bucket_lock(old_tbl, old_hash); spin_lock_bh(old_bucket_lock); - while (!(err = rhashtable_rehash_one(ht, old_hash))) + while (!rhashtable_rehash_one(ht, old_hash)) ; - - if (err == -ENOENT) { - old_tbl->rehash++; - err = 0; - } + old_tbl->rehash++; spin_unlock_bh(old_bucket_lock); - - return err; } static int rhashtable_rehash_attach(struct rhashtable *ht, @@ -363,17 +246,13 @@ static int rhashtable_rehash_table(struct rhashtable *ht) struct bucket_table *new_tbl; struct rhashtable_walker *walker; unsigned int old_hash; - int err; new_tbl = rht_dereference(old_tbl->future_tbl, ht); if (!new_tbl) return 0; - for (old_hash = 0; old_hash < old_tbl->size; old_hash++) { - err = rhashtable_rehash_chain(ht, old_hash); - if (err) - return err; - } + for (old_hash = 0; old_hash < old_tbl->size; old_hash++) + rhashtable_rehash_chain(ht, old_hash); /* Publish the new table pointer. */ rcu_assign_pointer(ht->tbl, new_tbl); @@ -392,16 +271,31 @@ static int rhashtable_rehash_table(struct rhashtable *ht) return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0; } -static int rhashtable_rehash_alloc(struct rhashtable *ht, - struct bucket_table *old_tbl, - unsigned int size) +/** + * rhashtable_expand - Expand hash table while allowing concurrent lookups + * @ht: the hash table to expand + * + * A secondary bucket array is allocated and the hash entries are migrated. + * + * This function may only be called in a context where it is safe to call + * synchronize_rcu(), e.g. not within a rcu_read_lock() section. + * + * The caller must ensure that no concurrent resizing occurs by holding + * ht->mutex. + * + * It is valid to have concurrent insertions and deletions protected by per + * bucket locks or concurrent RCU protected lookups and traversals. + */ +static int rhashtable_expand(struct rhashtable *ht) { - struct bucket_table *new_tbl; + struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht); int err; ASSERT_RHT_MUTEX(ht); - new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL); + old_tbl = rhashtable_last_table(ht, old_tbl); + + new_tbl = bucket_table_alloc(ht, old_tbl->size * 2, GFP_KERNEL); if (new_tbl == NULL) return -ENOMEM; @@ -430,9 +324,12 @@ static int rhashtable_rehash_alloc(struct rhashtable *ht, */ static int rhashtable_shrink(struct rhashtable *ht) { - struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); + struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht); unsigned int nelems = atomic_read(&ht->nelems); unsigned int size = 0; + int err; + + ASSERT_RHT_MUTEX(ht); if (nelems) size = roundup_pow_of_two(nelems * 3 / 2); @@ -445,7 +342,15 @@ static int rhashtable_shrink(struct rhashtable *ht) if (rht_dereference(old_tbl->future_tbl, ht)) return -EEXIST; - return rhashtable_rehash_alloc(ht, old_tbl, size); + new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL); + if (new_tbl == NULL) + return -ENOMEM; + + err = rhashtable_rehash_attach(ht, old_tbl, new_tbl); + if (err) + bucket_table_free(new_tbl); + + return err; } static void rht_deferred_worker(struct work_struct *work) @@ -461,14 +366,11 @@ static void rht_deferred_worker(struct work_struct *work) tbl = rhashtable_last_table(ht, tbl); if (rht_grow_above_75(ht, tbl)) - err = rhashtable_rehash_alloc(ht, tbl, tbl->size * 2); + rhashtable_expand(ht); else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl)) - err = rhashtable_shrink(ht); - else if (tbl->nest) - err = rhashtable_rehash_alloc(ht, tbl, tbl->size); + rhashtable_shrink(ht); - if (!err) - err = rhashtable_rehash_table(ht); + err = rhashtable_rehash_table(ht); mutex_unlock(&ht->mutex); @@ -537,8 +439,8 @@ static void *rhashtable_lookup_one(struct rhashtable *ht, int elasticity; elasticity = ht->elasticity; - pprev = rht_bucket_var(tbl, hash); - rht_for_each_continue(head, *pprev, tbl, hash) { + pprev = &tbl->buckets[hash]; + rht_for_each(head, tbl, hash) { struct rhlist_head *list; struct rhlist_head *plist; @@ -575,7 +477,6 @@ static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht, struct rhash_head *obj, void *data) { - struct rhash_head __rcu **pprev; struct bucket_table *new_tbl; struct rhash_head *head; @@ -598,11 +499,7 @@ static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht, if (unlikely(rht_grow_above_100(ht, tbl))) return ERR_PTR(-EAGAIN); - pprev = rht_bucket_insert(ht, tbl, hash); - if (!pprev) - return ERR_PTR(-ENOMEM); - - head = rht_dereference_bucket(*pprev, tbl, hash); + head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash); RCU_INIT_POINTER(obj->next, head); if (ht->rhlist) { @@ -612,7 +509,7 @@ static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht, RCU_INIT_POINTER(list->next, NULL); } - rcu_assign_pointer(*pprev, obj); + rcu_assign_pointer(tbl->buckets[hash], obj); atomic_inc(&ht->nelems); if (rht_grow_above_75(ht, tbl)) @@ -1078,7 +975,7 @@ void rhashtable_free_and_destroy(struct rhashtable *ht, void (*free_fn)(void *ptr, void *arg), void *arg) { - struct bucket_table *tbl; + const struct bucket_table *tbl; unsigned int i; cancel_work_sync(&ht->run_work); @@ -1089,7 +986,7 @@ void rhashtable_free_and_destroy(struct rhashtable *ht, for (i = 0; i < tbl->size; i++) { struct rhash_head *pos, *next; - for (pos = rht_dereference(*rht_bucket(tbl, i), ht), + for (pos = rht_dereference(tbl->buckets[i], ht), next = !rht_is_a_nulls(pos) ? rht_dereference(pos->next, ht) : NULL; !rht_is_a_nulls(pos); @@ -1110,70 +1007,3 @@ void rhashtable_destroy(struct rhashtable *ht) return rhashtable_free_and_destroy(ht, NULL, NULL); } EXPORT_SYMBOL_GPL(rhashtable_destroy); - -struct rhash_head __rcu **rht_bucket_nested(const struct bucket_table *tbl, - unsigned int hash) -{ - const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); - static struct rhash_head __rcu *rhnull = - (struct rhash_head __rcu *)NULLS_MARKER(0); - unsigned int index = hash & ((1 << tbl->nest) - 1); - unsigned int size = tbl->size >> tbl->nest; - unsigned int subhash = hash; - union nested_table *ntbl; - - ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]); - ntbl = rht_dereference_bucket(ntbl[index].table, tbl, hash); - subhash >>= tbl->nest; - - while (ntbl && size > (1 << shift)) { - index = subhash & ((1 << shift) - 1); - ntbl = rht_dereference_bucket(ntbl[index].table, tbl, hash); - size >>= shift; - subhash >>= shift; - } - - if (!ntbl) - return &rhnull; - - return &ntbl[subhash].bucket; - -} -EXPORT_SYMBOL_GPL(rht_bucket_nested); - -struct rhash_head __rcu **rht_bucket_nested_insert(struct rhashtable *ht, - struct bucket_table *tbl, - unsigned int hash) -{ - const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); - unsigned int index = hash & ((1 << tbl->nest) - 1); - unsigned int size = tbl->size >> tbl->nest; - union nested_table *ntbl; - unsigned int shifted; - unsigned int nhash; - - ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]); - hash >>= tbl->nest; - nhash = index; - shifted = tbl->nest; - ntbl = nested_table_alloc(ht, &ntbl[index].table, - size <= (1 << shift) ? shifted : 0, nhash); - - while (ntbl && size > (1 << shift)) { - index = hash & ((1 << shift) - 1); - size >>= shift; - hash >>= shift; - nhash |= index << shifted; - shifted += shift; - ntbl = nested_table_alloc(ht, &ntbl[index].table, - size <= (1 << shift) ? shifted : 0, - nhash); - } - - if (!ntbl) - return NULL; - - return &ntbl[hash].bucket; - -} -EXPORT_SYMBOL_GPL(rht_bucket_nested_insert); diff --git a/net/tipc/net.c b/net/tipc/net.c index ab8a2d5d1e32..28bf4feeb81c 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -110,10 +110,6 @@ int tipc_net_start(struct net *net, u32 addr) char addr_string[16]; tn->own_addr = addr; - - /* Ensure that the new address is visible before we reinit. */ - smp_mb(); - tipc_named_reinit(net); tipc_sk_reinit(net); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 370a5912bcb5..800caaa699a1 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -384,6 +384,8 @@ static int tipc_sk_create(struct net *net, struct socket *sock, INIT_LIST_HEAD(&tsk->publications); msg = &tsk->phdr; tn = net_generic(sock_net(sk), tipc_net_id); + tipc_msg_init(tn->own_addr, msg, TIPC_LOW_IMPORTANCE, TIPC_NAMED_MSG, + NAMED_H_SIZE, 0); /* Finish initializing socket data structures */ sock->ops = ops; @@ -393,13 +395,6 @@ static int tipc_sk_create(struct net *net, struct socket *sock, pr_warn("Socket create failed; port number exhausted\n"); return -EINVAL; } - - /* Ensure tsk is visible before we read own_addr. */ - smp_mb(); - - tipc_msg_init(tn->own_addr, msg, TIPC_LOW_IMPORTANCE, TIPC_NAMED_MSG, - NAMED_H_SIZE, 0); - msg_set_origport(msg, tsk->portid); setup_timer(&sk->sk_timer, tipc_sk_timeout, (unsigned long)tsk); sk->sk_shutdown = 0; @@ -2274,27 +2269,24 @@ static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope, void tipc_sk_reinit(struct net *net) { struct tipc_net *tn = net_generic(net, tipc_net_id); - struct rhashtable_iter iter; + const struct bucket_table *tbl; + struct rhash_head *pos; struct tipc_sock *tsk; struct tipc_msg *msg; + int i; - rhashtable_walk_enter(&tn->sk_rht, &iter); - - do { - tsk = ERR_PTR(rhashtable_walk_start(&iter)); - if (tsk) - continue; - - while ((tsk = rhashtable_walk_next(&iter)) && !IS_ERR(tsk)) { + rcu_read_lock(); + tbl = rht_dereference_rcu((&tn->sk_rht)->tbl, &tn->sk_rht); + for (i = 0; i < tbl->size; i++) { + rht_for_each_entry_rcu(tsk, pos, tbl, i, node) { spin_lock_bh(&tsk->sk.sk_lock.slock); msg = &tsk->phdr; msg_set_prevnode(msg, tn->own_addr); msg_set_orignode(msg, tn->own_addr); spin_unlock_bh(&tsk->sk.sk_lock.slock); } - - rhashtable_walk_stop(&iter); - } while (tsk == ERR_PTR(-EAGAIN)); + } + rcu_read_unlock(); } static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid)