Recently, ops->init() and ops->dump() of all actions were modified to
always obtain tcf_lock when accessing private action state. Actions that
don't depend on tcf_lock for synchronization with their data path use
non-bh locking API. However, tcf_lock is also used to protect rate
estimator stats in softirq context by timer callback.
Change ops->init() and ops->dump() of all actions to disable bh when using
tcf_lock to prevent deadlock reported by following lockdep warning:
[ 105.470398] ================================
[ 105.475014] WARNING: inconsistent lock state
[ 105.479628] 4.18.0-rc8+ #664 Not tainted
[ 105.483897] --------------------------------
[ 105.488511] inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage.
[ 105.494871] swapper/16/0 [HC0[0]:SC1[1]:HE1:SE0] takes:
[ 105.500449]
00000000f86c012e (&(&p->tcfa_lock)->rlock){+.?.}, at: est_fetch_counters+0x3c/0xa0
[ 105.509696] {SOFTIRQ-ON-W} state was registered at:
[ 105.514925] _raw_spin_lock+0x2c/0x40
[ 105.519022] tcf_bpf_init+0x579/0x820 [act_bpf]
[ 105.523990] tcf_action_init_1+0x4e4/0x660
[ 105.528518] tcf_action_init+0x1ce/0x2d0
[ 105.532880] tcf_exts_validate+0x1d8/0x200
[ 105.537416] fl_change+0x55a/0x268b [cls_flower]
[ 105.542469] tc_new_tfilter+0x748/0xa20
[ 105.546738] rtnetlink_rcv_msg+0x56a/0x6d0
[ 105.551268] netlink_rcv_skb+0x18d/0x200
[ 105.555628] netlink_unicast+0x2d0/0x370
[ 105.559990] netlink_sendmsg+0x3b9/0x6a0
[ 105.564349] sock_sendmsg+0x6b/0x80
[ 105.568271] ___sys_sendmsg+0x4a1/0x520
[ 105.572547] __sys_sendmsg+0xd7/0x150
[ 105.576655] do_syscall_64+0x72/0x2c0
[ 105.580757] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 105.586243] irq event stamp: 489296
[ 105.590084] hardirqs last enabled at (489296): [<
ffffffffb507e639>] _raw_spin_unlock_irq+0x29/0x40
[ 105.599765] hardirqs last disabled at (489295): [<
ffffffffb507e745>] _raw_spin_lock_irq+0x15/0x50
[ 105.609277] softirqs last enabled at (489292): [<
ffffffffb413a6a3>] irq_enter+0x83/0xa0
[ 105.618001] softirqs last disabled at (489293): [<
ffffffffb413a800>] irq_exit+0x140/0x190
[ 105.626813]
other info that might help us debug this:
[ 105.633976] Possible unsafe locking scenario:
[ 105.640526] CPU0
[ 105.643325] ----
[ 105.646125] lock(&(&p->tcfa_lock)->rlock);
[ 105.650747] <Interrupt>
[ 105.653717] lock(&(&p->tcfa_lock)->rlock);
[ 105.658514]
*** DEADLOCK ***
[ 105.665349] 1 lock held by swapper/16/0:
[ 105.669629] #0:
00000000a640ad99 ((&est->timer)){+.-.}, at: call_timer_fn+0x10b/0x550
[ 105.678200]
stack backtrace:
[ 105.683194] CPU: 16 PID: 0 Comm: swapper/16 Not tainted 4.18.0-rc8+ #664
[ 105.690249] Hardware name: Supermicro SYS-2028TP-DECR/X10DRT-P, BIOS 2.0b 03/30/2017
[ 105.698626] Call Trace:
[ 105.701421] <IRQ>
[ 105.703791] dump_stack+0x92/0xeb
[ 105.707461] print_usage_bug+0x336/0x34c
[ 105.711744] mark_lock+0x7c9/0x980
[ 105.715500] ? print_shortest_lock_dependencies+0x2e0/0x2e0
[ 105.721424] ? check_usage_forwards+0x230/0x230
[ 105.726315] __lock_acquire+0x923/0x26f0
[ 105.730597] ? debug_show_all_locks+0x240/0x240
[ 105.735478] ? mark_lock+0x493/0x980
[ 105.739412] ? check_chain_key+0x140/0x1f0
[ 105.743861] ? __lock_acquire+0x836/0x26f0
[ 105.748323] ? lock_acquire+0x12e/0x290
[ 105.752516] lock_acquire+0x12e/0x290
[ 105.756539] ? est_fetch_counters+0x3c/0xa0
[ 105.761084] _raw_spin_lock+0x2c/0x40
[ 105.765099] ? est_fetch_counters+0x3c/0xa0
[ 105.769633] est_fetch_counters+0x3c/0xa0
[ 105.773995] est_timer+0x87/0x390
[ 105.777670] ? est_fetch_counters+0xa0/0xa0
[ 105.782210] ? lock_acquire+0x12e/0x290
[ 105.786410] call_timer_fn+0x161/0x550
[ 105.790512] ? est_fetch_counters+0xa0/0xa0
[ 105.795055] ? del_timer_sync+0xd0/0xd0
[ 105.799249] ? __lock_is_held+0x93/0x110
[ 105.803531] ? mark_held_locks+0x20/0xe0
[ 105.807813] ? _raw_spin_unlock_irq+0x29/0x40
[ 105.812525] ? est_fetch_counters+0xa0/0xa0
[ 105.817069] ? est_fetch_counters+0xa0/0xa0
[ 105.821610] run_timer_softirq+0x3c4/0x9f0
[ 105.826064] ? lock_acquire+0x12e/0x290
[ 105.830257] ? __bpf_trace_timer_class+0x10/0x10
[ 105.835237] ? __lock_is_held+0x25/0x110
[ 105.839517] __do_softirq+0x11d/0x7bf
[ 105.843542] irq_exit+0x140/0x190
[ 105.847208] smp_apic_timer_interrupt+0xac/0x3b0
[ 105.852182] apic_timer_interrupt+0xf/0x20
[ 105.856628] </IRQ>
[ 105.859081] RIP: 0010:cpuidle_enter_state+0xd8/0x4d0
[ 105.864395] Code: 46 ff 48 89 44 24 08 0f 1f 44 00 00 31 ff e8 cf ec 46 ff 80 7c 24 07 00 0f 85 1d 02 00 00 e8 9f 90 4b ff fb 66 0f 1f 44 00 00 <4c> 8b 6c 24 08 4d 29 fd 0f 80 36 03 00 00 4c 89 e8 48 ba cf f7 53
[ 105.884288] RSP: 0018:
ffff8803ad94fd20 EFLAGS:
00000246 ORIG_RAX:
ffffffffffffff13
[ 105.892494] RAX:
0000000000000000 RBX:
ffffe8fb300829c0 RCX:
ffffffffb41e19e1
[ 105.899988] RDX:
0000000000000007 RSI:
dffffc0000000000 RDI:
ffff8803ad9358ac
[ 105.907503] RBP:
ffffffffb6636300 R08:
0000000000000004 R09:
0000000000000000
[ 105.914997] R10:
0000000000000000 R11:
0000000000000000 R12:
0000000000000004
[ 105.922487] R13:
ffffffffb6636140 R14:
ffffffffb66362d8 R15:
000000188d36091b
[ 105.929988] ? trace_hardirqs_on_caller+0x141/0x2d0
[ 105.935232] do_idle+0x28e/0x320
[ 105.938817] ? arch_cpu_idle_exit+0x40/0x40
[ 105.943361] ? mark_lock+0x8c1/0x980
[ 105.947295] ? _raw_spin_unlock_irqrestore+0x32/0x60
[ 105.952619] cpu_startup_entry+0xc2/0xd0
[ 105.956900] ? cpu_in_idle+0x20/0x20
[ 105.960830] ? _raw_spin_unlock_irqrestore+0x32/0x60
[ 105.966146] ? trace_hardirqs_on_caller+0x141/0x2d0
[ 105.971391] start_secondary+0x2b5/0x360
[ 105.975669] ? set_cpu_sibling_map+0x1330/0x1330
[ 105.980654] secondary_startup_64+0xa5/0xb0
Taking tcf_lock in sample action with bh disabled causes lockdep to issue a
warning regarding possible irq lock inversion dependency between tcf_lock,
and psample_groups_lock that is taken when holding tcf_lock in sample init:
[ 162.108959] Possible interrupt unsafe locking scenario:
[ 162.116386] CPU0 CPU1
[ 162.121277] ---- ----
[ 162.126162] lock(psample_groups_lock);
[ 162.130447] local_irq_disable();
[ 162.136772] lock(&(&p->tcfa_lock)->rlock);
[ 162.143957] lock(psample_groups_lock);
[ 162.150813] <Interrupt>
[ 162.153808] lock(&(&p->tcfa_lock)->rlock);
[ 162.158608]
*** DEADLOCK ***
In order to prevent potential lock inversion dependency between tcf_lock
and psample_groups_lock, extract call to psample_group_get() from tcf_lock
protected section in sample action init function.
Fixes:
4e232818bd32 ("net: sched: act_mirred: remove dependency on rtnl lock")
Fixes:
764e9a24480f ("net: sched: act_vlan: remove dependency on rtnl lock")
Fixes:
729e01260989 ("net: sched: act_tunnel_key: remove dependency on rtnl lock")
Fixes:
d77284956656 ("net: sched: act_sample: remove dependency on rtnl lock")
Fixes:
e8917f437006 ("net: sched: act_gact: remove dependency on rtnl lock")
Fixes:
b6a2b971c0b0 ("net: sched: act_csum: remove dependency on rtnl lock")
Fixes:
2142236b4584 ("net: sched: act_bpf: remove dependency on rtnl lock")
Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
struct tcf_t tm;
int ret;
- spin_lock(&prog->tcf_lock);
+ spin_lock_bh(&prog->tcf_lock);
opt.action = prog->tcf_action;
if (nla_put(skb, TCA_ACT_BPF_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
TCA_ACT_BPF_PAD))
goto nla_put_failure;
- spin_unlock(&prog->tcf_lock);
+ spin_unlock_bh(&prog->tcf_lock);
return skb->len;
nla_put_failure:
- spin_unlock(&prog->tcf_lock);
+ spin_unlock_bh(&prog->tcf_lock);
nlmsg_trim(skb, tp);
return -1;
}
prog = to_bpf(*act);
- spin_lock(&prog->tcf_lock);
+ spin_lock_bh(&prog->tcf_lock);
if (res != ACT_P_CREATED)
tcf_bpf_prog_fill_cfg(prog, &old);
prog->tcf_action = parm->action;
rcu_assign_pointer(prog->filter, cfg.filter);
- spin_unlock(&prog->tcf_lock);
+ spin_unlock_bh(&prog->tcf_lock);
if (res == ACT_P_CREATED) {
tcf_idr_insert(tn, *act);
}
params_new->update_flags = parm->update_flags;
- spin_lock(&p->tcf_lock);
+ spin_lock_bh(&p->tcf_lock);
p->tcf_action = parm->action;
rcu_swap_protected(p->params, params_new,
lockdep_is_held(&p->tcf_lock));
- spin_unlock(&p->tcf_lock);
+ spin_unlock_bh(&p->tcf_lock);
if (params_new)
kfree_rcu(params_new, rcu);
};
struct tcf_t t;
- spin_lock(&p->tcf_lock);
+ spin_lock_bh(&p->tcf_lock);
params = rcu_dereference_protected(p->params,
lockdep_is_held(&p->tcf_lock));
opt.action = p->tcf_action;
tcf_tm_dump(&t, &p->tcf_tm);
if (nla_put_64bit(skb, TCA_CSUM_TM, sizeof(t), &t, TCA_CSUM_PAD))
goto nla_put_failure;
- spin_unlock(&p->tcf_lock);
+ spin_unlock_bh(&p->tcf_lock);
return skb->len;
nla_put_failure:
- spin_unlock(&p->tcf_lock);
+ spin_unlock_bh(&p->tcf_lock);
nlmsg_trim(skb, b);
return -1;
}
gact = to_gact(*a);
- spin_lock(&gact->tcf_lock);
+ spin_lock_bh(&gact->tcf_lock);
gact->tcf_action = parm->action;
#ifdef CONFIG_GACT_PROB
if (p_parm) {
gact->tcfg_ptype = p_parm->ptype;
}
#endif
- spin_unlock(&gact->tcf_lock);
+ spin_unlock_bh(&gact->tcf_lock);
if (ret == ACT_P_CREATED)
tcf_idr_insert(tn, *a);
};
struct tcf_t t;
- spin_lock(&gact->tcf_lock);
+ spin_lock_bh(&gact->tcf_lock);
opt.action = gact->tcf_action;
if (nla_put(skb, TCA_GACT_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
tcf_tm_dump(&t, &gact->tcf_tm);
if (nla_put_64bit(skb, TCA_GACT_TM, sizeof(t), &t, TCA_GACT_PAD))
goto nla_put_failure;
- spin_unlock(&gact->tcf_lock);
+ spin_unlock_bh(&gact->tcf_lock);
return skb->len;
nla_put_failure:
- spin_unlock(&gact->tcf_lock);
+ spin_unlock_bh(&gact->tcf_lock);
nlmsg_trim(skb, b);
return -1;
}
}
m = to_mirred(*a);
- spin_lock(&m->tcf_lock);
+ spin_lock_bh(&m->tcf_lock);
m->tcf_action = parm->action;
m->tcfm_eaction = parm->eaction;
if (parm->ifindex) {
dev = dev_get_by_index(net, parm->ifindex);
if (!dev) {
- spin_unlock(&m->tcf_lock);
+ spin_unlock_bh(&m->tcf_lock);
tcf_idr_release(*a, bind);
return -ENODEV;
}
dev_put(dev);
m->tcfm_mac_header_xmit = mac_header_xmit;
}
- spin_unlock(&m->tcf_lock);
+ spin_unlock_bh(&m->tcf_lock);
if (ret == ACT_P_CREATED) {
spin_lock(&mirred_list_lock);
struct net_device *dev;
struct tcf_t t;
- spin_lock(&m->tcf_lock);
+ spin_lock_bh(&m->tcf_lock);
opt.action = m->tcf_action;
opt.eaction = m->tcfm_eaction;
dev = tcf_mirred_dev_dereference(m);
tcf_tm_dump(&t, &m->tcf_tm);
if (nla_put_64bit(skb, TCA_MIRRED_TM, sizeof(t), &t, TCA_MIRRED_PAD))
goto nla_put_failure;
- spin_unlock(&m->tcf_lock);
+ spin_unlock_bh(&m->tcf_lock);
return skb->len;
nla_put_failure:
- spin_unlock(&m->tcf_lock);
+ spin_unlock_bh(&m->tcf_lock);
nlmsg_trim(skb, b);
return -1;
}
if (event == NETDEV_UNREGISTER) {
spin_lock(&mirred_list_lock);
list_for_each_entry(m, &mirred_list, tcfm_list) {
- spin_lock(&m->tcf_lock);
+ spin_lock_bh(&m->tcf_lock);
if (tcf_mirred_dev_dereference(m) == dev) {
dev_put(dev);
/* Note : no rcu grace period necessary, as
*/
RCU_INIT_POINTER(m->tcfm_dev, NULL);
}
- spin_unlock(&m->tcf_lock);
+ spin_unlock_bh(&m->tcf_lock);
}
spin_unlock(&mirred_list_lock);
}
struct nlattr *tb[TCA_SAMPLE_MAX + 1];
struct psample_group *psample_group;
struct tc_sample *parm;
+ u32 psample_group_num;
struct tcf_sample *s;
bool exists = false;
int ret, err;
tcf_idr_release(*a, bind);
return -EEXIST;
}
- s = to_sample(*a);
- spin_lock(&s->tcf_lock);
- s->tcf_action = parm->action;
- s->rate = nla_get_u32(tb[TCA_SAMPLE_RATE]);
- s->psample_group_num = nla_get_u32(tb[TCA_SAMPLE_PSAMPLE_GROUP]);
- psample_group = psample_group_get(net, s->psample_group_num);
+ psample_group_num = nla_get_u32(tb[TCA_SAMPLE_PSAMPLE_GROUP]);
+ psample_group = psample_group_get(net, psample_group_num);
if (!psample_group) {
- spin_unlock(&s->tcf_lock);
tcf_idr_release(*a, bind);
return -ENOMEM;
}
+
+ s = to_sample(*a);
+
+ spin_lock_bh(&s->tcf_lock);
+ s->tcf_action = parm->action;
+ s->rate = nla_get_u32(tb[TCA_SAMPLE_RATE]);
+ s->psample_group_num = psample_group_num;
RCU_INIT_POINTER(s->psample_group, psample_group);
if (tb[TCA_SAMPLE_TRUNC_SIZE]) {
s->truncate = true;
s->trunc_size = nla_get_u32(tb[TCA_SAMPLE_TRUNC_SIZE]);
}
- spin_unlock(&s->tcf_lock);
+ spin_unlock_bh(&s->tcf_lock);
if (ret == ACT_P_CREATED)
tcf_idr_insert(tn, *a);
};
struct tcf_t t;
- spin_lock(&s->tcf_lock);
+ spin_lock_bh(&s->tcf_lock);
opt.action = s->tcf_action;
if (nla_put(skb, TCA_SAMPLE_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
if (nla_put_u32(skb, TCA_SAMPLE_PSAMPLE_GROUP, s->psample_group_num))
goto nla_put_failure;
- spin_unlock(&s->tcf_lock);
+ spin_unlock_bh(&s->tcf_lock);
return skb->len;
nla_put_failure:
- spin_unlock(&s->tcf_lock);
+ spin_unlock_bh(&s->tcf_lock);
nlmsg_trim(skb, b);
return -1;
}
params_new->tcft_action = parm->t_action;
params_new->tcft_enc_metadata = metadata;
- spin_lock(&t->tcf_lock);
+ spin_lock_bh(&t->tcf_lock);
t->tcf_action = parm->action;
rcu_swap_protected(t->params, params_new,
lockdep_is_held(&t->tcf_lock));
- spin_unlock(&t->tcf_lock);
+ spin_unlock_bh(&t->tcf_lock);
if (params_new)
kfree_rcu(params_new, rcu);
};
struct tcf_t tm;
- spin_lock(&t->tcf_lock);
+ spin_lock_bh(&t->tcf_lock);
params = rcu_dereference_protected(t->params,
lockdep_is_held(&t->tcf_lock));
opt.action = t->tcf_action;
if (nla_put_64bit(skb, TCA_TUNNEL_KEY_TM, sizeof(tm),
&tm, TCA_TUNNEL_KEY_PAD))
goto nla_put_failure;
- spin_unlock(&t->tcf_lock);
+ spin_unlock_bh(&t->tcf_lock);
return skb->len;
nla_put_failure:
- spin_unlock(&t->tcf_lock);
+ spin_unlock_bh(&t->tcf_lock);
nlmsg_trim(skb, b);
return -1;
}
p->tcfv_push_prio = push_prio;
p->tcfv_push_proto = push_proto;
- spin_lock(&v->tcf_lock);
+ spin_lock_bh(&v->tcf_lock);
v->tcf_action = parm->action;
rcu_swap_protected(v->vlan_p, p, lockdep_is_held(&v->tcf_lock));
- spin_unlock(&v->tcf_lock);
+ spin_unlock_bh(&v->tcf_lock);
if (p)
kfree_rcu(p, rcu);
};
struct tcf_t t;
- spin_lock(&v->tcf_lock);
+ spin_lock_bh(&v->tcf_lock);
opt.action = v->tcf_action;
p = rcu_dereference_protected(v->vlan_p, lockdep_is_held(&v->tcf_lock));
opt.v_action = p->tcfv_action;
tcf_tm_dump(&t, &v->tcf_tm);
if (nla_put_64bit(skb, TCA_VLAN_TM, sizeof(t), &t, TCA_VLAN_PAD))
goto nla_put_failure;
- spin_unlock(&v->tcf_lock);
+ spin_unlock_bh(&v->tcf_lock);
return skb->len;
nla_put_failure:
- spin_unlock(&v->tcf_lock);
+ spin_unlock_bh(&v->tcf_lock);
nlmsg_trim(skb, b);
return -1;
}