Re: [PATCH RFC 3/9] RCU: Preemptible RCU
From: Paul E. McKenney <hidden>
Date: 2007-09-21 23:04:12
Also in:
lkml
On Fri, Sep 21, 2007 at 11:20:48AM -0400, Steven Rostedt wrote:
On Mon, Sep 10, 2007 at 11:34:12AM -0700, Paul E. McKenney wrote:quoted
+ +/* + * PREEMPT_RCU data structures. + */ + +#define GP_STAGES 4 +struct rcu_data { + spinlock_t lock; /* Protect rcu_data fields. */ + long completed; /* Number of last completed batch. */ + int waitlistcount; + struct tasklet_struct rcu_tasklet; + struct rcu_head *nextlist; + struct rcu_head **nexttail; + struct rcu_head *waitlist[GP_STAGES]; + struct rcu_head **waittail[GP_STAGES]; + struct rcu_head *donelist; + struct rcu_head **donetail; +#ifdef CONFIG_RCU_TRACE + struct rcupreempt_trace trace; +#endif /* #ifdef CONFIG_RCU_TRACE */ +}; +struct rcu_ctrlblk { + spinlock_t fliplock; /* Protect state-machine transitions. */ + long completed; /* Number of last completed batch. */ +}; +static DEFINE_PER_CPU(struct rcu_data, rcu_data); +static struct rcu_ctrlblk rcu_ctrlblk = { + .fliplock = SPIN_LOCK_UNLOCKED, + .completed = 0, +}; +static DEFINE_PER_CPU(int [2], rcu_flipctr) = { 0, 0 }; + +/* + * States for rcu_try_flip() and friends. + */ + +enum rcu_try_flip_states { + rcu_try_flip_idle_state, /* "I" */ + rcu_try_flip_waitack_state, /* "A" */ + rcu_try_flip_waitzero_state, /* "Z" */ + rcu_try_flip_waitmb_state /* "M" */ +}; +static enum rcu_try_flip_states rcu_try_flip_state = rcu_try_flip_idle_state; +#ifdef CONFIG_RCU_TRACE +static char *rcu_try_flip_state_names[] = + { "idle", "waitack", "waitzero", "waitmb" }; +#endif /* #ifdef CONFIG_RCU_TRACE */[snip]quoted
+/* + * If a global counter flip has occurred since the last time that we + * advanced callbacks, advance them. Hardware interrupts must be + * disabled when calling this function. + */ +static void __rcu_advance_callbacks(struct rcu_data *rdp) +{ + int cpu; + int i; + int wlc = 0; + + if (rdp->completed != rcu_ctrlblk.completed) { + if (rdp->waitlist[GP_STAGES - 1] != NULL) { + *rdp->donetail = rdp->waitlist[GP_STAGES - 1]; + rdp->donetail = rdp->waittail[GP_STAGES - 1]; + RCU_TRACE_RDP(rcupreempt_trace_move2done, rdp); + } + for (i = GP_STAGES - 2; i >= 0; i--) { + if (rdp->waitlist[i] != NULL) { + rdp->waitlist[i + 1] = rdp->waitlist[i]; + rdp->waittail[i + 1] = rdp->waittail[i]; + wlc++; + } else { + rdp->waitlist[i + 1] = NULL; + rdp->waittail[i + 1] = + &rdp->waitlist[i + 1]; + } + } + if (rdp->nextlist != NULL) { + rdp->waitlist[0] = rdp->nextlist; + rdp->waittail[0] = rdp->nexttail; + wlc++; + rdp->nextlist = NULL; + rdp->nexttail = &rdp->nextlist; + RCU_TRACE_RDP(rcupreempt_trace_move2wait, rdp); + } else { + rdp->waitlist[0] = NULL; + rdp->waittail[0] = &rdp->waitlist[0]; + } + rdp->waitlistcount = wlc; + rdp->completed = rcu_ctrlblk.completed; + } + + /* + * Check to see if this CPU needs to report that it has seen + * the most recent counter flip, thereby declaring that all + * subsequent rcu_read_lock() invocations will respect this flip. + */ + + cpu = raw_smp_processor_id(); + if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) { + smp_mb(); /* Subsequent counter accesses must see new value */ + per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen; + smp_mb(); /* Subsequent RCU read-side critical sections */ + /* seen -after- acknowledgement. */ + } +}[snip]quoted
+/* + * Attempt a single flip of the counters. Remember, a single flip does + * -not- constitute a grace period. Instead, the interval between + * at least three consecutive flips is a grace period. + * + * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation + * on a large SMP, they might want to use a hierarchical organization of + * the per-CPU-counter pairs. + */ +static void rcu_try_flip(void) +{ + unsigned long oldirq; + + RCU_TRACE_ME(rcupreempt_trace_try_flip_1); + if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, oldirq))) { + RCU_TRACE_ME(rcupreempt_trace_try_flip_e1); + return; + } + + /* + * Take the next transition(s) through the RCU grace-period + * flip-counter state machine. + */ + + switch (rcu_try_flip_state) { + case rcu_try_flip_idle_state: + if (rcu_try_flip_idle()) + rcu_try_flip_state = rcu_try_flip_waitack_state; + break; + case rcu_try_flip_waitack_state: + if (rcu_try_flip_waitack()) + rcu_try_flip_state = rcu_try_flip_waitzero_state; + break; + case rcu_try_flip_waitzero_state: + if (rcu_try_flip_waitzero()) + rcu_try_flip_state = rcu_try_flip_waitmb_state; + break; + case rcu_try_flip_waitmb_state: + if (rcu_try_flip_waitmb()) + rcu_try_flip_state = rcu_try_flip_idle_state; + } + spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, oldirq); +}Paul, Looking further into this, I still think this is a bit of overkill. We go through 20 states from call_rcu to list->func(). On call_rcu we put our stuff on the next list. Before we move stuff from next to wait, we need to go through 4 states. So we have next -> 4 states -> wait[0] -> 4 states -> wait[1] -> 4 states -> wait[2] -> 4 states -> wait[3] -> 4 states -> done. That's 20 states that we go through from the time we add our function to the list to the time it actually gets called. Do we really need the 4 wait lists? Seems a bit overkill to me. What am I missing?
"Nothing kills like overkill!!!" ;-) Seriously, I do expect to be able to squeeze this down over time, but feel the need to be a bit on the cowardly side at the moment. In any case, I will be looking at the scenarios more carefully. If it turns out that GP_STAGES can indeed be cranked down a bit, well, that is an easy change! I just fired off a POWER run with GP_STAGES set to 3, will let you know how it goes. Thanx, Paul