静止状态上报(核心): a. 每个CPU(叶子节点)在经历了一次静止状态后,会向其父rcu_node报告。 b. 每个父rcu_node会等待其所有子节点(无论是CPU还是其他rcu_node)都报告了静止状态。 c. 当一个rcu_node收到了其所有子节点的报告后,它就认为其代表的整个子树都已经“静止”了,于是它再向它自己的父节点报告。
raw_lockdep_assert_held_rcu_node(rnp); WARN_ON_ONCE(rdp->mynode != rnp); WARN_ON_ONCE(!rcu_is_leaf_node(rnp)); /* RCU better not be waiting on newly onlined CPUs! */ WARN_ON_ONCE(rnp->qsmaskinitnext & ~rnp->qsmaskinit & rnp->qsmask & rdp->grpmask);
/* * 决定将新阻止的任务排队的位置。 理论上, * 这可以是 if 语句。 在实践中,当我尝试 * 那,它相当混乱。 */ switch (blkd_state) { case0: case RCU_EXP_TASKS: case RCU_EXP_TASKS | RCU_GP_BLKD: case RCU_GP_TASKS: case RCU_GP_TASKS | RCU_EXP_TASKS:
/* * Blocking neither GP, or first task blocking the normal * GP but not blocking the already-waiting expedited GP. * Queue at the head of the list to avoid unnecessarily * blocking the already-waiting GPs. */ list_add(&t->rcu_node_entry, &rnp->blkd_tasks); break;
case RCU_EXP_BLKD: case RCU_GP_BLKD: case RCU_GP_BLKD | RCU_EXP_BLKD: case RCU_GP_TASKS | RCU_EXP_BLKD: case RCU_GP_TASKS | RCU_GP_BLKD | RCU_EXP_BLKD: case RCU_GP_TASKS | RCU_EXP_TASKS | RCU_GP_BLKD | RCU_EXP_BLKD:
/* * First task arriving that blocks either GP, or first task * arriving that blocks the expedited GP (with the normal * GP already waiting), or a task arriving that blocks * both GPs with both GPs already waiting. Queue at the * tail of the list to avoid any GP waiting on any of the * already queued tasks that are not blocking it. */ list_add_tail(&t->rcu_node_entry, &rnp->blkd_tasks); break;
case RCU_EXP_TASKS | RCU_EXP_BLKD: case RCU_EXP_TASKS | RCU_GP_BLKD | RCU_EXP_BLKD: case RCU_GP_TASKS | RCU_EXP_TASKS | RCU_EXP_BLKD:
/* * Second or subsequent task blocking the expedited GP. * The task either does not block the normal GP, or is the * first task blocking the normal GP. Queue just after * the first task blocking the expedited GP. */ list_add(&t->rcu_node_entry, rnp->exp_tasks); break;
case RCU_GP_TASKS | RCU_GP_BLKD: case RCU_GP_TASKS | RCU_EXP_TASKS | RCU_GP_BLKD:
/* * Second or subsequent task blocking the normal GP. * The task does not block the expedited GP. Queue just * after the first task blocking the normal GP. */ list_add(&t->rcu_node_entry, rnp->gp_tasks); break;
default:
/* Yet another exercise in excessive paranoia. */ WARN_ON_ONCE(1); break; }
/* * We have now queued the task. If it was the first one to * block either grace period, update the ->gp_tasks and/or * ->exp_tasks pointers, respectively, to reference the newly * blocked tasks. */ if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD)) { WRITE_ONCE(rnp->gp_tasks, &t->rcu_node_entry); WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq); } if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) WRITE_ONCE(rnp->exp_tasks, &t->rcu_node_entry); WARN_ON_ONCE(!(blkd_state & RCU_GP_BLKD) != !(rnp->qsmask & rdp->grpmask)); WARN_ON_ONCE(!(blkd_state & RCU_EXP_BLKD) != !(rnp->expmask & rdp->grpmask)); raw_spin_unlock_rcu_node(rnp); /* interrupts remain disabled. */
/* * Report the quiescent state for the expedited GP. This expedited * GP should not be able to end until we report, so there should be * no need to check for a subsequent expedited GP. (Though we are * still in a quiescent state in any case.) * * Interrupts are disabled, so ->cpu_no_qs.b.exp cannot change. */ if (blkd_state & RCU_EXP_BLKD && rdp->cpu_no_qs.b.exp) rcu_report_exp_rdp(rdp); else WARN_ON_ONCE(rdp->cpu_no_qs.b.exp); ASSERT_EXCLUSIVE_WRITER_SCOPED(rdp->cpu_no_qs.b.exp); }
/* * Enqueue the specified callback onto the specified rcu_segcblist * structure, updating accounting as needed. Note that the ->len * field may be accessed locklessly, hence the WRITE_ONCE(). * The ->len field is used by rcu_barrier() and friends to determine * if it must post a callback on this structure, and it is OK * for rcu_barrier() to sometimes post callbacks needlessly, but * absolutely not OK for it to ever miss posting a callback. */ voidrcu_segcblist_enqueue(struct rcu_segcblist *rsclp, struct rcu_head *rhp) { /* 增加队列的总长度计数 */ rcu_segcblist_inc_len(rsclp); /* 将指定的回调(rhp)添加到 rcu_segcblist 的尾部(RCU_NEXT_TAIL 段) */ rcu_segcblist_inc_seglen(rsclp, RCU_NEXT_TAIL); rhp->next = NULL; WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rhp); WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], &rhp->next); }
/* * Handle any core-RCU processing required by a call_rcu() invocation. */ staticvoidcall_rcu_core(struct rcu_data *rdp, struct rcu_head *head, rcu_callback_t func, unsignedlong flags) { /* 将回调加入队列 */ rcutree_enqueue(rdp, head, func); /* * If called from an extended quiescent state, invoke the RCU * core in order to force a re-evaluation of RCU's idleness. */ if (!rcu_is_watching()) invoke_rcu_core();
/* If interrupts were disabled or CPU offline, don't invoke RCU core. */ if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id())) return;
/* Are we ignoring a completed grace period? */ note_gp_changes(rdp);
/* Start a new grace period if one not already started. */ if (!rcu_gp_in_progress()) { rcu_accelerate_cbs_unlocked(rdp->mynode, rdp); } else { /* Give the grace period a kick. */ rdp->blimit = DEFAULT_MAX_RCU_BLIMIT; if (READ_ONCE(rcu_state.n_force_qs) == rdp->n_force_qs_snap && rcu_segcblist_first_pend_cb(&rdp->cblist) != head) rcu_force_quiescent_state(); rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs); rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist); } } }
/** * call_rcu() - Queue an RCU callback for invocation after a grace period. * By default the callbacks are 'lazy' and are kept hidden from the main * ->cblist to prevent starting of grace periods too soon. * If you desire grace periods to start very soon, use call_rcu_hurry(). * * @head: structure to be used for queueing the RCU updates. * @func: actual callback function to be invoked after the grace period * * The callback function will be invoked some time after a full grace * period elapses, in other words after all pre-existing RCU read-side * critical sections have completed. However, the callback function * might well execute concurrently with RCU read-side critical sections * that started after call_rcu() was invoked. * * It is perfectly legal to repost an RCU callback, potentially with * a different callback function, from within its callback function. * The specified function will be invoked after another full grace period * has elapsed. This use case is similar in form to the common practice * of reposting a timer from within its own handler. * * RCU read-side critical sections are delimited by rcu_read_lock() * and rcu_read_unlock(), and may be nested. In addition, but only in * v5.0 and later, regions of code across which interrupts, preemption, * or softirqs have been disabled also serve as RCU read-side critical * sections. This includes hardware interrupt handlers, softirq handlers, * and NMI handlers. * * Note that all CPUs must agree that the grace period extended beyond * all pre-existing RCU read-side critical section. On systems with more * than one CPU, this means that when "func()" is invoked, each CPU is * guaranteed to have executed a full memory barrier since the end of its * last RCU read-side critical section whose beginning preceded the call * to call_rcu(). It also means that each CPU executing an RCU read-side * critical section that continues beyond the start of "func()" must have * executed a memory barrier after the call_rcu() but before the beginning * of that RCU read-side critical section. Note that these guarantees * include CPUs that are offline, idle, or executing in user mode, as * well as CPUs that are executing in the kernel. * * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the * resulting RCU callback function "func()", then both CPU A and CPU B are * guaranteed to execute a full memory barrier during the time interval * between the call to call_rcu() and the invocation of "func()" -- even * if CPU A and CPU B are the same CPU (but again only if the system has * more than one CPU). * * Implementation of these memory-ordering guarantees is described here: * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst. * * Specific to call_rcu() (as opposed to the other call_rcu*() functions), * in kernels built with CONFIG_RCU_LAZY=y, call_rcu() might delay for many * seconds before starting the grace period needed by the corresponding * callback. This delay can significantly improve energy-efficiency * on low-utilization battery-powered devices. To avoid this delay, * in latency-sensitive kernel code, use call_rcu_hurry(). */ voidcall_rcu(struct rcu_head *head, rcu_callback_t func) { __call_rcu_common(head, func, enable_rcu_lazy); } EXPORT_SYMBOL_GPL(call_rcu);
这段代码实现了 call_rcu 函数,它是 Linux 内核 RCU(Read-Copy-Update)机制的一个核心接口,用于在一个“宽限期”(grace period)结束后执行回调函数。RCU 是一种高效的同步机制,广泛用于内核中需要读写并发的场景。
/* * 根据内核参数计算rcu_node树形结构的几何形状。这不能替代tree.h中的定义,因为这些定义用于确定rcu_state结构中的->node数组的大小。 */ voidrcu_init_geometry(void) { ulong d; int i; staticunsignedlong old_nr_cpu_ids; int rcu_capacity[RCU_NUM_LVLS]; staticbool initialized;
if (initialized) { /* * Warn if setup_nr_cpu_ids() had not yet been invoked, * unless nr_cpus_ids == NR_CPUS, in which case who cares? */ WARN_ON_ONCE(old_nr_cpu_ids != nr_cpu_ids); return; }
old_nr_cpu_ids = nr_cpu_ids; initialized = true;
/* * 初始化任何未指定的启动参数. * jiffies_till_first_fqs 和 jiffies_till_next_fqs 的默认值设置为 RCU_JIFFIES_TILL_FORCE_QS 值,该值是 HZ 的函数,然后对于系统中可能存在的每个 RCU_JIFFIES_FQS_DIV CPU 加一。. */ d = RCU_JIFFIES_TILL_FORCE_QS + nr_cpu_ids / RCU_JIFFIES_FQS_DIV; if (jiffies_till_first_fqs == ULONG_MAX) jiffies_till_first_fqs = d; if (jiffies_till_next_fqs == ULONG_MAX) jiffies_till_next_fqs = d; adjust_jiffies_till_sched_qs();
/* Silence gcc 4.8 false positive about array index out of range. */ if (rcu_num_lvls <= 0 || rcu_num_lvls > RCU_NUM_LVLS) panic("rcu_init_one: rcu_num_lvls out of range");
/* An incoming CPU should never be blocking a grace period. */ if (WARN_ON_ONCE(rnp->qsmask & mask)) { /* RCU waiting on incoming CPU? */ /* rcu_report_qs_rnp() *really* wants some flags to restore */ unsignedlong flags;