linux内存管理（十四）-内存OOM触发分析

2023-10-29

在内存分配路径上，当内存不足的时候会触发kswapd、或者内存规整，极端情况会触发OOM，来获取更多内存。
在内存回收失败之后，会进行OOM，OOM的入口是__alloc_pages_may_oom，文件位于mm/page_alloc.c中：

static inline struct page *
__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
	const struct alloc_context *ac, unsigned long *did_some_progress)
{
	struct oom_control oc = {//OOM控制参数
		.zonelist = ac->zonelist,
		.nodemask = ac->nodemask,
		.memcg = NULL,
		.gfp_mask = gfp_mask,
		.order = order,
	};
	struct page *page;

	*did_some_progress = 0;

	/*
	 * Acquire the oom lock.  If that fails, somebody else is
	 * making progress for us.
	 */
	//尝试加锁，如果获取不到锁则返回
	if (!mutex_trylock(&oom_lock)) {
		*did_some_progress = 1;
		schedule_timeout_uninterruptible(1);
		return NULL;
	}

	/*
	 * Go through the zonelist yet one more time, keep very high watermark
	 * here, this is only to catch a parallel oom killing, we must fail if
	 * we're still under heavy pressure. But make sure that this reclaim
	 * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
	 * allocation which will never fail due to oom_lock already held.
	 */
	//尝试再次使用高水位分配内存一次，判断是否需要启动oom
	page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) &
				      ~__GFP_DIRECT_RECLAIM, order,
				      ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
	if (page)
		goto out;

	/* Coredumps can quickly deplete all memory reserves */
	if (current->flags & PF_DUMPCORE)
		goto out;
	/* The OOM killer will not help higher order allocs */
	//OOM不可以分配高于PAGE_ALLOC_COSTLY_ORDER的阶数，也就是3阶
	if (order > PAGE_ALLOC_COSTLY_ORDER)
		goto out;
	/*
	 * We have already exhausted all our reclaim opportunities without any
	 * success so it is time to admit defeat. We will skip the OOM killer
	 * because it is very likely that the caller has a more reasonable
	 * fallback than shooting a random task.
	 */
	//__GFP_NOFAIL是不允许内存申请失败的情况，如果不允许失败则从out退出
	if (gfp_mask & __GFP_RETRY_MAYFAIL)
		goto out;
	/* The OOM killer does not needlessly kill tasks for lowmem */
	//OOM不会为低端内存启动，如果是要分配低端内存则从out退出
	if (ac->high_zoneidx < ZONE_NORMAL)
		goto out;
	if (pm_suspended_storage())
		goto out;
	/*
	 * XXX: GFP_NOFS allocations should rather fail than rely on
	 * other request to make a forward progress.
	 * We are in an unfortunate situation where out_of_memory cannot
	 * do much for this context but let's try it to at least get
	 * access to memory reserved if the current task is killed (see
	 * out_of_memory). Once filesystems are ready to handle allocation
	 * failures more gracefully we should just bail out here.
	 */

	/* The OOM killer may not free memory on a specific node */
	//OOM不会释放特定节点上的内存
	if (gfp_mask & __GFP_THISNODE)
		goto out;

	/* Exhausted what can be done so it's blame time */
	//经过上面各种情况，仍然需要进行OOM处理。调用out_of_memory()。
	if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
		*did_some_progress = 1;

		/*
		 * Help non-failing allocations by giving them access to memory
		 * reserves
		 */
		//对于__GFP_NOFAIL的分配情况，需要降低分配标准到无水线
		if (gfp_mask & __GFP_NOFAIL)
			page = __alloc_pages_cpuset_fallback(gfp_mask, order,
					ALLOC_NO_WATERMARKS, ac);
	}
out:
	mutex_unlock(&oom_lock);
	return page;
}

__alloc_pages_may_oom会先配置OOM参数，再尝试高水位分配内存，成功就返回，失败再判断标志位等判断是否适合oom，适合才会调用out_of_memory进行OOM操作:

bool out_of_memory(struct oom_control *oc)
{
	unsigned long freed = 0;
	enum oom_constraint constraint = CONSTRAINT_NONE;

	//在freeze_processes会将其置位，即禁止OOM；因为在冻结过程不允许OOM
	if (oom_killer_disabled)
		return false;

	if (!is_memcg_oom(oc)) {//检查是否有资格杀死进程启动oom
		blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
		if (freed > 0)
			/* Got some memory back in the last second. */
			return true;
	}

	/*
	 * If current has a pending SIGKILL or is exiting, then automatically
	 * select it.  The goal is to allow it to allocate so that it may
	 * quickly exit and free its memory.
	 */
	//如果当前进程将要退出，或者释放内存
	if (task_will_free_mem(current)) {
		mark_oom_victim(current);//标记当前进程作为OOM候选者
		wake_oom_reaper(current);//唤醒OOM reaper去收割进而释放内存
		return true;//由于当前进程由于自身原因将要退出，不需要经过下面的打分和杀死流程
	}

	/*
	 * The OOM killer does not compensate for IO-less reclaim.
	 * pagefault_out_of_memory lost its gfp context so we have to
	 * make sure exclude 0 mask - all other users should have at least
	 * ___GFP_DIRECT_RECLAIM to get here.
	 */
	//如果内存申请掩码包括__GFP_DS或__GFP_NOFAIL，则不进行OOM收割。
	if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS))
		return true;

	/*
	 * Check if there were limitations on the allocation (only relevant for
	 * NUMA and memcg) that may require different handling.
	 */
	//检查是否需要处理不同的分配限制
	constraint = constrained_alloc(oc);
	if (constraint != CONSTRAINT_MEMORY_POLICY)
		oc->nodemask = NULL;
	//检查sysctl_panic_on_oom设置，以及是否由sysrq触发，来决定是否触发panic。
	check_panic_on_oom(oc, constraint);

	//如果设置了sysctl_oom_kill_allocating_task，那么当内存耗尽时，会把当前申请内存分配的进程杀掉。
	if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
	    current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
	    current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
		get_task_struct(current);
		oc->chosen = current;
		oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
		return true;
	}

	select_bad_process(oc);//遍历所有进程和进程下的线程，查找合适的候选进程
	/* Found nothing?!?! */
	if (!oc->chosen) {//如果没有合适候选进程，并且OOM不是由sysrq触发的，进入panic。
		dump_header(oc, NULL);
		pr_warn("Out of memory and no killable processes...\n");
		/*
		 * If we got here due to an actual allocation at the
		 * system level, we cannot survive this and will enter
		 * an endless loop in the allocator. Bail out now.
		 */
		if (!is_sysrq_oom(oc) && !is_memcg_oom(oc))
			panic("System is deadlocked on memory\n");
	}
	if (oc->chosen && oc->chosen != (void *)-1UL)
		//杀死选中的进程
		oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
				 "Memory cgroup out of memory");
	return !!oc->chosen;
}

out_of_memory函数是OOM机制的核心，他主要做了三件事：

调用select_bad_process调挑选最’bad‘的进程
如果没有合适的进程，则调用dump_header，打印OOM信息，找出OOM原因
调用oom_kill_process杀死第一步挑选的进程

我们先看select_bad_process函数怎么选择最差的进程：

static void select_bad_process(struct oom_control *oc)
{
	if (is_memcg_oom(oc))
		mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
	else {
		struct task_struct *p;

		rcu_read_lock();
		
		for_each_process(p)//遍历系统范围内所有进程线程
			if (oom_evaluate_task(p, oc))//评估每个进程的得分
				break;
		rcu_read_unlock();
	}

	oc->chosen_points = oc->chosen_points * 1000 / oc->totalpages;
}

select_bad_process主要是遍历系统范围所有进程，并且调用oom_evaluate_task函数对每一个进程评分：

static int oom_evaluate_task(struct task_struct *task, void *arg)
{
	struct oom_control *oc = arg;
	unsigned long points;

	//进程1以及内核线程等等不能被kill的线程跳过
	if (oom_unkillable_task(task, NULL, oc->nodemask))
		goto next;

	/*
	 * This task already has access to memory reserves and is being killed.
	 * Don't allow any other task to have access to the reserves unless
	 * the task has MMF_OOM_SKIP because chances that it would release
	 * any memory is quite low.
	 */
	if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {
		if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
			goto next;
		goto abort;
	}

	/*
	 * If task is allocating a lot of memory and has been marked to be
	 * killed first if it triggers an oom, then select it.
	 */
	//如果任务分配了大量的内存则优先选择触发OOM的进程
	if (oom_task_origin(task)) {
		points = ULONG_MAX;
		goto select;
	}

	//启发式函数，以确定哪个候选任务杀死
	points = oom_badness(task, NULL, oc->nodemask, oc->totalpages);
	//这里保证只取最高分的进程，所以分数最高者被选中。其他情况则直接跳过
	if (!points || points < oc->chosen_points)
		goto next;

	/* Prefer thread group leaders for display purposes */
	//选择线程组的所属的进程
	if (points == oc->chosen_points && thread_group_leader(oc->chosen))
		goto next;
select:
	if (oc->chosen)
		put_task_struct(oc->chosen);
	get_task_struct(task);
	oc->chosen = task;//更新OOM选中的进程和当前最高分
	oc->chosen_points = points;
next:
	return 0;
abort:
	if (oc->chosen)
		put_task_struct(oc->chosen);
	oc->chosen = (void *)-1UL;
	return 1;
}

oom_evaluate_task首先会跳过不可以被杀死的内核进程，然后判断如果一个进程是否分配了大量内存空间就直接返回优先杀死该进程并且不用往下走去评分，否则往下走通过oom_badness函数评分后返回，每次返回的分数和OC结构体的分数比较好后把较大的分数和其进程保存在OC中。oom_badness()是给进程打分的函数，可以说是核心中的核心。现在看看oom_badness：

unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
			  const nodemask_t *nodemask, unsigned long totalpages)
{
	long points;
	long adj;

	//如果进程不可被杀，直接跳过
	if (oom_unkillable_task(p, memcg, nodemask))
		return 0;

	p = find_lock_task_mm(p);//找到进程p，并使用task_lock()锁上
	if (!p)
		return 0;

	/*
	 * Do not even consider tasks which are explicitly marked oom
	 * unkillable or have been already oom reaped or the are in
	 * the middle of vfork
	 */
	//获取当前进程的oom_score_adj参数
	adj = (long)p->signal->oom_score_adj;
	//如果当前进程oom_score_adj为最小值，说明此进程不参数'bad'评比，返回0
	if (adj == OOM_SCORE_ADJ_MIN ||
			test_bit(MMF_OOM_SKIP, &p->mm->flags) ||
			in_vfork(p)) {
		task_unlock(p);
		return 0;
	}

	/*
	 * The baseline for the badness score is the proportion of RAM that each
	 * task's rss, pagetable and swap space use.
	 */
	//计算oom分数，也就是每个任务的rss、可分页和交换空间使用的RAM比例
	points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
		mm_pgtables_bytes(p->mm) / PAGE_SIZE;
	task_unlock(p);//解锁

	/* Normalize to oom_score_adj units */
	//参数oom_score_adj归一化处理，这里可以看出oom_score_adj对最终分数的影响，
	adj *= totalpages / 1000;
	//将归一化后的adj和points求和，作为当前进程的分数
	points += adj;

	/*
	 * Never return 0 for an eligible task regardless of the root bonus and
	 * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
	 */
	return points > 0 ? points : 1;
}

oom_badness首先跳过不可以杀死的进程，然后上进程锁，获取当前进程的oom_score_adj参数，如果oom_score_adj是最低值就不参加评选（我们设置某些进程不可以被杀死，可以通过设置这个参数达到目的的），mm->flags为MMF_OOM_SKIP的进程不参加评选，处于vfork()中的进程也不参加评选，最后就是计算进程的分数了，进程的得分取决于其消耗的内存大小再加上oom_score_adj的归一化处理结果。
select_bad_process调挑选最’bad‘的进程讲完了，现在说说dump_header如何打印OOM信息：

static void dump_header(struct oom_control *oc, struct task_struct *p)
{
	//显示在哪个进程中触发了OOM
	pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=%*pbl, order=%d, oom_score_adj=%hd\n",
		current->comm, oc->gfp_mask, &oc->gfp_mask,
		nodemask_pr_args(oc->nodemask), oc->order,
			current->signal->oom_score_adj);
	if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
		pr_warn("COMPACTION is disabled!!!\n");

	cpuset_print_current_mems_allowed();
	dump_stack();//输出当前现场的栈信息
	if (is_memcg_oom(oc))
		mem_cgroup_print_oom_info(oc->memcg, p);
	else {
		show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask);//输出整个系统的内存使用情况
		if (is_dump_unreclaim_slabs())
			dump_unreclaimable_slab();
	}
	if (sysctl_oom_dump_tasks)
		dump_tasks(oc->memcg, oc->nodemask);//显示系统所有进程的内存使用情况
}

static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
{
	struct task_struct *p;
	struct task_struct *task;

	pr_info("Tasks state (memory values in pages):\n");
	pr_info("[  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
	rcu_read_lock();
	for_each_process(p) {
		if (oom_unkillable_task(p, memcg, nodemask))//不可被kill的进程不显示
			continue;

		task = find_lock_task_mm(p);//内核线程等没有自己的mm，也无法被kill，所以不显示
		if (!task) {
			/*
			 * This is a kthread or all of p's threads have already
			 * detached their mm's.  There's no need to report
			 * them; they can't be oom killed anyway.
			 */
			continue;
		}

		//打印输出pid，kuid,tgid,total_vm和rss等信息
		pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
			task->pid, from_kuid(&init_user_ns, task_uid(task)),
			task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
			mm_pgtables_bytes(task->mm),
			get_mm_counter(task->mm, MM_SWAPENTS),
			task->signal->oom_score_adj, task->comm);
		task_unlock(task);
	}
	rcu_read_unlock();
}

dump_header主要打印了一下几种信息：

触发了OOM的进程信息
现场的栈信息
整个系统的内存使用情况

现在说说oom_kill_process杀死第一步挑选的进程的流程：

static void oom_kill_process(struct oom_control *oc, const char *message)
{
	struct task_struct *p = oc->chosen;
	unsigned int points = oc->chosen_points;
	struct task_struct *victim = p;
	struct task_struct *child;
	struct task_struct *t;
	struct mem_cgroup *oom_group;
	unsigned int victim_points = 0;
	static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
					      DEFAULT_RATELIMIT_BURST);

	/*
	 * If the task is already exiting, don't alarm the sysadmin or kill
	 * its children or threads, just give it access to memory reserves
	 * so it can die quickly
	 */
	task_lock(p);
	
	//如果p进程将要退出，或者释放内存
	if (task_will_free_mem(p)) {
		mark_oom_victim(p);//标记p进程作为OOM候选者
		wake_oom_reaper(p);//唤醒OOM reaper去收割进而释放内存
		task_unlock(p);//解锁
		put_task_struct(p);//释放进程内核栈，彻底释放了内核线程所占的系统资源
		return;
	}
	task_unlock(p);

	if (__ratelimit(&oom_rs))
		dump_header(oc, p);//打印系统栈信息、内存信息、所有进程的内存消耗情况

	//打印输出将要kill掉的进程名、pid、score
	pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
		message, task_pid_nr(p), p->comm, points);

	/*
	 * If any of p's children has a different mm and is eligible for kill,
	 * the one with the highest oom_badness() score is sacrificed for its
	 * parent.  This attempts to lose the minimal amount of work done while
	 * still freeing memory.
	 */
	read_lock(&tasklist_lock);

	/*
	 * The task 'p' might have already exited before reaching here. The
	 * put_task_struct() will free task_struct 'p' while the loop still try
	 * to access the field of 'p', so, get an extra reference.
	 */
	get_task_struct(p);
	//遍历进程下的线程
	for_each_thread(p, t) {
		list_for_each_entry(child, &t->children, sibling) {
			unsigned int child_points;

			//如果子进程有自己单独的mm内存空间，则可以被选中代提父进程被kill
			if (process_shares_mm(child, p->mm))
				continue;
			/*
			 * oom_badness() returns 0 if the thread is unkillable
			 */
			//计算子线程的得分情况
			child_points = oom_badness(child,
				oc->memcg, oc->nodemask, oc->totalpages);
			//将得分最高者计为victim，得分为victim_points。
			if (child_points > victim_points) {
				put_task_struct(victim);//释放进程内核栈，彻底释放了内核线程所占的系统资源
				victim = child;
				victim_points = child_points;
				get_task_struct(victim);
			}
		}
	}
	put_task_struct(p);//释放进程内核栈，彻底释放了内核线程所占的系统资源
	read_unlock(&tasklist_lock);

	/*
	 * Do we need to kill the entire memory cgroup?
	 * Or even one of the ancestor memory cgroups?
	 * Check this out before killing the victim task.
	 */
	oom_group = mem_cgroup_get_oom_group(victim, oc->memcg);

	__oom_kill_process(victim);

	/*
	 * If necessary, kill all tasks in the selected memory cgroup.
	 */
	if (oom_group) {
		mem_cgroup_print_oom_group(oom_group);
		mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, NULL);
		mem_cgroup_put(oom_group);
	}
}

oom_kill_process首先判断要杀死的进程如果要退出或者释放内存，就直接标记这个进程要oom并且唤醒OOM reaper去收割释放内存，否则就遍历进程下的全部线程，计算每一个线程得分，选择分数最高的线程标记，然后调用__oom_kill_process收割：

static void __oom_kill_process(struct task_struct *victim)
{
	struct task_struct *p;
	struct mm_struct *mm;
	bool can_oom_reap = true;

	p = find_lock_task_mm(victim);
	if (!p) {
		put_task_struct(victim);
		return;
	} else if (victim != p) {
		get_task_struct(p);
		put_task_struct(victim);
		victim = p;
	}

	/* Get a reference to safely compare mm after task_unlock(victim) */
	mm = victim->mm;
	mmgrab(mm);

	/* Raise event before sending signal: task reaper must see this */
	//在发送信号之前引发OOM_KILL事件，让任务收割者知道
	count_vm_event(OOM_KILL);
	memcg_memory_event_mm(mm, MEMCG_OOM_KILL);

	/*
	 * We should send SIGKILL before granting access to memory reserves
	 * in order to prevent the OOM victim from depleting the memory
	 * reserves from the user space under its control.
	 */
	//发送SIGKILL信号给victim进程
	do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, PIDTYPE_TGID);
	mark_oom_victim(victim);//标注TIF_MEMDIE是因为OOM被杀死
	
	//打印被kill进程的内存信息
	pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
		task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
		K(get_mm_counter(victim->mm, MM_ANONPAGES)),
		K(get_mm_counter(victim->mm, MM_FILEPAGES)),
		K(get_mm_counter(victim->mm, MM_SHMEMPAGES)));
	task_unlock(victim);

	/*
	 * Kill all user processes sharing victim->mm in other thread groups, if
	 * any.  They don't get access to memory reserves, though, to avoid
	 * depletion of all memory.  This prevents mm->mmap_sem livelock when an
	 * oom killed thread cannot exit because it requires the semaphore and
	 * its contended by another thread trying to allocate memory itself.
	 * That thread will now get access to memory reserves since it has a
	 * pending fatal signal.
	 */
	rcu_read_lock();
	//遍历相关线程处理共享内存
	for_each_process(p) {
		if (!process_shares_mm(p, mm))//如果子进程没有自己单独的mm内存空间，则跳过
			continue;
		if (same_thread_group(p, victim))
			continue;
		if (is_global_init(p)) {
			can_oom_reap = false;
			set_bit(MMF_OOM_SKIP, &mm->flags);
			pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
					task_pid_nr(victim), victim->comm,
					task_pid_nr(p), p->comm);
			continue;
		}
		/*
		 * No use_mm() user needs to read from the userspace so we are
		 * ok to reap it.
		 */
		//内核线程跳过
		if (unlikely(p->flags & PF_KTHREAD))
			continue;
		do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, PIDTYPE_TGID);
	}
	rcu_read_unlock();

	if (can_oom_reap)
		wake_oom_reaper(victim);//唤醒OOM Reaper内核线程收割

	mmdrop(mm);//释放mm空间的内存。包括申请的页面、mm结构体等
	put_task_struct(victim);//释放task_struct占用的内存空间，包括cgroup等等。
}

__oom_kill_process第一步通知任务收割者知道现在准备引发OOM_KILL事件，接着发送SIGKILL信号给victim进程，并且标注因为OOM被杀死的标志，然后打印被kill进程的内存信息，最后遍历进程下所有线程处理共享内存，因为共享内存其他进程还需要使用，不可以释放，最后调用wake_oom_reaper唤醒OOM Reaper内核线程收割。我们看看wake_oom_reaper如何唤醒OOM Reaper内核线程收割：

static void wake_oom_reaper(struct task_struct *tsk)
{
	/* mm is already queued? */
	if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
		return;

	get_task_struct(tsk);

	spin_lock(&oom_reaper_lock);
	tsk->oom_reaper_list = oom_reaper_list;
	oom_reaper_list = tsk;
	spin_unlock(&oom_reaper_lock);
	trace_wake_reaper(tsk->pid);
	wake_up(&oom_reaper_wait);
}

wake_oom_reaper首先看看oom的队列是否准备好，然后设置oom回收的队列，最后唤醒oom_reaper_wait队列来收割进程。我们再看看oom_reaper_wait是什么队列呢？查找并且分析后发现，原来linux内核有一个函数叫oom_init，他是linux初始化的时候运行的，因为他是subsys_initcall队列中的，我们看看一些oom相关的全局变量和oom_init函数：

static struct task_struct *oom_reaper_th;//oom_reaper内核线程的task_struct指针
//OOM Reaper等待队列，oom_reaper线程在此等待，当有OOM产生的时候唤醒等待队列，并从oom_reaper_list中获取待收割进程结构体
static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
static struct task_struct *oom_reaper_list;//待收割的进程
static DEFINE_SPINLOCK(oom_reaper_lock);//oom线程锁

static int __init oom_init(void)
{
	oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
	return 0;
}
subsys_initcall(oom_init)

oom_reaper_wait这个队列是通过DECLARE_WAIT_QUEUE_HEAD静态初始化的，而oom_init会创建一个内核线程oom_reaper，oom_reaper：

static int oom_reaper(void *unused)
{
	while (true) {
		struct task_struct *tsk = NULL;
		
		//oom_reaper在此睡眠，直到有OOM产生并且通过wake_oom_reaper()唤醒
		wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
		spin_lock(&oom_reaper_lock);
		if (oom_reaper_list != NULL) {
			tsk = oom_reaper_list;
			oom_reaper_list = tsk->oom_reaper_list;
		}
		spin_unlock(&oom_reaper_lock);

		if (tsk)
			oom_reap_task(tsk);//收割OOM选中的最bad进程
	}

	return 0;
}

oom_reaper中就一直等待oom_reaper_wait工作队列，这个工作队列就是wake_oom_reaper唤醒的工作队列，等到wake_oom_reaper函数执行并且唤醒oom_reaper_wait工作队列后，第一步是给oom线程上锁，记录oom_reaper_list上要收割的进程后清空oom_reaper_list 队列，最后调用oom_reap_task函数收割进程。oom_reap_task：

#define MAX_OOM_REAP_RETRIES 10
static void oom_reap_task(struct task_struct *tsk)
{
	int attempts = 0;
	struct mm_struct *mm = tsk->signal->oom_mm;

	/* Retry the down_read_trylock(mmap_sem) a few times */
	//最多重试10次，每次间隔100ms，进行收割
	while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm))
		schedule_timeout_idle(HZ/10);

	if (attempts <= MAX_OOM_REAP_RETRIES ||
	    test_bit(MMF_OOM_SKIP, &mm->flags))
		goto done;

	//收割失败，显示系统hold住的lock信息
	pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
		task_pid_nr(tsk), tsk->comm);
	debug_show_all_locks();

done:
	tsk->oom_reaper_list = NULL;

	/*
	 * Hide this mm from OOM killer because it has been either reaped or
	 * somebody can't call up_write(mmap_sem).
	 */
	set_bit(MMF_OOM_SKIP, &mm->flags);

	/* Drop a reference taken by wake_oom_reaper */
	put_task_struct(tsk);
}

可以看到oom_reap_task最多会进行重试10次，每次间隔100ms，执行oom_reap_task_mm进行收割，成功会返回，失败会报错后返回。现在看看oom_reap_task_mm怎么进行收割:

static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
{
	bool ret = true;

	if (!down_read_trylock(&mm->mmap_sem)) {
		trace_skip_task_reaping(tsk->pid);
		return false;
	}

	/*
	 * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
	 * work on the mm anymore. The check for MMF_OOM_SKIP must run
	 * under mmap_sem for reading because it serializes against the
	 * down_write();up_write() cycle in exit_mmap().
	 */
	if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
		trace_skip_task_reaping(tsk->pid);
		goto out_unlock;
	}

	trace_start_task_reaping(tsk->pid);

	/* failed to reap part of the address space. Try again later */
	//主要收割函数
	ret = __oom_reap_task_mm(mm);
	if (!ret)
		goto out_finish;

	//显示经过oom_reaper之后的victim进程所占用的内存信息
	pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
			task_pid_nr(tsk), tsk->comm,
			K(get_mm_counter(mm, MM_ANONPAGES)),
			K(get_mm_counter(mm, MM_FILEPAGES)),
			K(get_mm_counter(mm, MM_SHMEMPAGES)));
out_finish:
	trace_finish_task_reaping(tsk->pid);
out_unlock:
	up_read(&mm->mmap_sem);

	return ret;
}

oom_reap_task_mm主要通过__oom_reap_task_mm收割进程，__oom_reap_task_mm：

bool __oom_reap_task_mm(struct mm_struct *mm)
{
	struct vm_area_struct *vma;
	bool ret = true;

	/*
	 * Tell all users of get_user/copy_from_user etc... that the content
	 * is no longer stable. No barriers really needed because unmapping
	 * should imply barriers already and the reader would hit a page fault
	 * if it stumbled over a reaped memory.
	 */
	set_bit(MMF_UNSTABLE, &mm->flags);

	//遍历进程所有vma区域
	for (vma = mm->mmap ; vma; vma = vma->vm_next) {
		//如果vma是被锁住、巨型页、物理特殊页则跳过
		if (!can_madv_dontneed_vma(vma))
			continue;

		/*
		 * Only anonymous pages have a good chance to be dropped
		 * without additional steps which we cannot afford as we
		 * are OOM already.
		 *
		 * We do not even care about fs backed pages because all
		 * which are reclaimable have already been reclaimed and
		 * we do not want to block exit_mmap by keeping mm ref
		 * count elevated without a good reason.
		 */
		//匿名的非共享页
		if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
			const unsigned long start = vma->vm_start;
			const unsigned long end = vma->vm_end;
			struct mmu_gather tlb;

			tlb_gather_mmu(&tlb, mm, start, end);//初始化 mmu_gather 结构体用于页表拆卸
			
			//通知mmu使vma区域失效
			if (mmu_notifier_invalidate_range_start_nonblock(mm, start, end)) {
				tlb_finish_mmu(&tlb, start, end);//释放 mmu_gather 结构体
				ret = false;
				continue;
			}
			unmap_page_range(&tlb, vma, start, end, NULL);//系统解除vma映射
			
			//通知mmu使vma区域失效完成
			mmu_notifier_invalidate_range_end(mm, start, end);
			tlb_finish_mmu(&tlb, start, end);//释放 mmu_gather 结构体
		}
	}

	return ret;
}

static inline bool can_madv_dontneed_vma(struct vm_area_struct *vma)
{
	return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP));
}

__oom_reap_task_mm会遍历进程每一个vma，首先跳过被锁住、巨型页、物理特殊页的vma，然后对匿名的非共享页的vma区域进行下面的操作：

初始化 mmu_gather 结构体用于页表拆卸
通知mmu使vma区域失效
系统解除vma映射
通知mmu使vma区域失效完成
释放 mmu_gather 结构体

到此，oom收割进程，释放内存就结束了。

本文内容由网友自发贡献，版权归原作者所有，本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容，请联系:hwhale#tublm.com(使用前将#替换为@)

linux内存管理（十四）-内存OOM触发分析的相关文章

iptables通过注释删除特定规则

我需要删除一些具有相同评论的规则例如我有带有 comment test it 的规则所以我可以像这样获得它们的列表 sudo iptables t nat L grep test it 但是我怎样才能删除所有带有注释测试它的 PR
SSE：跨页边界的未对齐加载和存储

我在页面边界旁边执行未对齐加载或存储之前读过某处例如使用 mm loadu si128 mm storeu si128内在函数代码应首先检查整个向量在本例中为 16 个字节是否属于同一页如果不属于同一页则切换到非向量指令我知道
将 jar 作为 Linux 服务运行 - init.d 脚本在启动应用程序时卡住

我目前正在致力于在 Linux VM 上实现一个可运行的 jar 作为后台服务我已经使用了找到的例子here https gist github com shirish4you 5089019作为工作的基础并将 start 方法修改为
查找哪些页面不再与写入时复制共享

假设我在 Linux 中有一个进程我从中fork 另一个相同的过程后forking 因为原始进程将开始写入内存 Linux写时复制机制将为进程提供与分叉进程使用的不同的唯一物理内存页在执行的某个时刻我如何知道原始进程的哪些页面已被写
如何通过ssh检查ubuntu服务器上是否存在php和apache

如何通过ssh检查Ubuntu服务器上apache是否安装了php和mysql 另外如果安装的话在哪个目录如果安装了其他软件包例如 lighttpd 那么它在哪里确定程序是否已安装的另一种方法是使用which命令它将显示您正在搜索
如何使用 GOPATH 的 Samba 服务器位置？

我正在尝试将 GOPATH 设置为共享网络文件夹当我进入 export GOPATH smb path to shared folder I get go GOPATH entry is relative must be absolute
CoAP数据包的大小是多少？

我是这项技术的新手有人可以帮助我了解一些疑问吗 Q 1 CoAP数据包的大小是多少我知道有 4 字节固定标头但是包括标头选项和负载在内的最大大小限制是多少 Q 2 有像MQTT那样的Keep Alive的概念吗它在UDP上工作它
按进程名称过滤并记录 CPU 使用情况

Linux 下有选项吗顶部命令 https www man7 org linux man pages man1 top 1 html我可以在哪里按名称过滤进程并将每秒该进程的 CPU 使用情况写入日志文件 top pgrep 过滤输出top
需要一些建议来开始在 ARM（使用 Linux）平台上编程

我也许很快就会在托管 Linux 发行版的 ARM 平台上工作我不知道哪个发行版我知道该项目涉及视频流但我无法告诉你更多信息其实我只收到通知还没见到任何人我从来没有在这样的平台上工作过所以我的想法是在项目开始之前进行测试
使用 Grep 查找两个短语之间的文本块（包括短语）

是否可以使用 grep 来高亮所有以以下内容开头的文本 mutablePath CGPathCreateMutable 并以以下内容结尾 CGPathAddPath skinMutablePath NULL mutablePath 这两个短
确定我可以向文件句柄写入多少内容；将数据从一个 FH 复制到另一个 FH

如何确定是否可以将给定数量的字节写入文件句柄实际上是套接字或者如何取消读取我从其他文件句柄读取的数据我想要类似的东西 n how much can I write w handle n read r handle buf n a
在 Mac OSX 上交叉编译 x86_64-unknown-linux-gnu 失败

我尝试将我的 Rust 项目之一编译到 x86 64 unknown linux gnu 目标 cargo build target x86 64 unknown linux gnu Compiling deployer v0 1 0 fi
在 C++ linux 中将 STRINGS 写入串口

我知道这个问题遍布互联网但仍然没有任何东西能让我完全解决这个问题我想用 C linux 将数据写入 Propeller 板的串行端口从控制台获取输入时程序运行良好但是当我向它写入字符串时总是返回 ERROR Invalid comm
Capistrano 3 部署无法连接到 GitHub - 权限被拒绝（公钥）

我使用 Capistrano v3 和 capistrano symfony gem 设置了以下部署脚本我正在使用 Ubuntu 14 4 部署到 AWS EC2 实例我正在连接从 AWS 下载的 pem 文件我的deploy rb中
git 错误：无法处理 https

当我尝试使用 git clone 时https xxx https xxx我收到以下错误我不处理协议 https 有人可以帮我吗完整消息 dementrock dementrock A8Se git 克隆https git innosta
使用 plistBuddy 获取值数组

var keychain access groups declare a val usr libexec PlistBuddy c Print var sample plist echo val echo val 0 Ouput Array
如何从 PROC 获取有关子进程的信息

我正在尝试编写一个以几个进程作为参数的程序然后父进程执行每个子进程并打印出一些相关的统计信息示例 generate ls l 将生成一个程序打印出有关 ls l 的一些统计信息特别是其系统时间用户时间和上下文切换次数我不想使用
Gearman，php 扩展问题：使用终端在 .. 中找不到类“GearmanWorker”，但可以在浏览器上使用

我最近在 ubuntu 10 04 上安装了 gearman 并安装了它的 pecl 扩展现在当我在浏览器中运行一个 php 文件时其中包含 client new GearmanWorker die var Dump client I
vagrant ssh -c 并在连接关闭后保持后台进程运行

我正在编写一个脚本来启动和后台流浪机器内的进程似乎每次脚本结束和 ssh 会话结束时后台进程也会结束这是我正在运行的命令 vagrant ssh c cd vagrant src nohup python hello py gt he
嵌入式linux编写AT命令

我在向 GSM 模块写入 AT 命令时遇到问题当我使用 minicom b 115200 D dev ttySP0 term vt100 时它工作完美但我不知道如何在 C 代码中做同样的事情我没有收到任何错误但模块对命令没有反应有

随机推荐

python怎么遍历文件夹_python遍历文件夹,指定遍历深度与忽略目录的方法

背景需要在文件夹中搜索某一文件找到后返回此文件所在目录用最常规的os listdir 方式实现了一版但执行时报错递归超过最大深度于是自己添加了点功能之所有写此函数是为了让它适应不同的项目因为有项目要找的文件在第一层有的在第
趣味面试题

趣味面试题集锦最近看了不少关于求职面试的试题在其中发现了不少有意思的题目特整理后发表上来与大家一起分享为了方便与以后添加的题目区别每次更新会以 A B 的方式标注 A 1 如何将 a b 的值进行交换并且不使用任何中间变量解析
【超详细】使用Git上传本地文件夹到GitHub main branch

最近对Tensorflow Object Detection Tutorial进行更新想把本地建好的文件夹上传到main brach而不是master brach 一新建仓库 Repository 在GitHub上登陆你的账号点击 N
qDebug 学习小结

在qtcentre中看到有网友问这样一个问题 Why this doesn t work qDebug lt lt Test lt lt std endl 第一反应这两个东西本来就不能这样搭配使用啊第二反应额如何解释这个问题呢还真
SQL基础教程系列_基于Postgresql_创建表时的中文乱码问题(附如何修改注册表更改dos encoding方式)

创建表时的中文乱码问题具体分为两种一种是输入进去就不显示中文第二种是当上传到服务器端会返回乱码具体原因可以参考 http www cnblogs com winkey4986 p 6279243 html 一篇博客及 http w
localStorage.getItem

1 localStorage getItem WEB应用的快速发展是的本地存储一些数据也成为一种重要的需求实现的方案也有很多最普通的就是cookie了大家也经常都用但是cookie的缺点是显而易见的其他的方案比如 IE6以上的u
【华为OD机试】最大花费金额 (C++ Python Java)2023 B卷

题目描述双十一众多商品进行打折销售小明想购买自己心仪的一些物品但由于受购买资金限制所以他决定从众多心仪商品中购买三件而且想尽可能的花完资金现在请你设计一个程序帮助小明计算尽可能花费的最大资金数额输入描述输入第一行为一维整型数
【AI with ML】第 1 章：TensorFlow 简介

大家好我是Sonhhxg 柒希望你看完之后能对你有所帮助不足请指正共同学习交流个人主页 Sonhhxg 柒的博客 CSDN博客欢迎各位点赞收藏留言系列专栏机器学习 ML 自然语言处理 NLP 深度学习 DL fore
并发编程（三）——多线程之间如何实现通讯

前言欢迎大家一起来学习多线程大家一起来学习吧并发编程一多线程快速入门并发编程二内存模型并发编程三多线程之间如何实现通讯并发编程四 JUC并发包常用方法介绍并发编程五线程池及原理剖析并发编程六 java中锁
设置PWM占空比中TIM_SetCompare1，TIM_SetCompare2,TIM_SetCompare3,TIM_SetCompare4分别对应引脚和ADC通道对应引脚

这个函数TIM SetCompare1 这个函数有四个分别是TIM SetCompare1 TIM SetCompare2 TIM SetCompare3 TIM SetCompare4 位于CH1那一行的GPIO口使用TIM SetCo
leetcode-283-移动零

移动零给定一个数组 nums 编写一个函数将所有 0 移动到数组的末尾同时保持非零元素的相对顺序示例输入 0 1 0 3 12 输出 1 3 12 0 0 说明必须在原数组上操作不能拷贝额外的数组尽量减少操作次数解法一计算
STM32L4-RS485+DMA中断通信实验+字节丢失处理[寄存器版]

基本设置 MCU采用 STM32L431RCT6 485芯片采用 ADM3485 采用串口经由RS485 使用DMA向串口调试助手传送数据相关配置与前文基本相同 STM32L4 双路RS485自收发通信实验寄存器版 staypt的博客
我看鸿蒙操作系统

华为宣布推出鸿蒙操作系统其实我觉得能理解但也蛮无奈的所谓不得已而为之 google不提供后续版本授权不提供新的支持怎么办硬着头皮也要上有些自媒体说什么安卓慌了 google吓坏了我真的想骂人一群王八蛋为了点击率什么都敢写
日志分析入侵检测--实战

收到客户服务器被入侵消息希望我给他做一次入侵排查刚做完顺手记录一下入侵排查思路 0x01 日志分析 1 爆破ip统计 grep i Failed var log secure awk print NF 3 sort uniq c so
fgets()

fgets 函数简介读字符串函数fgets 的功能是从指定的文件中读一个字符串到字符数组中函数调用的形式为 fgets 字符数组名 n 文件指针要从键盘输入时文件指针的参数为 stdin 其中的n是一个正整数表示从文件中读出的字符串
Springboot中创建拦截器

目录目的实现过程 1 创建拦截器 2 注册拦截器完整代码目的在Springboot项目中创建拦截器在进入Controller层之前拦截请求可对拦截到的请求内容做响应处理如校验请求参数验证证书等操作实现过程 1 创建拦截
PTS测试

PTS性能测试一什么是性能测试性能测试PTS Performance Testing Service 是一款简单易用具备强大的分布式压测能力的SaaS压测平台 PTS可以模拟复杂的业务场景并快速精准地调度不同规模的流量同时提供压
JAVA中String及String常用的方法

String string是表示字符串的字符串类 public class StringDemo public static void main String args 常见面试题 String s new String hello 问如
Assuming drive cache: write through

我也遇到过关机重启就可以了
linux内存管理（十四）-内存OOM触发分析

在内存分配路径上当内存不足的时候会触发kswapd 或者内存规整极端情况会触发OOM 来获取更多内存在内存回收失败之后会进行OOM OOM的入口是 alloc pages may oom 文件位于mm page alloc c中 s

linux内存管理（十四）-内存OOM触发分析

linux内存管理（十四）-内存OOM触发分析 的相关文章

随机推荐

热门标签

linux内存管理（十四）-内存OOM触发分析的相关文章