OOM Killer 持续更新中

2023-11-03

虚拟地址空间的概念都门儿清，Linux 认为用户进程调用malloc申请了动态内存后不见得马上就会使用这段内存进行写读操作。
Linux使用了拖到最后的分配机制——用时分配机制。
但是，即使是用时分配，也不见得总有内存可分配。当进程太多，显得内存（加上swap）不足时，就有问题了。所有进程使用的内存量PrM是一定的，而系统物理内存量PyM本身又是有限的，也是一定的。

当用时分配机制导致 PrM > PyM 时，Linux的机制是无奈但又必须Kill一些重要性低的进程来回收内存，即OOM Killer（Out of memory killer）。

以arm64 arch 为例：

从文件entry.S开始，此文件的作用有必要搞懂：异常向量表。
arch/arm64/kernel/entry.S

el0_ia:
...
bl do_el0_ia_bp_hardening //bl指令是转移到子程序执行，
//并且事先保存当前位置的下一条指令地址到R14寄存器（lr），
//于是可以通过将lr的值mov到pc寄存器，来达到函数返回的效果。
arch/arm6/mm/fault.c :: asmlinkage void __exception do_el0_ia_bp_hardening(){do_mem_abort()}
same file fault.c:: void do_mem_abort(){ struct fault_info *inf=esr_to_fault_info(esr) inf->fun()；}
same file fault.c:: static const struct fault_info fault_info[]={
{},{},...,{do_translation_fault,SIGSEGV,SEGV_MAPERR,"level 1 translation fault"},...{};

}

kernel/arch/arm64/mm/fault.c中进行线性地址到物理地址的转换时，第一级页表包含的入口地址有误，
此时do_translation_fault(){do_page_fault(addr,esr,regs);}
do_page_fault(){当内存不足时会调用：pagefault_out_of_memory();}
void pagefault_out_of_memory(void){out_of_memory(&oc)}
bool out_of_memory(struct oom_control *oc){select_bad_process(oc)}
static int oom_evaluate_task(struct task_struct *task, void *arg){oom_badness(task,NULL,oc-->nodemask, oc->totalpages);}

函数oom_badness()的注释说的比较清楚，如下链接并复制中。

https://github.com/torvalds/linux/blob/master/mm/oom_kill.c

// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/oom_kill.c
*
* Copyright (C) 1998,2000 Rik van Riel
*   Thanks go out to Claus Fischer for some serious inspiration and
*   for goading me into coding this file...
* Copyright (C) 2010 Google, Inc.
*   Rewritten by David Rientjes
*
* The routines in this file are used to kill a process when
* we're seriously out of memory. This gets called from __alloc_pages()
* in mm/page_alloc.c when we really run out of memory.
*
* Since we won't call these routines often (on a well-configured
* machine) this file will double as a 'coding guide' and a signpost
* for newbie kernel hackers. It features several pointers to major
* kernel subsystems and hints as to where to find out what things do.
*/

#include <linux/oom.h>
#include <linux/mm.h>
#include <linux/err.h>
#include <linux/gfp.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/task.h>
#include <linux/swap.h>
#include <linux/timex.h>
#include <linux/jiffies.h>
#include <linux/cpuset.h>
#include <linux/export.h>
#include <linux/notifier.h>
#include <linux/memcontrol.h>
#include <linux/mempolicy.h>
#include <linux/security.h>
#include <linux/ptrace.h>
#include <linux/freezer.h>
#include <linux/ftrace.h>
#include <linux/ratelimit.h>
#include <linux/kthread.h>
#include <linux/init.h>
#include <linux/mmu_notifier.h>

#include <asm/tlb.h>
#include "internal.h"
#include "slab.h"

#define CREATE_TRACE_POINTS
#include <trace/events/oom.h>

int sysctl_panic_on_oom;
int sysctl_oom_kill_allocating_task;
int sysctl_oom_dump_tasks = 1;

/*
* Serializes oom killer invocations (out_of_memory()) from all contexts to
* prevent from over eager oom killing (e.g. when the oom killer is invoked
* from different domains).
*
* oom_killer_disable() relies on this lock to stabilize oom_killer_disabled
* and mark_oom_victim
*/
DEFINE_MUTEX(oom_lock);

#ifdef CONFIG_NUMA
/**
* has_intersects_mems_allowed() - check task eligiblity for kill
* @start: task struct of which task to consider
* @mask: nodemask passed to page allocator for mempolicy ooms
*
* Task eligibility is determined by whether or not a candidate task, @tsk,
* shares the same mempolicy nodes as current if it is bound by such a policy
* and whether or not it has the same set of allowed cpuset nodes.
*/
static bool has_intersects_mems_allowed(struct task_struct *start,
                   const nodemask_t *mask)
{
   struct task_struct *tsk;
   bool ret = false;

   rcu_read_lock();
   for_each_thread(start, tsk) {
       if (mask) {
           /*
           * If this is a mempolicy constrained oom, tsk's
           * cpuset is irrelevant. Only return true if its
           * mempolicy intersects current, otherwise it may be
           * needlessly killed.
           */
           ret = mempolicy_nodemask_intersects(tsk, mask);
       } else {
           /*
           * This is not a mempolicy constrained oom, so only
           * check the mems of tsk's cpuset.
           */
           ret = cpuset_mems_allowed_intersects(current, tsk);
       }
       if (ret)
           break;
   }
   rcu_read_unlock();

   return ret;
}
#else
static bool has_intersects_mems_allowed(struct task_struct *tsk,
                   const nodemask_t *mask)
{
   return true;
}
#endif /* CONFIG_NUMA */

/*
* The process p may have detached its own ->mm while exiting or through
* use_mm(), but one or more of its subthreads may still have a valid
* pointer. Return p, or any of its subthreads with a valid ->mm, with
* task_lock() held.
*/
struct task_struct *find_lock_task_mm(struct task_struct *p)
{
struct task_struct *t;

rcu_read_lock();

   for_each_thread(p, t) {
       task_lock(t);
       if (likely(t->mm))
           goto found;
       task_unlock(t);
   }
   t = NULL;
found:
   rcu_read_unlock();

return t;
}

/*
* order == -1 means the oom kill is required by sysrq, otherwise only
* for display purposes.
*/
static inline bool is_sysrq_oom(struct oom_control *oc)
{
return oc->order == -1;
}

static inline bool is_memcg_oom(struct oom_control *oc)
{
return oc->memcg != NULL;
}

/* return true if the task is not adequate as candidate victim task. */
static bool oom_unkillable_task(struct task_struct *p,
       struct mem_cgroup *memcg, const nodemask_t *nodemask)
{
   if (is_global_init(p))
       return true;
   if (p->flags & PF_KTHREAD)
       return true;

   /* When mem_cgroup_out_of_memory() and p is not member of the group */
   if (memcg && !task_in_mem_cgroup(p, memcg))
       return true;

   /* p may not have freeable memory in nodemask */
   if (!has_intersects_mems_allowed(p, nodemask))
       return true;

return false;
}

/*
* Print out unreclaimble slabs info when unreclaimable slabs amount is greater
* than all user memory (LRU pages)
*/
static bool is_dump_unreclaim_slabs(void)
{
unsigned long nr_lru;

   nr_lru = global_node_page_state(NR_ACTIVE_ANON) +
       global_node_page_state(NR_INACTIVE_ANON) +
       global_node_page_state(NR_ACTIVE_FILE) +
       global_node_page_state(NR_INACTIVE_FILE) +
       global_node_page_state(NR_ISOLATED_ANON) +
       global_node_page_state(NR_ISOLATED_FILE) +
       global_node_page_state(NR_UNEVICTABLE);

return (global_node_page_state(NR_SLAB_UNRECLAIMABLE) > nr_lru);
}

/**
* oom_badness - heuristic function to determine which candidate task to kill
* @p: task struct of which task we should calculate
* @totalpages: total present RAM allowed for page allocation
* @memcg: task's memory controller, if constrained
* @nodemask: nodemask passed to page allocator for mempolicy ooms
*
* The heuristic for determining which task to kill is made to be as simple and
* predictable as possible. The goal is to return the highest value for the
* task consuming the most memory to avoid subsequent oom failures.
*/
unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
           const nodemask_t *nodemask, unsigned long totalpages)
{
   long points;
   long adj;

if (oom_unkillable_task(p, memcg, nodemask))
return 0;

   p = find_lock_task_mm(p);
   if (!p)
       return 0;

   /*
   * Do not even consider tasks which are explicitly marked oom
   * unkillable or have been already oom reaped or the are in
   * the middle of vfork
   */
   adj = (long)p->signal->oom_score_adj;
   if (adj == OOM_SCORE_ADJ_MIN ||
           test_bit(MMF_OOM_SKIP, &p->mm->flags) ||
           in_vfork(p)) {
       task_unlock(p);
       return 0;
   }

   /*
   * The baseline for the badness score is the proportion of RAM that each
   * task's rss, pagetable and swap space use.
   */
   points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
       mm_pgtables_bytes(p->mm) / PAGE_SIZE;
   task_unlock(p);

   /* Normalize to oom_score_adj units */
   adj *= totalpages / 1000;
   points += adj;

   /*
   * Never return 0 for an eligible task regardless of the root bonus and
   * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
   */
   return points > 0 ? points : 1;
}

static const char * const oom_constraint_text[] = {
   [CONSTRAINT_NONE] = "CONSTRAINT_NONE",
   [CONSTRAINT_CPUSET] = "CONSTRAINT_CPUSET",
   [CONSTRAINT_MEMORY_POLICY] = "CONSTRAINT_MEMORY_POLICY",
   [CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG",
};

/*
* Determine the type of allocation constraint.
*/
static enum oom_constraint constrained_alloc(struct oom_control *oc)
{
   struct zone *zone;
   struct zoneref *z;
   enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask);
   bool cpuset_limited = false;
   int nid;

   if (is_memcg_oom(oc)) {
       oc->totalpages = mem_cgroup_get_max(oc->memcg) ?: 1;
       return CONSTRAINT_MEMCG;
   }

/* Default to all available memory */
oc->totalpages = totalram_pages() + total_swap_pages;

if (!IS_ENABLED(CONFIG_NUMA))
return CONSTRAINT_NONE;

   if (!oc->zonelist)
       return CONSTRAINT_NONE;
   /*
   * Reach here only when __GFP_NOFAIL is used. So, we should avoid
   * to kill current.We have to random task kill in this case.
   * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
   */
   if (oc->gfp_mask & __GFP_THISNODE)
       return CONSTRAINT_NONE;

   /*
   * This is not a __GFP_THISNODE allocation, so a truncated nodemask in
   * the page allocator means a mempolicy is in effect. Cpuset policy
   * is enforced in get_page_from_freelist().
   */
   if (oc->nodemask &&
   !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
       oc->totalpages = total_swap_pages;
       for_each_node_mask(nid, *oc->nodemask)
           oc->totalpages += node_spanned_pages(nid);
       return CONSTRAINT_MEMORY_POLICY;
   }

   /* Check this allocation failure is caused by cpuset's wall function */
   for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
           high_zoneidx, oc->nodemask)
       if (!cpuset_zone_allowed(zone, oc->gfp_mask))
           cpuset_limited = true;

   if (cpuset_limited) {
       oc->totalpages = total_swap_pages;
       for_each_node_mask(nid, cpuset_current_mems_allowed)
           oc->totalpages += node_spanned_pages(nid);
       return CONSTRAINT_CPUSET;
   }
   return CONSTRAINT_NONE;
}

static int oom_evaluate_task(struct task_struct *task, void *arg)
{
struct oom_control *oc = arg;
unsigned long points;

if (oom_unkillable_task(task, NULL, oc->nodemask))
goto next;

   /*
   * This task already has access to memory reserves and is being killed.
   * Don't allow any other task to have access to the reserves unless
   * the task has MMF_OOM_SKIP because chances that it would release
   * any memory is quite low.
   */
   if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {
       if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
           goto next;
       goto abort;
   }

   /*
   * If task is allocating a lot of memory and has been marked to be
   * killed first if it triggers an oom, then select it.
   */
   if (oom_task_origin(task)) {
       points = ULONG_MAX;
       goto select;
   }

   points = oom_badness(task, NULL, oc->nodemask, oc->totalpages);
   if (!points || points < oc->chosen_points)
       goto next;

   /* Prefer thread group leaders for display purposes */
   if (points == oc->chosen_points && thread_group_leader(oc->chosen))
       goto next;
select:
   if (oc->chosen)
       put_task_struct(oc->chosen);
   get_task_struct(task);
   oc->chosen = task;
   oc->chosen_points = points;
next:
   return 0;
abort:
   if (oc->chosen)
       put_task_struct(oc->chosen);
   oc->chosen = (void *)-1UL;
   return 1;
}

/*
* Simple selection loop. We choose the process with the highest number of
* 'points'. In case scan was aborted, oc->chosen is set to -1.
*/
static void select_bad_process(struct oom_control *oc)
{
   if (is_memcg_oom(oc))
       mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
   else {
       struct task_struct *p;

       rcu_read_lock();
       for_each_process(p)
           if (oom_evaluate_task(p, oc))
               break;
       rcu_read_unlock();
   }

oc->chosen_points = oc->chosen_points * 1000 / oc->totalpages;
}

/**
* dump_tasks - dump current memory state of all system tasks
* @memcg: current's memory controller, if constrained
* @nodemask: nodemask passed to page allocator for mempolicy ooms
*
* Dumps the current memory state of all eligible tasks. Tasks not in the same
* memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
* are not shown.
* State information includes task's pid, uid, tgid, vm size, rss,
* pgtables_bytes, swapents, oom_score_adj value, and name.
*/
static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
{
struct task_struct *p;
struct task_struct *task;

   pr_info("Tasks state (memory values in pages):\n");
   pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n");
   rcu_read_lock();
   for_each_process(p) {
       if (oom_unkillable_task(p, memcg, nodemask))
           continue;

       task = find_lock_task_mm(p);
       if (!task) {
           /*
           * This is a kthread or all of p's threads have already
           * detached their mm's. There's no need to report
           * them; they can't be oom killed anyway.
           */
           continue;
       }

       pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
           task->pid, from_kuid(&init_user_ns, task_uid(task)),
           task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
           mm_pgtables_bytes(task->mm),
           get_mm_counter(task->mm, MM_SWAPENTS),
           task->signal->oom_score_adj, task->comm);
       task_unlock(task);
   }
   rcu_read_unlock();
}

static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
{
   /* one line summary of the oom killer context. */
   pr_info("oom-kill:constraint=%s,nodemask=%*pbl",
           oom_constraint_text[oc->constraint],
           nodemask_pr_args(oc->nodemask));
   cpuset_print_current_mems_allowed();
   mem_cgroup_print_oom_context(oc->memcg, victim);
   pr_cont(",task=%s,pid=%d,uid=%d\n", victim->comm, victim->pid,
       from_kuid(&init_user_ns, task_uid(victim)));
}

static void dump_header(struct oom_control *oc, struct task_struct *p)
{
   pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
       current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
           current->signal->oom_score_adj);
   if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
       pr_warn("COMPACTION is disabled!!!\n");

   dump_stack();
   if (is_memcg_oom(oc))
       mem_cgroup_print_oom_meminfo(oc->memcg);
   else {
       show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask);
       if (is_dump_unreclaim_slabs())
           dump_unreclaimable_slab();
   }
   if (sysctl_oom_dump_tasks)
       dump_tasks(oc->memcg, oc->nodemask);
   if (p)
       dump_oom_summary(oc, p);
}

/*
* Number of OOM victims in flight
*/
static atomic_t oom_victims = ATOMIC_INIT(0);
static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);

static bool oom_killer_disabled __read_mostly;

#define K(x) ((x) << (PAGE_SHIFT-10))

/*
* task->mm can be NULL if the task is the exited group leader. So to
* determine whether the task is using a particular mm, we examine all the
* task's threads: if one of those is using this mm then this task was also
* using it.
*/
bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
{
struct task_struct *t;

   for_each_thread(p, t) {
       struct mm_struct *t_mm = READ_ONCE(t->mm);
       if (t_mm)
           return t_mm == mm;
   }
   return false;
}

#ifdef CONFIG_MMU
/*
* OOM Reaper kernel thread which tries to reap the memory used by the OOM
* victim (if that is possible) to help the OOM killer to move on.
*/
static struct task_struct *oom_reaper_th;
static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
static struct task_struct *oom_reaper_list;
static DEFINE_SPINLOCK(oom_reaper_lock);

bool __oom_reap_task_mm(struct mm_struct *mm)
{
struct vm_area_struct *vma;
bool ret = true;

   /*
   * Tell all users of get_user/copy_from_user etc... that the content
   * is no longer stable. No barriers really needed because unmapping
   * should imply barriers already and the reader would hit a page fault
   * if it stumbled over a reaped memory.
   */
   set_bit(MMF_UNSTABLE, &mm->flags);

   for (vma = mm->mmap ; vma; vma = vma->vm_next) {
       if (!can_madv_dontneed_vma(vma))
           continue;

       /*
       * Only anonymous pages have a good chance to be dropped
       * without additional steps which we cannot afford as we
       * are OOM already.
       *
       * We do not even care about fs backed pages because all
       * which are reclaimable have already been reclaimed and
       * we do not want to block exit_mmap by keeping mm ref
       * count elevated without a good reason.
       */
       if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
           struct mmu_notifier_range range;
           struct mmu_gather tlb;

           mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0,
                       vma, mm, vma->vm_start,
                       vma->vm_end);
           tlb_gather_mmu(&tlb, mm, range.start, range.end);
           if (mmu_notifier_invalidate_range_start_nonblock(&range)) {
               tlb_finish_mmu(&tlb, range.start, range.end);
               ret = false;
               continue;
           }
           unmap_page_range(&tlb, vma, range.start, range.end, NULL);
           mmu_notifier_invalidate_range_end(&range);
           tlb_finish_mmu(&tlb, range.start, range.end);
       }
   }

return ret;
}

/*
* Reaps the address space of the give task.
*
* Returns true on success and false if none or part of the address space
* has been reclaimed and the caller should retry later.
*/
static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
{
bool ret = true;

   if (!down_read_trylock(&mm->mmap_sem)) {
       trace_skip_task_reaping(tsk->pid);
       return false;
   }

   /*
   * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
   * work on the mm anymore. The check for MMF_OOM_SKIP must run
   * under mmap_sem for reading because it serializes against the
   * down_write();up_write() cycle in exit_mmap().
   */
   if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
       trace_skip_task_reaping(tsk->pid);
       goto out_unlock;
   }

trace_start_task_reaping(tsk->pid);

   /* failed to reap part of the address space. Try again later */
   ret = __oom_reap_task_mm(mm);
   if (!ret)
       goto out_finish;

   pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
           task_pid_nr(tsk), tsk->comm,
           K(get_mm_counter(mm, MM_ANONPAGES)),
           K(get_mm_counter(mm, MM_FILEPAGES)),
           K(get_mm_counter(mm, MM_SHMEMPAGES)));
out_finish:
   trace_finish_task_reaping(tsk->pid);
out_unlock:
   up_read(&mm->mmap_sem);

return ret;
}

#define MAX_OOM_REAP_RETRIES 10
static void oom_reap_task(struct task_struct *tsk)
{
int attempts = 0;
struct mm_struct *mm = tsk->signal->oom_mm;

   /* Retry the down_read_trylock(mmap_sem) a few times */
   while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm))
       schedule_timeout_idle(HZ/10);

   if (attempts <= MAX_OOM_REAP_RETRIES ||
   test_bit(MMF_OOM_SKIP, &mm->flags))
       goto done;

   pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
       task_pid_nr(tsk), tsk->comm);
   debug_show_all_locks();

done:
tsk->oom_reaper_list = NULL;

   /*
   * Hide this mm from OOM killer because it has been either reaped or
   * somebody can't call up_write(mmap_sem).
   */
   set_bit(MMF_OOM_SKIP, &mm->flags);

/* Drop a reference taken by wake_oom_reaper */
put_task_struct(tsk);
}

static int oom_reaper(void *unused)
{
while (true) {
struct task_struct *tsk = NULL;

       wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
       spin_lock(&oom_reaper_lock);
       if (oom_reaper_list != NULL) {
           tsk = oom_reaper_list;
           oom_reaper_list = tsk->oom_reaper_list;
       }
       spin_unlock(&oom_reaper_lock);

       if (tsk)
           oom_reap_task(tsk);
   }

return 0;
}

static void wake_oom_reaper(struct task_struct *tsk)
{
   /* mm is already queued? */
   if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
       return;

get_task_struct(tsk);

   spin_lock(&oom_reaper_lock);
   tsk->oom_reaper_list = oom_reaper_list;
   oom_reaper_list = tsk;
   spin_unlock(&oom_reaper_lock);
   trace_wake_reaper(tsk->pid);
   wake_up(&oom_reaper_wait);
}

static int __init oom_init(void)
{
oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
return 0;
}
subsys_initcall(oom_init)
#else
static inline void wake_oom_reaper(struct task_struct *tsk)
{
}
#endif /* CONFIG_MMU */

/**
* mark_oom_victim - mark the given task as OOM victim
* @tsk: task to mark
*
* Has to be called with oom_lock held and never after
* oom has been disabled already.
*
* tsk->mm has to be non NULL and caller has to guarantee it is stable (either
* under task_lock or operate on the current).
*/
static void mark_oom_victim(struct task_struct *tsk)
{
struct mm_struct *mm = tsk->mm;

   WARN_ON(oom_killer_disabled);
   /* OOM killer might race with memcg OOM */
   if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
       return;

   /* oom_mm is bound to the signal struct life time. */
   if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) {
       mmgrab(tsk->signal->oom_mm);
       set_bit(MMF_OOM_VICTIM, &mm->flags);
   }

   /*
   * Make sure that the task is woken up from uninterruptible sleep
   * if it is frozen because OOM killer wouldn't be able to free
   * any memory and livelock. freezing_slow_path will tell the freezer
   * that TIF_MEMDIE tasks should be ignored.
   */
   __thaw_task(tsk);
   atomic_inc(&oom_victims);
   trace_mark_victim(tsk->pid);
}

/**
* exit_oom_victim - note the exit of an OOM victim
*/
void exit_oom_victim(void)
{
clear_thread_flag(TIF_MEMDIE);

if (!atomic_dec_return(&oom_victims))
wake_up_all(&oom_victims_wait);
}

/**
* oom_killer_enable - enable OOM killer
*/
void oom_killer_enable(void)
{
oom_killer_disabled = false;
pr_info("OOM killer enabled.\n");
}

/**
* oom_killer_disable - disable OOM killer
* @timeout: maximum timeout to wait for oom victims in jiffies
*
* Forces all page allocations to fail rather than trigger OOM killer.
* Will block and wait until all OOM victims are killed or the given
* timeout expires.
*
* The function cannot be called when there are runnable user tasks because
* the userspace would see unexpected allocation failures as a result. Any
* new usage of this function should be consulted with MM people.
*
* Returns true if successful and false if the OOM killer cannot be
* disabled.
*/
bool oom_killer_disable(signed long timeout)
{
signed long ret;

   /*
   * Make sure to not race with an ongoing OOM killer. Check that the
   * current is not killed (possibly due to sharing the victim's memory).
   */
   if (mutex_lock_killable(&oom_lock))
       return false;
   oom_killer_disabled = true;
   mutex_unlock(&oom_lock);

   ret = wait_event_interruptible_timeout(oom_victims_wait,
           !atomic_read(&oom_victims), timeout);
   if (ret <= 0) {
       oom_killer_enable();
       return false;
   }
   pr_info("OOM killer disabled.\n");

return true;
}

static inline bool __task_will_free_mem(struct task_struct *task)
{
struct signal_struct *sig = task->signal;

   /*
   * A coredumping process may sleep for an extended period in exit_mm(),
   * so the oom killer cannot assume that the process will promptly exit
   * and release memory.
   */
   if (sig->flags & SIGNAL_GROUP_COREDUMP)
       return false;

if (sig->flags & SIGNAL_GROUP_EXIT)
return true;

if (thread_group_empty(task) && (task->flags & PF_EXITING))
return true;

return false;
}

/*
* Checks whether the given task is dying or exiting and likely to
* release its address space. This means that all threads and processes
* sharing the same mm have to be killed or exiting.
* Caller has to make sure that task->mm is stable (hold task_lock or
* it operates on the current).
*/
static bool task_will_free_mem(struct task_struct *task)
{
   struct mm_struct *mm = task->mm;
   struct task_struct *p;
   bool ret = true;

   /*
   * Skip tasks without mm because it might have passed its exit_mm and
   * exit_oom_victim. oom_reaper could have rescued that but do not rely
   * on that for now. We can consider find_lock_task_mm in future.
   */
   if (!mm)
       return false;

if (!__task_will_free_mem(task))
return false;

   /*
   * This task has already been drained by the oom reaper so there are
   * only small chances it will free some more
   */
   if (test_bit(MMF_OOM_SKIP, &mm->flags))
       return false;

if (atomic_read(&mm->mm_users) <= 1)
return true;

   /*
   * Make sure that all tasks which share the mm with the given tasks
   * are dying as well to make sure that a) nobody pins its mm and
   * b) the task is also reapable by the oom reaper.
   */
   rcu_read_lock();
   for_each_process(p) {
       if (!process_shares_mm(p, mm))
           continue;
       if (same_thread_group(task, p))
           continue;
       ret = __task_will_free_mem(p);
       if (!ret)
           break;
   }
   rcu_read_unlock();

return ret;
}

static void __oom_kill_process(struct task_struct *victim, const char *message)
{
   struct task_struct *p;
   struct mm_struct *mm;
   bool can_oom_reap = true;

   p = find_lock_task_mm(victim);
   if (!p) {
       put_task_struct(victim);
       return;
   } else if (victim != p) {
       get_task_struct(p);
       put_task_struct(victim);
       victim = p;
   }

   /* Get a reference to safely compare mm after task_unlock(victim) */
   mm = victim->mm;
   mmgrab(mm);

   /* Raise event before sending signal: task reaper must see this */
   count_vm_event(OOM_KILL);
   memcg_memory_event_mm(mm, MEMCG_OOM_KILL);

   /*
   * We should send SIGKILL before granting access to memory reserves
   * in order to prevent the OOM victim from depleting the memory
   * reserves from the user space under its control.
   */
   do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID);
   mark_oom_victim(victim);
   pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
       message, task_pid_nr(victim), victim->comm,
       K(victim->mm->total_vm),
       K(get_mm_counter(victim->mm, MM_ANONPAGES)),
       K(get_mm_counter(victim->mm, MM_FILEPAGES)),
       K(get_mm_counter(victim->mm, MM_SHMEMPAGES)));
   task_unlock(victim);

   /*
   * Kill all user processes sharing victim->mm in other thread groups, if
   * any. They don't get access to memory reserves, though, to avoid
   * depletion of all memory. This prevents mm->mmap_sem livelock when an
   * oom killed thread cannot exit because it requires the semaphore and
   * its contended by another thread trying to allocate memory itself.
   * That thread will now get access to memory reserves since it has a
   * pending fatal signal.
   */
   rcu_read_lock();
   for_each_process(p) {
       if (!process_shares_mm(p, mm))
           continue;
       if (same_thread_group(p, victim))
           continue;
       if (is_global_init(p)) {
           can_oom_reap = false;
           set_bit(MMF_OOM_SKIP, &mm->flags);
           pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
                   task_pid_nr(victim), victim->comm,
                   task_pid_nr(p), p->comm);
           continue;
       }
       /*
       * No use_mm() user needs to read from the userspace so we are
       * ok to reap it.
       */
       if (unlikely(p->flags & PF_KTHREAD))
           continue;
       do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID);
   }
   rcu_read_unlock();

if (can_oom_reap)
wake_oom_reaper(victim);

mmdrop(mm);
put_task_struct(victim);
}
#undef K

/*
* Kill provided task unless it's secured by setting
* oom_score_adj to OOM_SCORE_ADJ_MIN.
*/
static int oom_kill_memcg_member(struct task_struct *task, void *message)
{
   if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN &&
   !is_global_init(task)) {
       get_task_struct(task);
       __oom_kill_process(task, message);
   }
   return 0;
}

static void oom_kill_process(struct oom_control *oc, const char *message)
{
   struct task_struct *victim = oc->chosen;
   struct mem_cgroup *oom_group;
   static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
                   DEFAULT_RATELIMIT_BURST);

   /*
   * If the task is already exiting, don't alarm the sysadmin or kill
   * its children or threads, just give it access to memory reserves
   * so it can die quickly
   */
   task_lock(victim);
   if (task_will_free_mem(victim)) {
       mark_oom_victim(victim);
       wake_oom_reaper(victim);
       task_unlock(victim);
       put_task_struct(victim);
       return;
   }
   task_unlock(victim);

if (__ratelimit(&oom_rs))
dump_header(oc, victim);

   /*
   * Do we need to kill the entire memory cgroup?
   * Or even one of the ancestor memory cgroups?
   * Check this out before killing the victim task.
   */
   oom_group = mem_cgroup_get_oom_group(victim, oc->memcg);

__oom_kill_process(victim, message);

   /*
   * If necessary, kill all tasks in the selected memory cgroup.
   */
   if (oom_group) {
       mem_cgroup_print_oom_group(oom_group);
       mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member,
               (void*)message);
       mem_cgroup_put(oom_group);
   }
}

/*
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
*/
static void check_panic_on_oom(struct oom_control *oc)
{
   if (likely(!sysctl_panic_on_oom))
       return;
   if (sysctl_panic_on_oom != 2) {
       /*
       * panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel
       * does not panic for cpuset, mempolicy, or memcg allocation
       * failures.
       */
       if (oc->constraint != CONSTRAINT_NONE)
           return;
   }
   /* Do not panic for oom kills triggered by sysrq */
   if (is_sysrq_oom(oc))
       return;
   dump_header(oc, NULL);
   panic("Out of memory: %s panic_on_oom is enabled\n",
       sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
}

static BLOCKING_NOTIFIER_HEAD(oom_notify_list);

int register_oom_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_register(&oom_notify_list, nb);
}
EXPORT_SYMBOL_GPL(register_oom_notifier);

int unregister_oom_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_unregister(&oom_notify_list, nb);
}
EXPORT_SYMBOL_GPL(unregister_oom_notifier);

/**
* out_of_memory - kill the "best" process when we run out of memory
* @oc: pointer to struct oom_control
*
* If we run out of memory, we have the choice between either
* killing a random task (bad), letting the system crash (worse)
* OR try to be smart about which process to kill. Note that we
* don't have to be perfect here, we just have to be good.
*/
bool out_of_memory(struct oom_control *oc)
{
unsigned long freed = 0;

if (oom_killer_disabled)
return false;

   if (!is_memcg_oom(oc)) {
       blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
       if (freed > 0)
           /* Got some memory back in the last second. */
           return true;
   }

   /*
   * If current has a pending SIGKILL or is exiting, then automatically
   * select it. The goal is to allow it to allocate so that it may
   * quickly exit and free its memory.
   */
   if (task_will_free_mem(current)) {
       mark_oom_victim(current);
       wake_oom_reaper(current);
       return true;
   }

   /*
   * The OOM killer does not compensate for IO-less reclaim.
   * pagefault_out_of_memory lost its gfp context so we have to
   * make sure exclude 0 mask - all other users should have at least
   * ___GFP_DIRECT_RECLAIM to get here.
   */
   if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS))
       return true;

   /*
   * Check if there were limitations on the allocation (only relevant for
   * NUMA and memcg) that may require different handling.
   */
   oc->constraint = constrained_alloc(oc);
   if (oc->constraint != CONSTRAINT_MEMORY_POLICY)
       oc->nodemask = NULL;
   check_panic_on_oom(oc);

   if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
   current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
   current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
       get_task_struct(current);
       oc->chosen = current;
       oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
       return true;
   }

   select_bad_process(oc);
   /* Found nothing?!?! */
   if (!oc->chosen) {
       dump_header(oc, NULL);
       pr_warn("Out of memory and no killable processes...\n");
       /*
       * If we got here due to an actual allocation at the
       * system level, we cannot survive this and will enter
       * an endless loop in the allocator. Bail out now.
       */
       if (!is_sysrq_oom(oc) && !is_memcg_oom(oc))
           panic("System is deadlocked on memory\n");
   }
   if (oc->chosen && oc->chosen != (void *)-1UL)
       oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
               "Memory cgroup out of memory");
   return !!oc->chosen;
}

/*
* The pagefault handler calls here because it is out of memory, so kill a
* memory-hogging task. If oom_lock is held by somebody else, a parallel oom
* killing is already in progress so do nothing.
*/
void pagefault_out_of_memory(void)
{
   struct oom_control oc = {
       .zonelist = NULL,
       .nodemask = NULL,
       .memcg = NULL,
       .gfp_mask = 0,
       .order = 0,
   };

if (mem_cgroup_oom_synchronize(true))
return;

   if (!mutex_trylock(&oom_lock))
       return;
   out_of_memory(&oc);
   mutex_unlock(&oom_lock);
}

本文内容由网友自发贡献，版权归原作者所有，本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容，请联系:hwhale#tublm.com(使用前将#替换为@)

OOM Killer 持续更新中的相关文章

【Linux内核设计与实现】Linux内核简介

之前在读 APUE 的时候更多的是从上层去了解如何使用Linux系统的API 那个时候就十分喜欢Linux的设计觉得一切都很奇妙最近有些迷茫不知道自己以后更加具体的技术方向在哪所以最近广泛阅读了很多方面的书 C 方面服务端网络编
基础笔记（二）：设计模式摘录

基础笔记二设计模式摘录前言收录一些自己在开发过程中比较常用的模式整理出来以便自己复习毕竟熟才能生巧才能变通举一反三设计模式在大多数灵活性好可扩展性高可移植的优秀程序中都有运用比如界面展现层运用的MVC模式的主要关系就
Linux 下系统调用的三种方法

系统调用 System Call 是操作系统为在用户态运行的进程与硬件设备如CPU 磁盘打印机等进行交互提供的一组接口当用户进程需要发生系统调用时 CPU 通过软中断切换到内核态开始执行内核系统调用函数下面介绍Linux 下三种发
linux内核中GPIO的使用（一）--IO内存

一相关概念使用IO内存将物理地址映射为虚拟地址再通过对虚拟地址的操作来控制硬件所谓的IO内存是指一种编址方式不同cpu平台使用的编址方式不同一种是 IO内存方式也叫统一编址方式是指内存和外设的地址是在同一个地址空间上的如
linux boot 查看module信息

1 查看内置模块信息 lib modules uname r modules builtin 如 cat lib modules linux4 15 0 modules builtin 或者grep y boot config 4 15 0
本博客全文目录索引

本专栏博文索引目录涵盖 C C STL Data Structure Algorithm TCP IP Linux Interface Driver Kernel Netfilter 和 Projects C C 详解C指针 C 对象模
linux内核epoll实现分析

为了更好的分享体验博客搬迁至极客驿站欢迎查阅 epoll与select poll的区别 select poll epoll都是IO多路复用的机制 I O多路复用就通过一种机制可以监视多个描述符一旦某个描述符就绪能够通知程序进行相应
kzalloc 函数详解

用kzalloc申请内存的时候效果等同于先是用 kmalloc 申请空间然后用 memset 来初始化所有申请的元素都被初始化为 0 kzalloc allocate memory The memory is set to zero
imx6ull_kernel_移植

1 文件目录结构 2 顶层Makefile理解 3 kernel启动流程 4 kernel移植心得跟着左神一步一步操作目前没有太多的理解硬件适配中网络驱动和emmc 8线适配讲解的最多文件移植部分基本就是拷贝和粘贴 Q 1 如果要
Linux 高级进程管理

1 让出处理器 Linux提供一个系统调用运行进程主动让出执行权 sched yield 进程运行的好好的为什么需要这个函数呢有一种情况是用户空间线程的锁定如果一个线程试图取得另一个线程所持有的锁则新的线程应该让出处理器知道该锁变为
linux源代码.tar.xz解压

刚开始学习linux内核在linux内核官网https www kernel org 下载我下载的版本是 linux 2 6 34 14 tar xz 由于我的linux中没有安装 xz的解压缩软件需要下载 http download
gcov 和 perf 使用的基本套路备忘 ubuntu

一源代码 cat helloSeven c include
面试题创作0005，请说明Linux 和 AI的关系（联系和区别）

请说明Linux 和 AI的关系联系和区别可以在AI的业务应用平台服务提供平台设备商集成电路开发等各个跟AI相关的行业来寻找联系和区别
面试题创作0001，请解释mmap的细节

1 请列举Linux的几种ICP工具 2 重解释共享内存的实现原理 3 两个进程A和B共享到的同一页物理内存如果被A进程勾进CPU的Cache 那么B进程访问这段内存数据时将会从内存中访问还是从Cache中访问呢可以X86为例或其
Android中Log信息的输出方法

共两篇文章第一篇讲述了如何在程序中输出Log信息第二篇详细的分析了Log信息的输出机制下面是第一篇转自 http blog 163 com binghaitao 126 blog static 3383532520099309366
sel4白皮书翻译

首发地址 http trialley top pages 53ac44 CSDN地址 https blog csdn net lgfx21 article details 117606097 翻译与转发许可作者 Gernot Heiser
SELinux深入理解

1 简介 SELinux带给Linux的主要价值是提供了一个灵活的可配置的MAC机制 Security Enhanced Linux SELinux 由以下两部分组成 1 Kernel SELinux模块 kernel security
LCD DRM驱动框架分析一

本文是基于rk3566 rk3568平台从概念和框架上对LCD DRM驱动框架进行分析一 DRM Direct Rendering Manager 简介 DRM 是 Linux 目前主流的图形显示框架相比 FB 架构 DRM 更能适应
一次内核hung task分析

http blog chinaunix net uid 14528823 id 4406510 html 1 内核hung task检测机制由来我们知道进程等待IO时经常处于D状态即TASK UNINTERRUPTIBLE状态处于这
工作队列(workqueue)

转载于 http blog csdn net angle birds article details 8448070 项目需要在驱动模块里用内核计时器timer list实现了一个状态机郁闷的是运行时总报错 Scheduling wh

随机推荐

盘点 2012 年没落科技巨头

当我们安然度过2012年12月21日世界末日的谣言已不攻自破时在家电 IT 通信互联网等科技领域一些企业却正在经受着末日征兆的考验甚至正在走向末日终结开篇残酷的选择与金融能源领域的企业不同即使扩大到全球范围也没有一家
Python中字符串切片

在Python中可以对字符串按自己需要切片注意 1 第一个字符串排序为0 最后一个字符串为 1 2 切片时从小切到大 3 切片时不包含最后一个字符举例 str 0123456789 print str 0 3 截取第一位到第三位的
Android面试必刷Framewrok面试题（附答案），打破面试难点（2023年最新版）

最近收到身边很多人反馈现在的android面试大多数企业除了对求职者的语言和编码等基础能力提出要求外越来越强调对于 Framework 层的理解和 UI 框架的掌控能力而完整的项目经历和多端知识也成了重要的加分项于是小编收拾了一下
【跨模态】【对比学习】CLIP：文本监督CV的预训练(2021)

文章目录前言一整体架构 1 训练 2 测试迁移学习zero shot 3 prompt engineering and ensembling 二实验 1 few shot与zero shot的对比 2 Representation
C#实现多语言切换

代码 https github com tangbb1 C shop tree master 思路描述窗体的language属性修改为自己需要设定语言 localizable属性改为true 在窗体上进行英文编辑即可生成对应的资源文件
osgearth消除近裁剪平面离物体太近时的裁剪问题

This will mitigate near clip plane issues if you zoom in close to the ground LogarithmicDepthBuffer buf buf install view
JavaScript 新增两个原始数据类型Record 和 Tuple

JavaScript即将推出两个新的数据类型 Record 和 Tuple 这俩是啥呢其实就是一个只读的 Object 和 Array 其实在其它语言中已经有类似的数据类型了例如 Python 中也有 Tuple 元祖这一类型作用也
优雅的后端参数验证javax.validation

为什么要用validator javax validation的一系列注解可以帮我们完成参数校验免去繁琐的串行校验不然我们的代码就像下面这样 PostMapping save serial public Object save Requ
EDA14--DC脚本实例

这里写目录标题一示例1 同步设计 fifo1 二示例2 异步设计 fifo2 三具体操作 fifo1为例 DC的所有理论知识已经简单的概述完了包括 DC简介概念流程逻辑推断 DC脚本命令最后给出DC的两个示例下面是大的目录
18650锂电池充电方案及保护板电路构思

18650锂电池充电方案及保护板电路构思一电路参数 1 充电电源为USB电源额定电压为5V 2 蓄电池为18650锂电池 3 7V 容量2600mAh 3 负载电机参数二电路板功能要求 1 充电电压保护防止因选用错误的充电器过
JSONObject出现重复引用$ref

现象保存的时候红框里内容是一样的结果JSON toJSONString的时候第二个就变成了 ref 原因 JSONObject 默认开启引用检测重复引用对象时会被 ref代替返回的json对象出现 ref 对象地址值解决办法 S
PhpStorm 基本设置

更换皮肤 File gt Settings gt Appearance gt Theme 字体 File gt Settings gt Editor gt Colors Font gt font 点save as 然后再自定义 typo设置
oracle和sqlite区别,数据库sqlserver与sqlite的区别

sqlserver 与sqlite的区别 sqlserver是大型数据库常用于企业级应用的后台数据存储 sqlite 是轻量级数据库对小数据量的数据存储方便文件型数据库其语法区别大比如查询前10条数据 sqlserver SELE
聊聊Api接口优化的几个方法

转载聊聊Api接口优化的几个方法知乎作为记录用于学习
Vendor ID对照表（不定期更新）

Vendor ID对照表不定期更新可自行查询原网址 8086 Intel Corporation 0731 Jingjia Microelectronics Co Ltd 1DB7 Phytium Technology Co Ltd 1
归一化函数 normalized（）

1 归一化定义与作用归一化就是要把需要处理的数据经过处理后通过某种算法限制在你需要的一定范围内首先归一化是为了后面数据处理的方便其次是保证程序运行时收敛加快归一化的具体作用是归纳统一样本的统计分布性归一化在0 1之间是统计的概
攻防演练场景中的加密流量检测技术

lt 引言 gt 在对抗日益激烈加密手段逐渐成为主流的今天攻防演练场景中的加密流量也已逐渐成为主流对加密流量检测的技术变得愈发重要目前针对攻防演练场景的加密流量检测主要分为解密后检测和不解密检测两大类传统的解密检测拥有可以直接将加
文心一言#帮我生成一段1分钟的短视频AI脚本

视频标题旅行者的一天视频类型旅游视频时长 1分钟视频描述这是一个关于一个旅行者一天的故事他早上起床后先去了一家当地的早餐店品尝了美味的当地早餐接着他去了一家博物馆了解了当地的历史和文化中午他去了一家当地的餐厅品
ubuntu上redis安装启动和停止

在 Ubuntu 上操作 Redis 可以按照以下步骤进行安装 Redis 使用以下命令安装 Redis sudo apt get update sudo apt get install redis server 设置开机自动启动使用以
OOM Killer 持续更新中

虚拟地址空间的概念都门儿清 Linux 认为用户进程调用malloc申请了动态内存后不见得马上就会使用这段内存进行写读操作 Linux使用了拖到最后的分配机制用时分配机制但是即使是用时分配也不见得总有内存可分配当进程太多显得内存

OOM Killer 持续更新中

OOM Killer 持续更新中 的相关文章

随机推荐

热门标签

OOM Killer 持续更新中的相关文章