Linux绑核效率优化

2023-11-20

Linux绑核效率优化

原理概述：

cpu一般有多个物理核心，但在运行进程和线程时候，可以将其绑定或者指定到某一个或者多个核心上运行。这样做的好处是：一般在核数比较多的机器上，会有多个CPU共享三级缓存cache的情况。当出现跨cache数据通信时，效率会比同一个cache内通信慢上一些。默认情况下程序运行时不会考虑cache的分组，哪个核空闲就会用哪一个。

在Linux上一般会用0-n给cpu进行编号，在目录/sys/devices/system/cpu/下不同的目录存放不同cpu的信息。

假如一个cpu有8个核，0-3共享一个cache，4-7共享一个cache，经测试程序运行时绑核在0,1,2,3会比由系统自行调度效率高出10-14%

在核数低于4核的CPU上，一般都只有一组缓存，这种情况下绑核不会有效率上的提升。

哪几个CPU在同一组并不是固定的，需要查看系统的配置信息，一般/sys/devices/system/cpu/cpu0/cache/index3/shared_cpu_map文件里以16进制数的形式记录了与cpu0共享三级缓存的cpu，比如00ff，其实就是00001111，就代表0-3号cpu与cpu0共享了同一块三级缓存，一般编程获取的方式使用此文件。

人为观察一般查看/sys/devices/system/cpu/cpu0/cache/index3/shared_cpu_list文件，一般的写法诸如0-5,12-17则代表0-5,12-17号cpu共享同一块三级缓存。

绑核方式

命令绑核：`taskset`

# 平时启动wps
./wps
# 绑核启动WPS
taskset -c 0,1,2,3 ./wps
# 代表将使用cpu0-cpu3执行wps进程

线程绑核

#include <stdio.h>
#include <stdlib.h>
#define __USE_GNU
#include <sched.h>
#include <pthread.h>

#define NUM_OF_TASKS 5

long shared_cache_cpus;
cpu_set_t cpu_set;

int get_cpu_topo()
{
    int last_cache;
    char cache_info_path[100];
    char str[100];
    FILE *fp;

    //获取cpu有最后一级cache
    int i;
    for (i = 0;i < 5;++i) {
        sprintf(cache_info_path, "/sys/devices/system/cpu/cpu0/cache/index%d/shared_cpu_map", i);
        fp = fopen(cache_info_path, "r");
        if (fp == NULL)
            break;
        fclose(fp);
    }
    last_cache = i -1;

    //读取LLC中shared cache信息
    sprintf(cache_info_path, "/sys/devices/system/cpu/cpu0/cache/index%d/shared_cpu_map", last_cache);
    fp = fopen(cache_info_path, "r");
    if (fp == NULL) {
        perror("open file failed!!!");
        return -1;
    }
    fgets(str, 100, fp);
    shared_cache_cpus = strtol(str, NULL, 16);

    fclose(fp);

    return 0;
}

int bind_thread()
{
    long cpus;

    CPU_ZERO(&cpu_set);
    cpus = shared_cache_cpus;
    for (int i = 0;i < 64; ++i) {
        printf("cpu %d:: ", cpus);
        if (cpus&1) {
            CPU_SET(i, &cpu_set);
            printf("bind to cpu: %d\n", i);
        }

        cpus = cpus >> 1;
    }

    return 0;
}

void WasteTime(void)
{
    int abc = 1000;
    int temp = 0;

    while(abc--)
        temp = 10000*10000;

    sleep(1);
}
 
void *thread_func(void *param)
{
    while(1)
    {
        if (pthread_setaffinity_np(pthread_self(), sizeof(cpu_set), &cpu_set) < 0)
            perror("pthread_setaffinity_np");
        WasteTime();
    }
}

int main()
{
    pthread_t my_thread[NUM_OF_TASKS];

    if (get_cpu_topo() == 0) {
        bind_thread();
    }

    for (int t = 0;t < NUM_OF_TASKS; ++t) {
        if (pthread_create(&my_thread[t], NULL, thread_func, NULL) != 0) {
            perror("pthread_create ERROR!!!");
            return -1;
        }
    }

    for (int t = 0;t < NUM_OF_TASKS; ++t) {
        pthread_join(my_thread[t], NULL);
    }

    pthread_exit(NULL);

    return 0;
}

进程绑核

#include <unistd.h>
#include <sched.h>
#include <sys/sysinfo.h>
#define MAX_CPU_CACHE_LEVEL 5  // cpu最高三级缓存，从0开始循环，取到4确保能拿到最高值

int bindProcess()
{
    //根据cpu共享cache的情况进行绑核，提升运行效率。
    if (get_nprocs() <= 4)
        return 0;

    char cache_info_path[100] = {'\0'};
    char str[100] = {'\0'};

    //获取cpu有最后一级cache
    int i;
    for (i = 0; i < MAX_CPU_CACHE_LEVEL; ++i) 
    {
        sprintf(cache_info_path, "/sys/devices/system/cpu/cpu0/cache/index%d/shared_cpu_map", i);

        if (0 != access(cache_info_path, R_OK))
            break;
    }

    int last_cache = i - 1;

    //读取LLC中shared cache信息
    sprintf(cache_info_path, "/sys/devices/system/cpu/cpu0/cache/index%d/shared_cpu_map", last_cache);
    FILE *fp = fopen(cache_info_path, "r");
    if (fp == NULL) 
    {
        perror("open file failed!!!");
        return -1;
    }
    fgets(str, 100, fp);
    //移除str中的','，否则在有些多核系统上转long会失败
    int j = 0, k = 0;
    while(str[j] != '\0')
    {
        if (str[j] != ',')
        {
            str[k++] = str[j];
        }
        j++;
    }
    str[k] = str[j];
    long shared_cache_cpus = strtol(str, NULL, 16);

    fclose(fp);

    cpu_set_t cpu_set;
    CPU_ZERO(&cpu_set);
    for (int i = 0; i < 64; ++i) 
    {
        if (shared_cache_cpus & 1) 
        {
            CPU_SET(i, &cpu_set);
        }
        shared_cache_cpus = shared_cache_cpus >> 1;
    }
    
    sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set);
    return 0;
}

在自己的项目代码中嵌入

只需要在main函数执行Exec前，调用上述bindProcess函数即可

参考资料

• https://en.wikipedia.org/wiki/CPU_cache#Cache_miss
• https://blog.csdn.net/fanyun_01/article/details/102788269
• https://blog.csdn.net/ethercat_i7/article/details/105717152

本文内容由网友自发贡献，版权归原作者所有，本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容，请联系:hwhale#tublm.com(使用前将#替换为@)