在 x86-64、skylake 上以可重启序列优化 percpu 2 级位向量

2023-12-08

我很好奇如何最好地优化下面的程序集，特别是“跳到此处查看程序集”下的代码块中的部分（以便于 control-f 搜索）。

我正在编写一些代码，HOT HOT HOT 路径基本上是在位向量中查找 0 位并返回该位。

位向量由以下部分组成：

struct 2l_bitvec {
       // outer vector with bits indicating with inner vectors have available slots
       uint64_t v1;

       // inner vector with actual index bits
       uint64_t v2[64];
} 2l_bitvec;

每个CPU都有一个bitvec（或者多个以慢得多的路径数据结构链接在一起）。

为了管理我正在使用的这些位向量内的一致性可重新启动的序列（向下滚动一点，查看我能找到的最好的联机帮助页）。

由于使用rseq（这是超级热门的代码）逻辑全部用内联汇编编写。

我想写的C代码如下：

#define LIKELY(X)   __builtin_expect(!!(X), 1)
#define UNLIKELY(X) __builtin_expect((X), 0)
uint64_t __attribute__((noinline))
restarting_l2_set_idx(uint64_t * v1, const uint32_t start_cpu) {
    
// if ever preempted, migrated, or catch a signal return here
catch_something_label:
    
    if (start_cpu != __rseq_abi.cpu_id_start) {
        return 4097;
    }

    uint64_t temp_v1 = *v1;
    while (LIKELY(temp_v1 != (~(0UL)))) {
        const uint32_t idx_v1  = _tzcnt_u64((~temp_v1));
        
        uint64_t       temp_v2 = v1[idx_v1 + 1];
        if (LIKELY(temp_v2 != (~(0UL)))) {
            const uint32_t idx = _tzcnt_u64(~temp_v2);
            
            temp_v2 |= ((1UL) << idx);
            v1[idx + 1] = temp_v2;
            
            return 64 * idx_v1 + idx;
        }
        else {
            temp_v1 |= ((1UL) << idx_v1);
            *v1 = temp_v1;
        }
    }
    
    return -1;
}

有一些rseq设置内容基本上是：

#define RSEQ_INFO_DEF(alignment)                                               \
    ".pushsection __rseq_cs, \"aw\"\n\t"                                       \
    ".balign " #alignment                                                      \
    "\n\t"                                                                     \
    "3:\n\t"                                                                   \
    ".long 0x0\n"                                                              \
    ".long 0x0\n"                                                              \
    ".quad 1f\n"                                                               \
    ".quad 2f - 1f\n"                                                          \
    ".quad 4f\n"                                                               \
    ".popsection\n\t"

/*
    ".pushsection __rseq_cs, \"aw\"\n\t"    // creation section
    ".balign " #alignment"\n\t"             // alignment at least 32
    "3:\n\t"                                // struct info jump label
                                            // struct is rseq_info
    ".long 0x0\n"                           // version = 0
    ".long 0x0\n"                           // flags = 0
    ".quad 1f\n"                            // start_ip = 1f (label 1, forward)
    ".quad 2f - 1f\n"                       // post_commit_offset = (start_cs
                                               label - end_cs label)
    ".quad 4f\n"                            // abort label = 4f (label 4)
    ".popsection\n\t"                       // end section
*/


#define RSEQ_CS_ARR_DEF()                                                      \
    ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t"                             \
    ".quad 3b\n\t"                                                             \
    ".popsection\n\t"

/*
    ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t"  // create ptr section
    ".quad 3b\n\t"                                  // set ptr to addr of
                                                       rseq_info
    ".popsection\n\t"                               // end section
*/

#define RSEQ_PREP_CS_DEF(TEMP_REGISTER)                               \
    "leaq 3b (%%rip), " V_TO_STR(TEMP_REGISTER) "\n\t"                         \
    "movq " V_TO_STR(TEMP_REGISTER) ", %%fs:__rseq_abi@tpoff+8\n\t"          \



/*
    "leaq 3b (%%rip), REGISTER\n\t"     // get set for rseq_info struct
    "movq REGISTER, 8(%[rseq_abi])\n\t" // store in ptr field in __rseq_abi
*/

#define RSEQ_CMP_CUR_VS_START_CPUS()                                           \
    "cmpl %[start_cpu], %%fs:__rseq_abi@tpoff+4\n\t"

/*
    "cmpl %[start_cpu], 4(%[rseq_abi])\n\t" // get cpu in 4(%[rseq_abi]) and
                                               compare to %[start_cpu] which is
                                               passed as param to function
*/


// sometimes this is better to put in the
// same code section as the critical section
#define RSEQ_START_ABORT_DEF()                                                 \
    ".pushsection __rseq_failure, \"ax\"\n\t"                                  \
    ".byte 0x0f, 0xb9, 0x3d\n\t"                                               \
    ".long 0x53053053\n\t"                                                     \
    "4:\n\t"                                                                   \

/*
  ".pushsection __rseq_failure, \"ax\"\n\t" // create failure section
    ".byte 0x0f, 0xb9, 0x3d\n\t"            // Disassembler-friendly signature:
                                               ud1 <sig>(%rip),%edi
    ".long 0x53053053\n\t"                  // invalid operation to avoid code
                                               injection 
    "4:\n\t"                                // abort label
*/

#define RSEQ_END_ABORT_DEF() ".popsection\n\t"

/*
    ".popsection\n\t"   // end failure section
*/

在伪代码中，包含所有内容的程序集是什么rseq东西看起来是这样的：

/*
Type assembly will look like as follow:
foo(..., uint32_t start_cpu) 
    RSEQ_INFO_DEF(32) 
    RSEQ_CS_ARR_DEF() 
    RSEQ_PREP_CS_DEF()

    // maybe some setup stuff (or maybe abort)

    "1:\n\t"    

    RSEQ_CMP_CUR_VS_START_CPUS()
    // handle migrated somehow

    <actual critical section here>
    "2:\n\t" (this is end label of critical section)

    // if abort is in another code section
    RSEQ_START_ABORT_DEF()
    <logical for abort here>
        // if this is goto generally jmp %l[abort]
        // otherwise some actual logic (usually set return var)
    RSEQ_END_ABORT_DEF()
    : <output variables, only if NOT goto asm>
    : <input variables> +
     [ start_cpu ] "g"(start_cpu), // always
    : <clobber registers> +
      "memory", "cc" // minimum clobbers
    #ifdef IS_GOTO_ASM
    : <jump labels OUTSIDE of the asm>
    #endif
*/

该程序集针对以下事实进行了优化：绝大多数中止是由于抢占而不是迁移，因此通常中止只是跳回检查当前 cpu 并继续（因为比较成功）

我使用的汇编代码如下：

跳到这里查看大会

#define PRIMITIVE_V_TO_STR(X) #X
#define V_TO_STR(X) PRIMITIVE_V_TO_STR(X)

#define _FAILURE_MIGRATED 4097

// inlining the function often breaks stuff, so while testing I am skipping that
// aligning to cache line seems to actually affect performance significantly

uint64_t __attribute__((noinline))
__attribute__((aligned(64)))
    restarting_2l_set_idx(uint64_t * const v1, const uint32_t start_cpu) {
    // return [0 - 4095] -> success (that is the index)
    // return [4097] -> failure the thread migrated
    // return [-1] -> failure the bit vector is full
    
#pragma GCC diagnostic ignored "-Wuninitialized"
    // pin for return so compiler doesnt fuck up
    register uint64_t idx asm("rax");

    // some temps I trust the compiler to allocate smartly
    uint64_t * v2;
    uint64_t idx_v1, temp_v1, temp_v2;
#pragma GCC diagnostic push

    // clang-format off
    asm volatile(
        RSEQ_INFO_DEF(32)
        RSEQ_CS_ARR_DEF()

        // any register will do
        RSEQ_PREP_CS_DEF(%[temp_v1])

        "mov $" V_TO_STR(_FAILURE_MIGRATED) ", %[idx]\n\t"

#ifdef FAST_ABORT
        // skip abort first time
        "jmp 1f\n\t"
        
        ".byte 0x0f, 0xb9, 0x3d\n\t"            // Disassembler-friendly signature: ud1 <sig>(%rip),%edi
        ".long 0x53053053\n\t"                  // invalid operation to avoid code injection 
        "4:\n\t"                                // abort label

        ".byte 0x0f, 0xb9, 0x3d\n\t"
        ".long 0x53053053\n\t"
        "4:\n\t"
        "mov $" V_TO_STR(_FAILURE_MIGRATED) ", %[idx]\n\t"
#endif
        
        // start critical section
        "1:\n\t"
        
        // check if migrated        
        RSEQ_CMP_CUR_VS_START_CPUS()
        // if migrated goto 2:
        "jnz 2f\n\t"

        // if not migrated temp_v = *v
        "movq (%[v1]), %[temp_v1]\n\t"

        // start loop: while(temp_v1 != -1)
        "5:\n\t"
                
        // idx = ~temp_v
        "movq %[temp_v1], %[idx]\n\t"

                
        // The reason we can't do this cmp after notq %[idx]
        // (and use testq) is because
        // 0 is a valid idx to return whereas -1 is not
        // (also why setting idx before the comparison)

        // if (%[v1]) is full leave. 
        // This branch is VERY unexpected.
        "cmpq $-1, %[idx]\n\t"
        "jz 2f\n\t"
        
        "notq %[idx]\n\t"
        
        // idx_v1 = tzcnt(idx) (find first one)
        "tzcntq %[idx], %[idx_v1]\n\t"

        // if registers are tight v2 could be in
        // memory and could use [idx] as a temporary
        // temp_v2 = v[idx_v1 + 1]
        "leaq 8(%[v1],%[idx_v1],8), %[v2]\n\t"
        "movq (%[v2]), %[temp_v2]\n\t"

        // test if temp_v2 is full
        "cmpq $-1, %[temp_v2]\n\t"
        "jz 7f\n\t" // 7f is btsq %[idx_outer], %[temp_v1], jmp 5b
        
        // idx = ~temp_v2
        "movq %[temp_v2], %[idx]\n\t"
        "notq %[idx]\n\t"
        // could replace the cmpq $-1, %[temp_v2], jz above with
        // testq %[idx], %[idx], jz here

        // idx = tzcnt(idx)
        "tzcntq %[idx], %[idx]\n\t"

        // temp_v2 |= 1 << idx
        "btsq %[idx], %[temp_v2]\n\t"
        "jmp 9f\n\t"

        "7:\n\t"
        "btsq %[idx_v1], %[temp_v1]\n\t"
        
        // this is a completely valid state to be migrated out after
        // (all we have really done is cleaned up v1 vector a bit)
        // because we can be migrated out here we don't check/set if
        // temp_v2 is full as that could lead to invalid state in v1
        "movq %[temp_v1], (%[v1])\n\t"

        // this is } in while loop starting at 5:
        "jmp 5b\n\t"

        // prepare for commit and commit
        "9:\n\t"
        
        // temp_v2 |= 1UL << idx
        "btsq %[idx], %[temp_v2]\n\t"
               
        // prepare success return
        "salq $6, %[idx_v1]\n\t"
        "addq %[idx_v1], %[idx]\n\t"
        
        // commit
        "movq %[temp_v2], (%[v2])\n\t"

        // end critical section
        "2:\n\t"

#ifndef FAST_ABORT
          RSEQ_START_ABORT_DEF()
        // given that the critical section is fairly involved
        // it may be worth it to put this in the same code section
        // as critical section for faster aborts
        "mov $" V_TO_STR(_FAILURE_MIGRATED) ", %[idx]\n\t"
        "jmp 1b\n\t"
        RSEQ_END_ABORT_DEF()
#endif

        : [ idx] "+r" (idx)
        : [ idx_v1 ] "r" (idx_v1),
          [ temp_v2 ] "r" (temp_v2),
          [ temp_v1 ] "r" (temp_v1),
          [ v2 ] "r" (v2), 
          [ v1 ] "g" (v1),
          [ start_cpu] "g" (start_cpu)
        : "memory", "cc");

    return idx;
}

在正确之后，我的第一个、第二个和第三个目标是让它变得更快。所有优化都必须考虑到代码可以在任何指令之后跳转到中止（因此为什么从temp_v2 to v2是关键部分的最终指令）。如果中止是由于线程迁移引起的，则该函数无法写入任何数据（否则将出现严重的竞争条件）。

如果您想在用户空间中运行/编译它，您将需要包含linux/rseq.h标头。一个不错的“hello world”设置是here和/或在librseq.

注意：我将其发布在这里而不是在 codereview.SE 上，因为我的主要问题是如何在我的关键部分中进行程序集restarting_l2_set_idx faster.

编辑： @彼得科德斯

建议在这里更换 leaq：

        "leaq 8(%[v1],%[idx_v1],8), %[v2]\n\t"
        "movq (%[v2]), %[temp_v2]\n\t"

我把它改成了这个

        "movq %[v1], %[v2]\n\t"         // v2 = v1
        "salq $3, %[idx_v1]\n\t"        // idx_v1 = 8 * idx_v1
        "addq %[idx_v1], %[v2]\n\t"     // v2 += idx_v1 (index by uint64_t)
        "movq 8(%[v2]), %[temp_v2]\n\t" // temp_v2 = *(v + 8)

自从idx_v1现在它所代表的位位置为 8 x，以下代码也发生了变化：

        // in 7: label
        "btsq %[idx_v1], %[temp_v1]\n\t"

        "sarq $3, %[idx_v1]\n\t"
        "btsq %[idx_v1], %[temp_v1]\n\t"

and

        // in 9: label
        "salq $6, %[idx_v1]\n\t"

        "salq $3, %[idx_v1]\n\t"

但我不确定这是否真的是性能改进。我认为这可能会因为我确实需要存储而受到抑制v2用于提交。

编辑2： @PeterCordes 指出我的编辑很愚蠢：我可以放弃v2暂时的并使用movq 8(%[v1],%[idx_v1],8), %[temp_v2] to get temp_v2 and movq %[temp_v2], 8(%[v1],%[idx_v1],8)来存储它。抱歉我的第一次编辑很天真:(

None

本文内容由网友自发贡献，版权归原作者所有，本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容，请联系:hwhale#tublm.com(使用前将#替换为@)

在 x86-64、skylake 上以可重启序列优化 percpu 2 级位向量的相关文章

EF Core 返回 null 关系，直到直接访问

我有一些如下所示的模型 public class Mutant public long Id get set Relations public long OriginalCodeId get set public virtual Origi
在 LINQ 查询中进行转换

是否可以在 LINQ 查询中进行强制转换为了编译器的缘故下面的代码并不糟糕但最好将其放入一个查询中 Content content dataStore RootControl as Controls Content List
并行运行多个任务

我有一个代理列表每个代理都会访问不同的站点并从站点中提取所需的数据目前它一次只做一个但我希望同时运行 10 20 个任务这样它就可以一次性从 20 个站点下载而不是只下载一个这是我目前正在做的事情 private async T
无法在 CUDA 中找到 1 到 100 数字的简单和？

我正在研究使用 CUDA 的图像处理算法在我的算法中我想使用 CUDA 内核找到图像所有像素的总和所以我在cuda中制作了内核方法来测量16位灰度图像的所有像素的总和但我得到了错误的答案所以我在cuda中编写了一个简单的程序来查
如何在 C++ 中为指针“this”赋值

在函数中如何分配this一个新的价值您可以分配对象this点于 this XY 但你不能分配直接值this this XY Error Expression is not assignable
每个元素的 asp.net Web 表单自定义错误消息

我创建了一个 Web 应用程序表单以及后端 SQL 插入和查询目前我正在显示所有用户错误消息 div style padding 1em div
从结构调用 C++ 成员函数指针

我找到了有关调用 C 成员函数指针和调用结构中的指针的信息但我需要调用结构内部存在的成员函数指针但我无法获得正确的语法我在类 MyClass 的方法中有以下代码片段 void MyClass run struct int MyClas
自己绘制的WPF自定义滑块

这是我关于堆栈溢出的第一个问题所以不要踢它我在尝试创建 Mac 风格的滑块控件时遇到问题我已经发现这个解决方案 http www codeproject com KB miscctrl MAC Slider aspx我已经在我的解决方
重载算术运算符

赋值运算符可以声明为 T 运算符 const t 在类中但不能以这种方式定义算术运算符它必须是友元函数我不明白为什么你能解释一下吗算术运算符不必须是友元那么你可以这样定义 MyClass MyClass operator con
如何在 C 中链接目标文件？失败并显示“架构 x86_64 的未定义符号”

因此我尝试在我的文件 file2 c 中使用另一个 C file1 c 文件中定义的函数为了做到这一点我包含了 file1 file1 h 的标头但是每当我尝试使用 gcc 编译文件时我都会收到以下错误 Undefined sy
X 轴和 Z 轴上的 Quaternion.Slerp，无 Y 轴

I am trying to rotate the Player about X Y and Z axis The Y axis should not move from last angle Example if I rotate 45
使用 STL 流时如何格式化我自己的对象？

我想将我自己的对象输出到 STL 流但具有自定义格式我想出了这样的东西但由于我之前从未使用过 locale 和 imbue 所以我不知道这是否有意义以及如何实现 MyFacet 和operator 所以我的问题是这是否有意义以及如何
当我尝试传递临时地址作为参数时，它是一个 UB 吗？

对于以下 C 代码 include
main.cpp 是必需的吗？

我试图编译一个程序cmake 我最终删除了我的main cpp文件我刚刚将其复合到另一个包含我的项目名称的文件中即我刚刚将主函数剪切并粘贴到该文件中问题是我有一个main cpp未发现错误不确定是否在C 一个名为main cpp是
将 AutomationID 与 ListView 结合使用

我正在尝试将 AutomationId 附加到列表视图中的项目理想情况下将项目名称绑定到显示的项目
具有多个父项的 Qt 树模型

我想构建一棵树其中一个元素可以引用另一个元素我想要构建的树是像这样的东西 A B C D E F P this is a pointer to C D first child of C E second child of C I fo
在一个解决方案中调用不同项目的方法

1 个解决方案中有 3 个项目我对第一个项目中的主文件进行的主要操作但是我需要调用第三个项目中的方法并使用类例如第三个项目有 public DataClasses1DataContext base global WindowsFor
在 Visual Studio 2012 Express 中设置 C++ 调试环境

我需要调试的应用程序需要设置环境变量这在 Visual Studio 2012 中似乎非常复杂我想做类似的事情 set path c foo c bar c windows c program files application set
为什么存在系统调用

我一直在阅读有关系统调用及其在 Linux 中如何工作的内容我还有更多的阅读要做但我读过的一件事都没有回答那就是为什么我们需要系统调用我知道系统调用是用户空间程序要求内核执行某些操作的请求但我的问题基本上是为什么用户空间程序本
跟踪白色背景中的白球（Python/OpenCV）

我在 Python 3 中使用 OpenCV 来检测白场上的白黑球并给出它的精确 x y 半径和颜色我使用函数 cv2 Canny 和 cv2 findContours 来找到它但问题是 cv2 Canny 并不总是检测到圆的完整

随机推荐

多次调用 window.open() 第一次后失败

我有一个脚本可以循环访问多个网址并在新选项卡中打开它们以前可以用但现在只能打开第一个甚至有一个 w3schools 测试编辑器据说可以打开多个窗口但在第一个窗口之后它也失败了 https www w3schools com jsre
友好地获取ffmpeg信息

每次我尝试使用 ffmpeg 获取有关我的视频文件的一些信息时它都会吐出很多无用的信息和好的信息我在用着ffmpeg i name of the video mpg 有没有可能以友好的方式得到它我的意思是 JSON 会很棒甚至丑陋的
PWA 关闭时发送通知

我制作了一个 PWA 待办事项列表应用程序链接到应用程序使用角度我现在计划添加通知以便在应用程序关闭时可以到达用户由于它是一个离线工作的 PWA 我不能依赖推送通知提前致谢你可以利用Web 定期后台同步 API您也可以离线运行
使用 Android 进行录制时显示不断更新的双精度（频率）

我正在构建一个 Android 应用程序它使用 FFT 算法显示持续音符的频率我正在使用 Jtransform 方法目前我的问题是我无法在屏幕上显示频率以下代码是 fft 频率计算和 AsynchTask 应在文本框中显示频率 im
Python 列表到 XML，反之亦然

我编写了一些将 python 列表转换为 XML 元素的 python 代码它用于与 LabVIEW 交互因此采用了奇怪的 XML 数组格式无论如何这是代码 def pack data create the result eleme
无法转换日期 - LocalDate 减去一天

我对日期及其从 Date 到 LocalDate 的传递有疑问主要错误在于年份为 1700 年的日期我尝试过两个不同的日期每当年份是 1700 年时我就会少一天 Date dto Instant inst dto toInstant
android 如何限制ListView中列表项的显示和按钮显示更多

我正在从数据库获取数据并在 ListView 中使用自定义列表适配器进行显示我只需要在 ListView 中显示 10 个项目在第 10 项之后会显示一个带有文本显示更多的按钮当单击此按钮时会在列表视图中显示数据库中的更多 1
raw 无法解析或不是字段

我正在我的应用程序中构建 MP3 播放器但收到一条错误消息指出 raw 无法解析或不是字段 mMediaPlayer MediaPlayer create this R raw test cbr 我不确定 R raw test cbr
javascript 闭包无法正常工作

看第一段代码 var count 0 function addLinks var count 0 this count var is increasing for var i 0 link i lt 5 i link document cr
如何使用 Microsoft Graph 更新 SharePoint 中的查找字段和用户字段？

我正在寻找一种如何使用 Microsoft graph 更新查找字段和用户字段的方法我可以读取该项目但即使我输入了正确的 ID 值也找不到创建或更新此类字段的方法现在支持通过 Microsoft Graph API 更新查找字段假
带有循环调试打印的 ansible 寄存器不起作用

我有一个简单的剧本应该显示我的服务状态我想查看机器的输出以查看状态是否处于活动状态所以我使用了调试打印如下所示 name name of services shell systemctl status item with items
Flutter：更新列表中的特定索引（Firestore）

究竟如何根据索引更新列表中的特定值例如在以下列表中 0 first name name0 last name lastName0 1 first name name1 last name lastName1 如何仅更新 lastName
glassfish 的基本身份验证失败

首先我对这篇长文表示歉意这是我之前问题的延续 7u21更新后弹出需要验证的窗口关于这个问题但我缩小了搜索范围简而言之自 Java 7u21 以来我的 BASIC 身份验证似乎已被破坏通过 JNLP 文件启动的小程序根本无法稳
使用清晰的代码行异步调用插槽，无需连接到它

我遇到了相当奇怪的错误 QAction trigger导致出现阻塞对话框这导致我的服务器调用trigger卡住例如在对话框关闭之前无法处理套接字信号我想出了一个解决方法我连接信号void triggerWorkaround 到插槽
模态中的文本字段

如果我们使用列表视图当键盘出现时文本字段就会上升但在模式中则不会抱歉我不知道如何正确解释它我只会用图像向您展示模式代码 showModalBottomSheet shape RoundedRectangleBorder bor
ASP.NET Web 应用程序消息框

在 asp net windows 窗体应用程序中您可以在后面的 C 代码中使用 MessageBox Show Here is my message ASP NET Web 应用程序中有类似的东西吗我可以从后面的 C 代码中调用一些向
有人可以澄清 Android 上下文引用吗？

我的误会还在继续任何人都可以引用正确使用的参考资料get Context 我得到了关于使用的相互矛盾的建议getBaseContext getApplicationContext and getContext 我的理解是使用this是一个
如何在VIPS中进行透视扭曲变换？

是否可以执行以下 ImageMagick透视扭曲使用 VIPS 命令如果是这样命令是什么使用ruby vips convert my file png matte virtual pixel transparent distort P
为小字符增大 SKLabelNode 的触摸区域

我在我的游戏中添加了一个老式的高分输入屏幕用户点击每个字母来输入他们的名字每个字母符号或短语 DEL SP 等都是一个SKLabelNode而且点击和非常困难不过字符和一些符号每次点击都会通过通常的方式检测到touchesB
在 x86-64、skylake 上以可重启序列优化 percpu 2 级位向量

我很好奇如何最好地优化下面的程序集特别是跳到此处查看程序集下的代码块中的部分以便于 control f 搜索我正在编写一些代码 HOT HOT HOT 路径基本上是在位向量中查找 0 位并返回该位位向量由以下部分组成 struc

在 x86-64、skylake 上以可重启序列优化 percpu 2 级位向量

在 x86-64、skylake 上以可重启序列优化 percpu 2 级位向量 的相关文章

随机推荐

热门标签

在 x86-64、skylake 上以可重启序列优化 percpu 2 级位向量的相关文章