Thanks for checking! > It turns out that it performs exactly the same number of ...

orlp · on May 19, 2023

> This is surprising, shouldn't the number of comparisons be identical to the standard implementation?

No, to make it branchless (in an efficient way, it can be done regardless of course), you always take ceil(n / 2) as your remaining size, whereas the branchy version chooses either floor(n / 2) or ceil(n / 2) as appropriate.

And in fact, you don't want the number of comparisons to be identical to the standard implementation, for a branchless implementation. Because this makes the branch on loop exit unpredictable.

I did find that by changing the implementation slightly:

    template<typename It, typename T, typename Cmp>
    It lower_bound(It begin, It end, const T& value, Cmp comp) {
        size_t n = 1 + end - begin;
        size_t i = -1;
        while (n > 1) {
            size_t half = n / 2;
            size_t m = i + half;
            if (comp(begin[m], value)) i = m;
            n -= half;
        }
        return begin + i + 1;
    }

that clang gives slightly better code on the Apple M1 (avoiding an unnecessary add in the tight loop):

    ; %bb.0:
        sub x8, x1, x0
        add x8, x8, #4                      ; =4
        asr x8, x8, #2
        cmp x8, #2                          ; =2
        b.lo LBB8_3
    ; %bb.1:
        ldr w10, [x2]
        mov x9, #-1
    LBB8_2:                                 ; =>This Inner Loop Header: Depth=1
        lsr x11, x8, #1
        add x12, x11, x9
        ldr w13, [x0, x12, lsl #2]
        cmp w13, w10
        csel x9, x12, x9, lo
        sub x8, x8, x11
        cmp x8, #1                          ; =1
        b.hi LBB8_2
        b LBB8_4
    LBB8_3:
        mov x9, #-1
    LBB8_4:
        add x8, x0, x9, lsl #2
        add x0, x8, #4                      ; =4
        ret

which results in the classic algorithm beating the overlap method on smaller sizes, but not on larger sizes, on Apple M1: https://i.imgur.com/VjD5EK7.png