cmd/compile: a constant expression is moved into a loop #71443

artem-anisimov-0x7f · 2025-01-26T14:16:21Z

Go version

go version go1.24rc2 linux/amd64

Output of `go env` in your module/workspace:

AR='ar'
CC='gcc'
CGO_CFLAGS='-O2 -g'
CGO_CPPFLAGS=''
CGO_CXXFLAGS='-O2 -g'
CGO_ENABLED='1'
CGO_FFLAGS='-O2 -g'
CGO_LDFLAGS='-O2 -g'
CXX='g++'
GCCGO='gccgo'
GO111MODULE=''
GOAMD64='v1'
GOARCH='amd64'
GOAUTH='netrc'
GOBIN=''
GOCACHE='/home/artem/.cache/go-build'
GOCACHEPROG=''
GODEBUG=''
GOENV='/home/artem/.config/go/env'
GOEXE=''
GOEXPERIMENT=''
GOFIPS140='off'
GOFLAGS=''
GOGCCFLAGS='-fPIC -m64 -pthread -Wl,--no-gc-sections -fmessage-length=0 -ffile-prefix-map=/tmp/go-build211132318=/tmp/go-build -gno-record-gcc-switches'
GOHOSTARCH='amd64'
GOHOSTOS='linux'
GOINSECURE=''
GOMOD='/home/artem/dev/mono/go.mod'
GOMODCACHE='/home/artem/go/pkg/mod'
GOOS='linux'
GOPATH='/home/artem/go'
GOPROXY='http://gomodproxy:1111/'
GOROOT='/usr/local/go'
GOSUMDB='sum.golang.org'
GOTELEMETRY='off'
GOTELEMETRYDIR='/home/artem/.config/go/telemetry'
GOTMPDIR=''
GOTOOLCHAIN='auto'
GOTOOLDIR='/usr/local/go/pkg/tool/linux_amd64'
GOVCS=''
GOVERSION='go1.24rc2'
GOWORK=''
PKG_CONFIG='pkg-config'

What did you do?

Compile the following code:

package main

//go:noinline
func Sum0(xs []byte, mask uint64) (n int, found bool) {
        var sum uint64
        for i, x := range xs {
                sum = (sum << 1) + table[x]
                if (sum & mask) == 0 {
                        return i, true
                }
        }
        return 0, false
}

//go:noinline
func Sum1(xs []byte, mask uint64) (n int, found bool) {
        t := table[:]

        var sum uint64
        for i, x := range xs {
                sum = (sum << 1) + t[x]
                if (sum & mask) == 0 {
                        return i, true
                }
        }
        return 0, false
}

func main() {
        xs := []byte{0, 1, 2, 3, 4, 5, 6, 7}
        _, _ = Sum0(xs, ^uint64(0xff))
        _, _ = Sum1(xs, ^uint64(0xff))
}

var table = [256]uint64{
        0xe80e8d55032474b3, 0x11b25b61f5924e15, 0x03aa5bd82a9eb669, 0xc45a153ef107a38c, 0xeac874b86f0f57b9, 0xa5ccedec95ec79c7, 0xe15a3320ad42ac0a, 0x5ed3583fa63cec15,
        0xcd497bf624a4451d, 0xf9ade5b059683605, 0x773940c03fb11ca1, 0xa36b16e4a6ae15b2, 0x67afd1adb5a89eac, 0xc44c75ee32f0038e, 0x2101790f365c0967, 0x76415c64a222fc4a,
        0x579929249a1e577a, 0xe4762fc41fdbf750, 0xea52198e57dfcdcc, 0xe2535aafe30b4281, 0xcb1a1bd6c77c9056, 0x5a1aa9bfc4612a62, 0x15a728aef8943eb5, 0x2f8f09738a8ec8d9,
        0x200f3dec9fac8074, 0x0fa9a7b1e0d318df, 0x06c0804ffd0d8e3a, 0x630cbc412669dd25, 0x10e34f85f4b10285, 0x2a6fe8164b9b6410, 0xcacb57d857d55810, 0x77f8a3a36ff11b46,
        0x66af517e0dc3003e, 0x76c073c789b4009a, 0x853230dbb529f22a, 0x1e9e9c09a1f77e56, 0x1e871223802ee65d, 0x37fe4588718ff813, 0x10088539f30db464, 0x366f7470b80b72d1,
        0x33f2634d9a6b31db, 0xd43917751d69ea18, 0xa0f492bc1aa7b8de, 0x3f94e5a8054edd20, 0xedfd6e25eb8b1dbf, 0x759517a54f196a56, 0xe81d5006ec7b6b17, 0x8dd8385fa894a6b7,
        0x45f4d5467b0d6f91, 0xa1f894699de22bc8, 0x33829d09ef93e0fe, 0x3e29e250caed603c, 0xf7382cba7f63a45e, 0x970f95412bb569d1, 0xc7fcea456d356b4b, 0x723042513f3e7a57,
        0x17ae7688de3596f1, 0x27ac1fcd7cd23c1a, 0xf429beeb78b3f71f, 0xd0780692fb93a3f9, 0x9f507e28a7c9842f, 0x56001ad536e433ae, 0x7e1dd1ecf58be306, 0x15fee353aa233fc6,
        0xb033a0730b7638e8, 0xeb593ad6bd2406d1, 0x7c86502574d0f133, 0xce3b008d4ccb4be7, 0xf8566e3d383594c8, 0xb2c261e9b7af4429, 0xf685e7e253799dbb, 0x05d33ed60a494cbc,
        0xeaf88d55a4cb0d1a, 0x3ee9368a902415a1, 0x8980fe6a8493a9a4, 0x358ed008cb448631, 0xd0cb7e37b46824b8, 0xe9bc375c0bc94f84, 0xea0bf1d8e6b55bb3, 0xb66a60d0f9f6f297,
        0x66db2cc4807b3758, 0x7e4e014afbca8b4d, 0xa5686a4938b0c730, 0xa5f0d7353d623316, 0x26e38c349242d5e8, 0xeeefa80a29858e30, 0x8915cb912aa67386, 0x4b957a47bfc420d4,
        0xbb53d051a895f7e1, 0x09f5e3235f6911ce, 0x416b98e695cfb7ce, 0x97a08183344c5c86, 0xbf68e0791839a861, 0xea05dde59ed3ed56, 0x0ca732280beda160, 0xac748ed62fe7f4e2,
        0xc686da075cf6e151, 0xe1ba5658f4af05c8, 0xe9ff09fbeb67cc35, 0xafaea9470323b28d, 0x0291e8db5bb0ac2a, 0x342072a9bbee77ae, 0x03147eed6b3d0a9c, 0x21379d4de31dbadb,
        0x2388d965226fb986, 0x52c96988bfebabfa, 0xa6fc29896595bc2d, 0x38fa4af70aa46b8b, 0xa688dd13939421ee, 0x99d5275d9b1415da, 0x453d31bb4fe73631, 0xde51debc1fbe3356,
        0x75a3c847a06c622f, 0xe80e32755d272579, 0x5444052250d8ec0d, 0x8f17dfda19580a3b, 0xf6b3e9363a185e42, 0x7a42adec6868732f, 0x32cb6a07629203a2, 0x1eca8957defe56d9,
        0x9fa85e4bc78ff9ed, 0x20ff07224a499ca7, 0x3fa6295ff9682c70, 0xe3d5b1e3ce993eff, 0xa341209362e0b79a, 0x64bd9eae5712ffe8, 0xceebb537babbd12a, 0x5586ef404315954f,
        0x46c3085c938ab51a, 0xa82ccb9199907cee, 0x8c51b6690a3523c8, 0xc4dbd4c9ae518332, 0x979898dbb23db7b2, 0x1b5b585e6f672a9d, 0xce284da7c4903810, 0x841166e8bb5f1c4f,
        0xb7d884a3fceca7d0, 0xa76468f5a4572374, 0xc10c45f49ee9513d, 0x68f9a5663c1908c9, 0x0095a13476a6339d, 0xd1d7516ffbe9c679, 0xfd94ab0c9726f938, 0x627468bbdb27c959,
        0xedc3f8988e4a8c9a, 0x58efd33f0dfaa499, 0x21e37d7e2ef4ac8b, 0x297f9ab5586259c6, 0xda3ba4dc6cb9617d, 0xae11d8d9de2284d2, 0xcfeed88cb3729865, 0xefc2f9e4f03e2633,
        0x8226393e8f0855a4, 0xd6e25fd7acf3a767, 0x435784c3bfd6d14a, 0xf97142e6343fe757, 0xd73b9fe826352f85, 0x6c3ac444b5b2bd76, 0xd8e88f3e9fd4a3fd, 0x31e50875c36f3460,
        0xa824f1bf88cf4d44, 0x54a4d2c8f5f25899, 0xbff254637ce3b1e6, 0xa02cfe92561b3caa, 0x7bedb4edee9f0af7, 0x879c0620ac49a102, 0xa12c4ccd23b332e7, 0x09a5ff47bf94ed1e,
        0x7b62f43cd3046fa0, 0xaa3af0476b9c2fb9, 0x22e55301abebba8e, 0x3a6035c42747bd58, 0x1705373106c8ec07, 0xb1f660de828d0628, 0x065fe82d89ca563d, 0xf555c2d8074d516d,
        0x6bb6c186b423ee99, 0x54a807be6f3120a8, 0x8a3c7fe2f88860b8, 0xbeffc344f5118e81, 0xd686e80b7d1bd268, 0x661aef4ef5e5e88b, 0x5bf256c654cd1dda, 0x9adb1ab85d7640f4,
        0x68449238920833a2, 0x843279f4cebcb044, 0xc8710cdefa93f7bb, 0x236943294538f3e6, 0x80d7d136c486d0b4, 0x61653956b28851d3, 0x3f843be9a9a956b5, 0xf73cfbbf137987e5,
        0xcf0cb6dee8ceac2c, 0x50c401f52f185cae, 0xbdbe89ce735c4c1c, 0xeef3ade9c0570bc7, 0xbe8b066f8f64cbf6, 0x5238d6131705dcb9, 0x20219086c950e9f6, 0x634468d9ed74de02,
        0x0aba4b3d705c7fa5, 0x3374416f725a6672, 0xe7378bdf7beb3bc6, 0x0f7b6a1b1cee565b, 0x234e4c41b0c33e64, 0x4efa9a0c3f21fe28, 0x1167fc551643e514, 0x9f81a69d3eb01fa4,
        0xdb75c22b12306ed0, 0xe25055d738fc9686, 0x9f9f167a3f8507bb, 0x195f8336d3fbe4d3, 0x8442b6feffdcb6f6, 0x1e07ed24746ffde9, 0x140e31462d555266, 0x8bd0ce515ae1406e,
        0x2c0be0042b5584b3, 0x35a23d0e15d45a60, 0xc14f1ba147d9bc83, 0xbbf168691264b23f, 0xad2cc7b57e589ade, 0x9501963154c7815c, 0x9664afa6b8d67d47, 0x7f9e5101fea0a81c,
        0x45ecffb610d25bfd, 0x3157f7aecf9b6ab3, 0xc43ca6f88d87501d, 0x9576ff838dee38dc, 0x93f21afe0ce1c7d7, 0xceac699df343d8f9, 0x2fec49e29f03398d, 0x8805ccd5730281ed,
        0xf9fc16fc750a8e59, 0x35308cc771adf736, 0x4a57b7c9ee2b7def, 0x03a4c6cdc937a02a, 0x6c9a8a269fc8c4fc, 0x4681decec7a03f43, 0x342eecded1353ef9, 0x8be0552d8413a867,
        0xc7b4ac51beda8be8, 0xebcc64fb719842c0, 0xde8e4c7fb6d40c1c, 0xcc8263b62f9738b1, 0xd3cfc0f86511929a, 0x466024ce8bb226ea, 0x459ff690253a3c18, 0x98b27e9d91284c9c,
        0x75c3ae8aa3af373d, 0xfbf8f8e79a866ffc, 0x32327f59d0662799, 0x8228b57e729e9830, 0x065ceb7a18381b58, 0xd2177671a31dc5ff, 0x90cd801f2f8701f9, 0x9d714428471c65fe,
}

What did you see happen?

The second function loads the global var table into a local var before the loop, and never changes it. I'd expect that load to happen once in the generated code, yet Sum0() and Sum1() compile to the same code and load table in every loop iteration:

0000000000468f20 <main.Sum0>:
  468f20:       48 89 44 24 08          mov    %rax,0x8(%rsp)
  468f25:       31 c9                   xor    %ecx,%ecx
  468f27:       31 d2                   xor    %edx,%edx
  468f29:       eb 03                   jmp    468f2e <main.Sum0+0xe>
  468f2b:       48 ff c1                inc    %rcx
  468f2e:       48 39 cb                cmp    %rcx,%rbx
  468f31:       7e 21                   jle    468f54 <main.Sum0+0x34>
  468f33:       0f b6 34 08             movzbl (%rax,%rcx,1),%esi
  468f37:       4c 8d 05 82 9c 08 00    lea    0x89c82(%rip),%r8        # 4f2bc0 <main.table>
  468f3e:       49 8b 34 f0             mov    (%r8,%rsi,8),%rsi
  468f42:       48 8d 14 56             lea    (%rsi,%rdx,2),%rdx
  468f46:       48 85 d7                test   %rdx,%rdi
  468f49:       75 e0                   jne    468f2b <main.Sum0+0xb>
  468f4b:       48 89 c8                mov    %rcx,%rax
  468f4e:       bb 01 00 00 00          mov    $0x1,%ebx
  468f53:       c3                      ret
  468f54:       31 c0                   xor    %eax,%eax
  468f56:       31 db                   xor    %ebx,%ebx
  468f58:       c3                      ret

0000000000468f60 <main.Sum1>:
  468f60:       48 89 44 24 08          mov    %rax,0x8(%rsp)
  468f65:       31 c9                   xor    %ecx,%ecx
  468f67:       31 d2                   xor    %edx,%edx
  468f69:       eb 03                   jmp    468f6e <main.Sum1+0xe>
  468f6b:       48 ff c1                inc    %rcx
  468f6e:       48 39 cb                cmp    %rcx,%rbx
  468f71:       7e 21                   jle    468f94 <main.Sum1+0x34>
  468f73:       0f b6 34 08             movzbl (%rax,%rcx,1),%esi
  468f77:       4c 8d 05 42 9c 08 00    lea    0x89c42(%rip),%r8        # 4f2bc0 <main.table>   <-- !!!
  468f7e:       49 8b 34 f0             mov    (%r8,%rsi,8),%rsi
  468f82:       48 8d 14 56             lea    (%rsi,%rdx,2),%rdx
  468f86:       48 85 d7                test   %rdx,%rdi
  468f89:       75 e0                   jne    468f6b <main.Sum1+0xb>
  468f8b:       48 89 c8                mov    %rcx,%rax
  468f8e:       bb 01 00 00 00          mov    $0x1,%ebx
  468f93:       c3                      ret
  468f94:       31 c0                   xor    %eax,%eax
  468f96:       31 db                   xor    %ebx,%ebx
  468f98:       c3                      ret

What did you expect to see?

Do not move constant expressions into a loop, evaluate them only once.

Even in the case of Sum0() repeated reloads of table look dubious to me. Another goroutine may change table concurrently, but in the absence of locking in Sum0() I'd argue that hoisting the load out of the loop is as racy as reloading it every time. Moreover, table is private to the package so the compiler can know that no code in the package writes to it, nor takes its address, so table is a constant.

The text was updated successfully, but these errors were encountered:

gabyhelp · 2025-01-26T14:28:55Z

Related Issues

_{(Emoji vote if this was helpful or unhelpful; more detailed feedback welcome in this discussion.)}

randall77 · 2025-01-26T16:47:12Z

I'd expect that load to happen once in the generated code

That's not a load, it is just an address calculation.

The compiler doesn't try to keep those outside the loop currently. We could, I guess, but there's a tension there between computing the address just once, and using an additional register to hold it.

The worries about raciness don't apply here, as table[:] is just an address calculation, not a load. If table was defined as a slice instead of an array, then the compiler needs to be more careful (and it is).

randall77 · 2025-01-26T16:48:20Z

Do you have a benchmark that demonstrates that the LEA inside the loop makes things slower?

artem-anisimov-0x7f · 2025-01-27T16:12:54Z

Here is the C + asm version of the same loop. sum3() has lea outside the loop, sum4() has lea inside the loop. The benchmark calculates sum3() and sum4() 16 times over an array of 128M bytes. On a Ryzen 3950x, I get the following results:

$ gcc -o test -O2 -fPIE test.c
$ ./test
sum3: 1075.885ms
sum4: 1187.926ms

This is a 10.4% more time because of a lea inside the loop.

The code for the benchmark:

#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <err.h>

static uint64_t table[];

uint64_t sum0(uint8_t *xs, uint64_t len, uint64_t mask)
{
        uint64_t sum = 0;
        for (uint64_t i = 0; i < len; ++i) {
                sum = (sum << 1) + table[xs[i]];
                if ((sum & mask) == 0)
                        return i + 1;
        }
        return ~0;
}

uint64_t __attribute__((naked)) sum3(uint8_t *xs, uint64_t len, uint64_t mask)
{
        __asm__ __volatile__ (
                "xor %%rax, %%rax                \n\t"
                "xor %%rcx, %%rcx                \n\t"
                "lea table(%%rip), %%r9          \n\t"
                "jmp sum3start                   \n\t"
                "nop                             \n\t"

                "sum3loop:                       \n\t"
                "inc %%rax                       \n\t"

                "sum3start:                      \n\t"
                "cmp %%rax, %%rsi                \n\t"
                "je sum3notfound                 \n\t"

                "movzbl (%%rdi, %%rax, 1), %%r8d \n\t"
                "mov (%%r9, %%r8, 8), %%r8       \n\t"
                "lea (%%r8, %%rcx, 2), %%rcx     \n\t"
                "test %%rdx, %%rcx               \n\t"
                "jne sum3loop                    \n\t"

                "inc %%rax                       \n\t"
                "ret                             \n\t"
                "sum3notfound:                   \n\t"
                "mov $0xffffffffffffffff, %%rax  \n\t"
                "ret"
                :
                :
                :
        );
}

uint64_t __attribute__((naked)) sum4(uint8_t *xs, uint64_t len, uint64_t mask)
{
        __asm__ __volatile__ (
                "xor %%rax, %%rax                \n\t"
                "xor %%rcx, %%rcx                \n\t"
                "jmp sum4start                   \n\t"
                ".byte 0x0f, 0x1f, 0x84, 0x00    \n\t"
                ".byte 0x00, 0x00, 0x00, 0x00    \n\t"

                "sum4loop:                       \n\t"
                "inc %%rax                       \n\t"

                "sum4start:                      \n\t"
                "cmp %%rax, %%rsi                \n\t"
                "je sum4notfound                 \n\t"

                "movzbl (%%rdi, %%rax, 1), %%r8d \n\t"
                "lea table(%%rip), %%r9          \n\t"
                "mov (%%r9, %%r8, 8), %%r8       \n\t"
                "lea (%%r8, %%rcx, 2), %%rcx     \n\t"
                "test %%rdx, %%rcx               \n\t"
                "jne sum4loop                    \n\t"

                "inc %%rax                       \n\t"
                "ret                             \n\t"
                "sum4notfound:                   \n\t"
                "mov $0xffffffffffffffff, %%rax  \n\t"
                "ret"
                :
                :
                :
        );
}

static uint8_t* generate(uint64_t len)
{
        uint8_t *xs = malloc(len);
        xs[0] = 0;

        uint8_t *x = &xs[1];
        uint64_t ctr = 0;
        for (uint64_t i = 1; i < len; ++i) {
                ctr = ctr * 6364136223846793005 + 1442695040888963407;
                *x++ = ctr;
        }

        return xs;
}

static void compare(uint8_t *xs, uint64_t len, uint64_t mask)
{
        while (len > 0) {
                uint64_t a0 = sum0(xs, len, mask),
                        a3 = sum3(xs, len, mask),
                        a4 = sum4(xs, len, mask);

                if (a0 != a3)
                        errx(1, "sum0() and sum3() disagree");
                if (a0 != a4)
                        errx(1, "sum0() and sum4() disagree");

                if (a0 != -1ULL) {
                        xs += a0;
                        len -= a0;
                } else {
                        break;
                }
        }
}

static void measure(const char *name, uint64_t (*fn)(uint8_t *xs, uint64_t len, uint64_t mask), uint8_t *xs, uint64_t len)
{
        struct timespec begin, end;
        clock_gettime(CLOCK_THREAD_CPUTIME_ID, &begin);
        for (int i = 0; i < 16; ++i)
                (void) fn(xs, len, -1ULL);
        clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);

        long long sec = end.tv_sec - begin.tv_sec;
        long long nsec = end.tv_nsec - begin.tv_nsec;
        long long delta = (sec * 1000 * 1000) + (nsec / 1000);

        printf("%s: %.3lfms\n", name, delta / 1000.0);
}

int main()
{
        const uint64_t len = 128 * 1024 * 1024;
        const uint64_t mask = 0x7fULL;

        uint8_t *xs = generate(len);
        compare(xs, len, mask);

        measure("sum3", &sum3, xs, len);
        measure("sum4", &sum4, xs, len);

        return 0;
}

static uint64_t table[] = {
        0xe80e8d55032474b3, 0x11b25b61f5924e15, 0x03aa5bd82a9eb669, 0xc45a153ef107a38c, 0xeac874b86f0f57b9, 0xa5ccedec95ec79c7, 0xe15a3320ad42ac0a, 0x5ed3583fa63cec15,
        0xcd497bf624a4451d, 0xf9ade5b059683605, 0x773940c03fb11ca1, 0xa36b16e4a6ae15b2, 0x67afd1adb5a89eac, 0xc44c75ee32f0038e, 0x2101790f365c0967, 0x76415c64a222fc4a,
        0x579929249a1e577a, 0xe4762fc41fdbf750, 0xea52198e57dfcdcc, 0xe2535aafe30b4281, 0xcb1a1bd6c77c9056, 0x5a1aa9bfc4612a62, 0x15a728aef8943eb5, 0x2f8f09738a8ec8d9,
        0x200f3dec9fac8074, 0x0fa9a7b1e0d318df, 0x06c0804ffd0d8e3a, 0x630cbc412669dd25, 0x10e34f85f4b10285, 0x2a6fe8164b9b6410, 0xcacb57d857d55810, 0x77f8a3a36ff11b46,
        0x66af517e0dc3003e, 0x76c073c789b4009a, 0x853230dbb529f22a, 0x1e9e9c09a1f77e56, 0x1e871223802ee65d, 0x37fe4588718ff813, 0x10088539f30db464, 0x366f7470b80b72d1,
        0x33f2634d9a6b31db, 0xd43917751d69ea18, 0xa0f492bc1aa7b8de, 0x3f94e5a8054edd20, 0xedfd6e25eb8b1dbf, 0x759517a54f196a56, 0xe81d5006ec7b6b17, 0x8dd8385fa894a6b7,
        0x45f4d5467b0d6f91, 0xa1f894699de22bc8, 0x33829d09ef93e0fe, 0x3e29e250caed603c, 0xf7382cba7f63a45e, 0x970f95412bb569d1, 0xc7fcea456d356b4b, 0x723042513f3e7a57,
        0x17ae7688de3596f1, 0x27ac1fcd7cd23c1a, 0xf429beeb78b3f71f, 0xd0780692fb93a3f9, 0x9f507e28a7c9842f, 0x56001ad536e433ae, 0x7e1dd1ecf58be306, 0x15fee353aa233fc6,
        0xb033a0730b7638e8, 0xeb593ad6bd2406d1, 0x7c86502574d0f133, 0xce3b008d4ccb4be7, 0xf8566e3d383594c8, 0xb2c261e9b7af4429, 0xf685e7e253799dbb, 0x05d33ed60a494cbc,
        0xeaf88d55a4cb0d1a, 0x3ee9368a902415a1, 0x8980fe6a8493a9a4, 0x358ed008cb448631, 0xd0cb7e37b46824b8, 0xe9bc375c0bc94f84, 0xea0bf1d8e6b55bb3, 0xb66a60d0f9f6f297,
        0x66db2cc4807b3758, 0x7e4e014afbca8b4d, 0xa5686a4938b0c730, 0xa5f0d7353d623316, 0x26e38c349242d5e8, 0xeeefa80a29858e30, 0x8915cb912aa67386, 0x4b957a47bfc420d4,
        0xbb53d051a895f7e1, 0x09f5e3235f6911ce, 0x416b98e695cfb7ce, 0x97a08183344c5c86, 0xbf68e0791839a861, 0xea05dde59ed3ed56, 0x0ca732280beda160, 0xac748ed62fe7f4e2,
        0xc686da075cf6e151, 0xe1ba5658f4af05c8, 0xe9ff09fbeb67cc35, 0xafaea9470323b28d, 0x0291e8db5bb0ac2a, 0x342072a9bbee77ae, 0x03147eed6b3d0a9c, 0x21379d4de31dbadb,
        0x2388d965226fb986, 0x52c96988bfebabfa, 0xa6fc29896595bc2d, 0x38fa4af70aa46b8b, 0xa688dd13939421ee, 0x99d5275d9b1415da, 0x453d31bb4fe73631, 0xde51debc1fbe3356,
        0x75a3c847a06c622f, 0xe80e32755d272579, 0x5444052250d8ec0d, 0x8f17dfda19580a3b, 0xf6b3e9363a185e42, 0x7a42adec6868732f, 0x32cb6a07629203a2, 0x1eca8957defe56d9,
        0x9fa85e4bc78ff9ed, 0x20ff07224a499ca7, 0x3fa6295ff9682c70, 0xe3d5b1e3ce993eff, 0xa341209362e0b79a, 0x64bd9eae5712ffe8, 0xceebb537babbd12a, 0x5586ef404315954f,
        0x46c3085c938ab51a, 0xa82ccb9199907cee, 0x8c51b6690a3523c8, 0xc4dbd4c9ae518332, 0x979898dbb23db7b2, 0x1b5b585e6f672a9d, 0xce284da7c4903810, 0x841166e8bb5f1c4f,
        0xb7d884a3fceca7d0, 0xa76468f5a4572374, 0xc10c45f49ee9513d, 0x68f9a5663c1908c9, 0x0095a13476a6339d, 0xd1d7516ffbe9c679, 0xfd94ab0c9726f938, 0x627468bbdb27c959,
        0xedc3f8988e4a8c9a, 0x58efd33f0dfaa499, 0x21e37d7e2ef4ac8b, 0x297f9ab5586259c6, 0xda3ba4dc6cb9617d, 0xae11d8d9de2284d2, 0xcfeed88cb3729865, 0xefc2f9e4f03e2633,
        0x8226393e8f0855a4, 0xd6e25fd7acf3a767, 0x435784c3bfd6d14a, 0xf97142e6343fe757, 0xd73b9fe826352f85, 0x6c3ac444b5b2bd76, 0xd8e88f3e9fd4a3fd, 0x31e50875c36f3460,
        0xa824f1bf88cf4d44, 0x54a4d2c8f5f25899, 0xbff254637ce3b1e6, 0xa02cfe92561b3caa, 0x7bedb4edee9f0af7, 0x879c0620ac49a102, 0xa12c4ccd23b332e7, 0x09a5ff47bf94ed1e,
        0x7b62f43cd3046fa0, 0xaa3af0476b9c2fb9, 0x22e55301abebba8e, 0x3a6035c42747bd58, 0x1705373106c8ec07, 0xb1f660de828d0628, 0x065fe82d89ca563d, 0xf555c2d8074d516d,
        0x6bb6c186b423ee99, 0x54a807be6f3120a8, 0x8a3c7fe2f88860b8, 0xbeffc344f5118e81, 0xd686e80b7d1bd268, 0x661aef4ef5e5e88b, 0x5bf256c654cd1dda, 0x9adb1ab85d7640f4,
        0x68449238920833a2, 0x843279f4cebcb044, 0xc8710cdefa93f7bb, 0x236943294538f3e6, 0x80d7d136c486d0b4, 0x61653956b28851d3, 0x3f843be9a9a956b5, 0xf73cfbbf137987e5,
        0xcf0cb6dee8ceac2c, 0x50c401f52f185cae, 0xbdbe89ce735c4c1c, 0xeef3ade9c0570bc7, 0xbe8b066f8f64cbf6, 0x5238d6131705dcb9, 0x20219086c950e9f6, 0x634468d9ed74de02,
        0x0aba4b3d705c7fa5, 0x3374416f725a6672, 0xe7378bdf7beb3bc6, 0x0f7b6a1b1cee565b, 0x234e4c41b0c33e64, 0x4efa9a0c3f21fe28, 0x1167fc551643e514, 0x9f81a69d3eb01fa4,
        0xdb75c22b12306ed0, 0xe25055d738fc9686, 0x9f9f167a3f8507bb, 0x195f8336d3fbe4d3, 0x8442b6feffdcb6f6, 0x1e07ed24746ffde9, 0x140e31462d555266, 0x8bd0ce515ae1406e,
        0x2c0be0042b5584b3, 0x35a23d0e15d45a60, 0xc14f1ba147d9bc83, 0xbbf168691264b23f, 0xad2cc7b57e589ade, 0x9501963154c7815c, 0x9664afa6b8d67d47, 0x7f9e5101fea0a81c,
        0x45ecffb610d25bfd, 0x3157f7aecf9b6ab3, 0xc43ca6f88d87501d, 0x9576ff838dee38dc, 0x93f21afe0ce1c7d7, 0xceac699df343d8f9, 0x2fec49e29f03398d, 0x8805ccd5730281ed,
        0xf9fc16fc750a8e59, 0x35308cc771adf736, 0x4a57b7c9ee2b7def, 0x03a4c6cdc937a02a, 0x6c9a8a269fc8c4fc, 0x4681decec7a03f43, 0x342eecded1353ef9, 0x8be0552d8413a867,
        0xc7b4ac51beda8be8, 0xebcc64fb719842c0, 0xde8e4c7fb6d40c1c, 0xcc8263b62f9738b1, 0xd3cfc0f86511929a, 0x466024ce8bb226ea, 0x459ff690253a3c18, 0x98b27e9d91284c9c,
        0x75c3ae8aa3af373d, 0xfbf8f8e79a866ffc, 0x32327f59d0662799, 0x8228b57e729e9830, 0x065ceb7a18381b58, 0xd2177671a31dc5ff, 0x90cd801f2f8701f9, 0x9d714428471c65fe,
};

There is one difference. movzbl is one byte longer in my version because it moves into %r8d instead of %esi, but I think that should not affect the result.

artem-anisimov-0x7f · 2025-01-27T17:35:50Z

There is another interesting thing. Consider the following code. It has the same loop as Sum0() in my initial example, but it convinces the register allocator to insert a move that could be avoided (I have minimised it from a real example to keep the move):

type S struct {
        minSize  uint
        maxSize  uint
        normSize uint

        maskS uint64
        maskL uint64

        scanned uint
        sum     uint64
}

//go:noinline
func (s *S) Sum(xs []byte) (uint, bool) {
        l := uint(len(xs))
        sum := s.sum

        var i uint
        if s.scanned < s.minSize {
                i = s.minSize - s.scanned
        }

        n := min(l, s.normSize-s.scanned)
        mask := s.maskS
        for ; i < n; i++ {
                sum = (sum << 1) + table[xs[i]]
                if (sum & mask) == 0 {
                        return i + 1, true
                }
        }

        if i+s.scanned == s.maxSize {
                return s.maxSize, true
        }

        return l, false
}

The loop compiles to the following code:

  468fd3:       48 ff c6                inc    %rsi
  468fd6:       49 89 f8                mov    %rdi,%r8
  468fd9:       48 39 ce                cmp    %rcx,%rsi
  468fdc:       73 23                   jae    469001 <main.(*S).Sum+0x61>
  468fde:       0f b6 3c 33             movzbl (%rbx,%rsi,1),%edi
  468fe2:       4c 8d 1d d7 9b 08 00    lea    0x89bd7(%rip),%r11        # 4f2bc0 <main.table>
  468fe9:       49 8b 3c fb             mov    (%r11,%rdi,8),%rdi
  468fed:       4a 8d 3c 47             lea    (%rdi,%r8,2),%rdi
  468ff1:       49 85 f9                test   %rdi,%r9
  468ff4:       75 dd                   jne    468fd3 <main.(*S).Sum+0x33>

mov %rdi, %r8 is only needed because of movzbl at 468fde and mov at 468fe9. Those two instructions could use %r8 instead of %rdi.

The C + asm code for this version of the loop (sum1() has lea outside the loop and sum2() has lea inside the loop):

uint64_t __attribute__((naked)) sum1(uint8_t *xs, uint64_t len, uint64_t mask)
{
        __asm__ __volatile__ (
                "xor %%rax, %%rax                \n\t"
                "xor %%rcx, %%rcx                \n\t"
                "lea table(%%rip), %%r9          \n\t"
                "jmp sum1start                   \n\t"
                "nop                             \n\t"

                "sum1loop:                       \n\t"
                "inc %%rax                       \n\t"

                "sum1start:                      \n\t"
                "mov %%rcx, %%r10                \n\t"
                "cmp %%rax, %%rsi                \n\t"
                "je sum1notfound                 \n\t"

                "movzbl (%%rdi, %%rax, 1), %%ecx \n\t"
                "mov (%%r9, %%rcx, 8), %%rcx     \n\t"
                "lea (%%rcx, %%r10, 2), %%rcx    \n\t"
                "test %%rdx, %%rcx               \n\t"
                "jne sum1loop                    \n\t"

                "inc %%rax                       \n\t"
                "ret                             \n\t"
                "sum1notfound:                   \n\t"
                "mov $0xffffffffffffffff, %%rax  \n\t"
                "ret"
                :
                :
                :
        );
}

uint64_t __attribute__((naked)) sum2(uint8_t *xs, uint64_t len, uint64_t mask)
{
        __asm__ __volatile__ (
                "xor %%rax, %%rax                \n\t"
                "xor %%rcx, %%rcx                \n\t"
                "jmp sum2start                   \n\t"
                ".byte 0x0f, 0x1f, 0x84, 0x00    \n\t"
                ".byte 0x00, 0x00, 0x00, 0x00    \n\t"

                "sum2loop:                       \n\t"
                "inc %%rax                       \n\t"

                "sum2start:                      \n\t"
                "mov %%rcx, %%r10                \n\t"
                "cmp %%rax, %%rsi                \n\t"
                "je sum2notfound                 \n\t"

                "movzbl (%%rdi, %%rax, 1), %%ecx \n\t"
                "lea table(%%rip), %%r9          \n\t"
                "mov (%%r9, %%rcx, 8), %%rcx     \n\t"
                "lea (%%rcx, %%r10, 2), %%rcx    \n\t"
                "test %%rdx, %%rcx               \n\t"
                "jne sum2loop                    \n\t"

                "inc %%rax                       \n\t"
                "ret                             \n\t"
                "sum2notfound:                   \n\t"
                "mov $0xffffffffffffffff, %%rax  \n\t"
                "ret"
                :
                :
                :
        );
}

The difference of the run time between the two is

$ gcc -o test -O2 -fPIE test.c
$ ./test
sum1: 1063.284ms
sum2: 1291.752ms

For this version of the loop, the difference between lea inside and outside the loop is no longer 10.4% as in the previous comment, but 21.5%.

Jorropo · 2025-01-29T11:15:39Z

I couldn't believe that a random LEA would slow down this benchmark, it has no dependency and in real life you almost always have one free ALU or AGU unit to execute the LEA in parallel of other instructions ahead of when you need it.
So I proceeded to create a benchmark proving this is a loop alignment issue as this almost always explain weird performance differences.

Source code: https://go.dev/play/p/dwi-EzN2oTo

You also need to patch the assembler to use <ABIInternal>:

diff --git a/src/cmd/asm/internal/asm/parse.go b/src/cmd/asm/internal/asm/parse.go
index 638f4e2fc4..0bc8d1e3ce 100644
--- a/src/cmd/asm/internal/asm/parse.go
+++ b/src/cmd/asm/internal/asm/parse.go
@@ -68,7 +68,7 @@ func NewParser(ctxt *obj.Link, ar *arch.Arch, lexer lex.TokenReader) *Parser {
                labels:      make(map[string]*obj.Prog),
                dataAddr:    make(map[string]int64),
                errorWriter: os.Stderr,
-               allowABI:    ctxt != nil && objabi.LookupPkgSpecial(ctxt.Pkgpath).AllowAsmABI,
+               allowABI:    true,
                pkgPrefix:   pkgPrefix,
        }
 }

This test a 2×4 configuration space:

Alignment	LEA in body	LEA in header
0	`Sum0`	`Sum1`
64	`Sum2`	`Sum3`
32	`Sum4`	`Sum5`
16	`Sum6`	`Sum7`

Here are results for 10 interleaved runs:

goos: linux
goarch: amd64
pkg: aaa
cpu: AMD Ryzen 5 3600 6-Core Processor              
        │   result    │
        │   sec/op    │
Sum0-12   9.583n ± 1%
Sum1-12   8.495n ± 1%
Sum2-12   9.574n ± 1%
Sum3-12   8.753n ± 1%
Sum4-12   9.533n ± 1%
Sum5-12   8.718n ± 1%
Sum6-12   9.544n ± 1%
Sum7-12   8.630n ± 1%
geomean   9.092n

Alignment	LEA in body	LEA in header
0	9.5	8.4
64	9.5	8.7
32	9.5	8.7
16	9.5	8.6

If anything my conclusion is that for this exact function, having the LEA in the body is in fact significantly slower.

gopherbot added the compiler/runtime Issues related to the Go compiler and/or runtime. label Jan 26, 2025

gabyhelp added the BugReport Issues describing a possible bug in the Go implementation. label Jan 26, 2025

randall77 added the Performance label Jan 26, 2025

cagedmantis added this to the Backlog milestone Jan 27, 2025

cagedmantis added the NeedsInvestigation Someone must examine and confirm this is a valid issue and not a duplicate of an existing one. label Jan 27, 2025

mknyszek added this to Go Compiler / Runtime Jan 29, 2025

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

cmd/compile: a constant expression is moved into a loop #71443

cmd/compile: a constant expression is moved into a loop #71443

artem-anisimov-0x7f commented Jan 26, 2025

gabyhelp commented Jan 26, 2025

randall77 commented Jan 26, 2025

randall77 commented Jan 26, 2025

artem-anisimov-0x7f commented Jan 27, 2025

artem-anisimov-0x7f commented Jan 27, 2025

Jorropo commented Jan 29, 2025 •

edited

Loading

cmd/compile: a constant expression is moved into a loop #71443

cmd/compile: a constant expression is moved into a loop #71443

Comments

artem-anisimov-0x7f commented Jan 26, 2025

Go version

Output of go env in your module/workspace:

What did you do?

What did you see happen?

What did you expect to see?

gabyhelp commented Jan 26, 2025

randall77 commented Jan 26, 2025

randall77 commented Jan 26, 2025

artem-anisimov-0x7f commented Jan 27, 2025

artem-anisimov-0x7f commented Jan 27, 2025

Jorropo commented Jan 29, 2025 • edited Loading

Output of `go env` in your module/workspace:

Jorropo commented Jan 29, 2025 •

edited

Loading