cmd/compile: pair loads and stores on arm64

Look for possible paired load/store operations on arm64. I don't expect this would be a lot faster, but it will save binary space, and indirectly through the icache at least a bit of time. Change-Id: I4dd73b0e6329c4659b7453998f9b75320fcf380b Reviewed-on: https://go-review.googlesource.com/c/go/+/629256 LUCI-TryBot-Result: Go LUCI <[email protected]> Reviewed-by: David Chase <[email protected]> Auto-Submit: Keith Randall <[email protected]> Reviewed-by: Keith Randall <[email protected]>
mauri870 · Feb 13, 2025 · 20d7c57 · 20d7c57
1 parent 0b88a87
commit 20d7c57
Show file tree

Hide file tree

Showing 4 changed files with 463 additions and 5 deletions.
diff --git a/src/cmd/compile/internal/ssa/compile.go b/src/cmd/compile/internal/ssa/compile.go
@@ -488,6 +488,7 @@ var passes = [...]pass{
 	{name: "lower", fn: lower, required: true},
 	{name: "addressing modes", fn: addressingModes, required: false},
 	{name: "late lower", fn: lateLower, required: true},
+	{name: "pair", fn: pair},
 	{name: "lowered deadcode for cse", fn: deadcode}, // deadcode immediately before CSE avoids CSE making dead values live again
 	{name: "lowered cse", fn: cse},
 	{name: "elim unread autos", fn: elimUnreadAutos},

diff --git a/src/cmd/compile/internal/ssa/pair.go b/src/cmd/compile/internal/ssa/pair.go
@@ -0,0 +1,357 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package ssa
+
+import (
+	"cmd/compile/internal/ir"
+	"cmd/compile/internal/types"
+	"slices"
+)
+
+// The pair pass finds memory operations that can be paired up
+// into single 2-register memory instructions.
+func pair(f *Func) {
+	// Only arm64 for now. This pass is fairly arch-specific.
+	switch f.Config.arch {
+	case "arm64":
+	default:
+		return
+	}
+	pairLoads(f)
+	pairStores(f)
+}
+
+type pairableLoadInfo struct {
+	width int64 // width of one element in the pair, in bytes
+	pair  Op
+}
+
+// All pairableLoad ops must take 2 arguments, a pointer and a memory.
+// They must also take an offset in Aux/AuxInt.
+var pairableLoads = map[Op]pairableLoadInfo{
+	OpARM64MOVDload:  {8, OpARM64LDP},
+	OpARM64MOVWload:  {4, OpARM64LDPW},
+	OpARM64FMOVDload: {8, OpARM64FLDPD},
+	OpARM64FMOVSload: {4, OpARM64FLDPS},
+}
+
+type pairableStoreInfo struct {
+	width int64 // width of one element in the pair, in bytes
+	pair  Op
+}
+
+// All pairableStore keys must take 3 arguments, a pointer, a value, and a memory.
+// All pairableStore values must take 4 arguments, a pointer, 2 values, and a memory.
+// They must also take an offset in Aux/AuxInt.
+var pairableStores = map[Op]pairableStoreInfo{
+	OpARM64MOVDstore:  {8, OpARM64STP},
+	OpARM64MOVWstore:  {4, OpARM64STPW},
+	OpARM64FMOVDstore: {8, OpARM64FSTPD},
+	OpARM64FMOVSstore: {4, OpARM64FSTPS},
+	// TODO: storezero variants.
+}
+
+// offsetOk returns true if a pair instruction should be used
+// for the offset Aux+off, when the data width (of the
+// unpaired instructions) is width.
+// This function is best-effort. The compiled function must
+// still work if offsetOk always returns true.
+// TODO: this is currently arm64-specific.
+func offsetOk(aux Aux, off, width int64) bool {
+	if true {
+		// Seems to generate slightly smaller code if we just
+		// always allow this rewrite.
+		//
+		// Without pairing, we have 2 load instructions, like:
+		//   LDR 88(R0), R1
+		//   LDR 96(R0), R2
+		// with pairing we have, best case:
+		//   LDP 88(R0), R1, R2
+		// but maybe we need an adjuster if out of range or unaligned:
+		//   ADD R0, $88, R27
+		//   LDP (R27), R1, R2
+		// Even with the adjuster, it is at least no worse.
+		//
+		// A similar situation occurs when accessing globals.
+		// Two loads from globals requires 4 instructions,
+		// two ADRP and two LDR. With pairing, we need
+		// ADRP+ADD+LDP, three instructions.
+		//
+		// With pairing, it looks like the critical path might
+		// be a little bit longer. But it should never be more
+		// instructions.
+		// TODO: see if that longer critical path causes any
+		// regressions.
+		return true
+	}
+	if aux != nil {
+		if _, ok := aux.(*ir.Name); !ok {
+			// Offset is probably too big (globals).
+			return false
+		}
+		// We let *ir.Names pass here, as
+		// they are probably small offsets from SP.
+		// There's no guarantee that we're in range
+		// in that case though (we don't know the
+		// stack frame size yet), so the assembler
+		// might need to issue fixup instructions.
+		// Assume some small frame size.
+		if off >= 0 {
+			off += 120
+		}
+		// TODO: figure out how often this helps vs. hurts.
+	}
+	switch width {
+	case 4:
+		if off >= -256 && off <= 252 && off%4 == 0 {
+			return true
+		}
+	case 8:
+		if off >= -512 && off <= 504 && off%8 == 0 {
+			return true
+		}
+	}
+	return false
+}
+
+func pairLoads(f *Func) {
+	var loads []*Value
+
+	// Registry of aux values for sorting.
+	auxIDs := map[Aux]int{}
+	auxID := func(aux Aux) int {
+		id, ok := auxIDs[aux]
+		if !ok {
+			id = len(auxIDs)
+			auxIDs[aux] = id
+		}
+		return id
+	}
+
+	for _, b := range f.Blocks {
+		// Find loads.
+		loads = loads[:0]
+		clear(auxIDs)
+		for _, v := range b.Values {
+			info := pairableLoads[v.Op]
+			if info.width == 0 {
+				continue // not pairable
+			}
+			if !offsetOk(v.Aux, v.AuxInt, info.width) {
+				continue // not advisable
+			}
+			loads = append(loads, v)
+		}
+		if len(loads) < 2 {
+			continue
+		}
+
+		// Sort to put pairable loads together.
+		slices.SortFunc(loads, func(x, y *Value) int {
+			// First sort by op, ptr, and memory arg.
+			if x.Op != y.Op {
+				return int(x.Op - y.Op)
+			}
+			if x.Args[0].ID != y.Args[0].ID {
+				return int(x.Args[0].ID - y.Args[0].ID)
+			}
+			if x.Args[1].ID != y.Args[1].ID {
+				return int(x.Args[1].ID - y.Args[1].ID)
+			}
+			// Then sort by aux. (nil first, then by aux ID)
+			if x.Aux != nil {
+				if y.Aux == nil {
+					return 1
+				}
+				a, b := auxID(x.Aux), auxID(y.Aux)
+				if a != b {
+					return a - b
+				}
+			} else if y.Aux != nil {
+				return -1
+			}
+			// Then sort by offset, low to high.
+			return int(x.AuxInt - y.AuxInt)
+		})
+
+		// Look for pairable loads.
+		for i := 0; i < len(loads)-1; i++ {
+			x := loads[i]
+			y := loads[i+1]
+			if x.Op != y.Op || x.Args[0] != y.Args[0] || x.Args[1] != y.Args[1] {
+				continue
+			}
+			if x.Aux != y.Aux {
+				continue
+			}
+			if x.AuxInt+pairableLoads[x.Op].width != y.AuxInt {
+				continue
+			}
+
+			// Commit point.
+
+			// Make the 2-register load.
+			load := b.NewValue2IA(x.Pos, pairableLoads[x.Op].pair, types.NewTuple(x.Type, y.Type), x.AuxInt, x.Aux, x.Args[0], x.Args[1])
+
+			// Modify x to be (Select0 load). Similar for y.
+			x.reset(OpSelect0)
+			x.SetArgs1(load)
+			y.reset(OpSelect1)
+			y.SetArgs1(load)
+
+			i++ // Skip y next time around the loop.
+		}
+	}
+}
+
+func pairStores(f *Func) {
+	last := f.Cache.allocBoolSlice(f.NumValues())
+	defer f.Cache.freeBoolSlice(last)
+
+	// prevStore returns the previous store in the
+	// same block, or nil if there are none.
+	prevStore := func(v *Value) *Value {
+		if v.Op == OpInitMem || v.Op == OpPhi {
+			return nil
+		}
+		m := v.MemoryArg()
+		if m.Block != v.Block {
+			return nil
+		}
+		return m
+	}
+
+	for _, b := range f.Blocks {
+		// Find last store in block, so we can
+		// walk the stores last to first.
+		// Last to first helps ensure that the rewrites we
+		// perform do not get in the way of subsequent rewrites.
+		for _, v := range b.Values {
+			if v.Type.IsMemory() {
+				last[v.ID] = true
+			}
+		}
+		for _, v := range b.Values {
+			if v.Type.IsMemory() {
+				if m := prevStore(v); m != nil {
+					last[m.ID] = false
+				}
+			}
+		}
+		var lastMem *Value
+		for _, v := range b.Values {
+			if last[v.ID] {
+				lastMem = v
+				break
+			}
+		}
+
+		// Check all stores, from last to first.
+	memCheck:
+		for v := lastMem; v != nil; v = prevStore(v) {
+			info := pairableStores[v.Op]
+			if info.width == 0 {
+				continue // Not pairable.
+			}
+			if !offsetOk(v.Aux, v.AuxInt, info.width) {
+				continue // Not advisable to pair.
+			}
+			ptr := v.Args[0]
+			val := v.Args[1]
+			mem := v.Args[2]
+			off := v.AuxInt
+			aux := v.Aux
+
+			// Look for earlier store we can combine with.
+			lowerOk := true
+			higherOk := true
+			count := 10 // max lookback distance
+			for w := prevStore(v); w != nil; w = prevStore(w) {
+				if w.Uses != 1 {
+					// We can't combine stores if the earlier
+					// store has any use besides the next one
+					// in the store chain.
+					// (Unless we could check the aliasing of
+					// all those other uses.)
+					continue memCheck
+				}
+				if w.Op == v.Op &&
+					w.Args[0] == ptr &&
+					w.Aux == aux &&
+					(lowerOk && w.AuxInt == off-info.width || higherOk && w.AuxInt == off+info.width) {
+					// This op is mergeable with v.
+
+					// Commit point.
+
+					// ptr val1 val2 mem
+					args := []*Value{ptr, val, w.Args[1], mem}
+					if w.AuxInt == off-info.width {
+						args[1], args[2] = args[2], args[1]
+						off -= info.width
+					}
+					v.reset(info.pair)
+					v.AddArgs(args...)
+					v.Aux = aux
+					v.AuxInt = off
+					v.Pos = w.Pos // take position of earlier of the two stores (TODO: not really working?)
+
+					// Make w just a memory copy.
+					wmem := w.MemoryArg()
+					w.reset(OpCopy)
+					w.SetArgs1(wmem)
+					continue memCheck
+				}
+				if count--; count == 0 {
+					// Only look back so far.
+					// This keeps us in O(n) territory, and it
+					// also prevents us from keeping values
+					// in registers for too long (and thus
+					// needing to spill them).
+					continue memCheck
+				}
+				// We're now looking at a store w which is currently
+				// between the store v that we're intending to merge into,
+				// and the store we'll eventually find to merge with it.
+				// Make sure this store doesn't alias with the one
+				// we'll be moving.
+				var width int64
+				switch w.Op {
+				case OpARM64MOVDstore, OpARM64MOVDstorezero, OpARM64FMOVDstore:
+					width = 8
+				case OpARM64MOVWstore, OpARM64MOVWstorezero, OpARM64FMOVSstore:
+					width = 4
+				case OpARM64MOVHstore, OpARM64MOVHstorezero:
+					width = 2
+				case OpARM64MOVBstore, OpARM64MOVBstorezero:
+					width = 1
+				case OpCopy:
+					continue // this was a store we merged earlier
+				default:
+					// Can't reorder with any other memory operations.
+					// (atomics, calls, ...)
+					continue memCheck
+				}
+
+				// We only allow reordering with respect to other
+				// writes to the same pointer and aux, so we can
+				// compute the exact the aliasing relationship.
+				if w.Args[0] != ptr || w.Aux != aux {
+					continue memCheck
+				}
+				if overlap(w.AuxInt, width, off-info.width, info.width) {
+					// Aliases with slot before v's location.
+					lowerOk = false
+				}
+				if overlap(w.AuxInt, width, off+info.width, info.width) {
+					// Aliases with slot after v's location.
+					higherOk = false
+				}
+				if !higherOk && !lowerOk {
+					continue memCheck
+				}
+			}
+		}
+	}
+}