Skip to content

Commit

Permalink
cmd/compile: pair loads and stores on arm64
Browse files Browse the repository at this point in the history
Look for possible paired load/store operations on arm64.
I don't expect this would be a lot faster, but it will save
binary space, and indirectly through the icache at least a bit
of time.

Change-Id: I4dd73b0e6329c4659b7453998f9b75320fcf380b
Reviewed-on: https://go-review.googlesource.com/c/go/+/629256
LUCI-TryBot-Result: Go LUCI <[email protected]>
Reviewed-by: David Chase <[email protected]>
Auto-Submit: Keith Randall <[email protected]>
Reviewed-by: Keith Randall <[email protected]>
  • Loading branch information
randall77 committed Feb 13, 2025
1 parent 0b88a87 commit 20d7c57
Show file tree
Hide file tree
Showing 4 changed files with 463 additions and 5 deletions.
1 change: 1 addition & 0 deletions src/cmd/compile/internal/ssa/compile.go
Original file line number Diff line number Diff line change
Expand Up @@ -488,6 +488,7 @@ var passes = [...]pass{
{name: "lower", fn: lower, required: true},
{name: "addressing modes", fn: addressingModes, required: false},
{name: "late lower", fn: lateLower, required: true},
{name: "pair", fn: pair},
{name: "lowered deadcode for cse", fn: deadcode}, // deadcode immediately before CSE avoids CSE making dead values live again
{name: "lowered cse", fn: cse},
{name: "elim unread autos", fn: elimUnreadAutos},
Expand Down
357 changes: 357 additions & 0 deletions src/cmd/compile/internal/ssa/pair.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,357 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package ssa

import (
"cmd/compile/internal/ir"
"cmd/compile/internal/types"
"slices"
)

// The pair pass finds memory operations that can be paired up
// into single 2-register memory instructions.
func pair(f *Func) {
// Only arm64 for now. This pass is fairly arch-specific.
switch f.Config.arch {
case "arm64":
default:
return
}
pairLoads(f)
pairStores(f)
}

type pairableLoadInfo struct {
width int64 // width of one element in the pair, in bytes
pair Op
}

// All pairableLoad ops must take 2 arguments, a pointer and a memory.
// They must also take an offset in Aux/AuxInt.
var pairableLoads = map[Op]pairableLoadInfo{
OpARM64MOVDload: {8, OpARM64LDP},
OpARM64MOVWload: {4, OpARM64LDPW},
OpARM64FMOVDload: {8, OpARM64FLDPD},
OpARM64FMOVSload: {4, OpARM64FLDPS},
}

type pairableStoreInfo struct {
width int64 // width of one element in the pair, in bytes
pair Op
}

// All pairableStore keys must take 3 arguments, a pointer, a value, and a memory.
// All pairableStore values must take 4 arguments, a pointer, 2 values, and a memory.
// They must also take an offset in Aux/AuxInt.
var pairableStores = map[Op]pairableStoreInfo{
OpARM64MOVDstore: {8, OpARM64STP},
OpARM64MOVWstore: {4, OpARM64STPW},
OpARM64FMOVDstore: {8, OpARM64FSTPD},
OpARM64FMOVSstore: {4, OpARM64FSTPS},
// TODO: storezero variants.
}

// offsetOk returns true if a pair instruction should be used
// for the offset Aux+off, when the data width (of the
// unpaired instructions) is width.
// This function is best-effort. The compiled function must
// still work if offsetOk always returns true.
// TODO: this is currently arm64-specific.
func offsetOk(aux Aux, off, width int64) bool {
if true {
// Seems to generate slightly smaller code if we just
// always allow this rewrite.
//
// Without pairing, we have 2 load instructions, like:
// LDR 88(R0), R1
// LDR 96(R0), R2
// with pairing we have, best case:
// LDP 88(R0), R1, R2
// but maybe we need an adjuster if out of range or unaligned:
// ADD R0, $88, R27
// LDP (R27), R1, R2
// Even with the adjuster, it is at least no worse.
//
// A similar situation occurs when accessing globals.
// Two loads from globals requires 4 instructions,
// two ADRP and two LDR. With pairing, we need
// ADRP+ADD+LDP, three instructions.
//
// With pairing, it looks like the critical path might
// be a little bit longer. But it should never be more
// instructions.
// TODO: see if that longer critical path causes any
// regressions.
return true
}
if aux != nil {
if _, ok := aux.(*ir.Name); !ok {
// Offset is probably too big (globals).
return false
}
// We let *ir.Names pass here, as
// they are probably small offsets from SP.
// There's no guarantee that we're in range
// in that case though (we don't know the
// stack frame size yet), so the assembler
// might need to issue fixup instructions.
// Assume some small frame size.
if off >= 0 {
off += 120
}
// TODO: figure out how often this helps vs. hurts.
}
switch width {
case 4:
if off >= -256 && off <= 252 && off%4 == 0 {
return true
}
case 8:
if off >= -512 && off <= 504 && off%8 == 0 {
return true
}
}
return false
}

func pairLoads(f *Func) {
var loads []*Value

// Registry of aux values for sorting.
auxIDs := map[Aux]int{}
auxID := func(aux Aux) int {
id, ok := auxIDs[aux]
if !ok {
id = len(auxIDs)
auxIDs[aux] = id
}
return id
}

for _, b := range f.Blocks {
// Find loads.
loads = loads[:0]
clear(auxIDs)
for _, v := range b.Values {
info := pairableLoads[v.Op]
if info.width == 0 {
continue // not pairable
}
if !offsetOk(v.Aux, v.AuxInt, info.width) {
continue // not advisable
}
loads = append(loads, v)
}
if len(loads) < 2 {
continue
}

// Sort to put pairable loads together.
slices.SortFunc(loads, func(x, y *Value) int {
// First sort by op, ptr, and memory arg.
if x.Op != y.Op {
return int(x.Op - y.Op)
}
if x.Args[0].ID != y.Args[0].ID {
return int(x.Args[0].ID - y.Args[0].ID)
}
if x.Args[1].ID != y.Args[1].ID {
return int(x.Args[1].ID - y.Args[1].ID)
}
// Then sort by aux. (nil first, then by aux ID)
if x.Aux != nil {
if y.Aux == nil {
return 1
}
a, b := auxID(x.Aux), auxID(y.Aux)
if a != b {
return a - b
}
} else if y.Aux != nil {
return -1
}
// Then sort by offset, low to high.
return int(x.AuxInt - y.AuxInt)
})

// Look for pairable loads.
for i := 0; i < len(loads)-1; i++ {
x := loads[i]
y := loads[i+1]
if x.Op != y.Op || x.Args[0] != y.Args[0] || x.Args[1] != y.Args[1] {
continue
}
if x.Aux != y.Aux {
continue
}
if x.AuxInt+pairableLoads[x.Op].width != y.AuxInt {
continue
}

// Commit point.

// Make the 2-register load.
load := b.NewValue2IA(x.Pos, pairableLoads[x.Op].pair, types.NewTuple(x.Type, y.Type), x.AuxInt, x.Aux, x.Args[0], x.Args[1])

// Modify x to be (Select0 load). Similar for y.
x.reset(OpSelect0)
x.SetArgs1(load)
y.reset(OpSelect1)
y.SetArgs1(load)

i++ // Skip y next time around the loop.
}
}
}

func pairStores(f *Func) {
last := f.Cache.allocBoolSlice(f.NumValues())
defer f.Cache.freeBoolSlice(last)

// prevStore returns the previous store in the
// same block, or nil if there are none.
prevStore := func(v *Value) *Value {
if v.Op == OpInitMem || v.Op == OpPhi {
return nil
}
m := v.MemoryArg()
if m.Block != v.Block {
return nil
}
return m
}

for _, b := range f.Blocks {
// Find last store in block, so we can
// walk the stores last to first.
// Last to first helps ensure that the rewrites we
// perform do not get in the way of subsequent rewrites.
for _, v := range b.Values {
if v.Type.IsMemory() {
last[v.ID] = true
}
}
for _, v := range b.Values {
if v.Type.IsMemory() {
if m := prevStore(v); m != nil {
last[m.ID] = false
}
}
}
var lastMem *Value
for _, v := range b.Values {
if last[v.ID] {
lastMem = v
break
}
}

// Check all stores, from last to first.
memCheck:
for v := lastMem; v != nil; v = prevStore(v) {
info := pairableStores[v.Op]
if info.width == 0 {
continue // Not pairable.
}
if !offsetOk(v.Aux, v.AuxInt, info.width) {
continue // Not advisable to pair.
}
ptr := v.Args[0]
val := v.Args[1]
mem := v.Args[2]
off := v.AuxInt
aux := v.Aux

// Look for earlier store we can combine with.
lowerOk := true
higherOk := true
count := 10 // max lookback distance
for w := prevStore(v); w != nil; w = prevStore(w) {
if w.Uses != 1 {
// We can't combine stores if the earlier
// store has any use besides the next one
// in the store chain.
// (Unless we could check the aliasing of
// all those other uses.)
continue memCheck
}
if w.Op == v.Op &&
w.Args[0] == ptr &&
w.Aux == aux &&
(lowerOk && w.AuxInt == off-info.width || higherOk && w.AuxInt == off+info.width) {
// This op is mergeable with v.

// Commit point.

// ptr val1 val2 mem
args := []*Value{ptr, val, w.Args[1], mem}
if w.AuxInt == off-info.width {
args[1], args[2] = args[2], args[1]
off -= info.width
}
v.reset(info.pair)
v.AddArgs(args...)
v.Aux = aux
v.AuxInt = off
v.Pos = w.Pos // take position of earlier of the two stores (TODO: not really working?)

// Make w just a memory copy.
wmem := w.MemoryArg()
w.reset(OpCopy)
w.SetArgs1(wmem)
continue memCheck
}
if count--; count == 0 {
// Only look back so far.
// This keeps us in O(n) territory, and it
// also prevents us from keeping values
// in registers for too long (and thus
// needing to spill them).
continue memCheck
}
// We're now looking at a store w which is currently
// between the store v that we're intending to merge into,
// and the store we'll eventually find to merge with it.
// Make sure this store doesn't alias with the one
// we'll be moving.
var width int64
switch w.Op {
case OpARM64MOVDstore, OpARM64MOVDstorezero, OpARM64FMOVDstore:
width = 8
case OpARM64MOVWstore, OpARM64MOVWstorezero, OpARM64FMOVSstore:
width = 4
case OpARM64MOVHstore, OpARM64MOVHstorezero:
width = 2
case OpARM64MOVBstore, OpARM64MOVBstorezero:
width = 1
case OpCopy:
continue // this was a store we merged earlier
default:
// Can't reorder with any other memory operations.
// (atomics, calls, ...)
continue memCheck
}

// We only allow reordering with respect to other
// writes to the same pointer and aux, so we can
// compute the exact the aliasing relationship.
if w.Args[0] != ptr || w.Aux != aux {
continue memCheck
}
if overlap(w.AuxInt, width, off-info.width, info.width) {
// Aliases with slot before v's location.
lowerOk = false
}
if overlap(w.AuxInt, width, off+info.width, info.width) {
// Aliases with slot after v's location.
higherOk = false
}
if !higherOk && !lowerOk {
continue memCheck
}
}
}
}
}
Loading

0 comments on commit 20d7c57

Please sign in to comment.