-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
cmd/compile: pair loads and stores on arm64
Look for possible paired load/store operations on arm64. I don't expect this would be a lot faster, but it will save binary space, and indirectly through the icache at least a bit of time. Change-Id: I4dd73b0e6329c4659b7453998f9b75320fcf380b Reviewed-on: https://go-review.googlesource.com/c/go/+/629256 LUCI-TryBot-Result: Go LUCI <[email protected]> Reviewed-by: David Chase <[email protected]> Auto-Submit: Keith Randall <[email protected]> Reviewed-by: Keith Randall <[email protected]>
- Loading branch information
Showing
4 changed files
with
463 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,357 @@ | ||
// Copyright 2016 The Go Authors. All rights reserved. | ||
// Use of this source code is governed by a BSD-style | ||
// license that can be found in the LICENSE file. | ||
|
||
package ssa | ||
|
||
import ( | ||
"cmd/compile/internal/ir" | ||
"cmd/compile/internal/types" | ||
"slices" | ||
) | ||
|
||
// The pair pass finds memory operations that can be paired up | ||
// into single 2-register memory instructions. | ||
func pair(f *Func) { | ||
// Only arm64 for now. This pass is fairly arch-specific. | ||
switch f.Config.arch { | ||
case "arm64": | ||
default: | ||
return | ||
} | ||
pairLoads(f) | ||
pairStores(f) | ||
} | ||
|
||
type pairableLoadInfo struct { | ||
width int64 // width of one element in the pair, in bytes | ||
pair Op | ||
} | ||
|
||
// All pairableLoad ops must take 2 arguments, a pointer and a memory. | ||
// They must also take an offset in Aux/AuxInt. | ||
var pairableLoads = map[Op]pairableLoadInfo{ | ||
OpARM64MOVDload: {8, OpARM64LDP}, | ||
OpARM64MOVWload: {4, OpARM64LDPW}, | ||
OpARM64FMOVDload: {8, OpARM64FLDPD}, | ||
OpARM64FMOVSload: {4, OpARM64FLDPS}, | ||
} | ||
|
||
type pairableStoreInfo struct { | ||
width int64 // width of one element in the pair, in bytes | ||
pair Op | ||
} | ||
|
||
// All pairableStore keys must take 3 arguments, a pointer, a value, and a memory. | ||
// All pairableStore values must take 4 arguments, a pointer, 2 values, and a memory. | ||
// They must also take an offset in Aux/AuxInt. | ||
var pairableStores = map[Op]pairableStoreInfo{ | ||
OpARM64MOVDstore: {8, OpARM64STP}, | ||
OpARM64MOVWstore: {4, OpARM64STPW}, | ||
OpARM64FMOVDstore: {8, OpARM64FSTPD}, | ||
OpARM64FMOVSstore: {4, OpARM64FSTPS}, | ||
// TODO: storezero variants. | ||
} | ||
|
||
// offsetOk returns true if a pair instruction should be used | ||
// for the offset Aux+off, when the data width (of the | ||
// unpaired instructions) is width. | ||
// This function is best-effort. The compiled function must | ||
// still work if offsetOk always returns true. | ||
// TODO: this is currently arm64-specific. | ||
func offsetOk(aux Aux, off, width int64) bool { | ||
if true { | ||
// Seems to generate slightly smaller code if we just | ||
// always allow this rewrite. | ||
// | ||
// Without pairing, we have 2 load instructions, like: | ||
// LDR 88(R0), R1 | ||
// LDR 96(R0), R2 | ||
// with pairing we have, best case: | ||
// LDP 88(R0), R1, R2 | ||
// but maybe we need an adjuster if out of range or unaligned: | ||
// ADD R0, $88, R27 | ||
// LDP (R27), R1, R2 | ||
// Even with the adjuster, it is at least no worse. | ||
// | ||
// A similar situation occurs when accessing globals. | ||
// Two loads from globals requires 4 instructions, | ||
// two ADRP and two LDR. With pairing, we need | ||
// ADRP+ADD+LDP, three instructions. | ||
// | ||
// With pairing, it looks like the critical path might | ||
// be a little bit longer. But it should never be more | ||
// instructions. | ||
// TODO: see if that longer critical path causes any | ||
// regressions. | ||
return true | ||
} | ||
if aux != nil { | ||
if _, ok := aux.(*ir.Name); !ok { | ||
// Offset is probably too big (globals). | ||
return false | ||
} | ||
// We let *ir.Names pass here, as | ||
// they are probably small offsets from SP. | ||
// There's no guarantee that we're in range | ||
// in that case though (we don't know the | ||
// stack frame size yet), so the assembler | ||
// might need to issue fixup instructions. | ||
// Assume some small frame size. | ||
if off >= 0 { | ||
off += 120 | ||
} | ||
// TODO: figure out how often this helps vs. hurts. | ||
} | ||
switch width { | ||
case 4: | ||
if off >= -256 && off <= 252 && off%4 == 0 { | ||
return true | ||
} | ||
case 8: | ||
if off >= -512 && off <= 504 && off%8 == 0 { | ||
return true | ||
} | ||
} | ||
return false | ||
} | ||
|
||
func pairLoads(f *Func) { | ||
var loads []*Value | ||
|
||
// Registry of aux values for sorting. | ||
auxIDs := map[Aux]int{} | ||
auxID := func(aux Aux) int { | ||
id, ok := auxIDs[aux] | ||
if !ok { | ||
id = len(auxIDs) | ||
auxIDs[aux] = id | ||
} | ||
return id | ||
} | ||
|
||
for _, b := range f.Blocks { | ||
// Find loads. | ||
loads = loads[:0] | ||
clear(auxIDs) | ||
for _, v := range b.Values { | ||
info := pairableLoads[v.Op] | ||
if info.width == 0 { | ||
continue // not pairable | ||
} | ||
if !offsetOk(v.Aux, v.AuxInt, info.width) { | ||
continue // not advisable | ||
} | ||
loads = append(loads, v) | ||
} | ||
if len(loads) < 2 { | ||
continue | ||
} | ||
|
||
// Sort to put pairable loads together. | ||
slices.SortFunc(loads, func(x, y *Value) int { | ||
// First sort by op, ptr, and memory arg. | ||
if x.Op != y.Op { | ||
return int(x.Op - y.Op) | ||
} | ||
if x.Args[0].ID != y.Args[0].ID { | ||
return int(x.Args[0].ID - y.Args[0].ID) | ||
} | ||
if x.Args[1].ID != y.Args[1].ID { | ||
return int(x.Args[1].ID - y.Args[1].ID) | ||
} | ||
// Then sort by aux. (nil first, then by aux ID) | ||
if x.Aux != nil { | ||
if y.Aux == nil { | ||
return 1 | ||
} | ||
a, b := auxID(x.Aux), auxID(y.Aux) | ||
if a != b { | ||
return a - b | ||
} | ||
} else if y.Aux != nil { | ||
return -1 | ||
} | ||
// Then sort by offset, low to high. | ||
return int(x.AuxInt - y.AuxInt) | ||
}) | ||
|
||
// Look for pairable loads. | ||
for i := 0; i < len(loads)-1; i++ { | ||
x := loads[i] | ||
y := loads[i+1] | ||
if x.Op != y.Op || x.Args[0] != y.Args[0] || x.Args[1] != y.Args[1] { | ||
continue | ||
} | ||
if x.Aux != y.Aux { | ||
continue | ||
} | ||
if x.AuxInt+pairableLoads[x.Op].width != y.AuxInt { | ||
continue | ||
} | ||
|
||
// Commit point. | ||
|
||
// Make the 2-register load. | ||
load := b.NewValue2IA(x.Pos, pairableLoads[x.Op].pair, types.NewTuple(x.Type, y.Type), x.AuxInt, x.Aux, x.Args[0], x.Args[1]) | ||
|
||
// Modify x to be (Select0 load). Similar for y. | ||
x.reset(OpSelect0) | ||
x.SetArgs1(load) | ||
y.reset(OpSelect1) | ||
y.SetArgs1(load) | ||
|
||
i++ // Skip y next time around the loop. | ||
} | ||
} | ||
} | ||
|
||
func pairStores(f *Func) { | ||
last := f.Cache.allocBoolSlice(f.NumValues()) | ||
defer f.Cache.freeBoolSlice(last) | ||
|
||
// prevStore returns the previous store in the | ||
// same block, or nil if there are none. | ||
prevStore := func(v *Value) *Value { | ||
if v.Op == OpInitMem || v.Op == OpPhi { | ||
return nil | ||
} | ||
m := v.MemoryArg() | ||
if m.Block != v.Block { | ||
return nil | ||
} | ||
return m | ||
} | ||
|
||
for _, b := range f.Blocks { | ||
// Find last store in block, so we can | ||
// walk the stores last to first. | ||
// Last to first helps ensure that the rewrites we | ||
// perform do not get in the way of subsequent rewrites. | ||
for _, v := range b.Values { | ||
if v.Type.IsMemory() { | ||
last[v.ID] = true | ||
} | ||
} | ||
for _, v := range b.Values { | ||
if v.Type.IsMemory() { | ||
if m := prevStore(v); m != nil { | ||
last[m.ID] = false | ||
} | ||
} | ||
} | ||
var lastMem *Value | ||
for _, v := range b.Values { | ||
if last[v.ID] { | ||
lastMem = v | ||
break | ||
} | ||
} | ||
|
||
// Check all stores, from last to first. | ||
memCheck: | ||
for v := lastMem; v != nil; v = prevStore(v) { | ||
info := pairableStores[v.Op] | ||
if info.width == 0 { | ||
continue // Not pairable. | ||
} | ||
if !offsetOk(v.Aux, v.AuxInt, info.width) { | ||
continue // Not advisable to pair. | ||
} | ||
ptr := v.Args[0] | ||
val := v.Args[1] | ||
mem := v.Args[2] | ||
off := v.AuxInt | ||
aux := v.Aux | ||
|
||
// Look for earlier store we can combine with. | ||
lowerOk := true | ||
higherOk := true | ||
count := 10 // max lookback distance | ||
for w := prevStore(v); w != nil; w = prevStore(w) { | ||
if w.Uses != 1 { | ||
// We can't combine stores if the earlier | ||
// store has any use besides the next one | ||
// in the store chain. | ||
// (Unless we could check the aliasing of | ||
// all those other uses.) | ||
continue memCheck | ||
} | ||
if w.Op == v.Op && | ||
w.Args[0] == ptr && | ||
w.Aux == aux && | ||
(lowerOk && w.AuxInt == off-info.width || higherOk && w.AuxInt == off+info.width) { | ||
// This op is mergeable with v. | ||
|
||
// Commit point. | ||
|
||
// ptr val1 val2 mem | ||
args := []*Value{ptr, val, w.Args[1], mem} | ||
if w.AuxInt == off-info.width { | ||
args[1], args[2] = args[2], args[1] | ||
off -= info.width | ||
} | ||
v.reset(info.pair) | ||
v.AddArgs(args...) | ||
v.Aux = aux | ||
v.AuxInt = off | ||
v.Pos = w.Pos // take position of earlier of the two stores (TODO: not really working?) | ||
|
||
// Make w just a memory copy. | ||
wmem := w.MemoryArg() | ||
w.reset(OpCopy) | ||
w.SetArgs1(wmem) | ||
continue memCheck | ||
} | ||
if count--; count == 0 { | ||
// Only look back so far. | ||
// This keeps us in O(n) territory, and it | ||
// also prevents us from keeping values | ||
// in registers for too long (and thus | ||
// needing to spill them). | ||
continue memCheck | ||
} | ||
// We're now looking at a store w which is currently | ||
// between the store v that we're intending to merge into, | ||
// and the store we'll eventually find to merge with it. | ||
// Make sure this store doesn't alias with the one | ||
// we'll be moving. | ||
var width int64 | ||
switch w.Op { | ||
case OpARM64MOVDstore, OpARM64MOVDstorezero, OpARM64FMOVDstore: | ||
width = 8 | ||
case OpARM64MOVWstore, OpARM64MOVWstorezero, OpARM64FMOVSstore: | ||
width = 4 | ||
case OpARM64MOVHstore, OpARM64MOVHstorezero: | ||
width = 2 | ||
case OpARM64MOVBstore, OpARM64MOVBstorezero: | ||
width = 1 | ||
case OpCopy: | ||
continue // this was a store we merged earlier | ||
default: | ||
// Can't reorder with any other memory operations. | ||
// (atomics, calls, ...) | ||
continue memCheck | ||
} | ||
|
||
// We only allow reordering with respect to other | ||
// writes to the same pointer and aux, so we can | ||
// compute the exact the aliasing relationship. | ||
if w.Args[0] != ptr || w.Aux != aux { | ||
continue memCheck | ||
} | ||
if overlap(w.AuxInt, width, off-info.width, info.width) { | ||
// Aliases with slot before v's location. | ||
lowerOk = false | ||
} | ||
if overlap(w.AuxInt, width, off+info.width, info.width) { | ||
// Aliases with slot after v's location. | ||
higherOk = false | ||
} | ||
if !higherOk && !lowerOk { | ||
continue memCheck | ||
} | ||
} | ||
} | ||
} | ||
} |
Oops, something went wrong.