cmd/compile/internal/ssa: strengthen phiopt pass

The current phiopt pass just transforms the following code
  x := false
  if b { x = true}
into
  x = b

But we find code in runtime.atoi like this:
  neg := false
  if s[0] == '-' {
    neg = true
    s = s[1:]
  }

The current phiopt pass does not covert it into code like:
  neg := s[0] == '-'
  if neg { s = s[1:] }

Therefore, this patch strengthens the phiopt pass so that the
boolean Phi value "neg" can be replaced with a copy of control
value "s[0] == '-'", thereby using "cmp+cset" instead of a branch.

But in some cases even replacing the boolean Phis cannot eliminate
this branch. In the following case, this patch replaces "d" with a
copy of "a<0", but the regalloc pass will insert the "Load {c}"
value into an empty block to split the live ranges, which causes
the branch to not be eliminated.

For example:
  func test(a, b, c int) (bool, int) {
    d := false
    if (a<0) {
      if (b<0) {
        c = c+1
      }
      d = true
    }
    return d, c
  }

The optimized assembly code:
  MOVD "".a(FP), R0
  TBZ $63, R0, 48
  MOVD "".c+16(FP), R1
  ADD $1, R1, R2
  MOVD "".b+8(FP), R3
  CMP ZR, R3
  CSEL LT, R2, R1, R1
  CMP ZR, R0
  CSET LT, R0
  MOVB R0, "".~r3+24(FP)
  MOVD R1, "".~r4+32(FP)
  RET (R30)
  MOVD "".c+16(FP), R1
  JMP 28

The benchmark:

name          old time/op            new time/op            delta
pkg:cmd/compile/internal/ssa goos:linux goarch:arm64
PhioptPass  117783.250000ns +- 1%  117219.111111ns +- 1%   ~  (p=0.074  n=8+9)

Statistical data from compilecmp tool:

compilecmp local/master -> HEAD
local/master (a826f7dc45): debug/dwarf: support DW_FORM_rnglistx aka formRnglistx
HEAD (e57e003c10): cmd/compile/internal/ssa: strengthen phiopt pass

benchstat -geomean  /tmp/2516644532 /tmp/1075915815
completed 50 of 50, estimated time remaining 0s (ETA 7:10PM)
name                      old time/op       new time/op       delta
Template                        554ms _ 3%        553ms _ 3%    ~     (p=0.986 n=49+48)
Unicode                         252ms _ 4%        249ms _ 4%  -1.33%  (p=0.002 n=47+49)
GoTypes                         3.16s _ 3%        3.18s _ 3%  +0.77%  (p=0.022 n=44+48)
Compiler                        257ms _ 4%        258ms _ 4%    ~     (p=0.121 n=50+49)
SSA                             24.2s _ 4%        24.2s _ 5%    ~     (p=0.694 n=49+50)
Flate                           338ms _ 4%        338ms _ 4%    ~     (p=0.592 n=43+46)
GoParser                        506ms _ 3%        507ms _ 3%    ~     (p=0.942 n=49+50)
Reflect                         1.37s _ 4%        1.37s _ 5%    ~     (p=0.408 n=50+50)
Tar                             486ms _ 3%        487ms _ 4%    ~     (p=0.911 n=47+50)
XML                             619ms _ 2%        619ms _ 3%    ~     (p=0.368 n=46+48)
LinkCompiler                    1.29s _31%        1.32s _23%    ~     (p=0.306 n=49+44)
ExternalLinkCompiler            3.39s _10%        3.36s _ 6%    ~     (p=0.311 n=48+46)
LinkWithoutDebugCompiler        846ms _37%        793ms _24%  -6.29%  (p=0.040 n=50+49)
[Geo mean]                      974ms             971ms       -0.36%

name                      old user-time/op  new user-time/op  delta
Template                        910ms _12%        893ms _13%    ~     (p=0.098 n=49+49)
Unicode                         495ms _28%        492ms _18%    ~     (p=0.562 n=50+46)
GoTypes                         4.42s _15%        4.39s _13%    ~     (p=0.684 n=49+50)
Compiler                        419ms _22%        422ms _16%    ~     (p=0.579 n=48+50)
SSA                             36.5s _ 7%        36.6s _ 8%    ~     (p=0.465 n=50+47)
Flate                           521ms _21%        523ms _16%    ~     (p=0.889 n=50+47)
GoParser                        810ms _12%        792ms _15%    ~     (p=0.149 n=50+50)
Reflect                         1.98s _13%        2.02s _13%    ~     (p=0.144 n=47+50)
Tar                             826ms _15%        806ms _19%    ~     (p=0.115 n=49+49)
XML                             988ms _14%       1003ms _14%    ~     (p=0.179 n=50+50)
LinkCompiler                    1.79s _ 8%        1.84s _11%  +2.81%  (p=0.001 n=49+49)
ExternalLinkCompiler            3.69s _ 4%        3.71s _ 3%    ~     (p=0.261 n=50+50)
LinkWithoutDebugCompiler        838ms _10%        827ms _11%    ~     (p=0.323 n=50+48)
[Geo mean]                      1.44s             1.44s       -0.05%

name                      old alloc/op      new alloc/op      delta
Template                       39.0MB _ 1%       39.0MB _ 1%    ~     (p=0.445 n=50+49)
Unicode                        28.5MB _ 0%       28.5MB _ 0%    ~     (p=0.460 n=50+50)
GoTypes                         169MB _ 1%        169MB _ 1%    ~     (p=0.092 n=48+50)
Compiler                       23.4MB _ 1%       23.4MB _ 1%  -0.19%  (p=0.032 n=50+49)
SSA                            1.54GB _ 0%       1.55GB _ 1%  +0.14%  (p=0.001 n=50+50)
Flate                          23.8MB _ 1%       23.8MB _ 2%    ~     (p=0.702 n=49+49)
GoParser                       35.4MB _ 1%       35.4MB _ 1%    ~     (p=0.786 n=50+50)
Reflect                        85.3MB _ 1%       85.3MB _ 1%    ~     (p=0.298 n=50+50)
Tar                            34.6MB _ 2%       34.6MB _ 2%    ~     (p=0.683 n=50+50)
XML                            44.5MB _ 3%       44.0MB _ 2%  -1.05%  (p=0.000 n=50+46)
LinkCompiler                    136MB _ 0%        136MB _ 0%  +0.01%  (p=0.005 n=50+50)
ExternalLinkCompiler            128MB _ 0%        128MB _ 0%    ~     (p=0.179 n=50+50)
LinkWithoutDebugCompiler       84.3MB _ 0%       84.3MB _ 0%  +0.01%  (p=0.006 n=50+50)
[Geo mean]                     70.7MB            70.6MB       -0.07%

name                      old allocs/op     new allocs/op     delta
Template                         410k _ 0%         410k _ 0%    ~     (p=0.606 n=48+49)
Unicode                          310k _ 0%         310k _ 0%    ~     (p=0.674 n=50+50)
GoTypes                         1.81M _ 0%        1.81M _ 0%    ~     (p=0.674 n=50+50)
Compiler                         202k _ 0%         202k _ 0%  +0.02%  (p=0.046 n=50+50)
SSA                             16.3M _ 0%        16.3M _ 0%  +0.10%  (p=0.000 n=50+50)
Flate                            244k _ 0%         244k _ 0%    ~     (p=0.834 n=49+50)
GoParser                         380k _ 0%         380k _ 0%    ~     (p=0.410 n=50+50)
Reflect                         1.08M _ 0%        1.08M _ 0%    ~     (p=0.782 n=48+50)
Tar                              368k _ 0%         368k _ 0%    ~     (p=0.585 n=50+49)
XML                              453k _ 0%         453k _ 0%  -0.01%  (p=0.025 n=49+49)
LinkCompiler                     713k _ 0%         713k _ 0%  +0.01%  (p=0.044 n=50+50)
ExternalLinkCompiler             794k _ 0%         794k _ 0%  +0.01%  (p=0.000 n=50+49)
LinkWithoutDebugCompiler         251k _ 0%         251k _ 0%    ~     (p=0.092 n=47+50)
[Geo mean]                       615k              615k       +0.01%

name                      old maxRSS/op     new maxRSS/op     delta
Template                        37.0M _ 4%        37.2M _ 3%    ~     (p=0.062 n=48+48)
Unicode                         36.9M _ 5%        37.3M _ 4%  +1.10%  (p=0.021 n=50+47)
GoTypes                         94.3M _ 3%        94.9M _ 4%  +0.69%  (p=0.022 n=45+46)
Compiler                        33.4M _ 3%        33.4M _ 5%    ~     (p=0.964 n=49+50)
SSA                              741M _ 3%         738M _ 3%    ~     (p=0.164 n=50+50)
Flate                           28.5M _ 6%        28.8M _ 4%  +1.07%  (p=0.009 n=50+49)
GoParser                        35.0M _ 3%        35.3M _ 4%  +0.83%  (p=0.010 n=50+48)
Reflect                         57.2M _ 6%        57.1M _ 4%    ~     (p=0.815 n=50+49)
Tar                             34.9M _ 3%        35.0M _ 3%    ~     (p=0.134 n=49+48)
XML                             39.5M _ 5%        40.0M _ 3%  +1.35%  (p=0.001 n=50+48)
LinkCompiler                     220M _ 2%         220M _ 2%    ~     (p=0.547 n=49+48)
ExternalLinkCompiler             235M _ 2%         236M _ 2%    ~     (p=0.538 n=47+44)
LinkWithoutDebugCompiler         179M _ 1%         179M _ 1%    ~     (p=0.775 n=50+50)
[Geo mean]                      74.9M             75.2M       +0.43%

name                      old text-bytes    new text-bytes    delta
HelloSize                       784kB _ 0%        784kB _ 0%  +0.01%  (p=0.000 n=50+50)

name                      old data-bytes    new data-bytes    delta
HelloSize                      13.1kB _ 0%       13.1kB _ 0%    ~     (all equal)

name                      old bss-bytes     new bss-bytes     delta
HelloSize                       206kB _ 0%        206kB _ 0%    ~     (all equal)

name                      old exe-bytes     new exe-bytes     delta
HelloSize                      1.28MB _ 0%       1.28MB _ 0%  +0.00%  (p=0.000 n=50+50)

file      before    after     _       %
addr2line 4006300   4004484   -1816   -0.045%
api       5029956   5029324   -632    -0.013%
asm       4936311   4939423   +3112   +0.063%
buildid   2595059   2595291   +232    +0.009%
cgo       4401029   4397333   -3696   -0.084%
compile   22246677  22246863  +186    +0.001%
cover     4443825   4443065   -760    -0.017%
dist      3366078   3365838   -240    -0.007%
doc       3776391   3776615   +224    +0.006%
fix       3218800   3218648   -152    -0.005%
link      6365321   6365345   +24     +0.000%
nm        3923625   3923857   +232    +0.006%
objdump   4295569   4295041   -528    -0.012%
pack      2390745   2389217   -1528   -0.064%
pprof     12870094  12866942  -3152   -0.024%
test2json 2587265   2587073   -192    -0.007%
trace     9612629   9613981   +1352   +0.014%
vet       6791008   6792072   +1064   +0.016%
total     106856682 106850412 -6270   -0.006%

Update #37608

Change-Id: Ic6206b22fd1faf570be9fd3c2511aa6c4ce38cdb
Reviewed-on: https://go-review.googlesource.com/c/go/+/252937
Trust: fannie zhang <Fannie.Zhang@arm.com>
Run-TryBot: fannie zhang <Fannie.Zhang@arm.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
This commit is contained in:
fanzha02 2020-07-28 10:29:06 +08:00 коммит произвёл fannie zhang
Родитель 23ffb5b9ae
Коммит ba6bd967d2
4 изменённых файлов: 211 добавлений и 2 удалений

Просмотреть файл

@ -0,0 +1,32 @@
// Copyright 2020 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package ssa
import (
"math/rand"
"testing"
)
var d int
//go:noinline
func fn(a, b int) bool {
c := false
if a > 0 {
if b < 0 {
d = d + 1
}
c = true
}
return c
}
func BenchmarkPhioptPass(b *testing.B) {
for i := 0; i < b.N; i++ {
a := rand.Perm(i/10 + 10)
for i := 1; i < len(a)/2; i++ {
fn(a[i]-a[i-1], a[i+len(a)/2-2]-a[i+len(a)/2-1])
}
}
}

Просмотреть файл

@ -46,7 +46,6 @@ func phiopt(f *Func) {
continue continue
} }
// b0 is the if block giving the boolean value. // b0 is the if block giving the boolean value.
// reverse is the predecessor from which the truth value comes. // reverse is the predecessor from which the truth value comes.
var reverse int var reverse int
if b0.Succs[0].b == pb0 && b0.Succs[1].b == pb1 { if b0.Succs[0].b == pb0 && b0.Succs[1].b == pb1 {
@ -120,6 +119,141 @@ func phiopt(f *Func) {
} }
} }
} }
// strengthen phi optimization.
// Main use case is to transform:
// x := false
// if c {
// x = true
// ...
// }
// into
// x := c
// if x { ... }
//
// For example, in SSA code a case appears as
// b0
// If c -> b, sb0
// sb0
// If d -> sd0, sd1
// sd1
// ...
// sd0
// Plain -> b
// b
// x = (OpPhi (ConstBool [true]) (ConstBool [false]))
//
// In this case we can also replace x with a copy of c.
//
// The optimization idea:
// 1. block b has a phi value x, x = OpPhi (ConstBool [true]) (ConstBool [false]),
// and len(b.Preds) is equal to 2.
// 2. find the common dominator(b0) of the predecessors(pb0, pb1) of block b, and the
// dominator(b0) is a If block.
// Special case: one of the predecessors(pb0 or pb1) is the dominator(b0).
// 3. the successors(sb0, sb1) of the dominator need to dominate the predecessors(pb0, pb1)
// of block b respectively.
// 4. replace this boolean Phi based on dominator block.
//
// b0(pb0) b0(pb1) b0
// | \ / | / \
// | sb1 sb0 | sb0 sb1
// | ... ... | ... ...
// | pb1 pb0 | pb0 pb1
// | / \ | \ /
// b b b
//
var lca *lcaRange
for _, b := range f.Blocks {
if len(b.Preds) != 2 || len(b.Values) == 0 {
// TODO: handle more than 2 predecessors, e.g. a || b || c.
continue
}
for _, v := range b.Values {
// find a phi value v = OpPhi (ConstBool [true]) (ConstBool [false]).
// TODO: v = OpPhi (ConstBool [true]) (Arg <bool> {value})
if v.Op != OpPhi {
continue
}
if v.Args[0].Op != OpConstBool || v.Args[1].Op != OpConstBool {
continue
}
if v.Args[0].AuxInt == v.Args[1].AuxInt {
continue
}
pb0 := b.Preds[0].b
pb1 := b.Preds[1].b
if pb0.Kind == BlockIf && pb0 == sdom.Parent(b) {
// special case: pb0 is the dominator block b0.
// b0(pb0)
// | \
// | sb1
// | ...
// | pb1
// | /
// b
// if another successor sb1 of b0(pb0) dominates pb1, do replace.
ei := b.Preds[0].i
sb1 := pb0.Succs[1-ei].b
if sdom.IsAncestorEq(sb1, pb1) {
convertPhi(pb0, v, ei)
break
}
} else if pb1.Kind == BlockIf && pb1 == sdom.Parent(b) {
// special case: pb1 is the dominator block b0.
// b0(pb1)
// / |
// sb0 |
// ... |
// pb0 |
// \ |
// b
// if another successor sb0 of b0(pb0) dominates pb0, do replace.
ei := b.Preds[1].i
sb0 := pb1.Succs[1-ei].b
if sdom.IsAncestorEq(sb0, pb0) {
convertPhi(pb1, v, ei-1)
break
}
} else {
// b0
// / \
// sb0 sb1
// ... ...
// pb0 pb1
// \ /
// b
//
// Build data structure for fast least-common-ancestor queries.
if lca == nil {
lca = makeLCArange(f)
}
b0 := lca.find(pb0, pb1)
if b0.Kind != BlockIf {
break
}
sb0 := b0.Succs[0].b
sb1 := b0.Succs[1].b
var reverse int
if sdom.IsAncestorEq(sb0, pb0) && sdom.IsAncestorEq(sb1, pb1) {
reverse = 0
} else if sdom.IsAncestorEq(sb1, pb0) && sdom.IsAncestorEq(sb0, pb1) {
reverse = 1
} else {
break
}
if len(sb0.Preds) != 1 || len(sb1.Preds) != 1 {
// we can not replace phi value x in the following case.
// if gp == nil || sp < lo { x = true}
// if a || b { x = true }
// so the if statement can only have one condition.
break
}
convertPhi(b0, v, reverse)
}
}
}
} }
func phioptint(v *Value, b0 *Block, reverse int) { func phioptint(v *Value, b0 *Block, reverse int) {
@ -174,3 +308,16 @@ func phioptint(v *Value, b0 *Block, reverse int) {
f.Warnl(v.Block.Pos, "converted OpPhi bool -> int%d", v.Type.Size()*8) f.Warnl(v.Block.Pos, "converted OpPhi bool -> int%d", v.Type.Size()*8)
} }
} }
// b is the If block giving the boolean value.
// v is the phi value v = (OpPhi (ConstBool [true]) (ConstBool [false])).
// reverse is the predecessor from which the truth value comes.
func convertPhi(b *Block, v *Value, reverse int) {
f := b.Func
ops := [2]Op{OpNot, OpCopy}
v.reset(ops[v.Args[reverse].AuxInt])
v.AddArg(b.Controls[0])
if f.pass.debug > 0 {
f.Warnl(b.Pos, "converted OpPhi to %v", v.Op)
}
}

Просмотреть файл

@ -178,6 +178,12 @@ func (t SparseTree) Child(x *Block) *Block {
return t[x.ID].child return t[x.ID].child
} }
// Parent returns the parent of x in the dominator tree, or
// nil if x is the function's entry.
func (t SparseTree) Parent(x *Block) *Block {
return t[x.ID].parent
}
// isAncestorEq reports whether x is an ancestor of or equal to y. // isAncestorEq reports whether x is an ancestor of or equal to y.
func (t SparseTree) IsAncestorEq(x, y *Block) bool { func (t SparseTree) IsAncestorEq(x, y *Block) bool {
if x == y { if x == y {

Просмотреть файл

@ -1,4 +1,4 @@
// +build amd64 s390x // +build amd64 s390x arm64
// errorcheck -0 -d=ssa/phiopt/debug=3 // errorcheck -0 -d=ssa/phiopt/debug=3
// Copyright 2016 The Go Authors. All rights reserved. // Copyright 2016 The Go Authors. All rights reserved.
@ -104,5 +104,29 @@ func f7and(a bool, b bool) bool {
return a && b // ERROR "converted OpPhi to AndB$" return a && b // ERROR "converted OpPhi to AndB$"
} }
//go:noinline
func f8(s string) (string, bool) {
neg := false
if s[0] == '-' { // ERROR "converted OpPhi to Copy$"
neg = true
s = s[1:]
}
return s, neg
}
var d int
//go:noinline
func f9(a, b int) bool {
c := false
if a < 0 { // ERROR "converted OpPhi to Copy$"
if b < 0 {
d = d + 1
}
c = true
}
return c
}
func main() { func main() {
} }