diff --git a/NEWS.md b/NEWS.md index 7ccab87d92019..e2dc6990a08b5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -47,7 +47,7 @@ Compiler/Runtime improvements * Julia-level SROA (Scalar Replacement of Aggregates) has been improved, i.e. allowing elimination of `getfield` call with constant global field ([#42355]), enabling elimination of mutable struct with uninitialized fields ([#43208]), improving performance ([#43232]), handling more nested `getfield` - calls ([#43239]). + calls ([#43239], [#43267]). * Abstract callsite can now be inlined or statically resolved as far as the callsite has a single matching method ([#43113]). diff --git a/base/compiler/ssair/passes.jl b/base/compiler/ssair/passes.jl index 1df68da4e2b3d..a1d9735d7f52a 100644 --- a/base/compiler/ssair/passes.jl +++ b/base/compiler/ssair/passes.jl @@ -169,7 +169,7 @@ function collect_leaves(compact::IncrementalCompact, @nospecialize(val), @nospec end function simple_walk(compact::IncrementalCompact, @nospecialize(defssa#=::AnySSAValue=#), - callback = (@nospecialize(pi), @nospecialize(idx)) -> false) + callback = (@nospecialize(x), @nospecialize(idx)) -> false) while true if isa(defssa, OldSSAValue) if already_inserted(compact, defssa) @@ -337,10 +337,29 @@ struct LiftedValue end const LiftedLeaves = IdDict{Any, Union{Nothing,LiftedValue}} +# NOTE we use `IdSet{Int}` instead of `BitSet` for in these passes since they work on IR after inlining, +# which can be very large sometimes, and program counters in question are often very sparse +const SPCSet = IdSet{Int} + +mutable struct NestedLoads + maybe::Union{Nothing,SPCSet} + NestedLoads() = new(nothing) +end +function record_nested_load!(nested_loads::NestedLoads, pc::Int) + maybe = nested_loads.maybe + maybe === nothing && (maybe = nested_loads.maybe = SPCSet()) + push!(maybe::SPCSet, pc) +end +function is_nested_load(nested_loads::NestedLoads, pc::Int) + maybe = nested_loads.maybe + maybe === nothing && return false + return pc in maybe::SPCSet +end + # try to compute lifted values that can replace `getfield(x, field)` call # where `x` is an immutable struct that are defined at any of `leaves` -function lift_leaves(compact::IncrementalCompact, - @nospecialize(result_t), field::Int, leaves::Vector{Any}) +function lift_leaves!(compact::IncrementalCompact, leaves::Vector{Any}, + @nospecialize(result_t), field::Int, nested_loads::NestedLoads) # For every leaf, the lifted value lifted_leaves = LiftedLeaves() maybe_undef = false @@ -390,11 +409,19 @@ function lift_leaves(compact::IncrementalCompact, ocleaf = simple_walk(compact, ocleaf) end ocdef, _ = walk_to_def(compact, ocleaf) - if isexpr(ocdef, :new_opaque_closure) && isa(field, Int) && 1 ≤ field ≤ length(ocdef.args)-5 + if isexpr(ocdef, :new_opaque_closure) && 1 ≤ field ≤ length(ocdef.args)-5 lift_arg!(compact, leaf, cache_key, ocdef, 5+field, lifted_leaves) continue end return nothing + elseif is_known_call(def, getfield, compact) + if isa(leaf, SSAValue) + struct_typ = unwrap_unionall(widenconst(argextype(def.args[2], compact))) + if ismutabletype(struct_typ) + record_nested_load!(nested_loads, leaf.id) + end + end + return nothing else typ = argextype(leaf, compact) if !isa(typ, Const) @@ -588,7 +615,7 @@ function perform_lifting!(compact::IncrementalCompact, end val = lifted_val.x if isa(val, AnySSAValue) - callback = (@nospecialize(pi), @nospecialize(idx)) -> true + callback = (@nospecialize(x), @nospecialize(idx)) -> true val = simple_walk(compact, val, callback) end push!(new_node.values, val) @@ -619,10 +646,6 @@ function perform_lifting!(compact::IncrementalCompact, return stmt_val # N.B. should never happen end -# NOTE we use `IdSet{Int}` instead of `BitSet` for in these passes since they work on IR after inlining, -# which can be very large sometimes, and program counters in question are often very sparse -const SPCSet = IdSet{Int} - """ sroa_pass!(ir::IRCode) -> newir::IRCode @@ -641,10 +664,11 @@ its argument). In a case when all usages are fully eliminated, `struct` allocation may also be erased as a result of succeeding dead code elimination. """ -function sroa_pass!(ir::IRCode) +function sroa_pass!(ir::IRCode, optional_opts::Bool = true) compact = IncrementalCompact(ir) defuses = nothing # will be initialized once we encounter mutability in order to reduce dynamic allocations lifting_cache = IdDict{Pair{AnySSAValue, Any}, AnySSAValue}() + nested_loads = NestedLoads() # tracks nested `getfield(getfield(...)::Mutable, ...)::Immutable` for ((_, idx), stmt) in compact # check whether this statement is `getfield` / `setfield!` (or other "interesting" statement) isa(stmt, Expr) || continue @@ -672,7 +696,7 @@ function sroa_pass!(ir::IRCode) preserved_arg = stmt.args[pidx] isa(preserved_arg, SSAValue) || continue let intermediaries = SPCSet() - callback = function (@nospecialize(pi), @nospecialize(ssa)) + callback = function (@nospecialize(x), @nospecialize(ssa)) push!(intermediaries, ssa.id) return false end @@ -700,7 +724,9 @@ function sroa_pass!(ir::IRCode) if defuses === nothing defuses = IdDict{Int, Tuple{SPCSet, SSADefUse}}() end - mid, defuse = get!(defuses, defidx, (SPCSet(), SSADefUse())) + mid, defuse = get!(defuses, defidx) do + SPCSet(), SSADefUse() + end push!(defuse.ccall_preserve_uses, idx) union!(mid, intermediaries) end @@ -710,16 +736,17 @@ function sroa_pass!(ir::IRCode) compact[idx] = form_new_preserves(stmt, preserved, new_preserves) end continue - # TODO: This isn't the best place to put these - elseif is_known_call(stmt, typeassert, compact) - canonicalize_typeassert!(compact, idx, stmt) - continue - elseif is_known_call(stmt, (===), compact) - lift_comparison!(compact, idx, stmt, lifting_cache) - continue - # elseif is_known_call(stmt, isa, compact) - # TODO do a similar optimization as `lift_comparison!` for `===` else + if optional_opts + # TODO: This isn't the best place to put these + if is_known_call(stmt, typeassert, compact) + canonicalize_typeassert!(compact, idx, stmt) + elseif is_known_call(stmt, (===), compact) + lift_comparison!(compact, idx, stmt, lifting_cache) + # elseif is_known_call(stmt, isa, compact) + # TODO do a similar optimization as `lift_comparison!` for `===` + end + end continue end @@ -745,7 +772,7 @@ function sroa_pass!(ir::IRCode) if ismutabletype(struct_typ) isa(val, SSAValue) || continue let intermediaries = SPCSet() - callback = function (@nospecialize(pi), @nospecialize(ssa)) + callback = function (@nospecialize(x), @nospecialize(ssa)) push!(intermediaries, ssa.id) return false end @@ -755,7 +782,9 @@ function sroa_pass!(ir::IRCode) if defuses === nothing defuses = IdDict{Int, Tuple{SPCSet, SSADefUse}}() end - mid, defuse = get!(defuses, def.id, (SPCSet(), SSADefUse())) + mid, defuse = get!(defuses, def.id) do + SPCSet(), SSADefUse() + end if is_setfield push!(defuse.defs, idx) else @@ -777,7 +806,7 @@ function sroa_pass!(ir::IRCode) isempty(leaves) && continue result_t = argextype(SSAValue(idx), compact) - lifted_result = lift_leaves(compact, result_t, field, leaves) + lifted_result = lift_leaves!(compact, leaves, result_t, field, nested_loads) lifted_result === nothing && continue lifted_leaves, any_undef = lifted_result @@ -813,18 +842,21 @@ function sroa_pass!(ir::IRCode) used_ssas = copy(compact.used_ssas) simple_dce!(compact, (x::SSAValue) -> used_ssas[x.id] -= 1) ir = complete(compact) - sroa_mutables!(ir, defuses, used_ssas) - return ir + return sroa_mutables!(ir, defuses, used_ssas, nested_loads) else simple_dce!(compact) return complete(compact) end end -function sroa_mutables!(ir::IRCode, defuses::IdDict{Int, Tuple{SPCSet, SSADefUse}}, used_ssas::Vector{Int}) - # initialization of domtree is delayed to avoid the expensive computation in many cases - local domtree = nothing - for (idx, (intermediaries, defuse)) in defuses +function sroa_mutables!(ir::IRCode, + defuses::IdDict{Int, Tuple{SPCSet, SSADefUse}}, used_ssas::Vector{Int}, + nested_loads::NestedLoads) + domtree = nothing # initialization of domtree is delayed to avoid the expensive computation in many cases + nested_mloads = NestedLoads() # tracks nested `getfield(getfield(...)::Mutable, ...)::Mutable` + any_eliminated = false + # NOTE eliminate from innermost definitions, so that we can track elimination of nested `getfield` + for (idx, (intermediaries, defuse)) in sort!(collect(defuses); by=first, rev=true) intermediaries = collect(intermediaries) # Check if there are any uses we did not account for. If so, the variable # escapes and we cannot eliminate the allocation. This works, because we're guaranteed @@ -839,7 +871,19 @@ function sroa_mutables!(ir::IRCode, defuses::IdDict{Int, Tuple{SPCSet, SSADefUse nleaves == nuses_total || continue # Find the type for this allocation defexpr = ir[SSAValue(idx)] - isexpr(defexpr, :new) || continue + isa(defexpr, Expr) || continue + if !isexpr(defexpr, :new) + if is_known_call(defexpr, getfield, ir) + val = defexpr.args[2] + if isa(val, SSAValue) + struct_typ = unwrap_unionall(widenconst(argextype(val, ir))) + if ismutabletype(struct_typ) + record_nested_load!(nested_mloads, idx) + end + end + end + continue + end newidx = idx typ = ir.stmts[newidx][:type] if isa(typ, UnionAll) @@ -919,6 +963,10 @@ function sroa_mutables!(ir::IRCode, defuses::IdDict{Int, Tuple{SPCSet, SSADefUse # Now go through all uses and rewrite them for stmt in du.uses ir[SSAValue(stmt)] = compute_value_for_use(ir, domtree, allblocks, du, phinodes, fidx, stmt) + if !any_eliminated + any_eliminated |= (is_nested_load(nested_loads, stmt) || + is_nested_load(nested_mloads, stmt)) + end end if !isbitstype(ftyp) if preserve_uses !== nothing @@ -955,6 +1003,7 @@ function sroa_mutables!(ir::IRCode, defuses::IdDict{Int, Tuple{SPCSet, SSADefUse @label skip end + return any_eliminated ? sroa_pass!(compact!(ir), false) : ir end function form_new_preserves(origex::Expr, intermediates::Vector{Int}, new_preserves::Vector{Any}) diff --git a/test/compiler/irpasses.jl b/test/compiler/irpasses.jl index 124f6317baeed..3c6443a8b2286 100644 --- a/test/compiler/irpasses.jl +++ b/test/compiler/irpasses.jl @@ -90,6 +90,9 @@ iscall(pred::Function, @nospecialize(x)) = Meta.isexpr(x, :call) && pred(x.args[ struct ImmutableXYZ; x; y; z; end mutable struct MutableXYZ; x; y; z; end +struct ImmutableOuter{T}; x::T; y::T; z::T; end +mutable struct MutableOuter{T}; x::T; y::T; z::T; end + # should optimize away very basic cases let src = code_typed1((Any,Any,Any)) do x, y, z xyz = ImmutableXYZ(x, y, z) @@ -198,9 +201,8 @@ let src = code_typed1((Bool,Bool,Any,Any)) do c1, c2, x, y @test any(isnew, src.code) end -# should include a simple alias analysis -struct ImmutableOuter{T}; x::T; y::T; z::T; end -mutable struct MutableOuter{T}; x::T; y::T; z::T; end +# alias analysis +# -------------- let src = code_typed1((Any,Any,Any)) do x, y, z xyz = ImmutableXYZ(x, y, z) outer = ImmutableOuter(xyz, xyz, xyz) @@ -227,9 +229,11 @@ let src = code_typed1((Any,Any,Any)) do x, y, z x.args[2:end] == Any[#=x=# Core.Argument(2), #=y=# Core.Argument(3), #=y=# Core.Argument(4)] end end - -# FIXME our analysis isn't yet so powerful at this moment: may be unable to handle nested objects well -# OK: mutable(immutable(...)) case +# FIXME? in order to handle nested mutable `getfield` calls, we run SROA iteratively until +# any nested mutable `getfield` calls become no longer eliminatable: +# it's probably not the most efficient option and we may want to introduce some sort of +# alias analysis and eliminates all the loads at once. +# mutable(immutable(...)) case let src = code_typed1((Any,Any,Any)) do x, y, z xyz = MutableXYZ(x, y, z) t = (xyz,) @@ -260,21 +264,48 @@ let # this is a simple end to end test case, which demonstrates allocation elimi # compiled code for `simple_sroa`, otherwise everything can be folded even without SROA @test @allocated(simple_sroa(s)) == 0 end -# FIXME: immutable(mutable(...)) case +# immutable(mutable(...)) case let src = code_typed1((Any,Any,Any)) do x, y, z xyz = ImmutableXYZ(x, y, z) outer = MutableOuter(xyz, xyz, xyz) outer.x.x, outer.y.y, outer.z.z end - @test_broken !any(isnew, src.code) + @test !any(isnew, src.code) + @test any(src.code) do @nospecialize x + iscall((src, tuple), x) && + x.args[2:end] == Any[#=x=# Core.Argument(2), #=y=# Core.Argument(3), #=y=# Core.Argument(4)] + end end -# FIXME: mutable(mutable(...)) case +# mutable(mutable(...)) case let src = code_typed1((Any,Any,Any)) do x, y, z xyz = MutableXYZ(x, y, z) outer = MutableOuter(xyz, xyz, xyz) outer.x.x, outer.y.y, outer.z.z end - @test_broken !any(isnew, src.code) + @test !any(isnew, src.code) + @test any(src.code) do @nospecialize x + iscall((src, tuple), x) && + x.args[2:end] == Any[#=x=# Core.Argument(2), #=y=# Core.Argument(3), #=y=# Core.Argument(4)] + end +end +let src = code_typed1((Any,Any,Any)) do x, y, z + xyz = MutableXYZ(x, y, z) + inner = MutableOuter(xyz, xyz, xyz) + outer = MutableOuter(inner, inner, inner) + outer.x.x.x, outer.y.y.y, outer.z.z.z + end + @test !any(isnew, src.code) + @test any(src.code) do @nospecialize x + iscall((src, tuple), x) && + x.args[2:end] == Any[#=x=# Core.Argument(2), #=y=# Core.Argument(3), #=y=# Core.Argument(4)] + end +end +let # NOTE `sroa_mutables!` eliminate from innermost definitions, so that it should be able + # to fully eliminate this insanely nested example + src = code_typed1((Int,)) do x + (Ref(Ref(Ref(Ref(Ref(Ref(Ref(Ref(Ref(Ref((x))))))))))))[][][][][][][][][][] + end + @test !any(isnew, src.code) end # should work nicely with inlining to optimize away a complicated case