Stream: helpdesk (published)

Topic: ✔ Benchmarking function with `Threads.@threads`


view this post on Zulip Dale Black (Jul 18 2021 at 01:48):

I have been working on a parallelized function that uses Threads.@threads. The function works perfectly fine on its own, but when I try to benchmark the function I get this error. Is there anything I need to take into account when using BenchmarkTools.jl for threaded functions? I also implement a @views macro which might be important to mention?

Normal:

sedt(x, dt, v, z, nthreads)
5×4 Matrix{Float32}:
 1.0  1.0  5.0  10.0
 0.0  0.0  2.0   5.0
 1.0  1.0  1.0   2.0
 1.0  4.0  2.0   1.0
 1.0  9.0  5.0   2.0

Timing:

@benchmark sedt($x, $dt, $v, $z, $nthreads)
TaskFailedException

    nested task error: BoundsError: attempt to access 4-element view(::Matrix{Int64}, 5, :) with eltype Int64 at index [5]
    Stacktrace:
     [1] throw_boundserror(A::SubArray{Int64, 1, Matrix{Int64}, Tuple{Int64, Base.Slice{Base.OneTo{Int64}}}, true}, I::Tuple{Int64})
       @ Base ./abstractarray.jl:651
     [2] checkbounds
       @ ./abstractarray.jl:616 [inlined]
     [3] getindex
       @ ./subarray.jl:302 [inlined]
     [4] squared_euclidean_distance_transform(f::SubArray{Int64, 1, Matrix{Int64}, Tuple{Int64, Base.Slice{Base.OneTo{Int64}}}, true}, dt::SubArray{Float32, 1, Matrix{Float32}, Tuple{Int64, Base.Slice{Base.OneTo{Int64}}}, true}, v::SubArray{Int64, 1, Matrix{Int64}, Tuple{Int64, Base.Slice{Base.OneTo{Int64}}}, true}, z::SubArray{Float32, 1, Matrix{Float32}, Tuple{Int64, Base.Slice{Base.OneTo{Int64}}}, true})
       @ DistanceTransforms ~/.julia/packages/DistanceTransforms/1wk55/src/squared_euclidean_distance_transform.jl:36
     [5] macro expansion
       @ ./In[121]:7 [inlined]
     [6] (::var"#1138#threadsfor_fun#47"{Matrix{Int64}, Matrix{Float32}, Matrix{Int64}, Matrix{Float32}, UnitRange{Int64}})(onethread::Bool)
       @ Main ./threadingconstructs.jl:81
     [7] (::var"#1138#threadsfor_fun#47"{Matrix{Int64}, Matrix{Float32}, Matrix{Int64}, Matrix{Float32}, UnitRange{Int64}})()
       @ Main ./threadingconstructs.jl:48

Stacktrace:
  [1] wait
    @ ./task.jl:322 [inlined]
  [2] threading_run(func::Function)
    @ Base.Threads ./threadingconstructs.jl:34
  [3] macro expansion
    @ ./threadingconstructs.jl:93 [inlined]
  [4] sedt(img::Matrix{Int64}, dt::Matrix{Float32}, v::Matrix{Int64}, z::Matrix{Float32}, nthreads::Int64)
    @ Main ./In[121]:6
  [5] var"##core#576"(x#571::Matrix{Int64}, dt#572::Matrix{Float32}, v#573::Matrix{Int64}, z#574::Matrix{Float32}, nthreads#575::Int64)
    @ Main ~/.julia/packages/BenchmarkTools/tGTCy/src/execution.jl:479
  [6] var"##sample#577"(__params::BenchmarkTools.Parameters)
    @ Main ~/.julia/packages/BenchmarkTools/tGTCy/src/execution.jl:485
  [7] _run(b::BenchmarkTools.Benchmark, p::BenchmarkTools.Parameters; verbose::Bool, pad::String, kwargs::Base.Iterators.Pairs{Symbol, Integer, NTuple{4, Symbol}, NamedTuple{(:samples, :evals, :gctrial, :gcsample), Tuple{Int64, Int64, Bool, Bool}}})
    @ BenchmarkTools ~/.julia/packages/BenchmarkTools/tGTCy/src/execution.jl:98
  [8] #invokelatest#2
    @ ./essentials.jl:710 [inlined]
  [9] #run_result#45
    @ ~/.julia/packages/BenchmarkTools/tGTCy/src/execution.jl:33 [inlined]
 [10] run(b::BenchmarkTools.Benchmark, p::BenchmarkTools.Parameters; progressid::Nothing, nleaves::Float64, ndone::Float64, kwargs::Base.Iterators.Pairs{Symbol, Integer, NTuple{5, Symbol}, NamedTuple{(:verbose, :samples, :evals, :gctrial, :gcsample), Tuple{Bool, Int64, Int64, Bool, Bool}}})
    @ BenchmarkTools ~/.julia/packages/BenchmarkTools/tGTCy/src/execution.jl:116
 [11] #warmup#54
    @ ~/.julia/packages/BenchmarkTools/tGTCy/src/execution.jl:168 [inlined]
 [12] warmup(item::BenchmarkTools.Benchmark)
    @ BenchmarkTools ~/.julia/packages/BenchmarkTools/tGTCy/src/execution.jl:168
 [13] top-level scope
    @ ~/.julia/packages/BenchmarkTools/tGTCy/src/execution.jl:387
 [14] eval
    @ ./boot.jl:360 [inlined]
 [15] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
    @ Base ./loading.jl:1094

view this post on Zulip chriselrod (Jul 18 2021 at 06:15):

Do you have a minimal example?

view this post on Zulip Dale Black (Jul 18 2021 at 14:13):

function squared_euclidean_distance_transform(f::AbstractArray{T,1}, dt, v, z) where {T}
    n = length(f)
    k = 1
    z[1] = -1.0f12
    z[2] = 1.0f12

    # Lower envelope operation
    for q in 2:n
        while true
            s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2 * q - 2 * v[k])
            if s  z[k]
                k -= 1
            else
                k += 1
                v[k] = q
                z[k] = s
                z[k + 1] = 1.0f12
                break
            end
        end
    end

    # Distance transform operation
    k = 1
    for q in 1:n
        while z[k + 1] < q
            k = k + 1
        end
        dt[q] = (q - v[k])^2 + f[v[k]]
    end
    return dt
end

function sedt(img::AbstractArray{T,2}, dt, v, z, nthreads) where {T}
    if nthreads  1
        squared_euclidean_distance_transform(img, dt, v, z)
    else
        rows, columns = size(img)
        Threads.@threads for x in 1:rows
            @views squared_euclidean_distance_transform(img[x, :], dt[x, :], v[x, :], z[x, :])
        end

        Threads.@threads for y in 1:columns
            @views squared_euclidean_distance_transform(img[:, y], dt[:, y], v[:, y], z[:, y])
        end
        return dt
    end
end

x = Array([
    1 0 1 1
    0 0 1 1
    1 1 1 1
    1 0 0 1
    1 1 1 1
])
dt = Array{Float32}(undef, size(x))
v = ones(Int64, size(x))
z = zeros(Float32, size(x) .+ 1)
nthreads = Threads.nthreads();

sedt(x, dt, v, z, nthreads)

# Returns
5×4 Matrix{Float32}:
 1.0  1.0  5.0  10.0
 0.0  0.0  2.0   5.0
 1.0  1.0  1.0   2.0
 1.0  4.0  2.0   1.0
 1.0  9.0  5.0   2.0

@benchmark sedt($x, $dt, $v, $z, $nthreads)

# Returns
TaskFailedException

    nested task error: BoundsError: attempt to access 4-element view(::Matrix{Int64}, 5, :) with eltype Int64 at index [5]
    Stacktrace:
     [1] throw_boundserror(A::SubArray{Int64, 1, Matrix{Int64}, Tuple{Int64, Base.Slice{Base.OneTo{Int64}}}, true}, I::Tuple{Int64})
       @ Base ./abstractarray.jl:651
     [2] checkbounds
       @ ./abstractarray.jl:616 [inlined]
     [3] getindex
       @ ./subarray.jl:302 [inlined]
     [4] squared_euclidean_distance_transform(f::SubArray{Int64, 1, Matrix{Int64}, Tuple{Int64, Base.Slice{Base.OneTo{Int64}}}, true}, dt::SubArray{Float32, 1, Matrix{Float32}, Tuple{Int64, Base.Slice{Base.OneTo{Int64}}}, true}, v::SubArray{Int64, 1, Matrix{Int64}, Tuple{Int64, Base.Slice{Base.OneTo{Int64}}}, true}, z::SubArray{Float32, 1, Matrix{Float32}, Tuple{Int64, Base.Slice{Base.OneTo{Int64}}}, true})
       @ DistanceTransforms ~/.julia/packages/DistanceTransforms/1wk55/src/squared_euclidean_distance_transform.jl:36
     [5] macro expansion
       @ ./In[121]:7 [inlined]
     [6] (::var"#1138#threadsfor_fun#47"{Matrix{Int64}, Matrix{Float32}, Matrix{Int64}, Matrix{Float32}, UnitRange{Int64}})(onethread::Bool)
       @ Main ./threadingconstructs.jl:81
     [7] (::var"#1138#threadsfor_fun#47"{Matrix{Int64}, Matrix{Float32}, Matrix{Int64}, Matrix{Float32}, UnitRange{Int64}})()
       @ Main ./threadingconstructs.jl:48

Stacktrace:
  [1] wait
    @ ./task.jl:322 [inlined]
  [2] threading_run(func::Function)
    @ Base.Threads ./threadingconstructs.jl:34
  [3] macro expansion
    @ ./threadingconstructs.jl:93 [inlined]
  [4] sedt(img::Matrix{Int64}, dt::Matrix{Float32}, v::Matrix{Int64}, z::Matrix{Float32}, nthreads::Int64)
    @ Main ./In[121]:6
  [5] var"##core#576"(x#571::Matrix{Int64}, dt#572::Matrix{Float32}, v#573::Matrix{Int64}, z#574::Matrix{Float32}, nthreads#575::Int64)
    @ Main ~/.julia/packages/BenchmarkTools/tGTCy/src/execution.jl:479
  [6] var"##sample#577"(__params::BenchmarkTools.Parameters)
    @ Main ~/.julia/packages/BenchmarkTools/tGTCy/src/execution.jl:485
  [7] _run(b::BenchmarkTools.Benchmark, p::BenchmarkTools.Parameters; verbose::Bool, pad::String, kwargs::Base.Iterators.Pairs{Symbol, Integer, NTuple{4, Symbol}, NamedTuple{(:samples, :evals, :gctrial, :gcsample), Tuple{Int64, Int64, Bool, Bool}}})
    @ BenchmarkTools ~/.julia/packages/BenchmarkTools/tGTCy/src/execution.jl:98
  [8] #invokelatest#2
    @ ./essentials.jl:710 [inlined]
  [9] #run_result#45
    @ ~/.julia/packages/BenchmarkTools/tGTCy/src/execution.jl:33 [inlined]
 [10] run(b::BenchmarkTools.Benchmark, p::BenchmarkTools.Parameters; progressid::Nothing, nleaves::Float64, ndone::Float64, kwargs::Base.Iterators.Pairs{Symbol, Integer, NTuple{5, Symbol}, NamedTuple{(:verbose, :samples, :evals, :gctrial, :gcsample), Tuple{Bool, Int64, Int64, Bool, Bool}}})
    @ BenchmarkTools ~/.julia/packages/BenchmarkTools/tGTCy/src/execution.jl:116
 [11] #warmup#54
    @ ~/.julia/packages/BenchmarkTools/tGTCy/src/execution.jl:168 [inlined]
 [12] warmup(item::BenchmarkTools.Benchmark)
    @ BenchmarkTools ~/.julia/packages/BenchmarkTools/tGTCy/src/execution.jl:168
 [13] top-level scope
    @ ~/.julia/packages/BenchmarkTools/tGTCy/src/execution.jl:387
 [14] eval
    @ ./boot.jl:360 [inlined]
 [15] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
    @ Base ./loading.jl:1094

view this post on Zulip Kwaku Oskin (Jul 18 2021 at 14:20):

You probably have race conditions on z

view this post on Zulip Kwaku Oskin (Jul 18 2021 at 14:23):

At least it should be reset between calculations, shouldn't it?

view this post on Zulip Kwaku Oskin (Jul 18 2021 at 14:26):

Ah, maybe not.

view this post on Zulip Jameson Nash (Jul 21 2021 at 01:42):

Did you swap rows and columns?

view this post on Zulip Dale Black (Jul 21 2021 at 20:25):

Jameson Nash said:

Did you swap rows and columns?

Can you explain that a little more? The squared_euclidean_distance_transform function that is 1D works on either rows or columns and then the 2D threaded sedtcalls to the 1D function on both the rows and the columns of a given 2D dt array. Is that at all what you are referring to or am I missing something?

view this post on Zulip Takafumi Arakaki (tkf) (Jul 22 2021 at 02:37):

Is it specific to Threads.@threads? Did you try it without Threads.@threads?

Also, since you have the initializations

v = ones(Int64, size(x))
z = zeros(Float32, size(x) .+ 1)

I think you might need it to do before each call to sedt?

@benchmark sedt($x, $dt, fill!($v, 1), fill!($z, 0), $nthreads)

view this post on Zulip Dale Black (Jul 23 2021 at 03:53):

Yes it works without using threads. I’m on mobile and I won’t be near my laptop for a couple days but I will give your suggestion a try soon. That might be the problem!

view this post on Zulip Jameson Nash (Jul 27 2021 at 14:03):

Okay, it looked like your threads loop was using column indexes to slice the rows and the opposite

view this post on Zulip Dale Black (Aug 12 2021 at 02:30):

#helpdesk > ✔ How to rewrite for loops for GPU solved the benchmarking problem!

view this post on Zulip Notification Bot (Aug 12 2021 at 02:30):

Dale Black has marked this topic as resolved.


Last updated: Nov 22 2024 at 04:41 UTC