I have been working on a parallelized function that uses Threads.@threads
. The function works perfectly fine on its own, but when I try to benchmark the function I get this error. Is there anything I need to take into account when using BenchmarkTools.jl for threaded functions? I also implement a @views
macro which might be important to mention?
Normal:
sedt(x, dt, v, z, nthreads)
5×4 Matrix{Float32}:
1.0 1.0 5.0 10.0
0.0 0.0 2.0 5.0
1.0 1.0 1.0 2.0
1.0 4.0 2.0 1.0
1.0 9.0 5.0 2.0
Timing:
@benchmark sedt($x, $dt, $v, $z, $nthreads)
TaskFailedException
nested task error: BoundsError: attempt to access 4-element view(::Matrix{Int64}, 5, :) with eltype Int64 at index [5]
Stacktrace:
[1] throw_boundserror(A::SubArray{Int64, 1, Matrix{Int64}, Tuple{Int64, Base.Slice{Base.OneTo{Int64}}}, true}, I::Tuple{Int64})
@ Base ./abstractarray.jl:651
[2] checkbounds
@ ./abstractarray.jl:616 [inlined]
[3] getindex
@ ./subarray.jl:302 [inlined]
[4] squared_euclidean_distance_transform(f::SubArray{Int64, 1, Matrix{Int64}, Tuple{Int64, Base.Slice{Base.OneTo{Int64}}}, true}, dt::SubArray{Float32, 1, Matrix{Float32}, Tuple{Int64, Base.Slice{Base.OneTo{Int64}}}, true}, v::SubArray{Int64, 1, Matrix{Int64}, Tuple{Int64, Base.Slice{Base.OneTo{Int64}}}, true}, z::SubArray{Float32, 1, Matrix{Float32}, Tuple{Int64, Base.Slice{Base.OneTo{Int64}}}, true})
@ DistanceTransforms ~/.julia/packages/DistanceTransforms/1wk55/src/squared_euclidean_distance_transform.jl:36
[5] macro expansion
@ ./In[121]:7 [inlined]
[6] (::var"#1138#threadsfor_fun#47"{Matrix{Int64}, Matrix{Float32}, Matrix{Int64}, Matrix{Float32}, UnitRange{Int64}})(onethread::Bool)
@ Main ./threadingconstructs.jl:81
[7] (::var"#1138#threadsfor_fun#47"{Matrix{Int64}, Matrix{Float32}, Matrix{Int64}, Matrix{Float32}, UnitRange{Int64}})()
@ Main ./threadingconstructs.jl:48
Stacktrace:
[1] wait
@ ./task.jl:322 [inlined]
[2] threading_run(func::Function)
@ Base.Threads ./threadingconstructs.jl:34
[3] macro expansion
@ ./threadingconstructs.jl:93 [inlined]
[4] sedt(img::Matrix{Int64}, dt::Matrix{Float32}, v::Matrix{Int64}, z::Matrix{Float32}, nthreads::Int64)
@ Main ./In[121]:6
[5] var"##core#576"(x#571::Matrix{Int64}, dt#572::Matrix{Float32}, v#573::Matrix{Int64}, z#574::Matrix{Float32}, nthreads#575::Int64)
@ Main ~/.julia/packages/BenchmarkTools/tGTCy/src/execution.jl:479
[6] var"##sample#577"(__params::BenchmarkTools.Parameters)
@ Main ~/.julia/packages/BenchmarkTools/tGTCy/src/execution.jl:485
[7] _run(b::BenchmarkTools.Benchmark, p::BenchmarkTools.Parameters; verbose::Bool, pad::String, kwargs::Base.Iterators.Pairs{Symbol, Integer, NTuple{4, Symbol}, NamedTuple{(:samples, :evals, :gctrial, :gcsample), Tuple{Int64, Int64, Bool, Bool}}})
@ BenchmarkTools ~/.julia/packages/BenchmarkTools/tGTCy/src/execution.jl:98
[8] #invokelatest#2
@ ./essentials.jl:710 [inlined]
[9] #run_result#45
@ ~/.julia/packages/BenchmarkTools/tGTCy/src/execution.jl:33 [inlined]
[10] run(b::BenchmarkTools.Benchmark, p::BenchmarkTools.Parameters; progressid::Nothing, nleaves::Float64, ndone::Float64, kwargs::Base.Iterators.Pairs{Symbol, Integer, NTuple{5, Symbol}, NamedTuple{(:verbose, :samples, :evals, :gctrial, :gcsample), Tuple{Bool, Int64, Int64, Bool, Bool}}})
@ BenchmarkTools ~/.julia/packages/BenchmarkTools/tGTCy/src/execution.jl:116
[11] #warmup#54
@ ~/.julia/packages/BenchmarkTools/tGTCy/src/execution.jl:168 [inlined]
[12] warmup(item::BenchmarkTools.Benchmark)
@ BenchmarkTools ~/.julia/packages/BenchmarkTools/tGTCy/src/execution.jl:168
[13] top-level scope
@ ~/.julia/packages/BenchmarkTools/tGTCy/src/execution.jl:387
[14] eval
@ ./boot.jl:360 [inlined]
[15] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
@ Base ./loading.jl:1094
Do you have a minimal example?
function squared_euclidean_distance_transform(f::AbstractArray{T,1}, dt, v, z) where {T}
n = length(f)
k = 1
z[1] = -1.0f12
z[2] = 1.0f12
# Lower envelope operation
for q in 2:n
while true
s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2 * q - 2 * v[k])
if s ≤ z[k]
k -= 1
else
k += 1
v[k] = q
z[k] = s
z[k + 1] = 1.0f12
break
end
end
end
# Distance transform operation
k = 1
for q in 1:n
while z[k + 1] < q
k = k + 1
end
dt[q] = (q - v[k])^2 + f[v[k]]
end
return dt
end
function sedt(img::AbstractArray{T,2}, dt, v, z, nthreads) where {T}
if nthreads ≤ 1
squared_euclidean_distance_transform(img, dt, v, z)
else
rows, columns = size(img)
Threads.@threads for x in 1:rows
@views squared_euclidean_distance_transform(img[x, :], dt[x, :], v[x, :], z[x, :])
end
Threads.@threads for y in 1:columns
@views squared_euclidean_distance_transform(img[:, y], dt[:, y], v[:, y], z[:, y])
end
return dt
end
end
x = Array([
1 0 1 1
0 0 1 1
1 1 1 1
1 0 0 1
1 1 1 1
])
dt = Array{Float32}(undef, size(x))
v = ones(Int64, size(x))
z = zeros(Float32, size(x) .+ 1)
nthreads = Threads.nthreads();
sedt(x, dt, v, z, nthreads)
# Returns
5×4 Matrix{Float32}:
1.0 1.0 5.0 10.0
0.0 0.0 2.0 5.0
1.0 1.0 1.0 2.0
1.0 4.0 2.0 1.0
1.0 9.0 5.0 2.0
@benchmark sedt($x, $dt, $v, $z, $nthreads)
# Returns
TaskFailedException
nested task error: BoundsError: attempt to access 4-element view(::Matrix{Int64}, 5, :) with eltype Int64 at index [5]
Stacktrace:
[1] throw_boundserror(A::SubArray{Int64, 1, Matrix{Int64}, Tuple{Int64, Base.Slice{Base.OneTo{Int64}}}, true}, I::Tuple{Int64})
@ Base ./abstractarray.jl:651
[2] checkbounds
@ ./abstractarray.jl:616 [inlined]
[3] getindex
@ ./subarray.jl:302 [inlined]
[4] squared_euclidean_distance_transform(f::SubArray{Int64, 1, Matrix{Int64}, Tuple{Int64, Base.Slice{Base.OneTo{Int64}}}, true}, dt::SubArray{Float32, 1, Matrix{Float32}, Tuple{Int64, Base.Slice{Base.OneTo{Int64}}}, true}, v::SubArray{Int64, 1, Matrix{Int64}, Tuple{Int64, Base.Slice{Base.OneTo{Int64}}}, true}, z::SubArray{Float32, 1, Matrix{Float32}, Tuple{Int64, Base.Slice{Base.OneTo{Int64}}}, true})
@ DistanceTransforms ~/.julia/packages/DistanceTransforms/1wk55/src/squared_euclidean_distance_transform.jl:36
[5] macro expansion
@ ./In[121]:7 [inlined]
[6] (::var"#1138#threadsfor_fun#47"{Matrix{Int64}, Matrix{Float32}, Matrix{Int64}, Matrix{Float32}, UnitRange{Int64}})(onethread::Bool)
@ Main ./threadingconstructs.jl:81
[7] (::var"#1138#threadsfor_fun#47"{Matrix{Int64}, Matrix{Float32}, Matrix{Int64}, Matrix{Float32}, UnitRange{Int64}})()
@ Main ./threadingconstructs.jl:48
Stacktrace:
[1] wait
@ ./task.jl:322 [inlined]
[2] threading_run(func::Function)
@ Base.Threads ./threadingconstructs.jl:34
[3] macro expansion
@ ./threadingconstructs.jl:93 [inlined]
[4] sedt(img::Matrix{Int64}, dt::Matrix{Float32}, v::Matrix{Int64}, z::Matrix{Float32}, nthreads::Int64)
@ Main ./In[121]:6
[5] var"##core#576"(x#571::Matrix{Int64}, dt#572::Matrix{Float32}, v#573::Matrix{Int64}, z#574::Matrix{Float32}, nthreads#575::Int64)
@ Main ~/.julia/packages/BenchmarkTools/tGTCy/src/execution.jl:479
[6] var"##sample#577"(__params::BenchmarkTools.Parameters)
@ Main ~/.julia/packages/BenchmarkTools/tGTCy/src/execution.jl:485
[7] _run(b::BenchmarkTools.Benchmark, p::BenchmarkTools.Parameters; verbose::Bool, pad::String, kwargs::Base.Iterators.Pairs{Symbol, Integer, NTuple{4, Symbol}, NamedTuple{(:samples, :evals, :gctrial, :gcsample), Tuple{Int64, Int64, Bool, Bool}}})
@ BenchmarkTools ~/.julia/packages/BenchmarkTools/tGTCy/src/execution.jl:98
[8] #invokelatest#2
@ ./essentials.jl:710 [inlined]
[9] #run_result#45
@ ~/.julia/packages/BenchmarkTools/tGTCy/src/execution.jl:33 [inlined]
[10] run(b::BenchmarkTools.Benchmark, p::BenchmarkTools.Parameters; progressid::Nothing, nleaves::Float64, ndone::Float64, kwargs::Base.Iterators.Pairs{Symbol, Integer, NTuple{5, Symbol}, NamedTuple{(:verbose, :samples, :evals, :gctrial, :gcsample), Tuple{Bool, Int64, Int64, Bool, Bool}}})
@ BenchmarkTools ~/.julia/packages/BenchmarkTools/tGTCy/src/execution.jl:116
[11] #warmup#54
@ ~/.julia/packages/BenchmarkTools/tGTCy/src/execution.jl:168 [inlined]
[12] warmup(item::BenchmarkTools.Benchmark)
@ BenchmarkTools ~/.julia/packages/BenchmarkTools/tGTCy/src/execution.jl:168
[13] top-level scope
@ ~/.julia/packages/BenchmarkTools/tGTCy/src/execution.jl:387
[14] eval
@ ./boot.jl:360 [inlined]
[15] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
@ Base ./loading.jl:1094
You probably have race conditions on z
At least it should be reset between calculations, shouldn't it?
Ah, maybe not.
Did you swap rows and columns?
Jameson Nash said:
Did you swap rows and columns?
Can you explain that a little more? The squared_euclidean_distance_transform
function that is 1D works on either rows or columns and then the 2D threaded sedt
calls to the 1D function on both the rows and the columns of a given 2D dt
array. Is that at all what you are referring to or am I missing something?
Is it specific to Threads.@threads
? Did you try it without Threads.@threads
?
Also, since you have the initializations
v = ones(Int64, size(x))
z = zeros(Float32, size(x) .+ 1)
I think you might need it to do before each call to sedt
?
@benchmark sedt($x, $dt, fill!($v, 1), fill!($z, 0), $nthreads)
Yes it works without using threads. I’m on mobile and I won’t be near my laptop for a couple days but I will give your suggestion a try soon. That might be the problem!
Okay, it looked like your threads loop was using column indexes to slice the rows and the opposite
#helpdesk > ✔ How to rewrite for loops for GPU solved the benchmarking problem!
Dale Black has marked this topic as resolved.
Last updated: Dec 28 2024 at 04:38 UTC