Stream: helpdesk (published)

Topic: OhMyThreads tmap! mutating the input array


view this post on Zulip Moorits Muru (Feb 14 2025 at 13:32):

I am using OhMyThreads.jl function tmap!. I would like to mutate the input array A, but I am not sure, if it is safe, as I don't have much experience with parallel computation. Basically, I am running tmap!(f, A, A). Each calculation of f(A[i]) is independent of other elements of A. There shouldn't be any racing happening, right? Each thread works on its own elements?

view this post on Zulip Mason Protter (Feb 14 2025 at 13:37):

Yes, that should be fine.

view this post on Zulip Moorits Muru (Feb 14 2025 at 13:37):

Thanks!

view this post on Zulip jar (Feb 14 2025 at 19:14):

It might not work right on A::BitArray or some other types?

view this post on Zulip jar (Feb 14 2025 at 19:14):

See also https://github.com/JuliaLang/julia/issues/53140

view this post on Zulip Alec (Feb 21 2025 at 04:35):

I have a related question, where I am trying to run a simulation across independent objects and then aggregate time-varying results into some storage arrays. Based on reading the OhMyThreads documentation, I think the following code would be susceptible to data races, though testing simulation thousands of iterations I haven't encountered an assertion failure which I added to check for data races in this dummy case. Of course in the non-MWE, the inner loop is doing a lot more work.

My questions are:

  1. Am I wrong that this first code block is susceptible to data races even though I haven't encountered it in practice?
  2. What is the OhMyThreads.jl way of 'saving' the simulation results to a top-level container? From what I read in the documentation I think the only pattern that fits is the buffered channel, but it's very clunky (see second code example)
function inner_loop!(output_vecs,n)
    tid = Threads.threadid()
    for i in 1:n
        output_vecs.x[i,tid] += 1
        output_vecs.y[i,tid] += 2
    end

end
function simulation()
    n = 10^5
    proj_length = 1200
    nt = Threads.nthreads()
    output = (
        x = zeros(proj_length,nt),
        y = zeros(proj_length,nt),
    )

    Threads.@threads for i in 1:n
        inner_loop!(output,500)
    end

    output = map(x -> vec(reduce(+, x, dims=2)), output)
    @assert sum(output.x) == n * 500
    @assert sum(output.y) == n * 500 *2
    output
end

simulation()

Buffered channel with OhMyThreads.jl

function inner_loop_buffer!(output_vecs,n)
    tid = Threads.threadid()
    for i in 1:n
        output_vecs.x[i] += 1
        output_vecs.y[i] += 2
    end

end
function simulation_buffer()
    n = 10^5
    proj_length = 1200
    nt = Threads.nthreads()
    output = (
        x = zeros(proj_length),
        y = zeros(proj_length),
    )
    chnl = Channel{typeof(output)}(nt)
    foreach(1:nt) do _
        put!(chnl, (
            x = zeros(proj_length),
            y = zeros(proj_length),
        ))
    end

    OhMyThreads.tmap(1:n) do i
        C = take!(chnl)
        inner_loop_buffer!(C,500)
        put!(chnl,C)
    end

    close(chnl)  # Ensure no more items will be put into the channel

    for M in chnl
        output.x .+= M.x
        output.y .+= M.y
    end

    @assert sum(output.x) == n * 500
    @assert sum(output.y) == n * 500 *2
    output
end

simulation_buffer()

view this post on Zulip Alec (Feb 21 2025 at 04:42):

In addition to being more verbose, the buffered channel version is much slower (maybe just because creating more output arrays is the predominant time spent?

julia> @benchmark simulation()
BenchmarkTools.Trial: 1016 samples with 1 evaluation per sample.
 Range (min  max):  4.871 ms    7.371 ms   GC (min  max): 0.00%  32.04%
 Time  (median):     4.897 ms                GC (median):    0.00%
 Time  (mean ± σ):   4.925 ms ± 131.609 μs   GC (mean ± σ):  0.17% ±  1.79%

     ▂▅█▅▁
  ▁▂▇█████▆▅▅▄▄▂▂▄▄▄▄▄▃▃▂▃▃▂▂▃▂▂▃▃▃▂▂▂▂▂▃▂▃▂▂▂▂▂▂▂▂▂▁▂▂▁▁▁▁▁▁ 
  4.87 ms         Histogram: frequency by time        5.03 ms <

 Memory estimate: 96.42 KiB, allocs estimate: 42.

julia> @benchmark simulation_buffer()
BenchmarkTools.Trial: 243 samples with 1 evaluation per sample.
 Range (min  max):  19.796 ms   24.860 ms   GC (min  max): 0.00%  0.00%
 Time  (median):     20.524 ms                GC (median):    0.00%
 Time  (mean ± σ):   20.627 ms ± 559.408 μs   GC (mean ± σ):  1.08% ± 2.01%

       ▁▄█ ▆█▁▇▁▆▁▁▇▄▂ ▆▂       
  ▃▃▄▃█▇███▇███████████▇██▆█▆█▇▆█▇██▄▇▄▃▃▆▃▇▇▃▄▃▃▁▃▁▃▄▁▁▁▁▁▁▁▃ 
  19.8 ms         Histogram: frequency by time         22.2 ms <

 Memory estimate: 4.60 MiB, allocs estimate: 163.

Last updated: Mar 04 2025 at 04:41 UTC