From 5790b73f3f7f0475cef2d398ab3af57fcc537ee5 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Mon, 1 May 2023 17:24:06 -0400
Subject: [PATCH] `gpu(::DataLoader)`, take III (#2245)

* simpler MLUtils gpu(::DataLoader)

* docs

* also move cpu/gpu docstrings to a reference section

* doc fixes

* less verbose code in docs

* tweak words

* Apply 3 suggestions
---
 NEWS.md                     |  2 +
 docs/src/gpu.md             | 75 +++++++++++++++----------------------
 docs/src/models/functors.md | 10 +++++
 src/functor.jl              | 56 +++++++++++++++++++++++++++
 test/amd/basic.jl           | 13 +++++++
 test/cuda/cuda.jl           | 26 +++++++++++++
 6 files changed, 138 insertions(+), 44 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index bd39fbcf7c..e7fad6ccf0 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -5,6 +5,8 @@ See also [github's page](https://github.com/FluxML/Flux.jl/releases) for a compl
 ## v0.13.16
 * Most greek-letter keyword arguments are deprecated in favour of ascii.
   Thus `LayerNorm(3; ϵ=1e-4)` (not `ε`!) should become `LayerNorm(3; eps=1e-4)`.
+ * `DataLoader(...) |> gpu` will now produce a special iterator, moving each batch as needed,
+  instead of giving an error.
 
 ## v0.13.15
 * Added [MultiHeadAttention](https://github.com/FluxML/Flux.jl/pull/2146) layer.
diff --git a/docs/src/gpu.md b/docs/src/gpu.md
index 18ed7d7d9e..8a7bb7b721 100644
--- a/docs/src/gpu.md
+++ b/docs/src/gpu.md
@@ -49,7 +49,7 @@ julia> Flux.GPU_BACKEND
 "CUDA"
 ```
 
-## GPU Usage
+## Basic GPU Usage
 
 Support for array operations on other hardware backends, like GPUs, is provided by external packages like [CUDA](https://github.com/JuliaGPU/CUDA.jl). Flux is agnostic to array types, so we simply need to move model weights and data to the GPU and Flux will handle it.
 
@@ -122,61 +122,48 @@ julia> x |> cpu
  0.7766742
 ```
 
-```@docs
-cpu
-gpu
-```
-
-## Common GPU Workflows
-
-Some of the common workflows involving the use of GPUs are presented below.
-
-### Transferring Training Data
+## Transferring Training Data
 
-In order to train the model using the GPU both model and the training data have to be transferred to GPU memory. This process can be done with the `gpu` function in two different ways:
+In order to train the model using the GPU both model and the training data have to be transferred to GPU memory. Moving the data can be done in two different ways:
 
-1. Iterating over the batches in a [DataLoader](@ref) object transferring each one of the training batches at a time to the GPU. 
+1. Iterating over the batches in a [`DataLoader`](@ref) object transferring each one of the training batches at a time to the GPU. This is recommended for large datasets. Done by hand, it might look like this:
    ```julia
-   train_loader = Flux.DataLoader((xtrain, ytrain), batchsize = 64, shuffle = true)
-   # ... model, optimiser and loss definitions
-   for epoch in 1:nepochs
-       for (xtrain_batch, ytrain_batch) in train_loader
-           x, y = gpu(xtrain_batch), gpu(ytrain_batch)
-           gradients = gradient(() -> loss(x, y), parameters)
-           Flux.Optimise.update!(optimiser, parameters, gradients)
+   train_loader = Flux.DataLoader((X, Y), batchsize=64, shuffle=true)
+   # ... model definition, optimiser setup
+   for epoch in 1:epochs
+       for (x_cpu, y_cpu) in train_loader
+           x = gpu(x_cpu)
+           y = gpu(y_cpu)
+           grads = gradient(m -> loss(m, x, y), model)
+           Flux.update!(opt_state, model, grads[1])
        end
    end
    ```
-
-2. Transferring all training data to the GPU at once before creating the [DataLoader](@ref) object. This is usually performed for smaller datasets which are sure to fit in the available GPU memory. Some possibilities are:
-   ```julia
-   gpu_train_loader = Flux.DataLoader((xtrain |> gpu, ytrain |> gpu), batchsize = 32)
-   ```
-   ```julia
-   gpu_train_loader = Flux.DataLoader((xtrain, ytrain) |> gpu, batchsize = 32)
-   ```
-   Note that both `gpu` and `cpu` are smart enough to recurse through tuples and namedtuples. Another possibility is to use [`MLUtils.mapsobs`](https://juliaml.github.io/MLUtils.jl/dev/api/#MLUtils.mapobs) to push the data movement invocation into the background thread:
-   ```julia
-   using MLUtils: mapobs
-   # ...
-   gpu_train_loader = Flux.DataLoader(mapobs(gpu, (xtrain, ytrain)), batchsize = 16)
-   ```
-
-3. Wrapping the `DataLoader` in [`CUDA.CuIterator`](https://cuda.juliagpu.org/stable/usage/memory/#Batching-iterator) to efficiently move data to GPU on demand:
+   Rather than write this out every time, you can just call `gpu(::DataLoader)`:
    ```julia
-   using CUDA: CuIterator
-   train_loader = Flux.DataLoader((xtrain, ytrain), batchsize = 64, shuffle = true)
-   # ... model, optimiser and loss definitions
-   for epoch in 1:nepochs
-       for (xtrain_batch, ytrain_batch) in CuIterator(train_loader)
-          # ...
+   gpu_train_loader = Flux.DataLoader((X, Y), batchsize=64, shuffle=true) |> gpu
+   # ... model definition, optimiser setup
+   for epoch in 1:epochs
+       for (x, y) in gpu_train_loader
+           grads = gradient(m -> loss(m, x, y), model)
+           Flux.update!(opt_state, model, grads[1])
        end
    end
    ```
+   This is equivalent to `DataLoader(MLUtils.mapobs(gpu, (X, Y)); keywords...)`.
+   Something similar can also be done with [`CUDA.CuIterator`](https://cuda.juliagpu.org/stable/usage/memory/#Batching-iterator), `gpu_train_loader = CUDA.CuIterator(train_loader)`. However, this only works with a limited number of data types: `first(train_loader)` should be a tuple (or `NamedTuple`) of arrays.
 
-   Note that this works with a limited number of data types. If `iterate(train_loader)` returns anything other than arrays, approach 1 or 2 is preferred.
+2. Transferring all training data to the GPU at once before creating the `DataLoader`. This is usually performed for smaller datasets which are sure to fit in the available GPU memory.
+   ```julia
+   gpu_train_loader = Flux.DataLoader((X, Y) |> gpu, batchsize = 32)
+   # ...
+   for epoch in 1:epochs
+       for (x, y) in gpu_train_loader
+           # ...
+   ```
+   Here `(X, Y) |> gpu` applies [`gpu`](@ref) to both arrays, as it recurses into structures.
 
-### Saving GPU-Trained Models
+## Saving GPU-Trained Models
 
 After the training process is done, one must always transfer the trained model back to the `cpu` memory scope before serializing or saving to disk. This can be done, as described in the previous section, with:
 ```julia
diff --git a/docs/src/models/functors.md b/docs/src/models/functors.md
index 252841c0c2..ab0883c95e 100644
--- a/docs/src/models/functors.md
+++ b/docs/src/models/functors.md
@@ -15,3 +15,13 @@ Functors.fcollect
 Functors.functor
 Functors.fmapstructure
 ```
+
+## Moving models, or data, to the GPU
+
+Flux provides some convenience functions based on `fmap`. Some ([`f16`](@ref Flux.f16), [`f32`](@ref Flux.f32), [`f64`](@ref Flux.f64)) change the precision of all arrays in a model. Others are used for moving a model to of from GPU memory:
+
+```@docs
+cpu
+gpu(::Any)
+gpu(::Flux.DataLoader)
+```
diff --git a/src/functor.jl b/src/functor.jl
index 55d188287b..8e5e7ebc48 100644
--- a/src/functor.jl
+++ b/src/functor.jl
@@ -391,3 +391,59 @@ function gpu(::FluxAMDAdaptor, x)
 end
 
 function _amd end
+
+
+"""
+    gpu(data::DataLoader)
+
+Transforms a given `DataLoader` to apply `gpu` to each batch of data,
+when iterated over. (If no GPU is available, this does nothing.)
+
+# Example
+
+```julia-repl
+julia> dl = Flux.DataLoader((x = ones(2,10), y='a':'j'), batchsize=3)
+4-element DataLoader(::NamedTuple{(:x, :y), Tuple{Matrix{Float64}, StepRange{Char, Int64}}}, batchsize=3)
+  with first element:
+  (; x = 2×3 Matrix{Float64}, y = 3-element StepRange{Char, Int64})
+
+julia> first(dl)
+(x = [1.0 1.0 1.0; 1.0 1.0 1.0], y = 'a':1:'c')
+
+julia> c_dl = gpu(dl)
+4-element DataLoader(::MLUtils.MappedData{:auto, typeof(gpu), NamedTuple{(:x, :y), Tuple{Matrix{Float64}, StepRange{Char, Int64}}}}, batchsize=3)
+  with first element:
+  (; x = 2×3 CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, y = 3-element StepRange{Char, Int64})
+
+julia> first(c_dl).x
+2×3 CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}:
+ 1.0  1.0  1.0
+ 1.0  1.0  1.0
+```
+
+For large datasets, this is preferred over moving all the data to
+the GPU before creating the `DataLoader`, like this:
+
+```julia-repl
+julia> Flux.DataLoader((x = ones(2,10), y=2:11) |> gpu, batchsize=3)
+4-element DataLoader(::NamedTuple{(:x, :y), Tuple{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, UnitRange{Int64}}}, batchsize=3)
+  with first element:
+  (; x = 2×3 CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, y = 3-element UnitRange{Int64})
+```
+
+!!! warning
+    This only works if `gpu` is applied directly to the `DataLoader`.
+    While `gpu` acts recursively on Flux models and many basic Julia structs,
+    it will not work on (say) a tuple of `DataLoader`s.
+"""
+function gpu(d::MLUtils.DataLoader)
+  MLUtils.DataLoader(MLUtils.mapobs(gpu, d.data),
+    d.batchsize,
+    d.buffer,
+    d.partial,
+    d.shuffle,
+    d.parallel,
+    d.collate,
+    d.rng,
+  )
+end
diff --git a/test/amd/basic.jl b/test/amd/basic.jl
index d9b8db104a..fde8103bbb 100644
--- a/test/amd/basic.jl
+++ b/test/amd/basic.jl
@@ -101,3 +101,16 @@ end
         gpu_autodiff_test(bn, x; atol=1f-3, allow_nothing=true)
     end
 end
+
+@testset "gpu(::DataLoader)" begin
+    X = randn(Float64, 3, 33)
+    pre1 = Flux.DataLoader(X |> Flux.gpu; batchsize=13, shuffle=false)
+    post1 = Flux.DataLoader(X; batchsize=13, shuffle=false) |> Flux.gpu
+    for epoch in 1:2
+        for (p, q) in zip(pre1, post1)
+            @test p isa ROCArray{Float32}
+            @test q isa ROCArray{Float32}
+            @test p ≈ q
+        end
+    end
+end
diff --git a/test/cuda/cuda.jl b/test/cuda/cuda.jl
index e5e28d428b..c42baa7076 100644
--- a/test/cuda/cuda.jl
+++ b/test/cuda/cuda.jl
@@ -178,3 +178,29 @@ end
   @test cpu(xgpu) isa Vector{A2116}
   @test cpu(gpu([CartesianIndex(1)])) isa Vector{CartesianIndex{1}}
 end
+
+@testset "gpu(::DataLoader)" begin
+  X = randn(Float64, 3, 33)
+  pre1 = Flux.DataLoader(X |> gpu; batchsize=13, shuffle=false)
+  post1 = Flux.DataLoader(X; batchsize=13, shuffle=false) |> gpu
+  for epoch in 1:2
+    for (p, q) in zip(pre1, post1)
+      @test p isa CuArray{Float32}
+      @test q isa CuArray{Float32}
+      @test p ≈ q
+    end
+  end
+
+  Y = Flux.onehotbatch(rand(0:2, 33), 0:2)
+  pre2 = Flux.DataLoader((x=X, y=Y) |> gpu; batchsize=7, shuffle=false)
+  post2 = Flux.DataLoader((x=X, y=Y); batchsize=7, shuffle=false) |> gpu
+  for (p, q) in zip(pre2, post2)
+    @test p.x == q.x
+    @test_skip p.y == q.y  # https://github.com/FluxML/OneHotArrays.jl/issues/28 -- MethodError: getindex(::OneHotArrays.OneHotMatrix{UInt32, CuArray{UInt32, 1, CUDA.Mem.DeviceBuffer}}, ::Int64, ::Int64) is ambiguous
+  end
+
+  @test collect(pre2) isa Vector{<:NamedTuple{(:x, :y)}}
+  @test collect(post2) isa Vector{<:NamedTuple{(:x, :y)}}  # collect makes no sense, but check eltype?
+
+  @test_throws Exception gpu(((x = Flux.DataLoader(X), y = Y),))
+end
\ No newline at end of file