Merge pull request #74 from avik-pal/ap/tests

Add tests for utility functions
LuxDL · Jun 29, 2022 · 183f1c7 · 183f1c7
2 parents 077d67f + 94fe305
commit 183f1c7
Show file tree

Hide file tree

Showing 28 changed files with 328 additions and 202 deletions.
diff --git a/.github/workflows/Downstream.yml b/.github/workflows/Downstream.yml
@@ -0,0 +1,62 @@
+name: Downstream
+on:
+  pull_request:
+    branches:
+      - main
+  push:
+    branches:
+      - main
+concurrency:
+  # Skip intermediate builds: always.
+  # Cancel intermediate builds: only if it is a pull request build.
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
+jobs:
+  test:
+    name: ${{ matrix.package.repo }}/${{ matrix.package.group }}
+    runs-on: ${{ matrix.os }}
+    env:
+      GROUP: ${{ matrix.package.group }}
+    strategy:
+      fail-fast: false
+      matrix:
+        julia-version: [1.7]
+        os: [ubuntu-latest]
+        package:
+          - { user: SciML, repo: DiffEqFlux.jl, group: BasicNeuralDE }
+          - { user: SciML, repo: DiffEqFlux.jl, group: AdvancedNeuralDE }
+          - { user: SciML, repo: DeepEquilibriumNetworks.jl, group: All }
+    if: contains(github.event.pull_request.labels.*.name, 'run downstream test')
+    steps:
+      - uses: actions/checkout@v2
+      - uses: julia-actions/setup-julia@v1
+        with:
+          version: ${{ matrix.julia-version }}
+          arch: x64
+      - uses: julia-actions/julia-buildpkg@latest
+      - name: Clone Downstream
+        uses: actions/checkout@v2
+        with:
+          repository: ${{ matrix.package.user }}/${{ matrix.package.repo }}
+          path: downstream
+      - name: Load this and run the downstream tests
+        shell: julia --code-coverage=user --color=yes --project=downstream {0}
+        run: |
+          using Pkg
+          try
+            # force it to use this PR's version of the package
+            Pkg.develop(PackageSpec(path="."))  # resolver may fail with main deps
+            Pkg.update()
+            Pkg.test()  # resolver may fail with test time deps
+          catch err
+            err isa Pkg.Resolve.ResolverError || rethrow()
+            # If we can't resolve that means this is incompatible by SemVer and this is fine
+            # It means we marked this as a breaking change, so we don't need to worry about
+            # Mistakenly introducing a breaking change, as we have intentionally made one
+            @info "Not compatible with this release. No problem." exception=err
+            exit(0)  # Exit immediately, as a success
+          end
+      - uses: julia-actions/julia-processcoverage@v1
+      - uses: codecov/codecov-action@v2
+        with:
+          files: lcov.info
diff --git a/.gitignore b/.gitignore
@@ -6,4 +6,6 @@ wip
 model_weights
 
 docs/docs
-docs/site
+docs/site
+
+scripts
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,13 @@
 # v0.4
 
+## v0.4.7
+
+  - Manual detailing Lux Interface
+  - Fixes bug with ComponentArray + Optimiser
+    https://github.com/FluxML/Optimisers.jl/issues/91
+  - `Dropout` Layers caches `1 / (1 - p)` for minor improvements for forward pass
+  - `dropout` has a custom rrule -- significantly improves performance for smaller arrays
+
 ## v0.4.6
 
   - Documentation revamped

diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "Lux"
 uuid = "b2108857-7c20-44ae-9111-449ecde12c47"
 authors = ["Avik Pal <[email protected]> and contributors"]
-version = "0.4.7-DEV"
+version = "0.4.7"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"

diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
@@ -32,7 +32,7 @@ theme:
   twitter_name: "@avikpal1410"
   twitter_url: "https://twitter.com/avikpal1410"
 
-# TODO: Setup mkdocs for showing documentation versions
+# TODO(@avik-pal): Setup mkdocs for showing documentation versions
 # extra:
 #   version:
 #     provider: mike

diff --git a/docs/src/api/utilities.md b/docs/src/api/utilities.md
@@ -24,12 +24,9 @@ Lux.zeros32
 Lux.applyactivation
 Lux.elementwise_add
 Lux.elementwise_mul
-```
-
-## RNN Utilities
-
-```@docs
+Lux.istraining
 Lux.multigate
+Lux.replicate
 ```
 
 ## Index

diff --git a/docs/src/devdocs/style_guide.md b/docs/src/devdocs/style_guide.md
@@ -43,7 +43,7 @@ We do have automatic formatter, which opens PR after fixing common style issues,
   in src should have a complementary file in the test folder, containing tests relevant to
   that file's contents.
 
-* Add generic utilities for testing in `test/utils.jl` and include them in the relevant
+* Add generic utilities for testing in `test/test_utils.jl` and include them in the relevant
   files.
 
 * Use [JET.jl](https://aviatesk.github.io/JET.jl/dev/) to test for dynamic dispatch in the

diff --git a/docs/src/manual/migrate_from_flux.md b/docs/src/manual/migrate_from_flux.md
@@ -158,11 +158,6 @@ model or running inference. This is the default mode for `Flux.BatchNorm`, `Flux
 do exactly what the user wants), hence our default mode is `training`. This can be changed
 using [`Lux.testmode`](@ref).
 
-### Group Normalization
-
-`Flux.GroupNorm` sets `track_stats=true` by default. We set it to `false` since we found
-little to no reference for tracking statistics in Group Normalization.
-
 ## Can't access functions like `relu`, `sigmoid`, etc?
 
 Unlike Flux we don't reexport functionality from `NNlib`, all you need to do to fix this is

diff --git a/lib/Boltz/src/Boltz.jl b/lib/Boltz/src/Boltz.jl
@@ -9,7 +9,7 @@ using Statistics
 using Artifacts, LazyArtifacts
 using JLD2
 
-# TODO: We want to have generic Lux implementaions for Metalhead models
+# TODO(@avik-pal): We want to have generic Lux implementaions for Metalhead models
 # We can automatically convert several Metalhead.jl models to Lux
 using Metalhead
 

diff --git a/src/adapt.jl b/src/adapt.jl
@@ -21,7 +21,7 @@ function adapt_storage(to::LuxCPUAdaptor, x::ComponentArray)
     return ComponentArray(adapt_storage(to, getdata(x)), getaxes(x))
 end
 adapt_storage(::LuxCPUAdaptor, rng::AbstractRNG) = rng
-# TODO: SparseArrays
+# TODO(@avik-pal): SparseArrays
 function adapt_storage(::LuxCPUAdaptor,
                        x::CUDA.CUSPARSE.CUDA.CUSPARSE.AbstractCuSparseMatrix)
     return adapt(Array, x)

diff --git a/src/autodiff.jl b/src/autodiff.jl
@@ -2,7 +2,7 @@
 ChainRulesCore.@non_differentiable replicate(::Any)
 ChainRulesCore.@non_differentiable update_statistics(::Any, ::Any, ::Any, ::Any, ::Any,
                                                      ::Any, ::Any)
-ChainRulesCore.@non_differentiable generate_dropout_mask(::Any, ::Any, ::Any)
+ChainRulesCore.@non_differentiable generate_dropout_mask(::Any, ::Any, ::Any, ::Any)
 ChainRulesCore.@non_differentiable compute_adaptive_pooling_dims(::Any, ::Any)
 ChainRulesCore.@non_differentiable glorot_normal(::Any...)
 ChainRulesCore.@non_differentiable glorot_uniform(::Any...)
@@ -18,14 +18,9 @@ function ChainRulesCore.rrule(::typeof(Base.broadcasted), ::typeof(identity), x)
 end
 
 # NNlib Functions
-function ChainRulesCore.rrule(::typeof(batchnorm),
-                              g::CuArray{T},
-                              b::CuArray{T},
-                              x::Union{CuArray{T, 4}, CuArray{T, 5}},
-                              running_mean,
-                              running_var,
-                              momentum;
-                              kwargs...) where {T <: CUDNNFloat}
+function ChainRulesCore.rrule(::typeof(batchnorm), g::CuArray{T}, b::CuArray{T},
+                              x::Union{CuArray{T, 4}, CuArray{T, 5}}, running_mean,
+                              running_var, momentum; kwargs...) where {T <: CUDNNFloat}
     y = batchnorm(g, b, x, running_mean, running_var, momentum; kwargs...)
     function batchnorm_pullback(dy)
         dg, db, dx = ∇batchnorm(g, b, x, dy, running_mean, running_var, momentum; kwargs...)
@@ -34,6 +29,17 @@ function ChainRulesCore.rrule(::typeof(batchnorm),
     return y, batchnorm_pullback
 end
 
+function ChainRulesCore.rrule(::typeof(dropout), rng::AbstractRNG, x::AbstractArray{T, N},
+                              p::T, q::T, dims, t::Val{training}) where {T, N, training}
+    y, mask, rng = dropout(rng, x, p, q, dims, t)
+    function dropout_pullback((dy, dmask, drng))
+        return (NoTangent(), NoTangent(), elementwise_mul(dy, mask), NoTangent(),
+                NoTangent(),
+                NoTangent(), NoTangent())
+    end
+    return (y, mask, rng), dropout_pullback
+end
+
 # Activation Rrules
 function ChainRulesCore.rrule(::typeof(applyactivation), f::cudnnValidActivationTypes,
                               x::CuArray{T}) where {T <: CUDNNFloat}

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
@@ -473,7 +473,7 @@ keyword argument `disable_optimizations`.
 
   - All sublayers are recursively optimized.
   - If a function `f` is passed as a layer and it doesn't take 3 inputs, it is converted to
-    a WrappedFunction(`f`) which takes only one input.
+    a [`WrappedFunction`](@ref)(`f`) which takes only one input.
   - If the layer is a Chain, it is flattened.
   - [`NoOpLayer`](@ref)s are removed.
   - If there is only 1 layer (left after optimizations), then it is returned without the

diff --git a/src/layers/dropout.jl b/src/layers/dropout.jl
@@ -33,23 +33,23 @@ See also [`VariationalHiddenDropout`](@ref)
 """
 struct Dropout{T, D} <: AbstractExplicitLayer
     p::T
+    q::T
     dims::D
 end
 
 function initialstates(rng::AbstractRNG, ::Dropout)
-    # FIXME: Take PRNGs seriously
-    randn(rng, 1)
+    randn(rng)
     return (rng=replicate(rng), training=Val(true))
 end
 
 function Dropout(p; dims=:)
     @assert 0 ≤ p ≤ 1
     iszero(p) && return NoOpLayer()
-    return Dropout(p, dims)
+    return Dropout(p, 1 / (1 - p), dims)
 end
 
 function (d::Dropout{T})(x::AbstractArray{T}, ps, st::NamedTuple) where {T}
-    y, _, rng = dropout(st.rng, x, d.p, d.dims, st.training)
+    y, _, rng = dropout(st.rng, x, d.p, d.q, d.dims, st.training)
     return y, merge(st, (rng=rng,))
 end
 
@@ -98,24 +98,24 @@ See also [`Dropout`](@ref)
 """
 struct VariationalHiddenDropout{T, D} <: AbstractExplicitLayer
     p::T
+    q::T
     dims::D
 end
 
 function initialstates(rng::AbstractRNG, ::VariationalHiddenDropout)
-    # FIXME: Take PRNGs seriously
-    randn(rng, 1)
+    randn(rng)
     return (rng=replicate(rng), training=Val(true), update_mask=Val(true),
             mask=nothing)
 end
 
 function VariationalHiddenDropout(p; dims=:)
     @assert 0 ≤ p ≤ 1
     iszero(p) && return NoOpLayer()
-    return VariationalHiddenDropout(p, dims)
+    return VariationalHiddenDropout(p, 1 / (1 - p), dims)
 end
 
 function (d::VariationalHiddenDropout{T})(x::AbstractArray{T}, ps, st::NamedTuple) where {T}
-    y, mask, rng, update_mask = dropout(st.rng, x, st.mask, d.p, d.dims, st.training,
+    y, mask, rng, update_mask = dropout(st.rng, x, st.mask, d.p, d.q, d.dims, st.training,
                                         st.update_mask)
     return y, merge(st, (mask=mask, rng=rng, update_mask=update_mask))
 end

diff --git a/src/layers/normalize.jl b/src/layers/normalize.jl
@@ -81,13 +81,8 @@ struct BatchNorm{affine, track_stats, F1, F2, F3, N} <:
     init_scale::F3
 end
 
-function BatchNorm(chs::Int,
-                   activation=identity;
-                   init_bias=zeros32,
-                   init_scale=ones32,
-                   affine::Bool=true,
-                   track_stats::Bool=true,
-                   epsilon=1.0f-5,
+function BatchNorm(chs::Int, activation=identity; init_bias=zeros32, init_scale=ones32,
+                   affine::Bool=true, track_stats::Bool=true, epsilon=1.0f-5,
                    momentum=0.1f0)
     activation = NNlib.fast_act(activation)
     return BatchNorm{affine, track_stats, typeof(activation), typeof(init_bias),
@@ -182,7 +177,7 @@ end
 
 """
     GroupNorm(chs::Integer, groups::Integer, activation=identity; init_bias=zeros32,
-              init_scale=ones32, affine=true, track_stats=false, epsilon=1f-5,
+              init_scale=ones32, affine=true, track_stats=true, epsilon=1f-5,
               momentum=0.1f0)
 
 [Group Normalization](https://arxiv.org/abs/1803.08494) layer.
@@ -265,14 +260,8 @@ struct GroupNorm{affine, track_stats, F1, F2, F3, N} <:
     groups::Int
 end
 
-function GroupNorm(chs::Int,
-                   groups::Int,
-                   activation=identity;
-                   init_bias=zeros32,
-                   init_scale=ones32,
-                   affine::Bool=true,
-                   track_stats::Bool=true,
-                   epsilon=1.0f-5,
+function GroupNorm(chs::Integer, groups::Integer, activation=identity; init_bias=zeros32,
+                   init_scale=ones32, affine=true, track_stats=true, epsilon=1.0f-5,
                    momentum=0.1f0)
     @assert chs % groups==0 "The number of groups ($(groups)) must divide the number of channels ($chs)"
     activation = NNlib.fast_act(activation)
@@ -410,9 +399,7 @@ function (wn::WeightNorm)(x, ps, s::NamedTuple)
 end
 
 @inbounds @generated function get_normalized_parameters(::WeightNorm{Val{which_params}},
-                                                        dims::T,
-                                                        ps::Union{ComponentArray, NamedTuple
-                                                                  }) where {T, which_params}
+                                                        dims::T, ps) where {T, which_params}
     parameter_names = string.(which_params)
     v_parameter_names = Symbol.(parameter_names .* "_v")
     g_parameter_names = Symbol.(parameter_names .* "_g")

diff --git a/src/layers/recurrent.jl b/src/layers/recurrent.jl
@@ -67,7 +67,7 @@ function initialparameters(rng::AbstractRNG, rnn::RNNCell{bias}) where {bias}
 end
 
 function initialstates(rng::AbstractRNG, ::RNNCell)
-    # FIXME: Take PRNGs seriously
+    # FIXME(@avik-pal): Take PRNGs seriously
     randn(rng, 1)
     return (rng=replicate(rng),)
 end
@@ -202,7 +202,7 @@ function initialparameters(rng::AbstractRNG, lstm::LSTMCell)
 end
 
 function initialstates(rng::AbstractRNG, ::LSTMCell)
-    # FIXME: Take PRNGs seriously
+    # FIXME(@avik-pal): Take PRNGs seriously
     randn(rng, 1)
     return (rng=replicate(rng),)
 end
@@ -310,7 +310,7 @@ function initialparameters(rng::AbstractRNG, gru::GRUCell)
 end
 
 function initialstates(rng::AbstractRNG, ::GRUCell)
-    # FIXME: Take PRNGs seriously
+    # FIXME(@avik-pal): Take PRNGs seriously
     randn(rng, 1)
     return (rng=replicate(rng),)
 end