RealNVP network

For the definition of this network and concepts of normalizing flow, please refer this realnvp blog: https://lilianweng.github.io/lil-log/2018/10/13/flow-based-deep-generative-models.html, and the pytorch notebook: https://github.com/GiggleLiu/marburg/blob/master/solutions/realnvp.ipynb

using NiLang, NiLang.AD
using LinearAlgebra
using DelimitedFiles
using Plots

include the optimizer, you can find it under the Adam.jl file in the examples/ folder.

include("Adam.jl")

Model definition

First, define the single layer transformation and its behavior under GVar - the gradient wrapper.

struct RealNVPLayer{T}
    # transform network
    W1::Matrix{T}
    b1::Vector{T}
    W2::Matrix{T}
    b2::Vector{T}
    y1::Vector{T}
    y1a::Vector{T}

    # scaling network
    sW1::Matrix{T}
    sb1::Vector{T}
    sW2::Matrix{T}
    sb2::Vector{T}
    sy1::Vector{T}
    sy1a::Vector{T}
end

"""collect parameters in the `layer` into a vector `out`."""
function collect_params!(out, layer::RealNVPLayer)
    k=0
    for field in [:W1, :b1, :W2, :b2, :sW1, :sb1, :sW2, :sb2]
        v = getfield(layer, field)
        nv = length(v)
        out[k+1:k+nv] .= vec(v)
        k += nv
    end
    return out
end

"""dispatch vectorized parameters `out` into the `layer`."""
function dispatch_params!(layer::RealNVPLayer, out)
    k=0
    for field in [:W1, :b1, :W2, :b2, :sW1, :sb1, :sW2, :sb2]
        v = getfield(layer, field)
        nv = length(v)
        vec(v) .= out[k+1:k+nv]
        k += nv
    end
    return out
end

function nparameters(n::RealNVPLayer)
    sum(x->length(getfield(n, x)), [:W1, :b1, :W2, :b2, :sW1, :sb1, :sW2, :sb2])
end

nparameters (generic function with 1 method)

Then, we define network and how to access the parameters.

const RealNVP{T} = Vector{RealNVPLayer{T}}

nparameters(n::RealNVP) = sum(nparameters, n)

function collect_params(n::RealNVP{T}) where T
    out = zeros(T, nparameters(n))
    k = 0
    for layer in n
        np = nparameters(layer)
        collect_params!(view(out, k+1:k+np), layer)
        k += np
    end
    return out
end

function dispatch_params!(network::RealNVP, out)
    k = 0
    for layer in network
        np = nparameters(layer)
        dispatch_params!(layer, view(out, k+1:k+np))
        k += np
    end
    return network
end

function random_realnvp(nparams::Int, nhidden::Int, nhidden_s::Int, nlayer::Int; scale=0.1)
    random_realnvp(Float64, nparams, nhidden, nhidden_s::Int, nlayer; scale=scale)
end

function random_realnvp(::Type{T}, nparams::Int, nhidden::Int, nhidden_s::Int, nlayer::Int; scale=0.1) where T
    nin = nparams÷2
    scale = T(scale)
    y1 = zeros(T, nhidden)
    sy1 = zeros(T, nhidden_s)
    RealNVPLayer{T}[RealNVPLayer(
            randn(T, nhidden, nin)*scale, randn(T, nhidden)*scale,
            randn(T, nin, nhidden)*scale, randn(T, nin)*scale, y1, zero(y1),
            randn(T, nhidden_s, nin)*scale, randn(T, nhidden_s)*scale,
            randn(T, nin, nhidden_s)*scale, randn(T, nin)*scale, sy1, zero(sy1),
            ) for _ = 1:nlayer]
end

random_realnvp (generic function with 2 methods)

Loss function

Good! We still need to define some utilities like affine! transformation.

@i function affine!(y!::AbstractVector{T}, W::AbstractMatrix{T}, b::AbstractVector{T}, x::AbstractVector{T}) where T
    @safe @assert size(W) == (length(y!), length(x)) && length(b) == length(y!)
    @invcheckoff for j=1:size(W, 2)
        for i=1:size(W, 1)
            @inbounds y![i] += W[i,j]*x[j]
        end
    end
    @invcheckoff for i=1:size(W, 1)
        @inbounds y![i] += identity(b[i])
    end
end

In each layer, we use the information in x to update y!. During computing, we use to vector type ancillas y1 and y1a, both of them can be uncomputed at the end of the function.

@i function onelayer!(x::AbstractVector{T}, layer::RealNVPLayer{T},
                y!::AbstractVector{T}, logjacobian!::T; islast) where T
    @routine @invcheckoff begin
        # scale network
        scale ← zero(y!)
        ytemp2 ← zero(y!)
        affine!(layer.sy1, layer.sW1, layer.sb1, x)
        @inbounds for i=1:length(layer.sy1)
            if (layer.sy1[i] > 0, ~)
                layer.sy1a[i] += identity(layer.sy1[i])
            end
        end
        affine!(scale, layer.sW2, layer.sb2, layer.sy1a)

        # transform network
        affine!(layer.y1, layer.W1, layer.b1, x)
        # relu
        @inbounds for i=1:length(layer.y1)
            if (layer.y1[i] > 0, ~)
                layer.y1a[i] += identity(layer.y1[i])
            end
        end
    end
    # inplace multiply exp of scale! -- dangerous
    @inbounds @invcheckoff for i=1:length(scale)
        @routine begin
            expscale ← zero(T)
            tanhscale ← zero(T)
            if (islast, ~)
                tanhscale += tanh(scale[i])
            else
                tanhscale += identity(scale[i])
            end
            expscale += exp(tanhscale)
        end
        logjacobian! += identity(tanhscale)
        # inplace multiply!!!
        temp ← zero(T)
        temp += y![i] * expscale
        SWAP(temp, y![i])
        temp -= y![i] / expscale
        temp → zero(T)
        ~@routine
    end

    # affine the transform layer
    affine!(y!, layer.W2, layer.b2, layer.y1a)
    ~@routine
    # clean up accumulated rounding error, since this memory is reused.
    @safe layer.y1 .= zero(T)
    @safe layer.sy1 .= zero(T)
end

A realnvp network always transforms inputs reversibly. We update one half of x! a time, so that input and output memory space do not clash.

@i function realnvp!(x!::AbstractVector{T}, network::RealNVP{T}, logjacobian!) where T
    @invcheckoff for i=1:length(network)
        np ← length(x!)
        if (i%2==0, ~)
            @inbounds onelayer!(view(x!,np÷2+1:np), network[i], view(x!,1:np÷2), logjacobian!; islast=i==length(network))
        else
            @inbounds onelayer!(view(x!,1:np÷2), network[i], view(x!,np÷2+1:np), logjacobian!; islast=i==length(network))
        end
    end
end

How to obtain the log-probability of a data.

@i function logp!(out!::T, x!::AbstractVector{T}, network::RealNVP{T}) where T
    (~realnvp!)(x!, network, out!)
    @invcheckoff for i = 1:length(x!)
        @routine begin
            xsq ← zero(T)
            @inbounds xsq += x![i]^2
        end
        out! -= 0.5 * xsq
        ~@routine
    end
end

The negative-log-likelihood loss function

@i function nll_loss!(out!::T, cum!::T, xs!::Matrix{T}, network::RealNVP{T}) where T
    @invcheckoff for i=1:size(xs!, 2)
        @inbounds logp!(cum!, view(xs!,:,i), network)
    end
    out! -= cum!/size(xs!, 2)
end

Training

function train(x_data, model; num_epochs = 800)
    num_vars = size(x_data, 1)
    params = collect_params(model)
    optimizer = Adam(; lr=0.01)
    for epoch = 1:num_epochs
        loss, a, b, c = nll_loss!(0.0, 0.0, copy(x_data), model)
        if epoch % 50 == 1
            println("epoch = $epoch, loss = $loss")
            display(showmodel(x_data, model))
        end
        _, _, _, gmodel = (~nll_loss!)(GVar(loss, 1.0), GVar(a), GVar(b), GVar(c))
        g = grad.(collect_params(gmodel))
        update!(params, grad.(collect_params(gmodel)), optimizer)
        dispatch_params!(model, params)
    end
    return model
end

function showmodel(x_data, model; nsamples=2000)
    scatter(x_data[1,1:nsamples], x_data[2,1:nsamples]; xlims=(-5,5), ylims=(-5,5))
    zs = randn(2, nsamples)
    for i=1:nsamples
        realnvp!(view(zs, :, i), model)
    end
    scatter!(zs[1,:], zs[2,:])
end

showmodel (generic function with 1 method)

you can find the training data in examples/ folder

x_data = Matrix(readdlm(joinpath(@__DIR__, "train.dat"))')

import Random; Random.seed!(22)
model = random_realnvp(Float64, size(x_data, 1), 10, 10, 4; scale=0.1)

Before training, the distribution looks like before

model = train(x_data, model; num_epochs=800)

After training, the distribution looks like before

This page was generated using Literate.jl.