FluxML · staticfloat · Apr 30, 2019 · Sep 19, 2018 · Sep 19, 2018 · Sep 19, 2018
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,6 @@
 *.dll
 *~
 \#*
+deps/usr
+deps.jl
+*.log
diff --git a/REQUIRE b/REQUIRE
@@ -1,3 +1,4 @@
 julia 0.7-
 Requires
 MacroTools
+BinaryProvider
diff --git a/deps/build.jl b/deps/build.jl
@@ -0,0 +1,36 @@
+using BinaryProvider
+
+# Parse some basic command-line arguments
+const verbose = "--verbose" in ARGS
+const prefix = Prefix(get([a for a in ARGS if a != "--verbose"], 1, joinpath(@__DIR__, "usr")))
+products = [
+    LibraryProduct(prefix, "libnnpack", :libnnpack)
+]
+
+# Download binaries from hosted location
+bin_prefix = "https://github.com/avik-pal/NNPACKBuilder/releases/download/v0.1"
+
+# Listing of files generated by BinaryBuilder:
+download_info = Dict(
+    Linux(:aarch64, :glibc) => ("$bin_prefix/NNPACK.v2018.6.22.aarch64-linux-gnu.tar.gz", "27037ad8384dc8d9dff79c6dae9d852f8b4aa60dcc84fb605f1856042d125e63"),
+    Linux(:i686, :glibc) => ("$bin_prefix/NNPACK.v2018.6.22.i686-linux-gnu.tar.gz", "383264bbba19980a097662e4f3364e97ec6ab7e7e792663f8192595cf86a0ebe"),
+    Linux(:x86_64, :glibc) => ("$bin_prefix/NNPACK.v2018.6.22.x86_64-linux-gnu.tar.gz", "fe1870a89b8d80ffd37d20681c22cdb9b86f18b4539127455d9025a0e6720bba"),
+)
+
+# Install unsatisfied or updated dependencies:
+unsatisfied = any(!satisfied(p; verbose=verbose) for p in products)
+if haskey(download_info, platform_key())
+    url, tarball_hash = download_info[platform_key()]
+    if unsatisfied || !isinstalled(url, tarball_hash; prefix=prefix)
+        # Download and install binaries
+        install(url, tarball_hash; prefix=prefix, force=true, verbose=verbose)
+    end
+
+    # Write out a deps.jl file that will contain mappings for our products
+    write_deps_file(joinpath(@__DIR__, "deps.jl"), products)
+elseif unsatisfied
+    # If we don't have a BinaryProvider-compatible .tar.gz to download, complain.
+    # Alternatively, you could attempt to install from a separate provider,
+    # build from source or something even more ambitious here.
+    @warn "Your platform $(triplet(platform_key())) is not supported by this NNPACK. So building NNlib without NNPACK"
+end
diff --git a/src/NNlib.jl b/src/NNlib.jl
@@ -13,4 +13,8 @@ include("linalg.jl")
 include("conv.jl")
 include("cubroadcast.jl")
 
+if Sys.islinux()
+    include("nnpack/NNPACK.jl")
+end
+
 end # module
diff --git a/src/nnpack/NNPACK.jl b/src/nnpack/NNPACK.jl
@@ -0,0 +1,27 @@
+include("libnnpack_types.jl")
+include("error.jl")
+include("libnnpack.jl")
+
+const depsjl_path = joinpath(dirname(@__FILE__), "..", "..", "deps", "deps.jl")
+if !isfile(depsjl_path)
+    error("NNPACK not installed properly, run Pkg.build(\"NNlib\"), restart Julia and try again")
+end
+include(depsjl_path)
+
+const nnlib_interface_path = joinpath(dirname(@__FILE__), "nnlib.jl")
+@init begin
+    check_deps()
+    status = nnp_initialize()
+    if status == nnp_status_unsupported_hardware
+        @warn "HARDWARE is unsupported by NNPACK so falling back to default NNlib"
+    else
+        include(nnlib_interface_path)
+    end
+    try
+        global NNPACK_CPU_THREADS = parse(UInt64, ENV["JULIA_NUM_THREADS"])
+    catch
+        @warn "`JULIA_NUM_THREADS` not set. So taking the NNPACK default `4`"
+        global NNPACK_CPU_THREADS = UInt64(4)
+    end
+    global shared_threadpool = pthreadpool_create(NNPACK_CPU_THREADS)
+end
diff --git a/src/nnpack/error.jl b/src/nnpack/error.jl
@@ -0,0 +1,83 @@
+struct NNPACKError <: Exception
+    code::nnp_status
+    msg::AbstractString
+end
+
+Base.show(io::IO, err::NNPACKError) = print(io, "NNPACKError(code $(err.code), $(err.msg))")
+
+function NNPACKError(status::nnp_status)
+    msg = "NNPACK STATUS SUCCESS"
+    if status == nnp_status_invalid_batch_size
+        msg = "NNPACK STATUS INVALID BATCH SIZE"
+    elseif status == nnp_status_invalid_channels
+        msg = "NNPACK STATUS INVALID CHANNELS"
+    elseif status == nnp_status_invalid_input_channels
+        msg = "NNPACK STATUS INVALID INPUT CHANNELS"
+    elseif status == nnp_status_invalid_output_channels
+        msg = "NNPACK STATUS INVALID OUTPUT CHANNELS"
+    elseif status == nnp_status_invalid_input_size
+	    msg = "NNPACK STATUS INVALID INPUT SIZE"
+    elseif status == nnp_status_invalid_input_stride
+        msg = "NNPACK STATUS INVALID INPUT STRIDE"
+    elseif status == nnp_status_invalid_input_padding
+        msg = "NNPACK STATUS INVALID INPUT PADDING"
+    elseif status == nnp_status_invalid_kernel_size
+        msg = "NNPACK STATUS INVALID KERNEL SIZE"
+    elseif status == nnp_status_invalid_pooling_size
+        msg = "NNPACK STATUS INVALID POOLING SIZE"
+    elseif status == nnp_status_invalid_pooling_stride
+        msg = "NNPACK STATUS INVALID POOLING STRIDE"
+    elseif status == nnp_status_invalid_algorithm
+        msg = "NNPACK STATUS INVALID ALGORITHM"
+    elseif status == nnp_status_invalid_transform_strategy
+        msg = "NNPACK STATUS INVALID TRANSFORM STRATEGY"
+    elseif status == nnp_status_invalid_output_subsampling
+        msg = "NNPACK STATUS INVALID OUTPUT SUBSAMPLING"
+    elseif status == nnp_status_invalid_activation
+        msg = "NNPACK STATUS INVALID ACTIVATION"
+    elseif status == nnp_status_invalid_activation_parameters
+        msg = "NNPACK STATUS INVALID ACTIVATION PARAMETERS"
+    elseif status == nnp_status_unsupported_input_size
+        msg = "NNPACK STATUS UNSUPPORTED INPUT SIZE"
+    elseif status == nnp_status_unsupported_input_stride
+        msg = "NNPACK STATUS UNSUPPORTED INPUT STRIDE"
+    elseif status == nnp_status_unsupported_input_padding
+        msg = "NNPACK STATUS UNSUPPORTED INPUT PADDING"
+    elseif status == nnp_status_unsupported_kernel_size
+        msg = "NNPACK STATUS UNSUPPORTED KERNEL SIZE"
+    elseif status == nnp_status_unsupported_pooling_size
+        msg = "NNPACK STATUS UNSUPPORTED POOLING SIZE"
+    elseif status == nnp_status_unsupported_pooling_stride
+        msg = "NNPACK STATUS UNSUPPORTED POOLING STRIDE"
+    elseif status == nnp_status_unsupported_algorithm
+        msg = "NNPACK STATUS UNSUPPORTED ALGORITHM"
+    elseif status == nnp_status_unsupported_transform_strategy
+        msg = "NNPACK STATUS UNSUPPORTED TRANSFORM STRATEGY"
+    elseif status == nnp_status_unsupported_activation
+        msg = "NNPACK STATUS UNSUPPORTED ACTIVATION"
+    elseif status == nnp_status_unsupported_activation_parameters
+        msg = "NNPACK STATUS UNSUPPORTED ACTIVATION PARAMETERS"
+    elseif status == nnp_status_uninitialized
+        msg = "NNPACK STATUS UNINITIALIZED"
+    elseif status == nnp_status_unsupported_hardware
+        msg = "NNPACK STATUS UNSUPPORTED HARDWARE"
+    elseif status == nnp_status_out_of_memory
+        msg = "NNPACK STATUS OUT OF MEMORY"
+    elseif status == nnp_status_insufficient_buffer
+        msg = "NNPACK STATUS INSUFFICIENT BUFFER"
+    elseif status == nnp_status_misaligned_buffer
+        msg = "NNPACK STATUS MISALIGNED BUFFER"
+    end
+    NNPACKError(status, msg)
+end
+
+macro check(nnp_func)
+    quote
+        local err::nnp_status
+        err = $(esc(nnp_func))
+        if err != nnp_status_success
+            throw(NNPACKError(err))
+        end
+        err
+    end
+end
diff --git a/src/nnpack/libnnpack.jl b/src/nnpack/libnnpack.jl
@@ -0,0 +1,145 @@
+#NOTE: We do the error handling of nnp_initialize while loading NNPACK
+function nnp_initialize()
+    ccall((:nnp_initialize, libnnpack), nnp_status, (),)
+end
+
+function nnp_deinitialize()
+    @check ccall((:nnp_deinitialize, libnnpack), nnp_status, (),)
+end
+
+function pthreadpool_create(n = 0)
+    ccall((:pthreadpool_create, libnnpack), Ptr{Cvoid}, (Csize_t,), n)
+end
+
+function nnp_relu_output(batch_size, channels, input, output, negative_slope, threadpool)
+    @check ccall((:nnp_relu_output, libnnpack), nnp_status, (Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cfloat}, Cfloat, pthreadpool_t), batch_size, channels, input, output, negative_slope, threadpool)
+end
+
+function nnp_relu_output(x::AbstractArray{Float32,N}, y::AbstractArray{Float32,N}; negative_slope::AbstractFloat = 0.0, threadpool = nothing) where {N}
+    threadpool = threadpool === nothing ? pthreadpool_create() : threadpool
+    # Investigate why the channel and batch dims need to specified like this
+    nnp_relu_output(prod(size(x)[N-1:N]), prod(size(x)[1:N-2]), x, y, negative_slope, threadpool)
+    y
+end
+
+function nnp_relu_input_gradient(batch_size, channels, grad_output, input, grad_input, negative_slope, threadpool)
+    @check ccall((:nnp_relu_input_gradient, libnnpack), nnp_status, (Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Cfloat, pthreadpool_t), batch_size, channels, grad_output, input, grad_input, negative_slope, threadpool)
+end
+
+function nnp_relu_input_gradient(x::AbstractArray{Float32,N}, dy::AbstractArray{Float32,N}, dx::AbstractArray{Float32,N}; negative_slope::AbstractFloat = 0.0, threadpool = nothing) where {N}
+    threadpool = threadpool === nothing ? pthreadpool_create() : threadpool
+    # Investigate why the channel and batch dims need to specified like this
+    nnp_relu_input_gradient(Csize_t(prod(size(x)[N-1:N])), prod(size(x)[1:N-2]), dy, x, dx, negative_slope, threadpool)
+    dx
+end
+
+function nnp_softmax_output(batch_size, channels, input, output, threadpool)
+    @check ccall((:nnp_softmax_output, libnnpack), nnp_status, (Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cfloat}, pthreadpool_t), batch_size, channels, input, output, threadpool)
+end
+
+function nnp_softmax_output(x::AbstractVecOrMat{Float32}, y::AbstractVecOrMat{Float32}; threadpool = nothing)
+    threadpool = threadpool === nothing ? pthreadpool_create() : threadpool
+    nnp_softmax_output(ndims(x) == 2 ? size(x, 2) : 1, size(x, 1), x, y, threadpool)
+    y
+end
+
+#FIXME: Output of fully connected not consistent with `kernel * input`
+#NOTE: This most likely due to nnpack being row major. Investigate this.
+
+function nnp_fully_connected_output(batch_size, input_channels, output_channels, input, kernel, output, threadpool, profile)
+    @check ccall((:nnp_fully_connected_output, libnnpack), nnp_status, (Csize_t, Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, pthreadpool_t, Ptr{Cvoid}), batch_size, input_channels, output_channels, input, kernel, output, threadpool, C_NULL)
+end
+
+function nnp_fully_connected_output(x::AbstractArray{Float32,2}, w::AbstractArray{Float32,2}, y::AbstractArray{Float32,2}; profile = nothing, threadpool = nothing)
+    profile = profile == nothing ? nnp_profile() : profile
+    threadpool = threadpool === nothing ? pthreadpool_create() : threadpool
+    nnp_fully_connected_output(size(x, 2), size(x, 1), size(w, 1), x, w, y, threadpool, profile)
+    y
+end
+
+function nnp_fully_connected_inference_f16f32(input_channels, output_channels, input, kernel, output, threadpool)
+    @check ccall((:nnp_fully_connected_inference_f16f32, libnnpack), nnp_status, (Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cvoid}, Ptr{Cfloat}, pthreadpool_t), input_channels, output_channels, input, kernel, output, threadpool)
+end
+
+nnp_fully_connected_inference_f16f32(x::AbstractVector{Float32}, w::AbstractArray{Float16,2}, y::AbstractVector{Float32}; threadpool = nothing) =
+    nnp_fully_connected_inference(reshape(x, size(x), 1), w, reshape(y, size(y), 1), threadpool = threadpool)
+
+function nnp_fully_connected_inference_f16f32(x::AbstractMatrix{Float32}, w::AbstractArray{Float16,2}, y::AbstractMatrix{Float32}; threadpool = nothing)
+    threadpool = threadpool === nothing ? pthreadpool_create() : threadpool
+    nnp_fully_connected_inference(size(x, 1), size(y, 1), x, w, y, threadpool)
+    y
+end
+
+function nnp_fully_connected_inference(input_channels, output_channels, input, kernel, output, threadpool)
+    @check ccall((:nnp_fully_connected_inference, libnnpack), nnp_status, (Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, pthreadpool_t), input_channels, output_channels, input, kernel, output, threadpool)
+end
+
+nnp_fully_connected_inference(x::AbstractVector{Float32}, w::AbstractArray{Float32,2}; threadpool = nothing) =
+    nnp_fully_connected_inference(reshape(x, size(x), 1), w, threadpool = threadpool)
+
+function nnp_fully_connected_inference(x::AbstractMatrix{Float32}, w::AbstractMatrix{Float32}, y::AbstractMatrix{Float32}; threadpool = nothing)
+    threadpool = threadpool === nothing ? pthreadpool_create() : threadpool
+    nnp_fully_connected_inference(size(x, 1), size(y, 1), x, w, y, threadpool)
+    y
+end
+
+function nnp_max_pooling_output(batch_size, channels, input_size, input_padding, pooling_size, pooling_stride, input, output, threadpool)
+    @check ccall((:nnp_max_pooling_output, libnnpack), nnp_status, (Csize_t, Csize_t, nnp_size, nnp_padding, nnp_size, nnp_size, Ptr{Cfloat}, Ptr{Cfloat}, pthreadpool_t), batch_size, channels, input_size, input_padding, pooling_size, pooling_stride, input, output, threadpool)
+end
+
+function nnp_max_pooling_output(x::AbstractArray{Float32,4}, y::AbstractArray{Float32,4}, kernel::Tuple; padding = 0, stride = 1, threadpool = nothing)
+    input_size = nnp_size(Csize_t.((size(x, 1), size(x, 2)))...)
+    pooling_size = nnp_size(Csize_t.(kernel)...)
+    input_padding = nnp_padding(Csize_t(padding[2]), Csize_t(padding[1]), Csize_t(padding[2]), Csize_t(padding[1]))
+    pooling_stride = nnp_size(Csize_t.(padtuple(x, stride))...)
+    threadpool = threadpool === nothing ? pthreadpool_create() : threadpool
+    nnp_max_pooling_output(size(x, 4), size(x, 3), input_size, input_padding, pooling_size, pooling_stride, x, y, threadpool)
+    y
+end
+
+#TODO: Add wrapper for convolution inference
+
+function nnp_convolution_input_gradient(algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, grad_output, kernel, grad_input, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, profile)
+    @check ccall((:nnp_convolution_kernel_gradient, libnnpack), nnp_status, (nnp_convolution_algorithm, Csize_t, Csize_t, Csize_t, nnp_size, nnp_padding, nnp_size, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cvoid}, Csize_t, nnp_activation, Ptr{Cvoid}, pthreadpool_t, Ptr{Cvoid}), algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, grad_output, kernel, grad_input, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, C_NULL)
+end
+
+function nnp_convolution_input_gradient(dx::AbstractArray{Float32,4}, x::AbstractArray{Float32,4}, dy::AbstractArray{Float32,4}, w::AbstractArray{Float32,4}; algo::nnp_convolution_algorithm = UInt32(0), workspace_buffer = nothing, workspace_size = 0, padding = 0, stride = 1, threadpool = nothing, profile = nothing)
+    input_size = nnp_size(Csize_t.((size(x,1), size(x,2)))...)
+    kernel_size = nnp_size(Csize_t.((size(w,1),size(w,2)))...)
+    input_padding = nnp_padding(Csize_t(padding[2]), Csize_t(padding[1]), Csize_t(padding[2]), Csize_t(padding[1]))
+    profile = profile == nothing ? nnp_profile() : profile
+    threadpool = threadpool === nothing ? pthreadpool_create() : threadpool
+    workspace_buffer = workspace_buffer === nothing ? C_NULL : workspace_buffer
+    nnp_convolution_input_gradient(UInt32(algo), size(x,4), size(x,3), size(w,4), input_size, input_padding, kernel_size, dy, w, dx, workspace_buffer, workspace_size, UInt32(0), C_NULL, threadpool, profile)
+    dx
+end
+
+function nnp_convolution_kernel_gradient(algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, input, grad_output, grad_kernel, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, profile)
+    @check ccall((:nnp_convolution_kernel_gradient, libnnpack), nnp_status, (nnp_convolution_algorithm, Csize_t, Csize_t, Csize_t, nnp_size, nnp_padding, nnp_size, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cvoid}, Csize_t, nnp_activation, Ptr{Cvoid}, pthreadpool_t, Ptr{Cvoid}), algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, input, grad_output, grad_kernel, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, C_NULL)
+end
+
+function nnp_convolution_kernel_gradient(dw::AbstractArray{Float32,4}, x::AbstractArray{Float32,4}, dy::AbstractArray{Float32,4}, w::AbstractArray{Float32,4}; algo::nnp_convolution_algorithm = UInt32(0), workspace_buffer = nothing, workspace_size = 0, padding = 0, stride = 1, threadpool = nothing, profile = nothing)
+    input_size = nnp_size(Csize_t.((size(x,1), size(x,2)))...)
+    kernel_size = nnp_size(Csize_t.((size(w,1),size(w,2)))...)
+    input_padding = nnp_padding(Csize_t(padding[2]), Csize_t(padding[1]), Csize_t(padding[2]), Csize_t(padding[1]))
+    profile = profile == nothing ? nnp_profile() : profile
+    threadpool = threadpool === nothing ? pthreadpool_create() : threadpool
+    workspace_buffer = workspace_buffer === nothing ? C_NULL : workspace_buffer
+    nnp_convolution_kernel_gradient(UInt32(algo), size(x,4), size(x,3), size(w,4), input_size, input_padding, kernel_size, x, dy, dw, workspace_buffer, workspace_size, UInt32(0), C_NULL, threadpool, profile)
+    dw
+end
+
+function nnp_convolution_output(algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, input, kernel, bias, output, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, profile)
+    @check ccall((:nnp_convolution_output, libnnpack), nnp_status, (nnp_convolution_algorithm, Csize_t, Csize_t, Csize_t, nnp_size, nnp_padding, nnp_size, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cvoid}, Csize_t, nnp_activation, Ptr{Cvoid}, pthreadpool_t, Ptr{Cvoid}), algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, input, kernel, bias, output, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, C_NULL)
+end
+
+function nnp_convolution_output(y::AbstractArray{Float32,4}, x::AbstractArray{Float32,4}, w::AbstractArray{Float32,4}, b::AbstractArray{Float32,1}; algo::nnp_convolution_algorithm = UInt32(0), workspace_buffer = nothing, workspace_size = 0, padding = 0, stride = 1, threadpool = nothing, profile = nothing)
+    input_size = nnp_size(Csize_t.((size(x,1), size(x,2)))...)
+    kernel_size = nnp_size(Csize_t.((size(w,1),size(w,2)))...)
+    input_padding = nnp_padding(Csize_t(padding[2]), Csize_t(padding[1]), Csize_t(padding[2]), Csize_t(padding[1]))
+    profile = profile == nothing ? nnp_profile() : profile
+    threadpool = threadpool === nothing ? pthreadpool_create() : threadpool
+    workspace_buffer = workspace_buffer === nothing ? C_NULL : workspace_buffer
+    nnp_convolution_output(UInt32(algo), size(x,4), size(x,3), size(w,4), input_size, input_padding, kernel_size, x, w, b, y, workspace_buffer, workspace_size, UInt32(0), C_NULL, threadpool, profile)
+    y
+end