diff --git a/Project.toml b/Project.toml
index 270353467..a7894dcd4 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "LoopVectorization"
 uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
 authors = ["Chris Elrod <elrodc@gmail.com>"]
-version = "0.12.163"
+version = "0.12.164"
 
 
 [deps]
diff --git a/benchmark/looptests.jl b/benchmark/looptests.jl
index a5f417afb..feb9a09ce 100644
--- a/benchmark/looptests.jl
+++ b/benchmark/looptests.jl
@@ -76,12 +76,12 @@ function jgemm!(๐‚, ๐€แต€::Adjoint, ๐แต€::Adjoint)
   end
 end
 gemmavx!(๐‚, ๐€, ๐) = @turbo for m โˆˆ indices((๐€, ๐‚), 1), n โˆˆ indices((๐, ๐‚), 2)
-    ๐‚โ‚˜โ‚™ = zero(eltype(๐‚))
-    for k โˆˆ indices((๐€, ๐), (2, 1))
-      ๐‚โ‚˜โ‚™ += ๐€[m, k] * ๐[k, n]
-    end
-    ๐‚[m, n] = ๐‚โ‚˜โ‚™
+  ๐‚โ‚˜โ‚™ = zero(eltype(๐‚))
+  for k โˆˆ indices((๐€, ๐), (2, 1))
+    ๐‚โ‚˜โ‚™ += ๐€[m, k] * ๐[k, n]
   end
+  ๐‚[m, n] = ๐‚โ‚˜โ‚™
+end
 function gemmavx!(
   Cc::AbstractMatrix{Complex{T}},
   Ac::AbstractMatrix{Complex{T}},
@@ -102,12 +102,12 @@ function gemmavx!(
   end
 end
 gemmavxt!(๐‚, ๐€, ๐) = @tturbo for m โˆˆ indices((๐€, ๐‚), 1), n โˆˆ indices((๐, ๐‚), 2)
-    ๐‚โ‚˜โ‚™ = zero(eltype(๐‚))
-    for k โˆˆ indices((๐€, ๐), (2, 1))
-      ๐‚โ‚˜โ‚™ += ๐€[m, k] * ๐[k, n]
-    end
-    ๐‚[m, n] = ๐‚โ‚˜โ‚™
+  ๐‚โ‚˜โ‚™ = zero(eltype(๐‚))
+  for k โˆˆ indices((๐€, ๐), (2, 1))
+    ๐‚โ‚˜โ‚™ += ๐€[m, k] * ๐[k, n]
   end
+  ๐‚[m, n] = ๐‚โ‚˜โ‚™
+end
 function gemmavxt!(
   Cc::AbstractMatrix{Complex{T}},
   Ac::AbstractMatrix{Complex{T}},
@@ -204,11 +204,11 @@ function jdot3avx(x, A, y)
   s
 end
 jvexp!(b, a) = @inbounds for i โˆˆ eachindex(a)
-    b[i] = exp(a[i])
-  end
+  b[i] = exp(a[i])
+end
 jvexpavx!(b, a) = @turbo for i โˆˆ eachindex(a)
-    b[i] = exp(a[i])
-  end
+  b[i] = exp(a[i])
+end
 function jsvexp(a)
   s = zero(eltype(a))
   @inbounds for i โˆˆ eachindex(a)
@@ -242,12 +242,12 @@ function jgemv!(๐ฒ, ๐€แต€::Adjoint, ๐ฑ)
   end
 end
 jgemvavx!(๐ฒ, ๐€, ๐ฑ) = @turbo for i โˆˆ eachindex(๐ฒ)
-    ๐ฒแตข = zero(eltype(๐ฒ))
-    for j โˆˆ eachindex(๐ฑ)
-      ๐ฒแตข += ๐€[i, j] * ๐ฑ[j]
-    end
-    ๐ฒ[i] = ๐ฒแตข
+  ๐ฒแตข = zero(eltype(๐ฒ))
+  for j โˆˆ eachindex(๐ฑ)
+    ๐ฒแตข += ๐€[i, j] * ๐ฑ[j]
   end
+  ๐ฒ[i] = ๐ฒแตข
+end
 function jvar!(๐ฌยฒ, ๐€, xฬ„)
   @. sยฒ = zero(eltype(๐ฌยฒ))
   @inbounds @fastmath for i โˆˆ 1:size(๐€, 2)
@@ -258,14 +258,14 @@ function jvar!(๐ฌยฒ, ๐€, xฬ„)
   end
 end
 jvaravx!(๐ฌยฒ, ๐€, xฬ„) = @turbo for j โˆˆ eachindex(๐ฌยฒ)
-    ๐ฌยฒโฑผ = zero(eltype(๐ฌยฒ))
-    xฬ„โฑผ = xฬ„[j]
-    for i โˆˆ 1:size(๐€, 2)
-      ฮด = ๐€[j, i] - xฬ„โฑผ
-      ๐ฌยฒโฑผ += ฮด * ฮด
-    end
-    ๐ฌยฒ[j] = ๐ฌยฒโฑผ
+  ๐ฌยฒโฑผ = zero(eltype(๐ฌยฒ))
+  xฬ„โฑผ = xฬ„[j]
+  for i โˆˆ 1:size(๐€, 2)
+    ฮด = ๐€[j, i] - xฬ„โฑผ
+    ๐ฌยฒโฑผ += ฮด * ฮด
   end
+  ๐ฌยฒ[j] = ๐ฌยฒโฑผ
+end
 japlucBc!(D, a, B, c) = @. D = a + B * c';
 japlucBcavx!(D, a, B, c) = @turbo @. D = a + B * c';
 
diff --git a/benchmark/plotbenchmarks.jl b/benchmark/plotbenchmarks.jl
index e9f984505..125f435cc 100644
--- a/benchmark/plotbenchmarks.jl
+++ b/benchmark/plotbenchmarks.jl
@@ -29,7 +29,8 @@ else
   # const COLOR_MAP = Dict{String,RGB{Float64}}()
   # const COLOR_MAP = Dict{String,RGB{Colors.N0f8}}()
   const COLOR_MAP64 = Dict{String,RGB{Float64}}()
-  getcolor(s::String) = get!(COLOR_MAP64, s) do
+  getcolor(s::String) =
+    get!(COLOR_MAP64, s) do
       COLORS[length(COLOR_MAP64)+1]
     end
   replace_and(str) = replace(str, '&' => "with")
diff --git a/ext/ForwardDiffExt.jl b/ext/ForwardDiffExt.jl
index 00dfc5565..26227f694 100644
--- a/ext/ForwardDiffExt.jl
+++ b/ext/ForwardDiffExt.jl
@@ -157,8 +157,8 @@ end
   end
 end
 
-@generated function ifelse(
-  m::AbstractMask,
+@generated function _ifelse(
+  m::Union{AbstractMask,VecUnroll{<:Any,<:Any,Bit,<:AbstractMask}},
   x::ForwardDiff.Dual{TAG,V,P},
   y::ForwardDiff.Dual{TAG,V,P}
 ) where {TAG,V,P}
@@ -171,8 +171,8 @@ end
     ForwardDiff.Dual{$TAG}(z, ForwardDiff.Partials(p))
   end
 end
-@generated function ifelse(
-  m::AbstractMask,
+@generated function _ifelse(
+  m::Union{AbstractMask,VecUnroll{<:Any,<:Any,Bit,<:AbstractMask}},
   x::Number,
   y::ForwardDiff.Dual{TAG,V,P}
 ) where {TAG,V,P}
@@ -184,8 +184,8 @@ end
     ForwardDiff.Dual{$TAG}(z, ForwardDiff.Partials(p))
   end
 end
-@generated function ifelse(
-  m::AbstractMask,
+@generated function _ifelse(
+  m::Union{AbstractMask,VecUnroll{<:Any,<:Any,Bit,<:AbstractMask}},
   x::ForwardDiff.Dual{TAG,V,P},
   y::Number
 ) where {TAG,V,P}
@@ -197,6 +197,29 @@ end
     ForwardDiff.Dual{$TAG}(z, ForwardDiff.Partials(p))
   end
 end
+@inline ifelse(m::AbstractMask, x::ForwardDiff.Dual, y::Number) =
+  _ifelse(m, x, y)
+@inline ifelse(m::AbstractMask, x::ForwardDiff.Dual, y::ForwardDiff.Dual) =
+  _ifelse(m, x, y)
+@inline ifelse(m::AbstractMask, y::Number, x::ForwardDiff.Dual) =
+  _ifelse(m, y, x)
+
+@inline ifelse(
+  m::VecUnroll{<:Any,<:Any,Bit,<:AbstractMask},
+  x::ForwardDiff.Dual,
+  y::Number
+) = _ifelse(m, x, y)
+@inline ifelse(
+  m::VecUnroll{<:Any,<:Any,Bit,<:AbstractMask},
+  x::ForwardDiff.Dual,
+  y::ForwardDiff.Dual
+) = _ifelse(m, x, y)
+@inline ifelse(
+  m::VecUnroll{<:Any,<:Any,Bit,<:AbstractMask},
+  y::Number,
+  x::ForwardDiff.Dual
+) = _ifelse(m, y, x)
+
 @inline function SLEEFPirates.softplus(x::ForwardDiff.Dual{TAG}) where {TAG}
   val = ForwardDiff.value(x)
   expx = exp(val)
diff --git a/src/LoopVectorization.jl b/src/LoopVectorization.jl
index 7856fcc6c..d8033d307 100644
--- a/src/LoopVectorization.jl
+++ b/src/LoopVectorization.jl
@@ -108,7 +108,8 @@ using VectorizationBase:
   contract_or,
   collapse_or,
   max_mask,
-  maybestaticsize,zero_mask
+  maybestaticsize,
+  zero_mask
 
 using HostCPUFeatures:
   pick_vector_width,
diff --git a/src/codegen/split_loops.jl b/src/codegen/split_loops.jl
index af1eafba8..95978a0f5 100644
--- a/src/codegen/split_loops.jl
+++ b/src/codegen/split_loops.jl
@@ -76,7 +76,8 @@ function add_operation!(
   opnew
 end
 
-append_if_included!(vnew, vold, included) = for (i, v) โˆˆ vold
+append_if_included!(vnew, vold, included) =
+  for (i, v) โˆˆ vold
     id = included[i]
     iszero(id) || push!(vnew, (id, v))
   end
diff --git a/src/modeling/costs.jl b/src/modeling/costs.jl
index 8c2c2bbb5..0d2b40771 100644
--- a/src/modeling/costs.jl
+++ b/src/modeling/costs.jl
@@ -13,7 +13,8 @@ struct Instruction
 end
 # lower(instr::Instruction) = Expr(:(.), instr.mod, QuoteNode(instr.instr))
 # Base.convert(::Type{Expr}, instr::Instruction) = Expr(:(.), instr.mod, QuoteNode(instr.instr))
-callexpr(instr::Instruction) = if instr.mod === :LoopVectorization
+callexpr(instr::Instruction) =
+  if instr.mod === :LoopVectorization
     Expr(:call, lv(instr.instr))
   else#if instr.mod === :Main
     Expr(:call, instr.instr)
@@ -563,7 +564,8 @@ function reduction_to_single_vector(x::Float64)
     throw("Reduction not found.")
   end
 end
-reduce_to_onevecunroll(x::Float64) = if x == ADDITIVE_IN_REDUCTIONS
+reduce_to_onevecunroll(x::Float64) =
+  if x == ADDITIVE_IN_REDUCTIONS
     :+
   elseif x == MULTIPLICATIVE_IN_REDUCTIONS
     :*
@@ -578,7 +580,8 @@ reduce_to_onevecunroll(x::Float64) = if x == ADDITIVE_IN_REDUCTIONS
   else
     throw("Reduction not found.")
   end
-reduce_number_of_vectors(x::Float64) = if x == ADDITIVE_IN_REDUCTIONS
+reduce_number_of_vectors(x::Float64) =
+  if x == ADDITIVE_IN_REDUCTIONS
     :contract_add
   elseif x == MULTIPLICATIVE_IN_REDUCTIONS
     :contract_mul
@@ -593,7 +596,8 @@ reduce_number_of_vectors(x::Float64) = if x == ADDITIVE_IN_REDUCTIONS
   else
     throw("Reduction not found.")
   end
-reduction_to_scalar(x::Float64) = if x == ADDITIVE_IN_REDUCTIONS
+reduction_to_scalar(x::Float64) =
+  if x == ADDITIVE_IN_REDUCTIONS
     :vsum
   elseif x == MULTIPLICATIVE_IN_REDUCTIONS
     :vprod
diff --git a/src/predicates.jl b/src/predicates.jl
index 69af70c8b..48291974d 100644
--- a/src/predicates.jl
+++ b/src/predicates.jl
@@ -11,7 +11,7 @@ isscopedname(:(Base.Checked.checked_add), (:Base, :Checked), :checked_add)
 function isscopedname(ex, modpath, name::Symbol)
   isexpr(ex, :(.), 2) &&
     (a = ex.args[2]; isa(a, QuoteNode) && a.value === name) &&
-            hasscope(ex.args[1], modpath)
+    hasscope(ex.args[1], modpath)
 end
 hasscope(modex, mod::Symbol) = modex === mod
 hasscope(modex, mod::Tuple{Symbol}) = hasscope(modex, mod[1])
diff --git a/src/reconstruct_loopset.jl b/src/reconstruct_loopset.jl
index 000ffc208..d044fbeb3 100644
--- a/src/reconstruct_loopset.jl
+++ b/src/reconstruct_loopset.jl
@@ -27,7 +27,7 @@ Base.promote_rule(
   ::Type{UpperBoundedInteger{N,T}},
   ::Type{T}
 ) where {N,T<:Base.BitInteger} = T
-Base.convert(::Type{T}, i::UpperBoundedInteger) where {T<:Number} =
+Base.convert(::Type{T}, i::UpperBoundedInteger) where {T<:Integer} =
   convert(T, i.i)
 Base.convert(
   ::Type{UpperBoundedInteger{N,T}},
diff --git a/src/simdfunctionals/mapreduce.jl b/src/simdfunctionals/mapreduce.jl
index 01b93014e..47393abcc 100644
--- a/src/simdfunctionals/mapreduce.jl
+++ b/src/simdfunctionals/mapreduce.jl
@@ -115,7 +115,8 @@ end
 Vectorized version of `sum`. Providing a function as the first argument
 will apply the function to each element of `A` before summing.
 """
-@inline vsum(f::F, A::AbstractArray{T}) where {F,T<:NativeTypes} = vmapreduce(f, +, A)
+@inline vsum(f::F, A::AbstractArray{T}) where {F,T<:NativeTypes} =
+  vmapreduce(f, +, A)
 @inline vsum(A::AbstractArray{T}) where {T<:NativeTypes} = vsum(identity, A)
 
 length_one_axis(::Base.OneTo) = Base.OneTo(1)
diff --git a/test/Project.toml b/test/Project.toml
index e57e06bb3..dabb95f63 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,8 +1,10 @@
 [deps]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
+ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
@@ -12,4 +14,5 @@ SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 Static = "aedffcd0-7271-4cad-89d0-dc628f76c6d3"
 StrideArraysCore = "7792a7ef-975c-4747-a70f-980b88e8d1da"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
diff --git a/test/forwarddiffext.jl b/test/forwarddiffext.jl
new file mode 100644
index 000000000..90b167e33
--- /dev/null
+++ b/test/forwarddiffext.jl
@@ -0,0 +1,37 @@
+
+using NNlib, LoopVectorization, VectorizationBase, ForwardDiff, Test
+randnvec() = Vec(ntuple(_ -> randn(), pick_vector_width(Float64))...)
+
+tovec(x::Vec{W,T}) where {W,T} = T[Tuple(x)...]
+tovec(x::VecUnroll) = reduce(vcat, map(tovec, VectorizationBase.data(x)))
+function tovec(x::ForwardDiff.Dual{T,V,N}) where {T,V,N}
+  v = tovec(ForwardDiff.value(x))
+  dv = map(tovec, Tuple(ForwardDiff.partials(x)))
+  D = ForwardDiff.Dual{T,eltype(v),N}
+  ret = Vector{D}(undef, length(v))
+  for i in eachindex(v)
+    ret[i] = ForwardDiff.Dual(v[i], map(Base.Fix2(Base.getindex, i), dv)...)
+  end
+  return ret
+end
+
+
+vx0 = randnvec()
+vx1 = randnvec()
+vx2 = randnvec()
+vx3 = randnvec()
+vx4 = randnvec()
+vx5 = randnvec()
+
+vd0 = ForwardDiff.Dual(vx0, vx1, vx2, vx3, vx4, vx5)
+
+vu0 = VecUnroll((vx0, vx1))
+vu1 = VecUnroll((vx2, vx3))
+vu2 = VecUnroll((vx4, vx5))
+
+vud = ForwardDiff.Dual(vu0, vu1, vu2)
+
+@test reinterpret(Float64, tovec(NNlib.leakyrelu(vd0))) โ‰ˆ
+      reinterpret(Float64, NNlib.leakyrelu.(tovec(vd0)))
+@test reinterpret(Float64, tovec(NNlib.leakyrelu(vud))) โ‰ˆ
+      reinterpret(Float64, NNlib.leakyrelu.(tovec(vud)))
diff --git a/test/grouptests.jl b/test/grouptests.jl
index e97a629b2..7e7c78908 100644
--- a/test/grouptests.jl
+++ b/test/grouptests.jl
@@ -116,6 +116,7 @@ const START_TIME = time()
     Pkg.activate(joinpath(precompiledir, "LVUser"))
     @time include(joinpath(precompiledir, "precompile.jl"))
     Pkg.activate(cproj)
+    @time include("forwarddiffext.jl")
   end
 
 end