diff --git a/src/matrix_multiply.jl b/src/matrix_multiply.jl
index b9bbb284..eabb414f 100644
--- a/src/matrix_multiply.jl
+++ b/src/matrix_multiply.jl
@@ -16,10 +16,24 @@ import LinearAlgebra: BlasFloat, matprod, mul!
 
 # Implementations
 
+function matrix_vector_quote(sa)
+    q = Expr(:block)
+    exprs = [Symbol(:x_, k) for k ∈ 1:sa[1]]
+    for j ∈ 1:sa[2]
+        for k ∈ 1:sa[1]
+            call = isone(j) ? :(a[$(LinearIndices(sa)[k, j])]*b[$j]) : :(muladd(a[$(LinearIndices(sa)[k, j])], b[$j], $(exprs[k])))
+            push!(q.args, :($(exprs[k]) = $call))
+        end
+    end
+    q, exprs
+end
+
 @generated function _mul(::Size{sa}, a::StaticMatrix{<:Any, <:Any, Ta}, b::AbstractVector{Tb}) where {sa, Ta, Tb}
     if sa[2] != 0
-        exprs = [reduce((ex1,ex2) -> :(+($ex1,$ex2)), [:(a[$(LinearIndices(sa)[k, j])]*b[$j]) for j = 1:sa[2]]) for k = 1:sa[1]]
+        # [reduce((ex1,ex2) -> :(+($ex1,$ex2)), [:(a[$(LinearIndices(sa)[k, j])]*b[$j]) for j = 1:sa[2]]) for k = 1:sa[1]]
+        q, exprs = matrix_vector_quote(sa)
     else
+        q = nothing
         exprs = [:(zero(T)) for k = 1:sa[1]]
     end
 
@@ -29,6 +43,7 @@ import LinearAlgebra: BlasFloat, matprod, mul!
             throw(DimensionMismatch("Tried to multiply arrays of size $sa and $(size(b))"))
         end
         T = promote_op(matprod,Ta,Tb)
+        $q
         @inbounds return similar_type(b, T, Size(sa[1]))(tuple($(exprs...)))
     end
 end
@@ -39,14 +54,17 @@ end
     end
 
     if sa[2] != 0
-        exprs = [reduce((ex1,ex2) -> :(+($ex1,$ex2)), [:(a[$(LinearIndices(sa)[k, j])]*b[$j]) for j = 1:sa[2]]) for k = 1:sa[1]]
+        q, exprs = matrix_vector_quote(sa)
+        # exprs = [reduce((ex1,ex2) -> :(+($ex1,$ex2)), [:(a[$(LinearIndices(sa)[k, j])]*b[$j]) for j = 1:sa[2]]) for k = 1:sa[1]]
     else
+        q = nothing
         exprs = [:(zero(T)) for k = 1:sa[1]]
     end
 
     return quote
         @_inline_meta
         T = promote_op(matprod,Ta,Tb)
+        $q
         @inbounds return similar_type(b, T, Size(sa[1]))(tuple($(exprs...)))
     end
 end
@@ -71,7 +89,7 @@ end
             @_inline_meta
             return mul_unrolled(Sa, Sb, a, b)
         end
-    elseif sa[1] <= 14 && sa[2] <= 14 && sb[2] <= 14
+    elseif sa[1] <= 12 && sa[2] <= 12 && sb[2] <= 12
         return quote
             @_inline_meta
             return mul_unrolled_chunks(Sa, Sb, a, b)
@@ -104,7 +122,7 @@ end
             @_inline_meta
             return mul_unrolled(Sa, Sb, a, b)
         end
-    elseif sa[1] <= 14 && sa[2] <= 14 && sb[2] <= 14
+    elseif sa[1] <= 12 && sa[2] <= 12 && sb[2] <= 12
         return quote
             @_inline_meta
             return similar_type(a, T, $S)(mul_unrolled_chunks(Sa, Sb, a, b))
@@ -125,14 +143,30 @@ end
     S = Size(sa[1], sb[2])
 
     if sa[2] != 0
-        exprs = [reduce((ex1,ex2) -> :(+($ex1,$ex2)), [:(a[$(LinearIndices(sa)[k1, j])]*b[$(LinearIndices(sb)[j, k2])]) for j = 1:sa[2]]) for k1 = 1:sa[1], k2 = 1:sb[2]]
+        # exprs = [reduce((ex1,ex2) -> :(+($ex1,$ex2)), [:(a[$(LinearIndices(sa)[k1, j])]*b[$(LinearIndices(sb)[j, k2])]) for j = 1:sa[2]]) for k1 = 1:sa[1], k2 = 1:sb[2]]
+        exprs = [Symbol(:C_,k1,:_,k2) for k1 = 1:sa[1], k2 = 1:sb[2]]
+        q = Expr(:block)
+        for k2 in 1:sb[2]
+            for k1 in 1:sa[1]
+                push!(q.args, :($(exprs[k1,k2]) = a[$(LinearIndices(sa)[k1, 1])]*b[$(LinearIndices(sb)[1, k2])]))
+            end
+        end
+        for j in 2:sb[1]
+            for k2 in 1:sb[2]
+                for k1 in 1:sa[1]
+                    push!(q.args, :($(exprs[k1,k2]) = muladd(a[$(LinearIndices(sa)[k1, j])], b[$(LinearIndices(sb)[j, k2])], $(exprs[k1,k2]))))
+                end
+            end
+        end
     else
+        q = nothing
         exprs = [:(zero(T)) for k1 = 1:sa[1], k2 = 1:sb[2]]
     end
 
     return quote
         @_inline_meta
         T = promote_op(matprod,Ta,Tb)
+        $q
         @inbounds return similar_type(a, T, $S)(tuple($(exprs...)))
     end
 end
@@ -145,18 +179,44 @@ end
 
     S = Size(sa[1], sb[2])
 
-    tmps = [Symbol("tmp_$(k1)_$(k2)") for k1 = 1:sa[1], k2 = 1:sb[2]]
-    exprs_init = [:($(tmps[k1,k2])  = a[$k1] * b[1 + $((k2-1) * sb[1])]) for k1 = 1:sa[1], k2 = 1:sb[2]]
-    exprs_loop = [:($(tmps[k1,k2]) += a[$(k1-sa[1]) + $(sa[1])*j] * b[j + $((k2-1) * sb[1])]) for k1 = 1:sa[1], k2 = 1:sb[2]]
-
+    # optimal for AVX2 with `Float64
+    # AVX512 would want something more like 16x14 or 24x9 with `Float64`
+    M_r, N_r = 8, 6
+    n = 0
+    M, K = sa
+    N = sb[2]
+    q = Expr(:block)
+    atemps = [Symbol(:a_, k1) for k1 = 1:M]
+    tmps = [Symbol("tmp_$(k1)_$(k2)") for k1 = 1:M, k2 = 1:N]
+    while n < N
+        nu = min(N, n + N_r)
+        nrange = n+1:nu
+        m = 0
+        while m < M
+            mu = min(M, m + M_r)
+            mrange = m+1:mu
+            
+            atemps_init = [:($(atemps[k1]) = a[$k1]) for k1 = mrange]
+            exprs_init = [:($(tmps[k1,k2])  = $(atemps[k1]) * b[$(1 + (k2-1) * sb[1])]) for k1 = mrange, k2 = nrange]
+            atemps_loop_init = [:($(atemps[k1]) = a[$(k1-sa[1]) + $(sa[1])*j]) for k1 = mrange]
+            exprs_loop = [:($(tmps[k1,k2]) = muladd($(atemps[k1]), b[j + $((k2-1) * sb[1])], $(tmps[k1,k2]))) for k1 = mrange, k2 = nrange]
+            qblock = quote
+                @inbounds $(Expr(:block, atemps_init...))
+                @inbounds $(Expr(:block, exprs_init...))
+                for j = 2:$(sa[2])
+                    @inbounds $(Expr(:block, atemps_loop_init...))
+                    @inbounds $(Expr(:block, exprs_loop...))
+                end
+            end
+            push!(q.args, qblock)
+            m = mu
+        end
+        n = nu
+    end
     return quote
         @_inline_meta
         T = promote_op(matprod,Ta,Tb)
-
-        @inbounds $(Expr(:block, exprs_init...))
-        for j = 2:$(sa[2])
-            @inbounds $(Expr(:block, exprs_loop...))
-        end
+        $q
         @inbounds return similar_type(a, T, $S)(tuple($(tmps...)))
     end
 end
@@ -170,22 +230,43 @@ end
 
     S = Size(sa[1], sb[2])
 
-    # Do a custom b[:, k2] to return a SVector (an isbitstype type) rather than (possibly) a mutable type. Avoids allocation == faster
-    tmp_type_in = :(SVector{$(sb[1]), T})
-    tmp_type_out = :(SVector{$(sa[1]), T})
-    vect_exprs = [:($(Symbol("tmp_$k2"))::$tmp_type_out = partly_unrolled_multiply(TSize(a), TSize($(sb[1])), a,
-                    $(Expr(:call, tmp_type_in, [Expr(:ref, :b, LinearIndices(sb)[i, k2]) for i = 1:sb[1]]...)))::$tmp_type_out)
-                  for k2 = 1:sb[2]]
+    # optimal for AVX2 with `Float64
+    # AVX512 would want something more like 16x14 or 24x9 with `Float64`
+    M_r, N_r = 8, 6
+    n = 0
+    M, K = sa
+    N = sb[2]
+    q = Expr(:block)
+    atemps = [Symbol(:a_, k1) for k1 = 1:M]
+    tmps = [Symbol("tmp_$(k1)_$(k2)") for k1 = 1:M, k2 = 1:N]
+    while n < N
+        nu = min(N, n + N_r)
+        nrange = n+1:nu
+        m = 0
+        while m < M
+            mu = min(M, m + M_r)
+            mrange = m+1:mu
 
-    exprs = [:($(Symbol("tmp_$k2"))[$k1]) for k1 = 1:sa[1], k2 = 1:sb[2]]
+            atemps_init = [:($(atemps[k1]) = a[$k1]) for k1 = mrange]
+            exprs_init = [:($(tmps[k1,k2])  = $(atemps[k1]) * b[$(1 + (k2-1) * sb[1])]) for k1 = mrange, k2 = nrange]
+            push!(q.args, :(@inbounds $(Expr(:block, atemps_init...))))
+            push!(q.args, :(@inbounds $(Expr(:block, exprs_init...))))
 
+            for j in 2:K
+                atemps_loop_init = [:($(atemps[k1]) = a[$(LinearIndices(sa)[k1,j])]) for k1 = mrange]
+                exprs_loop = [:($(tmps[k1,k2]) = muladd($(atemps[k1]), b[$(LinearIndices(sb)[j,k2])], $(tmps[k1,k2]))) for k1 = mrange, k2 = nrange]
+                push!(q.args, :(@inbounds $(Expr(:block, atemps_loop_init...))))
+                push!(q.args, :(@inbounds $(Expr(:block, exprs_loop...))))
+            end
+            m = mu
+        end
+        n = nu
+    end
     return quote
         @_inline_meta
         T = promote_op(matprod,Ta,Tb)
-        $(Expr(:block,
-            vect_exprs...,
-            :(@inbounds return similar_type(a, T, $S)(tuple($(exprs...))))
-        ))
+        $q
+        @inbounds return similar_type(a, T, $S)(tuple($(tmps...)))
     end
 end