/*******************************************************************************
* Copyright 2019 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#include "common_f32.hpp"
#include "jit_generator.hpp"

namespace mkldnn {
namespace impl {
namespace cpu {

jit_sse41_kernel_sgemm_kern::jit_sse41_kernel_sgemm_kern() :
    jit_generator(nullptr, F32_COMPUTE_KERNEL_CODE_SIZE) {

#ifndef _WIN32

#define M   rdi
#define N   rsi
#define K   rdx
#define A   r8
#define B   r9
#define C   rcx
#define LDC r10

#define AA  r15
#define I   r11
#define J   r12
#define H   rax
#define AO  rbx
#define BO  rbp
#define CO1 r13
#define CO2 r14

#define OLD_C       8+stacksize+rsp
#define OLD_LDC     16+stacksize+rsp

#else

#define M   rcx
#define N   rdx
#define K   r8
#define A   rdi
#define B   rsi
#define C   r9
#define LDC r10
#define AA  r15
#define I   r11
#define J   r12
#define H   rax
#define AO  rbx
#define BO  rbp
#define CO1 r13
#define CO2 r14

#define OLD_A       40+stacksize+rsp
#define OLD_B       48+stacksize+rsp
#define OLD_C       56+stacksize+rsp
#define OLD_LDC     64+stacksize+rsp

#endif

inLocalLabel();
{

Xbyak::Label l101c;
Xbyak::Label l1114;
Xbyak::Label l1130;
Xbyak::Label l1228;
Xbyak::Label l1234;
Xbyak::Label l1280;
Xbyak::Label l130c;
Xbyak::Label l136c;
Xbyak::Label l1468;
Xbyak::Label l1478;
Xbyak::Label l1574;
Xbyak::Label l1580;
Xbyak::Label l15d0;
Xbyak::Label l1624;
Xbyak::Label l1684;
Xbyak::Label l1780;
Xbyak::Label l178c;
Xbyak::Label l1888;
Xbyak::Label l1894;
Xbyak::Label l18e4;
Xbyak::Label l1928;
Xbyak::Label l192c;
Xbyak::Label l195c;
Xbyak::Label l19b0;
Xbyak::Label l1aac;
Xbyak::Label l1ac8;
Xbyak::Label l1bc4;
Xbyak::Label l1bd0;
Xbyak::Label l1c20;
Xbyak::Label l1cb0;
Xbyak::Label l1d14;
Xbyak::Label l1e14;
Xbyak::Label l1e24;
Xbyak::Label l1f24;
Xbyak::Label l1f30;
Xbyak::Label l1f80;
Xbyak::Label l1fd4;
Xbyak::Label l2038;
Xbyak::Label l2138;
Xbyak::Label l2144;
Xbyak::Label l2244;
Xbyak::Label l2250;
Xbyak::Label l22a0;
Xbyak::Label l22e4;
Xbyak::Label l22e8;
Xbyak::Label l2318;
Xbyak::Label l236c;
Xbyak::Label l2468;
Xbyak::Label l2480;
Xbyak::Label l257c;
Xbyak::Label l2588;
Xbyak::Label l25d8;
Xbyak::Label l266c;
Xbyak::Label l26d0;
Xbyak::Label l27c;
Xbyak::Label l27d0;
Xbyak::Label l27e0;
Xbyak::Label l28e0;
Xbyak::Label l28ec;
Xbyak::Label l293c;
Xbyak::Label l298;
Xbyak::Label l2994;
Xbyak::Label l29f8;
Xbyak::Label l2af8;
Xbyak::Label l2b04;
Xbyak::Label l2c04;
Xbyak::Label l2c10;
Xbyak::Label l2c60;
Xbyak::Label l2ca4;
Xbyak::Label l2ca8;
Xbyak::Label l444;
Xbyak::Label l450;
Xbyak::Label l4bc;
Xbyak::Label l50;
Xbyak::Label l5c0;
Xbyak::Label l62c;
Xbyak::Label l74;
Xbyak::Label l7dc;
Xbyak::Label l7ec;
Xbyak::Label l99c;
Xbyak::Label l9a8;
Xbyak::Label la18;
Xbyak::Label lab4;
Xbyak::Label lb20;
Xbyak::Label lcd0;
Xbyak::Label lcdc;
Xbyak::Label ld0;
Xbyak::Label le8c;
Xbyak::Label le98;
Xbyak::Label lf08;
Xbyak::Label lf84;
Xbyak::Label lf98;
Xbyak::Label lfc8;

    preamble();
    auto stacksize = get_size_of_abi_save_regs();
#ifdef _WIN32
    mov(A, ptr[OLD_A]);
    mov(B, ptr[OLD_B]);
#endif
    mov(C, ptr[OLD_C]);
    mov(LDC, ptr[OLD_LDC]);

    mov(M, qword[M]);
    mov(N, qword[N]);
    mov(K, qword[K]);
    shl(LDC, 0x2);
    sub(A, -128);
    sub(B, -128);
    mov(J, M);
    cmp(J, 0x8);
    jl(lf98, T_NEAR);
    align(4);

L(l50);
    mov(AA, K);
    imul(AA, AA, 0x20);
    add(AA, A);
    mov(CO1, C);
    add(C, 0x20);
    mov(BO, B);
    mov(I, N);
    cmp(I, 0x4);
    jl(l5c0, T_NEAR);
    align(4);

L(l74);
    lea(CO2, ptr[CO1+LDC*2+0x0]);
    movups(xmm0, xword[A-0x80]);
    xorps(xmm8, xmm8);
    movups(xmm1, xword[A-0x70]);
    xorps(xmm9, xmm9);
    movups(xmm2, xword[A-0x60]);
    xorps(xmm10, xmm10);
    movups(xmm3, xword[A-0x50]);
    xorps(xmm11, xmm11);
    movaps(xmm4, xword[BO-0x80]);
    xorps(xmm12, xmm12);
    movaps(xmm5, xword[BO-0x70]);
    xorps(xmm13, xmm13);
    xorps(xmm14, xmm14);
    xorps(xmm15, xmm15);
    mov(AO, A);
    mov(H, K);
    sar(H, 0x2);
    jle(l444, T_NEAR);
    sub(H, 0x1e);
    jle(l27c, T_NEAR);
    align(4);

L(ld0);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm8, xmm4);
    addps(xmm12, xmm7);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm10, xmm4);
    movaps(xmm4, xword[BO-0x60]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x40]);
    mulps(xmm7, xmm1);
    movups(xmm1, xword[AO-0x30]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm8, xmm5);
    addps(xmm12, xmm7);
    pshufd(xmm5, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm10, xmm5);
    movaps(xmm5, xword[BO-0x50]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO-0x20]);
    mulps(xmm7, xmm3);
    movups(xmm3, xword[AO-0x10]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    prefetcht0(byte[AO+0x1c0]);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm8, xmm4);
    addps(xmm12, xmm7);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm10, xmm4);
    movaps(xmm4, xword[BO-0x40]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO]);
    mulps(xmm7, xmm1);
    movups(xmm1, xword[AO+0x10]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm8, xmm5);
    addps(xmm12, xmm7);
    pshufd(xmm5, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm10, xmm5);
    movaps(xmm5, xword[BO-0x30]);
    addps(xmm14, xmm7);
    add(AA, 0x8);
    sub(BO, -64);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO+0x20]);
    mulps(xmm7, xmm3);
    movups(xmm3, xword[AO+0x30]);
    sub(AO, -128);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(ld0, T_NEAR);
    align(4);

L(l27c);
    prefetcht0(byte[CO1+0x1c]);
    prefetcht0(byte[CO1+LDC*1+0x1c]);
    prefetcht0(byte[CO2+0x1c]);
    prefetcht0(byte[CO2+LDC*1+0x1c]);
    add(H, 0x1e);
    align(4);

L(l298);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm8, xmm4);
    addps(xmm12, xmm7);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm10, xmm4);
    movaps(xmm4, xword[BO-0x60]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x40]);
    mulps(xmm7, xmm1);
    movups(xmm1, xword[AO-0x30]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm8, xmm5);
    addps(xmm12, xmm7);
    pshufd(xmm5, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm10, xmm5);
    movaps(xmm5, xword[BO-0x50]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO-0x20]);
    mulps(xmm7, xmm3);
    movups(xmm3, xword[AO-0x10]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    prefetcht0(byte[AO+0x1c0]);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm8, xmm4);
    addps(xmm12, xmm7);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm10, xmm4);
    movaps(xmm4, xword[BO-0x40]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO]);
    mulps(xmm7, xmm1);
    movups(xmm1, xword[AO+0x10]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm8, xmm5);
    addps(xmm12, xmm7);
    pshufd(xmm5, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm10, xmm5);
    movaps(xmm5, xword[BO-0x30]);
    addps(xmm14, xmm7);
    add(AA, 0x8);
    sub(BO, -64);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO+0x20]);
    mulps(xmm7, xmm3);
    movups(xmm3, xword[AO+0x30]);
    sub(AO, -128);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(l298, T_NEAR);
    align(4);

L(l444);
    mov(H, K);
    and_(H, 0x3);
    je(l4bc, T_NEAR);
    align(4);

L(l450);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm8, xmm4);
    addps(xmm12, xmm7);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm10, xmm4);
    movaps(xmm4, xword[BO-0x70]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x60]);
    mulps(xmm7, xmm1);
    movups(xmm1, xword[AO-0x50]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    sub(AO, -32);
    sub(BO, -16);
    dec(H);
    jg(l450, T_NEAR);
    align(4);

L(l4bc);
    movaps(xmm0, xmm8);
    unpcklpd(xmm8, xmm9);
    unpckhpd(xmm0, xmm9);
    movaps(xmm1, xmm10);
    unpckhpd(xmm10, xmm11);
    unpcklpd(xmm1, xmm11);
    movaps(xmm9, xmm8);
    shufps(xmm8, xmm10, 0xcc);
    shufps(xmm9, xmm10, 0x66);
    movaps(xmm10, xmm1);
    movaps(xmm11, xmm1);
    shufps(xmm10, xmm0, 0xcc);
    shufps(xmm11, xmm0, 0x66);
    movaps(xmm0, xmm12);
    unpcklpd(xmm12, xmm13);
    unpckhpd(xmm0, xmm13);
    movaps(xmm1, xmm14);
    unpckhpd(xmm14, xmm15);
    unpcklpd(xmm1, xmm15);
    movaps(xmm13, xmm12);
    shufps(xmm12, xmm14, 0xcc);
    shufps(xmm13, xmm14, 0x66);
    movaps(xmm14, xmm1);
    movaps(xmm15, xmm1);
    shufps(xmm14, xmm0, 0xcc);
    shufps(xmm15, xmm0, 0x66);
    movups(xmm0, xword[CO1+0x0]);
    addps(xmm8, xmm0);
    movups(xword[CO1+0x0], xmm8);
    movups(xmm1, xword[CO1+0x10]);
    addps(xmm12, xmm1);
    movups(xword[CO1+0x10], xmm12);
    movups(xmm0, xword[CO1+LDC*1+0x0]);
    addps(xmm9, xmm0);
    movups(xword[CO1+LDC*1+0x0], xmm9);
    movups(xmm1, xword[CO1+LDC*1+0x10]);
    addps(xmm13, xmm1);
    movups(xword[CO1+LDC*1+0x10], xmm13);
    movups(xmm0, xword[CO2]);
    addps(xmm10, xmm0);
    movups(xword[CO2], xmm10);
    movups(xmm1, xword[CO2+0x10]);
    addps(xmm14, xmm1);
    movups(xword[CO2+0x10], xmm14);
    movups(xmm0, xword[CO2+LDC*1]);
    addps(xmm11, xmm0);
    movups(xword[CO2+LDC*1], xmm11);
    movups(xmm1, xword[CO2+LDC*1+0x10]);
    addps(xmm15, xmm1);
    movups(xword[CO2+LDC*1+0x10], xmm15);
    lea(CO1, ptr[CO1+LDC*4+0x0]);
    lea(CO2, ptr[CO2+LDC*4]);
    sub(I, 0x4);
    cmp(I, 0x4);
    jge(l74, T_NEAR);
    align(4);

L(l5c0);
    test(I, 0x2);
    jle(lab4, T_NEAR);
    lea(CO2, ptr[CO1+LDC*2+0x0]);
    movups(xmm0, xword[A-0x80]);
    xorps(xmm8, xmm8);
    movups(xmm1, xword[A-0x70]);
    xorps(xmm9, xmm9);
    movups(xmm2, xword[A-0x60]);
    xorps(xmm10, xmm10);
    movups(xmm3, xword[A-0x50]);
    xorps(xmm11, xmm11);
    movddup(xmm4, qword[BO-0x80]);
    xorps(xmm12, xmm12);
    movddup(xmm5, qword[BO-0x78]);
    xorps(xmm13, xmm13);
    xorps(xmm14, xmm14);
    xorps(xmm15, xmm15);
    mov(AO, A);
    mov(H, K);
    sar(H, 0x2);
    jle(l99c, T_NEAR);
    sub(H, 0x1e);
    jle(l7dc, T_NEAR);
    align(4);

L(l62c);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm8, xmm4);
    addps(xmm12, xmm7);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm10, xmm4);
    movddup(xmm4, qword[BO-0x70]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x40]);
    mulps(xmm7, xmm1);
    movups(xmm1, xword[AO-0x30]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm8, xmm5);
    addps(xmm12, xmm7);
    pshufd(xmm5, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm10, xmm5);
    movddup(xmm5, qword[BO-0x68]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO-0x20]);
    mulps(xmm7, xmm3);
    movups(xmm3, xword[AO-0x10]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    prefetcht0(byte[AO+0x1c0]);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm8, xmm4);
    addps(xmm12, xmm7);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm10, xmm4);
    movddup(xmm4, qword[BO-0x60]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO]);
    mulps(xmm7, xmm1);
    movups(xmm1, xword[AO+0x10]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm8, xmm5);
    addps(xmm12, xmm7);
    pshufd(xmm5, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm10, xmm5);
    movddup(xmm5, qword[BO-0x58]);
    addps(xmm14, xmm7);
    add(AA, 0x8);
    sub(BO, -32);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO+0x20]);
    mulps(xmm7, xmm3);
    movups(xmm3, xword[AO+0x30]);
    sub(AO, -128);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(l62c, T_NEAR);
    align(4);

L(l7dc);
    prefetcht0(byte[CO1+0x1c]);
    prefetcht0(byte[CO1+LDC*1+0x1c]);
    add(H, 0x1e);
    align(4);

L(l7ec);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm8, xmm4);
    addps(xmm12, xmm7);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm10, xmm4);
    movddup(xmm4, qword[BO-0x70]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x40]);
    mulps(xmm7, xmm1);
    movups(xmm1, xword[AO-0x30]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm8, xmm5);
    addps(xmm12, xmm7);
    pshufd(xmm5, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm10, xmm5);
    movddup(xmm5, qword[BO-0x68]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO-0x20]);
    mulps(xmm7, xmm3);
    movups(xmm3, xword[AO-0x10]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    prefetcht0(byte[AO+0x1c0]);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm8, xmm4);
    addps(xmm12, xmm7);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm10, xmm4);
    movddup(xmm4, qword[BO-0x60]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO]);
    mulps(xmm7, xmm1);
    movups(xmm1, xword[AO+0x10]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm8, xmm5);
    addps(xmm12, xmm7);
    pshufd(xmm5, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm10, xmm5);
    movddup(xmm5, qword[BO-0x58]);
    addps(xmm14, xmm7);
    add(AA, 0x8);
    sub(BO, -32);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO+0x20]);
    mulps(xmm7, xmm3);
    movups(xmm3, xword[AO+0x30]);
    sub(AO, -128);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(l7ec, T_NEAR);
    align(4);

L(l99c);
    mov(H, K);
    and_(H, 0x3);
    je(la18, T_NEAR);
    align(4);

L(l9a8);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm8, xmm4);
    addps(xmm12, xmm7);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm10, xmm4);
    movddup(xmm4, qword[BO-0x78]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x60]);
    mulps(xmm7, xmm1);
    movups(xmm1, xword[AO-0x50]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    sub(AO, -32);
    sub(BO, -8);
    dec(H);
    jg(l9a8, T_NEAR);
    align(4);

L(la18);
    movaps(xmm0, xmm8);
    unpcklpd(xmm8, xmm9);
    unpckhpd(xmm0, xmm9);
    movaps(xmm1, xmm10);
    unpckhpd(xmm10, xmm11);
    unpcklpd(xmm1, xmm11);
    movaps(xmm9, xmm8);
    shufps(xmm8, xmm10, 0xcc);
    shufps(xmm9, xmm10, 0x66);
    movaps(xmm0, xmm12);
    unpcklpd(xmm12, xmm13);
    unpckhpd(xmm0, xmm13);
    movaps(xmm1, xmm14);
    unpckhpd(xmm14, xmm15);
    unpcklpd(xmm1, xmm15);
    movaps(xmm13, xmm12);
    shufps(xmm12, xmm14, 0xcc);
    shufps(xmm13, xmm14, 0x66);
    movups(xmm0, xword[CO1+0x0]);
    addps(xmm8, xmm0);
    movups(xword[CO1+0x0], xmm8);
    movups(xmm1, xword[CO1+0x10]);
    addps(xmm12, xmm1);
    movups(xword[CO1+0x10], xmm12);
    movups(xmm0, xword[CO1+LDC*1+0x0]);
    addps(xmm9, xmm0);
    movups(xword[CO1+LDC*1+0x0], xmm9);
    movups(xmm1, xword[CO1+LDC*1+0x10]);
    addps(xmm13, xmm1);
    movups(xword[CO1+LDC*1+0x10], xmm13);
    lea(CO1, ptr[CO1+LDC*2+0x0]);
    lea(CO2, ptr[CO2+LDC*2]);
    align(4);

L(lab4);
    test(I, 0x1);
    jle(lf84, T_NEAR);
    lea(CO2, ptr[CO1+LDC*2+0x0]);
    movups(xmm0, xword[A-0x80]);
    xorps(xmm8, xmm8);
    movups(xmm1, xword[A-0x70]);
    xorps(xmm9, xmm9);
    movups(xmm2, xword[A-0x60]);
    xorps(xmm10, xmm10);
    movups(xmm3, xword[A-0x50]);
    xorps(xmm11, xmm11);
    movss(xmm4, dword[BO-0x80]);
    xorps(xmm12, xmm12);
    movss(xmm5, dword[BO-0x7c]);
    xorps(xmm13, xmm13);
    xorps(xmm14, xmm14);
    xorps(xmm15, xmm15);
    mov(AO, A);
    mov(H, K);
    sar(H, 0x2);
    jle(le8c, T_NEAR);
    sub(H, 0x1e);
    jle(lcd0, T_NEAR);
    align(4);

L(lb20);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm8, xmm4);
    addps(xmm12, xmm7);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm10, xmm4);
    movss(xmm4, dword[BO-0x78]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x40]);
    mulps(xmm7, xmm1);
    movups(xmm1, xword[AO-0x30]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm8, xmm5);
    addps(xmm12, xmm7);
    pshufd(xmm5, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm10, xmm5);
    movss(xmm5, dword[BO-0x74]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO-0x20]);
    mulps(xmm7, xmm3);
    movups(xmm3, xword[AO-0x10]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    prefetcht0(byte[AO+0x1c0]);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm8, xmm4);
    addps(xmm12, xmm7);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm10, xmm4);
    movss(xmm4, dword[BO-0x70]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO]);
    mulps(xmm7, xmm1);
    movups(xmm1, xword[AO+0x10]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm8, xmm5);
    addps(xmm12, xmm7);
    pshufd(xmm5, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm10, xmm5);
    movss(xmm5, dword[BO-0x6c]);
    addps(xmm14, xmm7);
    add(AA, 0x8);
    sub(BO, -16);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO+0x20]);
    mulps(xmm7, xmm3);
    movups(xmm3, xword[AO+0x30]);
    sub(AO, -128);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(lb20, T_NEAR);
    align(4);

L(lcd0);
    prefetcht0(byte[CO1+0x1c]);
    add(H, 0x1e);
    align(4);

L(lcdc);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm8, xmm4);
    addps(xmm12, xmm7);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm10, xmm4);
    movss(xmm4, dword[BO-0x78]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x40]);
    mulps(xmm7, xmm1);
    movups(xmm1, xword[AO-0x30]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm8, xmm5);
    addps(xmm12, xmm7);
    pshufd(xmm5, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm10, xmm5);
    movss(xmm5, dword[BO-0x74]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO-0x20]);
    mulps(xmm7, xmm3);
    movups(xmm3, xword[AO-0x10]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    prefetcht0(byte[AO+0x1c0]);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm8, xmm4);
    addps(xmm12, xmm7);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm10, xmm4);
    movss(xmm4, dword[BO-0x70]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO]);
    mulps(xmm7, xmm1);
    movups(xmm1, xword[AO+0x10]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm8, xmm5);
    addps(xmm12, xmm7);
    pshufd(xmm5, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm10, xmm5);
    movss(xmm5, dword[BO-0x6c]);
    addps(xmm14, xmm7);
    add(AA, 0x8);
    sub(BO, -16);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO+0x20]);
    mulps(xmm7, xmm3);
    movups(xmm3, xword[AO+0x30]);
    sub(AO, -128);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(lcdc, T_NEAR);
    align(4);

L(le8c);
    mov(H, K);
    and_(H, 0x3);
    je(lf08, T_NEAR);
    align(4);

L(le98);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm8, xmm4);
    addps(xmm12, xmm7);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm10, xmm4);
    movss(xmm4, dword[BO-0x7c]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x60]);
    mulps(xmm7, xmm1);
    movups(xmm1, xword[AO-0x50]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    sub(AO, -32);
    sub(BO, -4);
    dec(H);
    jg(le98, T_NEAR);
    align(4);

L(lf08);
    movaps(xmm0, xmm8);
    unpcklpd(xmm8, xmm9);
    unpckhpd(xmm0, xmm9);
    movaps(xmm1, xmm10);
    unpckhpd(xmm10, xmm11);
    unpcklpd(xmm1, xmm11);
    movaps(xmm9, xmm8);
    shufps(xmm8, xmm10, 0xcc);
    shufps(xmm9, xmm10, 0x66);
    movaps(xmm0, xmm12);
    unpcklpd(xmm12, xmm13);
    unpckhpd(xmm0, xmm13);
    movaps(xmm1, xmm14);
    unpckhpd(xmm14, xmm15);
    unpcklpd(xmm1, xmm15);
    movaps(xmm13, xmm12);
    shufps(xmm12, xmm14, 0xcc);
    shufps(xmm13, xmm14, 0x66);
    movups(xmm0, xword[CO1+0x0]);
    addps(xmm8, xmm0);
    movups(xword[CO1+0x0], xmm8);
    movups(xmm1, xword[CO1+0x10]);
    addps(xmm12, xmm1);
    movups(xword[CO1+0x10], xmm12);
    lea(CO1, ptr[CO1+LDC*1+0x0]);
    lea(CO2, ptr[CO2+LDC*1]);
    align(4);

L(lf84);
    mov(A, AO);
    sub(J, 0x8);
    cmp(J, 0x8);
    jge(l50, T_NEAR);
    align(4);

L(lf98);
    test(J, 0x4);
    jle(l192c, T_NEAR);
    mov(AA, K);
    imul(AA, AA, 0x10);
    add(AA, A);
    mov(CO1, C);
    add(C, 0x10);
    mov(BO, B);
    mov(I, N);
    cmp(I, 0x4);
    jl(l130c, T_NEAR);
    align(4);

L(lfc8);
    lea(CO2, ptr[CO1+LDC*2+0x0]);
    movups(xmm0, xword[A-0x80]);
    xorps(xmm8, xmm8);
    xorps(xmm9, xmm9);
    movups(xmm2, xword[A-0x70]);
    xorps(xmm10, xmm10);
    xorps(xmm11, xmm11);
    movaps(xmm4, xword[BO-0x80]);
    xorps(xmm12, xmm12);
    movaps(xmm5, xword[BO-0x70]);
    xorps(xmm13, xmm13);
    xorps(xmm14, xmm14);
    xorps(xmm15, xmm15);
    mov(AO, A);
    mov(H, K);
    sar(H, 0x2);
    jle(l1228, T_NEAR);
    sub(H, 0x1e);
    jle(l1114, T_NEAR);
    align(4);

L(l101c);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movaps(xmm4, xword[BO-0x60]);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x60]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movaps(xmm5, xword[BO-0x50]);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO-0x50]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movaps(xmm4, xword[BO-0x40]);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x40]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movaps(xmm5, xword[BO-0x30]);
    add(AA, 0x8);
    sub(BO, -64);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO-0x30]);
    sub(AO, -64);
    addps(xmm11, xmm6);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(l101c, T_NEAR);
    align(4);

L(l1114);
    prefetcht0(byte[CO1+0xc]);
    prefetcht0(byte[CO1+LDC*1+0xc]);
    prefetcht0(byte[CO2+0xc]);
    prefetcht0(byte[CO2+LDC*1+0xc]);
    add(H, 0x1e);
    align(4);

L(l1130);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movaps(xmm4, xword[BO-0x60]);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x60]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movaps(xmm5, xword[BO-0x50]);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO-0x50]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movaps(xmm4, xword[BO-0x40]);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x40]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movaps(xmm5, xword[BO-0x30]);
    add(AA, 0x8);
    sub(BO, -64);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO-0x30]);
    sub(AO, -64);
    addps(xmm11, xmm6);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(l1130, T_NEAR);
    align(4);

L(l1228);
    mov(H, K);
    and_(H, 0x3);
    je(l1280, T_NEAR);
    align(4);

L(l1234);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movaps(xmm4, xword[BO-0x70]);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x70]);
    addps(xmm11, xmm6);
    sub(AO, -16);
    sub(BO, -16);
    dec(H);
    jg(l1234, T_NEAR);
    align(4);

L(l1280);
    movaps(xmm0, xmm8);
    unpcklpd(xmm8, xmm9);
    unpckhpd(xmm0, xmm9);
    movaps(xmm1, xmm10);
    unpckhpd(xmm10, xmm11);
    unpcklpd(xmm1, xmm11);
    movaps(xmm9, xmm8);
    shufps(xmm8, xmm10, 0xcc);
    shufps(xmm9, xmm10, 0x66);
    movaps(xmm10, xmm1);
    movaps(xmm11, xmm1);
    shufps(xmm10, xmm0, 0xcc);
    shufps(xmm11, xmm0, 0x66);
    movups(xmm0, xword[CO1+0x0]);
    addps(xmm8, xmm0);
    movups(xword[CO1+0x0], xmm8);
    movups(xmm0, xword[CO1+LDC*1+0x0]);
    addps(xmm9, xmm0);
    movups(xword[CO1+LDC*1+0x0], xmm9);
    movups(xmm0, xword[CO2]);
    addps(xmm10, xmm0);
    movups(xword[CO2], xmm10);
    movups(xmm0, xword[CO2+LDC*1]);
    addps(xmm11, xmm0);
    movups(xword[CO2+LDC*1], xmm11);
    lea(CO1, ptr[CO1+LDC*4+0x0]);
    lea(CO2, ptr[CO2+LDC*4]);
    sub(I, 0x4);
    cmp(I, 0x4);
    jge(lfc8, T_NEAR);
    align(4);

L(l130c);
    test(I, 0x2);
    jle(l1624, T_NEAR);
    lea(CO2, ptr[CO1+LDC*2+0x0]);
    movups(xmm0, xword[A-0x80]);
    xorps(xmm8, xmm8);
    xorps(xmm9, xmm9);
    movups(xmm2, xword[A-0x70]);
    xorps(xmm10, xmm10);
    xorps(xmm11, xmm11);
    movddup(xmm4, qword[BO-0x80]);
    xorps(xmm12, xmm12);
    movddup(xmm5, qword[BO-0x78]);
    xorps(xmm13, xmm13);
    xorps(xmm14, xmm14);
    xorps(xmm15, xmm15);
    mov(AO, A);
    mov(H, K);
    sar(H, 0x2);
    jle(l1574, T_NEAR);
    sub(H, 0x1e);
    jle(l1468, T_NEAR);
    align(4);

L(l136c);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movddup(xmm4, qword[BO-0x70]);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x60]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movddup(xmm5, qword[BO-0x68]);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO-0x50]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movddup(xmm4, qword[BO-0x60]);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x40]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movddup(xmm5, qword[BO-0x58]);
    add(AA, 0x8);
    sub(BO, -32);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO-0x30]);
    sub(AO, -64);
    addps(xmm11, xmm6);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(l136c, T_NEAR);
    align(4);

L(l1468);
    prefetcht0(byte[CO1+0xc]);
    prefetcht0(byte[CO1+LDC*1+0xc]);
    add(H, 0x1e);
    align(4);

L(l1478);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movddup(xmm4, qword[BO-0x70]);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x60]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movddup(xmm5, qword[BO-0x68]);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO-0x50]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movddup(xmm4, qword[BO-0x60]);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x40]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movddup(xmm5, qword[BO-0x58]);
    add(AA, 0x8);
    sub(BO, -32);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO-0x30]);
    sub(AO, -64);
    addps(xmm11, xmm6);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(l1478, T_NEAR);
    align(4);

L(l1574);
    mov(H, K);
    and_(H, 0x3);
    je(l15d0, T_NEAR);
    align(4);

L(l1580);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movddup(xmm4, qword[BO-0x78]);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x70]);
    addps(xmm11, xmm6);
    sub(AO, -16);
    sub(BO, -8);
    dec(H);
    jg(l1580, T_NEAR);
    align(4);

L(l15d0);
    movaps(xmm0, xmm8);
    unpcklpd(xmm8, xmm9);
    unpckhpd(xmm0, xmm9);
    movaps(xmm1, xmm10);
    unpckhpd(xmm10, xmm11);
    unpcklpd(xmm1, xmm11);
    movaps(xmm9, xmm8);
    shufps(xmm8, xmm10, 0xcc);
    shufps(xmm9, xmm10, 0x66);
    movups(xmm0, xword[CO1+0x0]);
    addps(xmm8, xmm0);
    movups(xword[CO1+0x0], xmm8);
    movups(xmm0, xword[CO1+LDC*1+0x0]);
    addps(xmm9, xmm0);
    movups(xword[CO1+LDC*1+0x0], xmm9);
    lea(CO1, ptr[CO1+LDC*2+0x0]);
    lea(CO2, ptr[CO2+LDC*2]);
    align(4);

L(l1624);
    test(I, 0x1);
    jle(l1928, T_NEAR);
    lea(CO2, ptr[CO1+LDC*2+0x0]);
    movups(xmm0, xword[A-0x80]);
    xorps(xmm8, xmm8);
    xorps(xmm9, xmm9);
    movups(xmm2, xword[A-0x70]);
    xorps(xmm10, xmm10);
    xorps(xmm11, xmm11);
    movss(xmm4, dword[BO-0x80]);
    xorps(xmm12, xmm12);
    movss(xmm5, dword[BO-0x7c]);
    xorps(xmm13, xmm13);
    xorps(xmm14, xmm14);
    xorps(xmm15, xmm15);
    mov(AO, A);
    mov(H, K);
    sar(H, 0x2);
    jle(l1888, T_NEAR);
    sub(H, 0x1e);
    jle(l1780, T_NEAR);
    align(4);

L(l1684);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movss(xmm4, dword[BO-0x78]);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x60]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movss(xmm5, dword[BO-0x74]);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO-0x50]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movss(xmm4, dword[BO-0x70]);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x40]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movss(xmm5, dword[BO-0x6c]);
    add(AA, 0x8);
    sub(BO, -16);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO-0x30]);
    sub(AO, -64);
    addps(xmm11, xmm6);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(l1684, T_NEAR);
    align(4);

L(l1780);
    prefetcht0(byte[CO1+0xc]);
    add(H, 0x1e);
    align(4);

L(l178c);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movss(xmm4, dword[BO-0x78]);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x60]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movss(xmm5, dword[BO-0x74]);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO-0x50]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movss(xmm4, dword[BO-0x70]);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x40]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movss(xmm5, dword[BO-0x6c]);
    add(AA, 0x8);
    sub(BO, -16);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO-0x30]);
    sub(AO, -64);
    addps(xmm11, xmm6);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(l178c, T_NEAR);
    align(4);

L(l1888);
    mov(H, K);
    and_(H, 0x3);
    je(l18e4, T_NEAR);
    align(4);

L(l1894);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movss(xmm4, dword[BO-0x7c]);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x70]);
    addps(xmm11, xmm6);
    sub(AO, -16);
    sub(BO, -4);
    dec(H);
    jg(l1894, T_NEAR);
    align(4);

L(l18e4);
    movaps(xmm0, xmm8);
    unpcklpd(xmm8, xmm9);
    unpckhpd(xmm0, xmm9);
    movaps(xmm1, xmm10);
    unpckhpd(xmm10, xmm11);
    unpcklpd(xmm1, xmm11);
    movaps(xmm9, xmm8);
    shufps(xmm8, xmm10, 0xcc);
    shufps(xmm9, xmm10, 0x66);
    movups(xmm0, xword[CO1+0x0]);
    addps(xmm8, xmm0);
    movups(xword[CO1+0x0], xmm8);
    lea(CO1, ptr[CO1+LDC*1+0x0]);
    lea(CO2, ptr[CO2+LDC*1]);
    align(4);

L(l1928);
    mov(A, AO);
    align(4);

L(l192c);
    test(J, 0x2);
    jle(l22e8, T_NEAR);
    mov(AA, K);
    imul(AA, AA, 0x8);
    add(AA, A);
    mov(CO1, C);
    add(C, 0x8);
    mov(BO, B);
    mov(I, N);
    cmp(I, 0x4);
    jl(l1cb0, T_NEAR);
    align(4);

L(l195c);
    lea(CO2, ptr[CO1+LDC*2+0x0]);
    movsd(xmm0, qword[A-0x80]);
    xorps(xmm8, xmm8);
    xorps(xmm9, xmm9);
    movsd(xmm2, qword[A-0x78]);
    xorps(xmm10, xmm10);
    xorps(xmm11, xmm11);
    movaps(xmm4, xword[BO-0x80]);
    xorps(xmm12, xmm12);
    movaps(xmm5, xword[BO-0x70]);
    xorps(xmm13, xmm13);
    xorps(xmm14, xmm14);
    xorps(xmm15, xmm15);
    mov(AO, A);
    mov(H, K);
    sar(H, 0x2);
    jle(l1bc4, T_NEAR);
    sub(H, 0x1e);
    jle(l1aac, T_NEAR);
    align(4);

L(l19b0);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movaps(xmm4, xword[BO-0x60]);
    mulps(xmm6, xmm0);
    movsd(xmm0, qword[AO-0x70]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movaps(xmm5, xword[BO-0x50]);
    mulps(xmm6, xmm2);
    movsd(xmm2, qword[AO-0x68]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movaps(xmm4, xword[BO-0x40]);
    mulps(xmm6, xmm0);
    movsd(xmm0, qword[AO-0x60]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movaps(xmm5, xword[BO-0x30]);
    add(AA, 0x8);
    sub(BO, -64);
    mulps(xmm6, xmm2);
    movsd(xmm2, qword[AO-0x58]);
    sub(AO, -32);
    addps(xmm11, xmm6);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(l19b0, T_NEAR);
    align(4);

L(l1aac);
    prefetcht0(byte[CO1+0x4]);
    prefetcht0(byte[CO1+LDC*1+0x4]);
    prefetcht0(byte[CO2+0x4]);
    prefetcht0(byte[CO2+LDC*1+0x4]);
    add(H, 0x1e);
    align(4);

L(l1ac8);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movaps(xmm4, xword[BO-0x60]);
    mulps(xmm6, xmm0);
    movsd(xmm0, qword[AO-0x70]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movaps(xmm5, xword[BO-0x50]);
    mulps(xmm6, xmm2);
    movsd(xmm2, qword[AO-0x68]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movaps(xmm4, xword[BO-0x40]);
    mulps(xmm6, xmm0);
    movsd(xmm0, qword[AO-0x60]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movaps(xmm5, xword[BO-0x30]);
    add(AA, 0x8);
    sub(BO, -64);
    mulps(xmm6, xmm2);
    movsd(xmm2, qword[AO-0x58]);
    sub(AO, -32);
    addps(xmm11, xmm6);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(l1ac8, T_NEAR);
    align(4);

L(l1bc4);
    mov(H, K);
    and_(H, 0x3);
    je(l1c20, T_NEAR);
    align(4);

L(l1bd0);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movaps(xmm4, xword[BO-0x70]);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movsd(xmm0, qword[AO-0x78]);
    addps(xmm11, xmm6);
    sub(AO, -8);
    sub(BO, -16);
    dec(H);
    jg(l1bd0, T_NEAR);
    align(4);

L(l1c20);
    movaps(xmm0, xmm8);
    unpcklpd(xmm8, xmm9);
    unpckhpd(xmm0, xmm9);
    movaps(xmm1, xmm10);
    unpckhpd(xmm10, xmm11);
    unpcklpd(xmm1, xmm11);
    movaps(xmm9, xmm8);
    shufps(xmm8, xmm10, 0xcc);
    shufps(xmm9, xmm10, 0x66);
    movaps(xmm10, xmm1);
    movaps(xmm11, xmm1);
    shufps(xmm10, xmm0, 0xcc);
    shufps(xmm11, xmm0, 0x66);
    movsd(xmm0, qword[CO1+0x0]);
    addps(xmm8, xmm0);
    movlps(qword[CO1+0x0], xmm8);
    movsd(xmm0, qword[CO1+LDC*1+0x0]);
    addps(xmm9, xmm0);
    movlps(qword[CO1+LDC*1+0x0], xmm9);
    movsd(xmm0, qword[CO2]);
    addps(xmm10, xmm0);
    movlps(qword[CO2], xmm10);
    movsd(xmm0, qword[CO2+LDC*1]);
    addps(xmm11, xmm0);
    movlps(qword[CO2+LDC*1], xmm11);
    lea(CO1, ptr[CO1+LDC*4+0x0]);
    lea(CO2, ptr[CO2+LDC*4]);
    sub(I, 0x4);
    cmp(I, 0x4);
    jge(l195c, T_NEAR);
    align(4);

L(l1cb0);
    test(I, 0x2);
    jle(l1fd4, T_NEAR);
    lea(CO2, ptr[CO1+LDC*2+0x0]);
    movsd(xmm0, qword[A-0x80]);
    xorps(xmm8, xmm8);
    xorps(xmm9, xmm9);
    movsd(xmm2, qword[A-0x78]);
    xorps(xmm10, xmm10);
    xorps(xmm11, xmm11);
    movddup(xmm4, qword[BO-0x80]);
    xorps(xmm12, xmm12);
    movddup(xmm5, qword[BO-0x78]);
    xorps(xmm13, xmm13);
    xorps(xmm14, xmm14);
    xorps(xmm15, xmm15);
    mov(AO, A);
    mov(H, K);
    sar(H, 0x2);
    jle(l1f24, T_NEAR);
    sub(H, 0x1e);
    jle(l1e14, T_NEAR);
    align(4);

L(l1d14);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movddup(xmm4, qword[BO-0x70]);
    mulps(xmm6, xmm0);
    movsd(xmm0, qword[AO-0x70]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movddup(xmm5, qword[BO-0x68]);
    mulps(xmm6, xmm2);
    movsd(xmm2, qword[AO-0x68]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movddup(xmm4, qword[BO-0x60]);
    mulps(xmm6, xmm0);
    movsd(xmm0, qword[AO-0x60]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movddup(xmm5, qword[BO-0x58]);
    add(AA, 0x8);
    sub(BO, -32);
    mulps(xmm6, xmm2);
    movsd(xmm2, qword[AO-0x58]);
    sub(AO, -32);
    addps(xmm11, xmm6);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(l1d14, T_NEAR);
    align(4);

L(l1e14);
    prefetcht0(byte[CO1+0x4]);
    prefetcht0(byte[CO1+LDC*1+0x4]);
    add(H, 0x1e);
    align(4);

L(l1e24);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movddup(xmm4, qword[BO-0x70]);
    mulps(xmm6, xmm0);
    movsd(xmm0, qword[AO-0x70]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movddup(xmm5, qword[BO-0x68]);
    mulps(xmm6, xmm2);
    movsd(xmm2, qword[AO-0x68]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movddup(xmm4, qword[BO-0x60]);
    mulps(xmm6, xmm0);
    movsd(xmm0, qword[AO-0x60]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movddup(xmm5, qword[BO-0x58]);
    add(AA, 0x8);
    sub(BO, -32);
    mulps(xmm6, xmm2);
    movsd(xmm2, qword[AO-0x58]);
    sub(AO, -32);
    addps(xmm11, xmm6);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(l1e24, T_NEAR);
    align(4);

L(l1f24);
    mov(H, K);
    and_(H, 0x3);
    je(l1f80, T_NEAR);
    align(4);

L(l1f30);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movddup(xmm4, qword[BO-0x78]);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movsd(xmm0, qword[AO-0x78]);
    addps(xmm11, xmm6);
    sub(AO, -8);
    sub(BO, -8);
    dec(H);
    jg(l1f30, T_NEAR);
    align(4);

L(l1f80);
    movaps(xmm0, xmm8);
    unpcklpd(xmm8, xmm9);
    unpckhpd(xmm0, xmm9);
    movaps(xmm1, xmm10);
    unpckhpd(xmm10, xmm11);
    unpcklpd(xmm1, xmm11);
    movaps(xmm9, xmm8);
    shufps(xmm8, xmm10, 0xcc);
    shufps(xmm9, xmm10, 0x66);
    movsd(xmm0, qword[CO1+0x0]);
    addps(xmm8, xmm0);
    movlps(qword[CO1+0x0], xmm8);
    movsd(xmm0, qword[CO1+LDC*1+0x0]);
    addps(xmm9, xmm0);
    movlps(qword[CO1+LDC*1+0x0], xmm9);
    lea(CO1, ptr[CO1+LDC*2+0x0]);
    lea(CO2, ptr[CO2+LDC*2]);
    align(4);

L(l1fd4);
    test(I, 0x1);
    jle(l22e4, T_NEAR);
    lea(CO2, ptr[CO1+LDC*2+0x0]);
    movsd(xmm0, qword[A-0x80]);
    xorps(xmm8, xmm8);
    xorps(xmm9, xmm9);
    movsd(xmm2, qword[A-0x78]);
    xorps(xmm10, xmm10);
    xorps(xmm11, xmm11);
    movss(xmm4, dword[BO-0x80]);
    xorps(xmm12, xmm12);
    movss(xmm5, dword[BO-0x7c]);
    xorps(xmm13, xmm13);
    xorps(xmm14, xmm14);
    xorps(xmm15, xmm15);
    mov(AO, A);
    mov(H, K);
    sar(H, 0x2);
    jle(l2244, T_NEAR);
    sub(H, 0x1e);
    jle(l2138, T_NEAR);
    align(4);

L(l2038);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movss(xmm4, dword[BO-0x78]);
    mulps(xmm6, xmm0);
    movsd(xmm0, qword[AO-0x70]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movss(xmm5, dword[BO-0x74]);
    mulps(xmm6, xmm2);
    movsd(xmm2, qword[AO-0x68]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movss(xmm4, dword[BO-0x70]);
    mulps(xmm6, xmm0);
    movsd(xmm0, qword[AO-0x60]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movss(xmm5, dword[BO-0x6c]);
    add(AA, 0x8);
    sub(BO, -16);
    mulps(xmm6, xmm2);
    movsd(xmm2, qword[AO-0x58]);
    sub(AO, -32);
    addps(xmm11, xmm6);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(l2038, T_NEAR);
    align(4);

L(l2138);
    prefetcht0(byte[CO1+0x4]);
    add(H, 0x1e);
    align(4);

L(l2144);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movss(xmm4, dword[BO-0x78]);
    mulps(xmm6, xmm0);
    movsd(xmm0, qword[AO-0x70]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movss(xmm5, dword[BO-0x74]);
    mulps(xmm6, xmm2);
    movsd(xmm2, qword[AO-0x68]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movss(xmm4, dword[BO-0x70]);
    mulps(xmm6, xmm0);
    movsd(xmm0, qword[AO-0x60]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movss(xmm5, dword[BO-0x6c]);
    add(AA, 0x8);
    sub(BO, -16);
    mulps(xmm6, xmm2);
    movsd(xmm2, qword[AO-0x58]);
    sub(AO, -32);
    addps(xmm11, xmm6);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(l2144, T_NEAR);
    align(4);

L(l2244);
    mov(H, K);
    and_(H, 0x3);
    je(l22a0, T_NEAR);
    align(4);

L(l2250);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movss(xmm4, dword[BO-0x7c]);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movsd(xmm0, qword[AO-0x78]);
    addps(xmm11, xmm6);
    sub(AO, -8);
    sub(BO, -4);
    dec(H);
    jg(l2250, T_NEAR);
    align(4);

L(l22a0);
    movaps(xmm0, xmm8);
    unpcklpd(xmm8, xmm9);
    unpckhpd(xmm0, xmm9);
    movaps(xmm1, xmm10);
    unpckhpd(xmm10, xmm11);
    unpcklpd(xmm1, xmm11);
    movaps(xmm9, xmm8);
    shufps(xmm8, xmm10, 0xcc);
    shufps(xmm9, xmm10, 0x66);
    movsd(xmm0, qword[CO1+0x0]);
    addps(xmm8, xmm0);
    movlps(qword[CO1+0x0], xmm8);
    lea(CO1, ptr[CO1+LDC*1+0x0]);
    lea(CO2, ptr[CO2+LDC*1]);
    align(4);

L(l22e4);
    mov(A, AO);
    align(4);

L(l22e8);
    test(J, 0x1);
    jle(l2ca8, T_NEAR);
    mov(AA, K);
    imul(AA, AA, 0x4);
    add(AA, A);
    mov(CO1, C);
    add(C, 0x4);
    mov(BO, B);
    mov(I, N);
    cmp(I, 0x4);
    jl(l266c, T_NEAR);
    align(4);

L(l2318);
    lea(CO2, ptr[CO1+LDC*2+0x0]);
    movss(xmm0, dword[A-0x80]);
    xorps(xmm8, xmm8);
    xorps(xmm9, xmm9);
    movss(xmm2, dword[A-0x7c]);
    xorps(xmm10, xmm10);
    xorps(xmm11, xmm11);
    movaps(xmm4, xword[BO-0x80]);
    xorps(xmm12, xmm12);
    movaps(xmm5, xword[BO-0x70]);
    xorps(xmm13, xmm13);
    xorps(xmm14, xmm14);
    xorps(xmm15, xmm15);
    mov(AO, A);
    mov(H, K);
    sar(H, 0x2);
    jle(l257c, T_NEAR);
    sub(H, 0x1e);
    jle(l2468, T_NEAR);
    align(4);

L(l236c);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movaps(xmm4, xword[BO-0x60]);
    mulps(xmm6, xmm0);
    movss(xmm0, dword[AO-0x78]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movaps(xmm5, xword[BO-0x50]);
    mulps(xmm6, xmm2);
    movss(xmm2, dword[AO-0x74]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movaps(xmm4, xword[BO-0x40]);
    mulps(xmm6, xmm0);
    movss(xmm0, dword[AO-0x70]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movaps(xmm5, xword[BO-0x30]);
    add(AA, 0x8);
    sub(BO, -64);
    mulps(xmm6, xmm2);
    movss(xmm2, dword[AO-0x6c]);
    sub(AO, -16);
    addps(xmm11, xmm6);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(l236c, T_NEAR);
    align(4);

L(l2468);
    prefetcht0(byte[CO1+0x0]);
    prefetcht0(byte[CO1+LDC*1+0x0]);
    prefetcht0(byte[CO2]);
    prefetcht0(byte[CO2+LDC*1]);
    add(H, 0x1e);
    align(4);

L(l2480);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movaps(xmm4, xword[BO-0x60]);
    mulps(xmm6, xmm0);
    movss(xmm0, dword[AO-0x78]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movaps(xmm5, xword[BO-0x50]);
    mulps(xmm6, xmm2);
    movss(xmm2, dword[AO-0x74]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movaps(xmm4, xword[BO-0x40]);
    mulps(xmm6, xmm0);
    movss(xmm0, dword[AO-0x70]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movaps(xmm5, xword[BO-0x30]);
    add(AA, 0x8);
    sub(BO, -64);
    mulps(xmm6, xmm2);
    movss(xmm2, dword[AO-0x6c]);
    sub(AO, -16);
    addps(xmm11, xmm6);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(l2480, T_NEAR);
    align(4);

L(l257c);
    mov(H, K);
    and_(H, 0x3);
    je(l25d8, T_NEAR);
    align(4);

L(l2588);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movaps(xmm4, xword[BO-0x70]);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movss(xmm0, dword[AO-0x7c]);
    addps(xmm11, xmm6);
    sub(AO, -4);
    sub(BO, -16);
    dec(H);
    jg(l2588, T_NEAR);
    align(4);

L(l25d8);
    movaps(xmm0, xmm8);
    unpcklpd(xmm8, xmm9);
    unpckhpd(xmm0, xmm9);
    movaps(xmm1, xmm10);
    unpckhpd(xmm10, xmm11);
    unpcklpd(xmm1, xmm11);
    movaps(xmm9, xmm8);
    shufps(xmm8, xmm10, 0xcc);
    shufps(xmm9, xmm10, 0x66);
    movaps(xmm10, xmm1);
    movaps(xmm11, xmm1);
    shufps(xmm10, xmm0, 0xcc);
    shufps(xmm11, xmm0, 0x66);
    movss(xmm0, dword[CO1+0x0]);
    addps(xmm8, xmm0);
    movss(dword[CO1+0x0], xmm8);
    movss(xmm0, dword[CO1+LDC*1+0x0]);
    addps(xmm9, xmm0);
    movss(dword[CO1+LDC*1+0x0], xmm9);
    movss(xmm0, dword[CO2]);
    addps(xmm10, xmm0);
    movss(dword[CO2], xmm10);
    movss(xmm0, dword[CO2+LDC*1]);
    addps(xmm11, xmm0);
    movss(dword[CO2+LDC*1], xmm11);
    lea(CO1, ptr[CO1+LDC*4+0x0]);
    lea(CO2, ptr[CO2+LDC*4]);
    sub(I, 0x4);
    cmp(I, 0x4);
    jge(l2318, T_NEAR);
    align(4);

L(l266c);
    test(I, 0x2);
    jle(l2994, T_NEAR);
    lea(CO2, ptr[CO1+LDC*2+0x0]);
    movss(xmm0, dword[A-0x80]);
    xorps(xmm8, xmm8);
    xorps(xmm9, xmm9);
    movss(xmm2, dword[A-0x7c]);
    xorps(xmm10, xmm10);
    xorps(xmm11, xmm11);
    movddup(xmm4, qword[BO-0x80]);
    xorps(xmm12, xmm12);
    movddup(xmm5, qword[BO-0x78]);
    xorps(xmm13, xmm13);
    xorps(xmm14, xmm14);
    xorps(xmm15, xmm15);
    mov(AO, A);
    mov(H, K);
    sar(H, 0x2);
    jle(l28e0, T_NEAR);
    sub(H, 0x1e);
    jle(l27d0, T_NEAR);
    align(4);

L(l26d0);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movddup(xmm4, qword[BO-0x70]);
    mulps(xmm6, xmm0);
    movss(xmm0, dword[AO-0x78]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movddup(xmm5, qword[BO-0x68]);
    mulps(xmm6, xmm2);
    movss(xmm2, dword[AO-0x74]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movddup(xmm4, qword[BO-0x60]);
    mulps(xmm6, xmm0);
    movss(xmm0, dword[AO-0x70]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movddup(xmm5, qword[BO-0x58]);
    add(AA, 0x8);
    sub(BO, -32);
    mulps(xmm6, xmm2);
    movss(xmm2, dword[AO-0x6c]);
    sub(AO, -16);
    addps(xmm11, xmm6);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(l26d0, T_NEAR);
    align(4);

L(l27d0);
    prefetcht0(byte[CO1+0x0]);
    prefetcht0(byte[CO1+LDC*1+0x0]);
    add(H, 0x1e);
    align(4);

L(l27e0);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movddup(xmm4, qword[BO-0x70]);
    mulps(xmm6, xmm0);
    movss(xmm0, dword[AO-0x78]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movddup(xmm5, qword[BO-0x68]);
    mulps(xmm6, xmm2);
    movss(xmm2, dword[AO-0x74]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movddup(xmm4, qword[BO-0x60]);
    mulps(xmm6, xmm0);
    movss(xmm0, dword[AO-0x70]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movddup(xmm5, qword[BO-0x58]);
    add(AA, 0x8);
    sub(BO, -32);
    mulps(xmm6, xmm2);
    movss(xmm2, dword[AO-0x6c]);
    sub(AO, -16);
    addps(xmm11, xmm6);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(l27e0, T_NEAR);
    align(4);

L(l28e0);
    mov(H, K);
    and_(H, 0x3);
    je(l293c, T_NEAR);
    align(4);

L(l28ec);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movddup(xmm4, qword[BO-0x78]);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movss(xmm0, dword[AO-0x7c]);
    addps(xmm11, xmm6);
    sub(AO, -4);
    sub(BO, -8);
    dec(H);
    jg(l28ec, T_NEAR);
    align(4);

L(l293c);
    movaps(xmm0, xmm8);
    unpcklpd(xmm8, xmm9);
    unpckhpd(xmm0, xmm9);
    movaps(xmm1, xmm10);
    unpckhpd(xmm10, xmm11);
    unpcklpd(xmm1, xmm11);
    movaps(xmm9, xmm8);
    shufps(xmm8, xmm10, 0xcc);
    shufps(xmm9, xmm10, 0x66);
    movss(xmm0, dword[CO1+0x0]);
    addps(xmm8, xmm0);
    movss(dword[CO1+0x0], xmm8);
    movss(xmm0, dword[CO1+LDC*1+0x0]);
    addps(xmm9, xmm0);
    movss(dword[CO1+LDC*1+0x0], xmm9);
    lea(CO1, ptr[CO1+LDC*2+0x0]);
    lea(CO2, ptr[CO2+LDC*2]);
    align(4);

L(l2994);
    test(I, 0x1);
    jle(l2ca4, T_NEAR);
    lea(CO2, ptr[CO1+LDC*2+0x0]);
    movss(xmm0, dword[A-0x80]);
    xorps(xmm8, xmm8);
    xorps(xmm9, xmm9);
    movss(xmm2, dword[A-0x7c]);
    xorps(xmm10, xmm10);
    xorps(xmm11, xmm11);
    movss(xmm4, dword[BO-0x80]);
    xorps(xmm12, xmm12);
    movss(xmm5, dword[BO-0x7c]);
    xorps(xmm13, xmm13);
    xorps(xmm14, xmm14);
    xorps(xmm15, xmm15);
    mov(AO, A);
    mov(H, K);
    sar(H, 0x2);
    jle(l2c04, T_NEAR);
    sub(H, 0x1e);
    jle(l2af8, T_NEAR);
    align(4);

L(l29f8);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movss(xmm4, dword[BO-0x78]);
    mulps(xmm6, xmm0);
    movss(xmm0, dword[AO-0x78]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movss(xmm5, dword[BO-0x74]);
    mulps(xmm6, xmm2);
    movss(xmm2, dword[AO-0x74]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movss(xmm4, dword[BO-0x70]);
    mulps(xmm6, xmm0);
    movss(xmm0, dword[AO-0x70]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movss(xmm5, dword[BO-0x6c]);
    add(AA, 0x8);
    sub(BO, -16);
    mulps(xmm6, xmm2);
    movss(xmm2, dword[AO-0x6c]);
    sub(AO, -16);
    addps(xmm11, xmm6);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(l29f8, T_NEAR);
    align(4);

L(l2af8);
    prefetcht0(byte[CO1+0x0]);
    add(H, 0x1e);
    align(4);

L(l2b04);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movss(xmm4, dword[BO-0x78]);
    mulps(xmm6, xmm0);
    movss(xmm0, dword[AO-0x78]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movss(xmm5, dword[BO-0x74]);
    mulps(xmm6, xmm2);
    movss(xmm2, dword[AO-0x74]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movss(xmm4, dword[BO-0x70]);
    mulps(xmm6, xmm0);
    movss(xmm0, dword[AO-0x70]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movss(xmm5, dword[BO-0x6c]);
    add(AA, 0x8);
    sub(BO, -16);
    mulps(xmm6, xmm2);
    movss(xmm2, dword[AO-0x6c]);
    sub(AO, -16);
    addps(xmm11, xmm6);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(l2b04, T_NEAR);
    align(4);

L(l2c04);
    mov(H, K);
    and_(H, 0x3);
    je(l2c60, T_NEAR);
    align(4);

L(l2c10);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movss(xmm4, dword[BO-0x7c]);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movss(xmm0, dword[AO-0x7c]);
    addps(xmm11, xmm6);
    sub(AO, -4);
    sub(BO, -4);
    dec(H);
    jg(l2c10, T_NEAR);
    align(4);

L(l2c60);
    movaps(xmm0, xmm8);
    unpcklpd(xmm8, xmm9);
    unpckhpd(xmm0, xmm9);
    movaps(xmm1, xmm10);
    unpckhpd(xmm10, xmm11);
    unpcklpd(xmm1, xmm11);
    movaps(xmm9, xmm8);
    shufps(xmm8, xmm10, 0xcc);
    shufps(xmm9, xmm10, 0x66);
    movss(xmm0, dword[CO1+0x0]);
    addps(xmm8, xmm0);
    movss(dword[CO1+0x0], xmm8);
    lea(CO1, ptr[CO1+LDC*1+0x0]);
    lea(CO2, ptr[CO2+LDC*1]);
    align(4);

L(l2ca4);
    mov(A, AO);
    align(4);

L(l2ca8);

    postamble();
}
outLocalLabel();

#undef M
#undef N
#undef K
#undef A
#undef B
#undef C
#undef LDC
#undef AA
#undef I
#undef J
#undef H
#undef AO
#undef BO
#undef CO1
#undef CO2
#ifdef _WIN32
#undef OLD_A
#undef OLD_B
#endif
#undef OLD_C
#undef OLD_LDC
}

}
}
}
