// file kernel/n/x86/sqrt_n2.S: O(n^2) square root of natural integers
/*-----------------------------------------------------------------------+
 |  Copyright 2005, Michel Quercia (michel.quercia@prepas.org)           |
 |                                                                       |
 |  This file is part of Numerix. Numerix is free software; you can      |
 |  redistribute it and/or modify it under the terms of the GNU Lesser   |
 |  General Public License as published by the Free Software Foundation; |
 |  either version 2.1 of the License, or (at your option) any later     |
 |  version.                                                             |
 |                                                                       |
 |  The Numerix Library is distributed in the hope that it will be       |
 |  useful, but WITHOUT ANY WARRANTY; without even the implied warranty  |
 |  of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU  |
 |  Lesser General Public License for more details.                      |
 |                                                                       |
 |  You should have received a copy of the GNU Lesser General Public     |
 |  License along with the GNU MP Library; see the file COPYING. If not, |
 |  write to the Free Software Foundation, Inc., 59 Temple Place -       |
 |  Suite 330, Boston, MA 02111-1307, USA.                               |
 +-----------------------------------------------------------------------+
 |                                                                       |
 |                          Racine carre quadratique                    |
 |                                                                       |
 +-----------------------------------------------------------------------*/

                             # +-----------------+
                             # |  Racine carre  |
                             # +-----------------+

# entre :
# a = naturel de longueur la        esi = &a, edx = la
# b = naturel de longueur la/2      edi = &b
#
# contraintes :
# la > 0, la pair, BASE/16 <= a[la-1] < BASE/4
# a,b non confondus
#
# sortie :
# b <- 2*floor(sqrt(a))
# a <- a - b^2/4

#ifdef assembly_sn_sqrt_n2
#undef L
#define L(x) .Lsn_fsqrt_n2_##x

        ALIGN(32)
#ifdef debug_sqrt_n2
.Lsn_fsqrt_n2_buggy:
#else
.Lsn_fsqrt_n2:
#endif

        leal   -8(%esi,%edx,4), %esi    # esi <- &a[la-2]
        leal   -4(%edi,%edx,2),  %edi   # edi <- &b[la/2-1]
        movl   %edx,    %ebp            # ebp <- la

        # b[0] <- 2*floor(sqrt(a[0]+BASE*a[1])), a <- a - b^2/4
        movl   $0x7fffffff, %ebx        # ebx <- u = BASE/2 - 1
        xorl   %eax,    %eax            # init retenue
        ALIGN(4)
1:
        sarl   $1,      %eax            # u <- (u + x/u)/2
        addl   %eax,    %ebx
        movl   (%esi),  %eax            # eax <- x/u
        movl   4(%esi), %edx
        divl   %ebx
        subl   %ebx,    %eax
        jb     1b
        # fin de la boucle de Newton, ebx = u = floor(sqrt(x))
        # eax = q = floor(x/u), edx = r = x - q*u
        # on a q = u ou u+1 ou u+2 et on veut x - u^2 = r ou r+u ou r+2u
        je     2f
        addl   %ebx,    %edx            # r += u
        decl   %eax
        je     2f
        addl   %ebx,    %edx            # r += u
2:
        movl   %edx,    (%esi)          # a[1]:a[0] <- x - u^2
        movl   $0,     4(%esi)
        shll   $1,      %ebx
        movl   %ebx,    (%edi)          # b[0] <- 2u

        # calcule les chiffres suivants par divisions
        subl   $2,      %ebp            # la -= 2
        jne    1f
        ret
1:

#ifdef use_sse2

        shrl   $1,      %ebp            # ebp <- la/2
        movl   $-1,     %ebx            # ebx <- -lb
        movd   %ebx,    %mm7            # mm7 <- BASE-1
        movq   %mm7,    %mm6
        pmuludq %mm6,   %mm6            # mm6 <- (BASE-1)^2

        # boucle principale
        ALIGN(4)
L(div):
        decl   %ebx                     # lb++
        
        # quotient approch, peut tre trop grand d une ou deux units
        movl -4(%esi),  %eax            # edx:eax <- a[lb-1]:a[lb]
        movl  (%esi),   %edx
        movd   %edx,    %mm4            # mm4 <- a[la]
        movl  (%edi),   %ecx            # ecx <- b[lb-1]
        cmpl   %edx,    %ecx
        jne    1f
        movl   $-1,     %eax
        jmp    2f
        ALIGN(4)
1:
        divl   %ecx
2:
        movd   %eax,    %mm0            # mm0 <- v = quotient
        movl   %eax,  4(%edi,%ebx,4)    # b[0] <- v

        # a <- a - v*b - v^2
        # astuce prise dans GMP (pentium4/sse2/submul_1.asm) :
        # on ajoute BASE^2-BASE avant le dcalage et on retranche BASE-1
        # aprs, de faon  dcaler par psrlq un nombre compris entre
        # 0 et BASE^2-1. Au total, on part avec une retenue initiale valant
        # BASE-1, on ajoute (BASE-1)^2  chaque itration et on retranche
        # BASE-1  la fin.
        
        movq   %mm7,    %mm3            # ret <- BASE-1
        movl   %ebx,    %ecx            # ecx <- -lb
        
        ALIGN(4)
1:
        movd   (%esi,%ecx,4), %mm1      # mm1 <- a[i]
        movd  4(%edi,%ecx,4), %mm2      # mm2 <- b[i]
        paddq   %mm1,   %mm3            # mm3 <- ret + a[i]
        pmuludq %mm0,   %mm2            # mm2 <- q*b[i]
        paddq   %mm6,   %mm3            # mm3 <- ret + a[i] + (BASE-1)^2
        psubq   %mm2,   %mm3            # mm3 <- ... - q*b[i]
        movd    %mm3,  (%esi,%ecx,4)    # sauve a[i]
        incl    %ecx
        psrlq   $32,    %mm3            # mm3 <- nouvelle retenue + (BASE-1)
        jne     1b
        psubq   %mm7,   %mm4            # ret -= BASE-1

2:
        shll   $1,    4(%edi,%ebx,4)
        adcl   $0,    8(%edi,%ebx,4)
L(corr):
        paddq  %mm3,    %mm4            # dernier chiffre
        pextrw $3,  %mm4, %eax          # eax <- retenue
        testl  %eax,    %eax
        jz     L(next)

        # si < 0, rajoute b
        subl   $1,    4(%edi,%ebx,4)    # b <- b-1
        sbbl   $0,    8(%edi,%ebx,4)
        movl   %ebx,    %ecx            # ecx <- -lb
        pxor   %mm3,    %mm3            # init retenue
        ALIGN(4)
1:
        movd   (%esi,%ecx,4), %mm0
        movd  4(%edi,%ecx,4), %mm1
        incl   %ecx
        paddq  %mm0,    %mm3
        paddq  %mm1,    %mm3
        movd   %mm3, -4(%esi,%ecx,4)
        pshufw $0xfe,   %mm3, %mm3
        jne    1b
        decl 4(%edi,%ebx,4)             # b <- b-1
        jmp    L(corr)

        # chiffre suivant
L(next):
        leal -4(%esi),  %esi            # a--
        decl   %ebp                     # la -= 2
        jne    L(div)

        # termin
        emms
        ret

#else /* use_sse2 */
        
        # variables locales
        #undef _a_
        #undef _b_
        #undef _la_
        #undef _lb_
        #undef _ctr_
        #undef _br_
        #define _a_  20(%esp)
        #define _b_  16(%esp)
        #define _la_ 12(%esp)
        #define _lb_  8(%esp)
        #define _ctr_ 4(%esp)
        #define _br_   (%esp)

        leal   -20(%esp), %esp          # rserve de la place pour les variables locales
        call   L(here)
L(here):
        movl   $1,      %ecx            # lb <- 1
        shrl   $1,      %ebp            # la <- la/2
        movl   %ebp,    _la_            # sauve la/2
        addl   $L(loop_end)-L(here)-17, _br_ # adresse de saut dans la boucle interne

        # arriver ici avec esi = &a, edi = &b, ecx = lb
        ALIGN(4)
L(div):
        incl   %ecx                     # lb++
        leal   -8(%esi), %esi           # a -= 2
        leal   -4(%edi), %edi           # b -= 1
        movl   %esi,    _a_
        movl   %edi,    _b_
        movl   %ecx,    _lb_
        
        # quotient approch, peut tre trop grand d une ou deux units
        movl   -4(%esi,%ecx,4), %eax    # edx:eax <- a[lb-1]:a[lb]
        movl   (%esi,%ecx,4),   %edx
        movl   -4(%edi,%ecx,4), %ebx    # ebx <- b[lb-1]
        cmpl   %edx,    %ebx
        jne    1f
        movl   $-1,     %eax           # eax <- min(BASE-1,a[lb]:a[lb-1]/b[lb-1])
        jmp    2f
        ALIGN(4)
1:
        divl   %ebx
2:

        # a <- a - v*b - v^2
        movl   %eax,    %ebp
        movl   %eax,    (%edi)          # b[0] <- v

        negl   %ecx
        movl   %ecx,    %ebx
        sarl   $5,      %ebx
        movl   %ebx,    _ctr_           # compteur <- -lb/32

        andl   $31,      %ecx           # recadre les pointeurs
        negl   %ecx
        leal -4(%esi,%ecx,4), %esi
        leal   (%edi,%ecx,4), %edi
        
        mull   %ebp                     # edx:eax <- v^2
        xorl   %ebx,    %ebx            # init retenues
        xorl   %ecx,    %ecx
        jmp   *_br_
        
        # corps de boucle  drouler. taille du code = 17 octets
        # entrer dans la boucle avec edx:eax = retenue, ebx = ecx = 0, CF = 0
        # code inspir de GMP (k7/mul_basecase.asm)
#undef BODY
#define BODY(x,y) \
          adcl   %eax,    %ebx           /* ebx += pfaible courant  */;\
          movl   x(%edi), %eax           /* eax <- b[2j]            */;\
          adcl   %edx,    %ecx           /* ecx <- pfort courant    */;\
          mull   %ebp                    /* multiplie par v         */;\
          subl   %ebx,    x(%esi)        /* a[2j] <- pfaible prc.  */;\
          movl   $0,      %ebx                                        ;\
          adcl   %eax,    %ecx           /* ecx += pfaible courant  */;\
          movl   y(%edi), %eax           /* eax <- b[2j+1]          */;\
          adcl   %edx,    %ebx           /* ebx <- pfort courant    */;\
          mull   %ebp                    /* multiplie par 2a[i]     */;\
          subl   %ecx,    y(%esi)        /* a[2j+1] <- pf. prc.    */;\
          movl   $0,      %ecx

        # boucle interne droule 32 fois       
        ALIGN(4)
L(loop):
        BODY(0,4);    BODY(8,12);    BODY(16,20);   BODY(24,28)
        BODY(32,36);  BODY(40,44);   BODY(48,52);   BODY(56,60)
        BODY(64,68);  BODY(72,76);   BODY(80,84);   BODY(88,92)
        BODY(96,100); BODY(104,108); BODY(112,116); BODY(120,124)
L(loop_end):
        
        leal 128(%esi), %esi            # a += 32
        leal 128(%edi), %edi            # b += 32
        incl   _ctr_
        jne    L(loop)
        
        adcl   %eax,    %ebx            # additionne les dernires retenues
        adcl   %edx,    %ecx
        subl   %ebx,   (%esi)           # et retranche  a
        adcl   $0,      %ecx
        movl   _b_,     %edi            # double le nouveau chiffre
        shll   $1,      %ebp
        movl   %ebp,   (%edi)
        adcl   $0,    4(%edi)
        subl   %ecx,  4(%esi)           # retranche la dernire retenue
        jnb    L(next)

        # corrige le quotient et le reste si < 0
L(corr):
        movl   _a_,     %esi
        movl   %edi,    %ebx
        movl   _lb_,    %ecx
        subl   $1,     (%ebx)           # b <- b-1
        sbbl   $0,    4(%ebx)
        call   .Lsn_finc_1              # a <- a + b
        adcl   %ecx,   (%esi)           # dernire retenue
        movl   _b_,     %edi
        decl   (%edi)                   # b <- b-1
        jnc    L(corr)

        # chiffre suivant
L(next):
        movl   _a_,     %esi
        movl   _lb_,    %ecx
        testl  $31,     %ecx            # corrige l adresse de saut
        jne    1f
        addl   $17*31,  _br_
        jmp    2f
1:
        subl   $17,     _br_
2:
        decl   _la_
        jne    L(div)

        # termin
        leal   24(%esp), %esp
        ret

#endif /* use_sse2 */
        
                              # +---------------+
                              # |  Interface C  |
                              # +---------------+
        

# void xn(sqrt_n2)(chiffre *a, long la, chiffre *b)
#
# entre :
# a = naturel de longueur la
# b = naturel de longueur la/2
#
# contraintes :
# la > 0, la pair, BASE/16 <= a[la-1] < BASE/4
# a,b non confondus
#
# sortie :
# b <- 2*floor(sqrt(a))
# a <- a - b^2/4

#ifdef debug_sqrt_n2
ENTER(sn_sqrt_n2_buggy)
#else
ENTER(sn_sqrt_n2)
#endif

        movl   arg1,    %esi            # esi <- &a
        movl   arg2,    %edx            # edx <- la
        movl   arg3,    %edi            # edi <- &b
#ifdef debug_sqrt_n2
        call   .Lsn_fsqrt_n2_buggy      # calcule la racine carre
#else
        call   .Lsn_fsqrt_n2
#endif
        RETURN_WITH_SP
        
#endif /* assembly_sn_sqrt_n2 */

        # cas o la version assembleur est dsactive ou dbogue :
        # sn_fsqrt_n2 renvoie vers la version C
        
#if !defined(assembly_sn_sqrt_n2) || defined(debug_sqrt_n2)
        ALIGN(32)
.Lsn_fsqrt_n2:
        pushl  %edi
        pushl  %edx
        pushl  %esi
        call   SUBR(sn_sqrt_n2)
        leal   12(%esp), %esp
        ret
        
#endif /* !defined(assembly_sn_sqrt_n2) || defined(debug_sqrt_n2) */

