[BACK]Return to pre_mod_1.asm CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k6

Annotation of OpenXM_contrib/gmp/mpn/x86/k6/pre_mod_1.asm, Revision 1.1

1.1     ! ohara       1: dnl  AMD K6 mpn_preinv_mod_1 -- mpn by 1 remainder, with pre-inverted divisor.
        !             2:
        !             3: dnl  Copyright 2000, 2002 Free Software Foundation, Inc.
        !             4: dnl
        !             5: dnl  This file is part of the GNU MP Library.
        !             6: dnl
        !             7: dnl  The GNU MP Library is free software; you can redistribute it and/or
        !             8: dnl  modify it under the terms of the GNU Lesser General Public License as
        !             9: dnl  published by the Free Software Foundation; either version 2.1 of the
        !            10: dnl  License, or (at your option) any later version.
        !            11: dnl
        !            12: dnl  The GNU MP Library is distributed in the hope that it will be useful,
        !            13: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            14: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
        !            15: dnl  Lesser General Public License for more details.
        !            16: dnl
        !            17: dnl  You should have received a copy of the GNU Lesser General Public
        !            18: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
        !            19: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
        !            20: dnl  Suite 330, Boston, MA 02111-1307, USA.
        !            21:
        !            22: include(`../config.m4')
        !            23:
        !            24:
        !            25: C K6: 18.0 cycles/limb
        !            26:
        !            27:
        !            28: C mp_limb_t mpn_preinv_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
        !            29: C                             mp_limb_t inverse);
        !            30: C
        !            31: C This code is only 2 c/l faster than a simple divl, but that's 10% so it's
        !            32: C considered worthwhile (just).
        !            33: C
        !            34: C Future:
        !            35: C
        !            36: C In theory this code should be made available in mod_1 and mod_1c, but it
        !            37: C would take quite a while to overcome the time to calculate an inverse.
        !            38: C The threshold would probably be around 20 limbs, or around 30 for an
        !            39: C unnormalized divisor.
        !            40:
        !            41: defframe(PARAM_INVERSE,16)
        !            42: defframe(PARAM_DIVISOR,12)
        !            43: defframe(PARAM_SIZE,    8)
        !            44: defframe(PARAM_SRC,     4)
        !            45:
        !            46:        TEXT
        !            47:        ALIGN(32)
        !            48: PROLOGUE(mpn_preinv_mod_1)
        !            49: deflit(`FRAME',0)
        !            50:
        !            51:        ASSERT(ae,`cmpl $1, PARAM_SIZE')
        !            52:        ASSERT(nz,`testl $0x80000000, PARAM_DIVISOR')
        !            53:
        !            54:        movl    PARAM_SIZE, %ecx
        !            55:        pushl   %ebp    FRAME_pushl()
        !            56:
        !            57:        movl    PARAM_SRC, %ebp
        !            58:        pushl   %edi    FRAME_pushl()
        !            59:
        !            60:        movl    PARAM_DIVISOR, %eax
        !            61:        pushl   %esi    FRAME_pushl()
        !            62:
        !            63:        movl    -4(%ebp,%ecx,4), %esi   C src high limb
        !            64:        pushl   %ebx    FRAME_pushl()
        !            65:
        !            66:        movl    %edx, %edi              C first n2 to cancel
        !            67:        subl    %eax, %esi              C first n1 = high-divisor
        !            68:
        !            69:        decl    %ecx
        !            70:        jz      L(done_sbbl)
        !            71:
        !            72: L(top):
        !            73:        C eax   scratch
        !            74:        C ebx   n10, nadj, q1
        !            75:        C ecx   counter, size to 1
        !            76:        C edx   scratch
        !            77:        C esi   n2
        !            78:        C edi   old high, for underflow test
        !            79:        C ebp   src
        !            80:
        !            81:        sbbl    %edx, %edi          C high n-(q1+1)*d, 0 or -1
        !            82:
        !            83: L(entry):
        !            84:        andl    PARAM_DIVISOR, %edi
        !            85: L(q1_ff_top):
        !            86:        movl    -4(%ebp,%ecx,4), %ebx
        !            87:
        !            88:        addl    %esi, %edi          C possible addback
        !            89:        movl    %ebx, %esi          C n10
        !            90:
        !            91:        sarl    $31, %ebx           C -n1 = 0 or -1
        !            92:        movl    %edi, %eax          C n2
        !            93:
        !            94:        movl    PARAM_INVERSE, %edx
        !            95:        subl    %ebx, %eax          C n2+n1
        !            96:
        !            97:        mull    %edx                C m*(n2+n1)
        !            98:
        !            99:        andl    PARAM_DIVISOR, %ebx C -n1 & d
        !           100:        addl    %esi, %ebx          C nadj = n10 + (-n1&d), ignoring overflow
        !           101:
        !           102:        addl    %ebx, %eax          C low m*(n2+n1) + nadj, giving carry flag
        !           103:        leal    1(%edi), %ebx       C n2+1
        !           104:
        !           105:        adcl    %ebx, %edx          C 1+high(n2<<32+m*(n2+n1)+nadj) = q1+1
        !           106:
        !           107:        movl    PARAM_DIVISOR, %eax C d
        !           108:        jz      L(q1_ff)
        !           109:
        !           110:        mull    %edx                C (q1+1)*d
        !           111:
        !           112:        subl    %eax, %esi          C low  n-(q1+1)*d
        !           113:        loop    L(top)
        !           114:
        !           115:
        !           116:
        !           117: L(done_sbbl):
        !           118:        sbbl    %edx, %edi          C high n-(q1+1)*d, 0 or -1
        !           119:
        !           120:        andl    PARAM_DIVISOR, %edi
        !           121: L(done_esi_edi):
        !           122:        popl    %ebx
        !           123:
        !           124:        leal    (%esi,%edi), %eax
        !           125:        popl    %esi
        !           126:
        !           127:        popl    %edi
        !           128:        popl    %ebp
        !           129:
        !           130:        ret
        !           131:
        !           132:
        !           133: C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
        !           134: C of q*d is simply -d and the remainder n-q*d = n10+d.  This is rarely
        !           135: C reached.
        !           136:
        !           137: L(q1_ff):
        !           138:        movl    PARAM_DIVISOR, %edi
        !           139:        loop    L(q1_ff_top)
        !           140:
        !           141:        jmp     L(done_esi_edi)
        !           142:
        !           143:
        !           144: EPILOGUE()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>