[BACK]Return to gcd_finda.asm CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k6

Annotation of OpenXM_contrib/gmp/mpn/x86/k6/gcd_finda.asm, Revision 1.1.1.1

1.1       ohara       1: dnl  AMD K6 mpn_gcd_finda.
                      2:
                      3: dnl  Copyright 2000, 2002 Free Software Foundation, Inc.
                      4: dnl
                      5: dnl  This file is part of the GNU MP Library.
                      6: dnl
                      7: dnl  The GNU MP Library is free software; you can redistribute it and/or
                      8: dnl  modify it under the terms of the GNU Lesser General Public License as
                      9: dnl  published by the Free Software Foundation; either version 2.1 of the
                     10: dnl  License, or (at your option) any later version.
                     11: dnl
                     12: dnl  The GNU MP Library is distributed in the hope that it will be useful,
                     13: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
                     14: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     15: dnl  Lesser General Public License for more details.
                     16: dnl
                     17: dnl  You should have received a copy of the GNU Lesser General Public
                     18: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
                     19: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
                     20: dnl  Suite 330, Boston, MA 02111-1307, USA.
                     21:
                     22: include(`../config.m4')
                     23:
                     24:
                     25: C K6: 680 cycles (approx) on average
                     26:
                     27:
                     28: dnl  How many trial subtractions to attempt before launching into a full
                     29: dnl  division.
                     30:
                     31: deflit(TRIAL_SUBS, 8)
                     32:
                     33:
                     34: C mp_limb_t mpn_gcd_finda (const mp_limb_t cp[2]);
                     35: C
                     36: C This code is probably not optimal, but it's already a good improvement
                     37: C over the generic C.
                     38: C
                     39:
                     40: defframe(PARAM_CP, 4)
                     41:
                     42: defframe(SAVE_EBX,      -4)
                     43: defframe(SAVE_ESI,      -8)
                     44: defframe(SAVE_EDI,     -12)
                     45: defframe(SAVE_EBP,     -16)
                     46:
                     47: defframe(VAR_N2H,      -20)
                     48: defframe(VAR_N2L,      -24)
                     49: defframe(VAR_Q,        -28)
                     50: defframe(VAR_N2L_NORM, -32)
                     51:
                     52: deflit(STACK_SPACE, 32)
                     53:
                     54:        TEXT
                     55:        ALIGN(32)
                     56:
                     57: PROLOGUE(mpn_gcd_finda)
                     58: deflit(`FRAME',0)
                     59:
                     60:        movl    PARAM_CP, %eax
                     61:        subl    $STACK_SPACE, %esp
                     62: deflit(`FRAME',STACK_SPACE)
                     63:
                     64:        movl    %ebx, SAVE_EBX
                     65:
                     66:        movl    %esi, SAVE_ESI
                     67:        movl    (%eax), %ecx
                     68:
                     69:        movl    %edi, SAVE_EDI
                     70:        movl    4(%eax), %edx
                     71:
                     72:        movl    %ebp, SAVE_EBP
                     73:
                     74:        ASSERT(nz,`orl %ecx, %ecx')
                     75:        ASSERT(nz,`orl %edx, %edx')
                     76:
                     77:        movl    %ecx, %eax
                     78:        movl    %edx, %ebx
                     79:
                     80:        negl    %eax
                     81:        notl    %ebx
                     82:
                     83:        cmpl    %ecx, %eax
                     84:        movl    %ebx, %esi
                     85:
                     86:        sbbl    %edx, %esi
                     87:
                     88:        jbe     L(top)
                     89:
                     90:        movl    %ecx, %eax
                     91:        movl    %edx, %ebx
                     92:
                     93:        negl    %ecx
                     94:        notl    %edx
                     95:
                     96:        jmp     L(top)
                     97:
                     98:
                     99:        ALIGN(8)
                    100: L(restore):
                    101:        C eax   n2 l
                    102:        C ebx   n2 h
                    103:        C ecx   n1-n2 l
                    104:        C edx   n1-n2 h
                    105:        C esi   old n1 h
                    106:        C edi
                    107:        C ebp
                    108:
                    109:        movl    %ebx, %edx
                    110:        movl    %esi, %ebx
                    111:
                    112:        movl    %eax, %esi
                    113:        addl    %ecx, %eax
                    114:
                    115:        movl    %esi, %ecx
                    116:
                    117:
                    118: L(top):
                    119:        C n1 >= n2
                    120:        C
                    121:        C eax   n2 l
                    122:        C ebx   n2 h
                    123:        C ecx   n1 l
                    124:        C edx   n1 h
                    125:        C esi
                    126:        C edi
                    127:        C ebp
                    128:
                    129:        orl     %ebx, %ebx
                    130:        jz      L(done)
                    131:
                    132: L(entry):
                    133:        subl    %eax, %ecx
                    134:        sbbl    %ebx, %edx
                    135:        ASSERT(nc)
                    136:
                    137: forloop(i,1,TRIAL_SUBS,`
                    138:        movl    %edx, %esi
                    139:        subl    %eax, %ecx
                    140:
                    141:        sbbl    %ebx, %edx
                    142:        jc      L(restore)
                    143: ')
                    144:
                    145:
                    146:        C n1 >= n2
                    147:        C
                    148:        C eax   n2 l
                    149:        C ebx   n2 h
                    150:        C ecx   n1 l
                    151:        C edx   n1 h
                    152:        C esi
                    153:        C edi
                    154:        C ebp
                    155:
                    156:        movl    %eax, VAR_N2L
                    157:        movl    %ecx, %esi              C n1l
                    158:
                    159:        bsrl    %ebx, %ecx
                    160:
                    161:        movl    %ebx, VAR_N2H
                    162:        notl    %ecx                    C n2h leading zeros (low 5 bits)
                    163:
                    164:        shldl(  %cl, %eax, %ebx)        C n2h normalized
                    165:
                    166:        shll    %cl, %eax               C n2l normalized
                    167:        movl    %edx, %edi              C n1h
                    168:
                    169:        movl    %eax, VAR_N2L_NORM
                    170:        xorl    %ebp, %ebp
                    171:
                    172:        shldl(  %cl, %edi, %ebp)        C n1h shifted
                    173:        shldl(  %cl, %esi, %edi)        C n1m shifted
                    174:
                    175:        shll    %cl, %esi               C n1l shifted
                    176:        movl    %ebp, %edx
                    177:
                    178:        movl    %edi, %eax
                    179:
                    180:        divl    %ebx                    C n1h:n1m / n2h
                    181:
                    182:        movl    %edx, %edi              C n1h:n1m:n1l - q*n2h
                    183:        movl    VAR_N2L_NORM, %edx
                    184:
                    185:        mull    %edx                    C q*n2l
                    186:
                    187:        subl    %eax, %esi
                    188:        movl    VAR_N2L_NORM, %ebp
                    189:
                    190:        sbbl    %edx, %edi              C n1h:n1m:n1l - q*(n2h:n2l)
                    191:
                    192:        jnc     L(div_done)
                    193:        addl    %ebp, %esi
                    194:
                    195:        adcl    %ebx, %edi              C addback n2h:n2l
                    196:
                    197:        jc      L(div_done)
                    198:        addl    %ebp, %esi
                    199:
                    200:        adcl    %ebx, %edi              C further addback n2h:n2l
                    201:        ASSERT(c)
                    202:
                    203: L(div_done):
                    204:        shrdl(  %cl, %edi, %esi)
                    205:
                    206:        shrl    %cl, %edi               C unshift n1m:n1l remainder
                    207:        movl    %esi, %eax
                    208:
                    209:        movl    VAR_N2L, %ecx
                    210:        movl    %edi, %ebx
                    211:
                    212:        movl    VAR_N2H, %edx
                    213:        orl     %ebx, %ebx
                    214:
                    215:        jnz     L(entry)
                    216:
                    217:
                    218: L(done):
                    219:        movl    SAVE_EBX, %ebx
                    220:        movl    SAVE_ESI, %esi
                    221:        movl    SAVE_EDI, %edi
                    222:        movl    SAVE_EBP, %ebp
                    223:        addl    $STACK_SPACE, %esp
                    224:        ret
                    225:
                    226: EPILOGUE()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>