[BACK]Return to gcd_1.asm CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k6

Annotation of OpenXM_contrib/gmp/mpn/x86/k6/gcd_1.asm, Revision 1.1

1.1     ! ohara       1: dnl  AMD K6 mpn_mod_1 -- mpn by 1 gcd.
        !             2:
        !             3: dnl  Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
        !             4: dnl
        !             5: dnl  This file is part of the GNU MP Library.
        !             6: dnl
        !             7: dnl  The GNU MP Library is free software; you can redistribute it and/or
        !             8: dnl  modify it under the terms of the GNU Lesser General Public License as
        !             9: dnl  published by the Free Software Foundation; either version 2.1 of the
        !            10: dnl  License, or (at your option) any later version.
        !            11: dnl
        !            12: dnl  The GNU MP Library is distributed in the hope that it will be useful,
        !            13: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            14: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
        !            15: dnl  Lesser General Public License for more details.
        !            16: dnl
        !            17: dnl  You should have received a copy of the GNU Lesser General Public
        !            18: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
        !            19: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
        !            20: dnl  Suite 330, Boston, MA 02111-1307, USA.
        !            21:
        !            22: include(`../config.m4')
        !            23:
        !            24:
        !            25: C K6: 9.5 cycles/bit (approx)   1x1 gcd
        !            26: C     11.0 cycles/limb          Nx1 reduction (modexact_1_odd)
        !            27:
        !            28:
        !            29: C mp_limb_t mpn_gcd_1 (mp_srcptr src, mp_size_t size, mp_limb_t y);
        !            30: C
        !            31: C This code is nothing very special, but offers a speedup over what gcc 2.95
        !            32: C can do with mpn/generic/gcd_1.c.
        !            33: C
        !            34: C Future:
        !            35: C
        !            36: C Using a lookup table to count trailing zeros seems a touch quicker, but
        !            37: C after a slightly longer startup.  Might be worthwhile if an mpn_gcd_2 used
        !            38: C it too.
        !            39:
        !            40:
        !            41: dnl  If size==1 and x (the larger operand) is more than DIV_THRESHOLD bits
        !            42: dnl  bigger than y, then a division x%y is done to reduce it.
        !            43: dnl
        !            44: dnl  A divl is 20 cycles and the loop runs at about 9.5 cycles/bitpair so
        !            45: dnl  there should be an advantage in the divl at about 4 or 5 bits, which is
        !            46: dnl  what's found.
        !            47:
        !            48: deflit(DIV_THRESHOLD, 5)
        !            49:
        !            50:
        !            51: defframe(PARAM_LIMB, 12)
        !            52: defframe(PARAM_SIZE,  8)
        !            53: defframe(PARAM_SRC,   4)
        !            54:
        !            55:        TEXT
        !            56:        ALIGN(16)
        !            57:
        !            58: PROLOGUE(mpn_gcd_1)
        !            59: deflit(`FRAME',0)
        !            60:
        !            61:        ASSERT(ne, `cmpl $0, PARAM_LIMB')
        !            62:        ASSERT(ae, `cmpl $1, PARAM_SIZE')
        !            63:
        !            64:
        !            65:        movl    PARAM_SRC, %eax
        !            66:        pushl   %ebx                    FRAME_pushl()
        !            67:
        !            68:        movl    PARAM_LIMB, %edx
        !            69:        movl    $-1, %ecx
        !            70:
        !            71:        movl    (%eax), %ebx            C src low limb
        !            72:
        !            73:        movl    %ebx, %eax              C src low limb
        !            74:        orl     %edx, %ebx
        !            75:
        !            76: L(common_twos):
        !            77:        shrl    %ebx
        !            78:        incl    %ecx
        !            79:
        !            80:        jnc     L(common_twos)          C 1/4 chance on random data
        !            81:        shrl    %cl, %edx               C y
        !            82:
        !            83:        cmpl    $1, PARAM_SIZE
        !            84:        ja      L(size_two_or_more)
        !            85:
        !            86:
        !            87:        ASSERT(nz, `orl %eax, %eax')    C should have src limb != 0
        !            88:
        !            89:        shrl    %cl, %eax               C x
        !            90:
        !            91:
        !            92:        C Swap if necessary to make x>=y.  Measures a touch quicker as a
        !            93:        C jump than a branch free calculation.
        !            94:        C
        !            95:        C eax   x
        !            96:        C ebx
        !            97:        C ecx   common twos
        !            98:        C edx   y
        !            99:
        !           100:        movl    %eax, %ebx
        !           101:        cmpl    %eax, %edx
        !           102:
        !           103:        jb      L(noswap)
        !           104:        movl    %edx, %eax
        !           105:
        !           106:        movl    %ebx, %edx
        !           107:        movl    %eax, %ebx
        !           108: L(noswap):
        !           109:
        !           110:
        !           111:        C See if it's worth reducing x with a divl.
        !           112:        C
        !           113:        C eax   x
        !           114:        C ebx   x
        !           115:        C ecx   common twos
        !           116:        C edx   y
        !           117:
        !           118:        shrl    $DIV_THRESHOLD, %ebx
        !           119:
        !           120:        cmpl    %ebx, %edx
        !           121:        ja      L(nodiv)
        !           122:
        !           123:
        !           124:        C Reduce x to x%y.
        !           125:        C
        !           126:        C eax   x
        !           127:        C ebx
        !           128:        C ecx   common twos
        !           129:        C edx   y
        !           130:
        !           131:        movl    %edx, %ebx
        !           132:        xorl    %edx, %edx
        !           133:
        !           134:        divl    %ebx
        !           135:
        !           136:        orl     %edx, %edx      C y
        !           137:        nop     C code alignment
        !           138:
        !           139:        movl    %ebx, %eax      C x
        !           140:        jz      L(done_shll)
        !           141: L(nodiv):
        !           142:
        !           143:
        !           144:        C eax   x
        !           145:        C ebx
        !           146:        C ecx   common twos
        !           147:        C edx   y
        !           148:        C esi
        !           149:        C edi
        !           150:        C ebp
        !           151:
        !           152: L(strip_y):
        !           153:        shrl    %edx
        !           154:        jnc     L(strip_y)
        !           155:
        !           156:        leal    1(%edx,%edx), %edx
        !           157:        movl    %ecx, %ebx      C common twos
        !           158:
        !           159:        leal    1(%eax), %ecx
        !           160:        jmp     L(strip_x_and)
        !           161:
        !           162:
        !           163: C Calculating a %cl shift based on the low bit 0 or 1 avoids doing a branch
        !           164: C on a 50/50 chance of 0 or 1.  The chance of the next bit also being 0 is
        !           165: C only 1/4.
        !           166: C
        !           167: C A second computed %cl shift was tried, but that measured a touch slower
        !           168: C than branching back.
        !           169: C
        !           170: C A branch-free abs(x-y) and min(x,y) calculation was tried, but that
        !           171: C measured about 1 cycle/bit slower.
        !           172:
        !           173:        C eax   x
        !           174:        C ebx   common twos
        !           175:        C ecx   scratch
        !           176:        C edx   y
        !           177:
        !           178:        ALIGN(4)
        !           179: L(swap):
        !           180:        addl    %eax, %edx      C x-y+y = x
        !           181:        negl    %eax            C -(x-y) = y-x
        !           182:
        !           183: L(strip_x):
        !           184:        shrl    %eax            C odd-odd = even, so always one to strip
        !           185:        ASSERT(nz)
        !           186:
        !           187: L(strip_x_leal):
        !           188:        leal    1(%eax), %ecx
        !           189:
        !           190: L(strip_x_and):
        !           191:        andl    $1, %ecx        C (x^1)&1
        !           192:
        !           193:        shrl    %cl, %eax       C shift if x even
        !           194:
        !           195:        testb   $1, %al
        !           196:        jz      L(strip_x)
        !           197:
        !           198:        ASSERT(nz,`testl $1, %eax')     C x, y odd
        !           199:        ASSERT(nz,`testl $1, %edx')
        !           200:
        !           201:        subl    %edx, %eax
        !           202:        jb      L(swap)
        !           203:        ja      L(strip_x)
        !           204:
        !           205:
        !           206:        movl    %edx, %eax
        !           207:        movl    %ebx, %ecx
        !           208:
        !           209: L(done_shll):
        !           210:        shll    %cl, %eax
        !           211:        popl    %ebx
        !           212:
        !           213:        ret
        !           214:
        !           215:
        !           216: C -----------------------------------------------------------------------------
        !           217: C Two or more limbs.
        !           218: C
        !           219: C x={src,size} is reduced modulo y using either a plain mod_1 style
        !           220: C remainder, or a modexact_1 style exact division.
        !           221:
        !           222: deflit(MODEXACT_THRESHOLD, ifdef(`PIC', 4, 4))
        !           223:
        !           224:        ALIGN(8)
        !           225: L(size_two_or_more):
        !           226:        C eax
        !           227:        C ebx
        !           228:        C ecx   common twos
        !           229:        C edx   y, without common twos
        !           230:        C esi
        !           231:        C edi
        !           232:        C ebp
        !           233:
        !           234: deflit(FRAME_TWO_OR_MORE, FRAME)
        !           235:
        !           236:        pushl   %edi            defframe_pushl(SAVE_EDI)
        !           237:        movl    PARAM_SRC, %ebx
        !           238:
        !           239: L(y_twos):
        !           240:        shrl    %edx
        !           241:        jnc     L(y_twos)
        !           242:
        !           243:        movl    %ecx, %edi              C common twos
        !           244:        movl    PARAM_SIZE, %ecx
        !           245:
        !           246:        pushl   %esi            defframe_pushl(SAVE_ESI)
        !           247:        leal    1(%edx,%edx), %esi      C y (odd)
        !           248:
        !           249:        movl    -4(%ebx,%ecx,4), %eax   C src high limb
        !           250:
        !           251:        cmpl    %edx, %eax              C carry if high<divisor
        !           252:
        !           253:        sbbl    %edx, %edx              C -1 if high<divisor
        !           254:
        !           255:        addl    %edx, %ecx              C skip one limb if high<divisor
        !           256:        andl    %eax, %edx
        !           257:
        !           258:        cmpl    $MODEXACT_THRESHOLD, %ecx
        !           259:        jae     L(modexact)
        !           260:
        !           261:
        !           262: L(divide_top):
        !           263:        C eax   scratch (quotient)
        !           264:        C ebx   src
        !           265:        C ecx   counter, size-1 to 1
        !           266:        C edx   carry (remainder)
        !           267:        C esi   divisor (odd)
        !           268:        C edi
        !           269:        C ebp
        !           270:
        !           271:        movl    -4(%ebx,%ecx,4), %eax
        !           272:        divl    %esi
        !           273:        loop    L(divide_top)
        !           274:
        !           275:
        !           276:        movl    %edx, %eax      C x
        !           277:        movl    %esi, %edx      C y (odd)
        !           278:
        !           279:        movl    %edi, %ebx      C common twos
        !           280:        popl    %esi
        !           281:
        !           282:        popl    %edi
        !           283:        leal    1(%eax), %ecx
        !           284:
        !           285:        orl     %eax, %eax
        !           286:        jnz     L(strip_x_and)
        !           287:
        !           288:
        !           289:        movl    %ebx, %ecx
        !           290:        movl    %edx, %eax
        !           291:
        !           292:        shll    %cl, %eax
        !           293:        popl    %ebx
        !           294:
        !           295:        ret
        !           296:
        !           297:
        !           298:        ALIGN(8)
        !           299: L(modexact):
        !           300:        C eax
        !           301:        C ebx   src ptr
        !           302:        C ecx   size or size-1
        !           303:        C edx
        !           304:        C esi   y odd
        !           305:        C edi   common twos
        !           306:        C ebp
        !           307:
        !           308:        movl    PARAM_SIZE, %eax
        !           309:        pushl   %esi            FRAME_pushl()
        !           310:
        !           311:        pushl   %eax            FRAME_pushl()
        !           312:
        !           313:        pushl   %ebx            FRAME_pushl()
        !           314:
        !           315: ifdef(`PIC',`
        !           316:        nop     C code alignment
        !           317:        call    L(movl_eip_ebx)
        !           318: L(here):
        !           319:        addl    $_GLOBAL_OFFSET_TABLE_, %ebx
        !           320:         call   GSYM_PREFIX`'mpn_modexact_1_odd@PLT
        !           321: ',`
        !           322:        call    GSYM_PREFIX`'mpn_modexact_1_odd
        !           323: ')
        !           324:
        !           325:        movl    %esi, %edx              C y odd
        !           326:        movl    SAVE_ESI, %esi
        !           327:
        !           328:        movl    %edi, %ebx              C common twos
        !           329:        movl    SAVE_EDI, %edi
        !           330:
        !           331:        addl    $eval(FRAME - FRAME_TWO_OR_MORE), %esp
        !           332:        orl     %eax, %eax
        !           333:
        !           334:        leal    1(%eax), %ecx
        !           335:        jnz     L(strip_x_and)
        !           336:
        !           337:
        !           338:        movl    %ebx, %ecx
        !           339:        movl    %edx, %eax
        !           340:
        !           341:        shll    %cl, %eax
        !           342:        popl    %ebx
        !           343:
        !           344:        ret
        !           345:
        !           346:
        !           347: ifdef(`PIC',`
        !           348: L(movl_eip_ebx):
        !           349:        movl    (%esp), %ebx
        !           350:        ret
        !           351: ')
        !           352:
        !           353: EPILOGUE()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>