[BACK]Return to gcd_1.asm CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k7

Annotation of OpenXM_contrib/gmp/mpn/x86/k7/gcd_1.asm, Revision 1.1

1.1     ! ohara       1: dnl  AMD K7 mpn_gcd_1 -- mpn by 1 gcd.
        !             2:
        !             3: dnl  Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
        !             4: dnl
        !             5: dnl  This file is part of the GNU MP Library.
        !             6: dnl
        !             7: dnl  The GNU MP Library is free software; you can redistribute it and/or
        !             8: dnl  modify it under the terms of the GNU Lesser General Public License as
        !             9: dnl  published by the Free Software Foundation; either version 2.1 of the
        !            10: dnl  License, or (at your option) any later version.
        !            11: dnl
        !            12: dnl  The GNU MP Library is distributed in the hope that it will be useful,
        !            13: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            14: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
        !            15: dnl  Lesser General Public License for more details.
        !            16: dnl
        !            17: dnl  You should have received a copy of the GNU Lesser General Public
        !            18: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
        !            19: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
        !            20: dnl  Suite 330, Boston, MA 02111-1307, USA.
        !            21:
        !            22: include(`../config.m4')
        !            23:
        !            24:
        !            25: C K7: 6.75 cycles/bit (approx)  1x1 gcd
        !            26: C     11.0 cycles/limb          Nx1 reduction (modexact_1_odd)
        !            27:
        !            28:
        !            29: dnl  Reduce using x%y if x is more than DIV_THRESHOLD bits bigger than y,
        !            30: dnl  where x is the larger of the two.  See tune/README for more.
        !            31: dnl
        !            32: dnl  divl at 40 cycles compared to the gcd at about 7 cycles/bitpair
        !            33: dnl  suggests 40/7*2=11.4 but 7 seems to be about right.
        !            34:
        !            35: deflit(DIV_THRESHOLD, 7)
        !            36:
        !            37:
        !            38: C table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
        !            39: C
        !            40: C This is mixed in with the code, but as per the k7 optimization manual it's
        !            41: C a full cache line and suitably aligned so it won't get swapped between
        !            42: C code and data.  Having it in TEXT rather than RODATA saves needing a GOT
        !            43: C entry when PIC.
        !            44: C
        !            45: C Actually, there doesn't seem to be a measurable difference between this in
        !            46: C it's own cache line or plonked in the middle of the code.  Presumably
        !            47: C since TEXT is read-only there's no worries about coherency.
        !            48:
        !            49: deflit(MASK, 63)
        !            50: deflit(MAXSHIFT, 6)
        !            51:
        !            52:        TEXT
        !            53:        ALIGN(64)
        !            54: L(table):
        !            55:        .byte   MAXSHIFT
        !            56: forloop(i,1,MASK,
        !            57: `      .byte   m4_count_trailing_zeros(i)
        !            58: ')
        !            59:
        !            60:
        !            61: C mp_limb_t mpn_gcd_1 (mp_srcptr src, mp_size_t size, mp_limb_t limb);
        !            62: C
        !            63:
        !            64: defframe(PARAM_LIMB,   12)
        !            65: defframe(PARAM_SIZE,    8)
        !            66: defframe(PARAM_SRC,     4)
        !            67:
        !            68: defframe(SAVE_EBX,     -4)
        !            69: defframe(SAVE_ESI,     -8)
        !            70: defframe(SAVE_EDI,    -12)
        !            71: defframe(SAVE_EBP,    -16)
        !            72: defframe(CALL_DIVISOR,-20)
        !            73: defframe(CALL_SIZE,   -24)
        !            74: defframe(CALL_SRC,    -28)
        !            75:
        !            76: deflit(STACK_SPACE, 28)
        !            77:
        !            78:        TEXT
        !            79:        ALIGN(16)
        !            80:
        !            81: PROLOGUE(mpn_gcd_1)
        !            82: deflit(`FRAME',0)
        !            83:
        !            84:        ASSERT(ne, `cmpl $0, PARAM_LIMB')       C y!=0
        !            85:        ASSERT(ae, `cmpl $1, PARAM_SIZE')       C size>=1
        !            86:
        !            87:        movl    PARAM_SRC, %eax
        !            88:        movl    PARAM_LIMB, %edx
        !            89:        subl    $STACK_SPACE, %esp      deflit(`FRAME',STACK_SPACE)
        !            90:
        !            91:        movl    %esi, SAVE_ESI
        !            92:        movl    %ebx, SAVE_EBX
        !            93:
        !            94:        movl    (%eax), %esi            C src low limb
        !            95:
        !            96: ifdef(`PIC',`
        !            97:        movl    %edi, SAVE_EDI
        !            98:        call    L(movl_eip_to_edi)
        !            99: L(here):
        !           100:        addl    $L(table)-L(here), %edi
        !           101: ')
        !           102:
        !           103:        movl    %esi, %ebx
        !           104:        orl     %edx, %esi      C x|y
        !           105:        movl    $-1, %ecx
        !           106:
        !           107: L(twos):
        !           108:        incl    %ecx
        !           109:        shrl    %esi
        !           110:        jnc     L(twos)         C 3/4 chance of x or y odd already
        !           111:
        !           112:        shrl    %cl, %ebx
        !           113:        shrl    %cl, %edx
        !           114:        movl    %ecx, %esi      C common twos
        !           115:
        !           116:        movl    PARAM_SIZE, %ecx
        !           117:        cmpl    $1, %ecx
        !           118:        ja      L(divide)
        !           119:
        !           120:
        !           121:        C eax
        !           122:        C ebx   x
        !           123:        C ecx
        !           124:        C edx   y
        !           125:        C esi   common twos
        !           126:        C edi   [PIC] L(table)
        !           127:        C ebp
        !           128:
        !           129:        movl    %edx, %eax
        !           130:        cmpl    %ebx, %edx
        !           131:
        !           132:        cmovb(  %ebx, %eax)     C swap to make x bigger than y
        !           133:        cmovb(  %edx, %ebx)
        !           134:
        !           135:
        !           136: L(strip_y):
        !           137:        C eax   x
        !           138:        C ebx   y
        !           139:        C ecx
        !           140:        C edx
        !           141:        C esi   common twos
        !           142:        C edi   [PIC] L(table)
        !           143:        C ebp
        !           144:
        !           145:        ASSERT(nz,`orl %ebx,%ebx')
        !           146:        shrl    %ebx
        !           147:        jnc     L(strip_y)
        !           148:        rcll    %ebx
        !           149:
        !           150:
        !           151:        C eax   x
        !           152:        C ebx   y (odd)
        !           153:        C ecx
        !           154:        C edx
        !           155:        C esi   common twos
        !           156:        C edi   [PIC] L(table)
        !           157:        C ebp
        !           158:
        !           159:        movl    %eax, %ecx
        !           160:        movl    %ebx, %edx
        !           161:        shrl    $DIV_THRESHOLD, %eax
        !           162:
        !           163:        cmpl    %eax, %ebx
        !           164:        movl    %ecx, %eax
        !           165:        ja      L(strip_x_entry)        C do x%y if x much bigger than y
        !           166:
        !           167:
        !           168:        xorl    %edx, %edx
        !           169:
        !           170:        divl    %ebx
        !           171:
        !           172:        orl     %edx, %edx
        !           173:        movl    %edx, %eax              C remainder -> x
        !           174:        movl    %ebx, %edx              C y
        !           175:
        !           176:        jz      L(done_ebx)
        !           177:        jmp     L(strip_x)
        !           178:
        !           179:
        !           180:        C Offset 0x9D here for non-PIC.  About 0.4 cycles/bit is saved by
        !           181:        C ensuring the end of the jnz at the end of this loop doesn't cross
        !           182:        C into the next cache line at 0xC0.
        !           183:        C
        !           184:        C PIC on the other hand is offset 0xAC here and extends to 0xC9, so
        !           185:        C it crosses but doesn't suffer any measurable slowdown.
        !           186:
        !           187: L(top):
        !           188:        C eax   x
        !           189:        C ebx   y-x
        !           190:        C ecx   x-y
        !           191:        C edx   y
        !           192:        C esi   twos, for use at end
        !           193:        C edi   [PIC] L(table)
        !           194:
        !           195:        cmovc(  %ebx, %ecx)             C if x-y gave carry, use x and y-x
        !           196:        cmovc(  %eax, %edx)
        !           197:
        !           198: L(strip_x):
        !           199:        movl    %ecx, %eax
        !           200: L(strip_x_entry):
        !           201:        andl    $MASK, %ecx
        !           202:
        !           203:        ASSERT(nz, `orl %eax, %eax')
        !           204:
        !           205: ifdef(`PIC',`
        !           206:        movb    (%ecx,%edi), %cl
        !           207: ',`
        !           208:        movb    L(table) (%ecx), %cl
        !           209: ')
        !           210:
        !           211:        shrl    %cl, %eax
        !           212:        cmpb    $MAXSHIFT, %cl
        !           213:
        !           214:        movl    %eax, %ecx
        !           215:        movl    %edx, %ebx
        !           216:        je      L(strip_x)
        !           217:
        !           218:        ASSERT(nz, `testl $1, %eax')    C both odd
        !           219:        ASSERT(nz, `testl $1, %edx')
        !           220:
        !           221:        subl    %eax, %ebx
        !           222:        subl    %edx, %ecx
        !           223:        jnz     L(top)
        !           224:
        !           225:
        !           226: L(done):
        !           227:        movl    %esi, %ecx
        !           228:        movl    SAVE_ESI, %esi
        !           229: ifdef(`PIC',`
        !           230:        movl    SAVE_EDI, %edi
        !           231: ')
        !           232:
        !           233:        shll    %cl, %eax
        !           234:        movl    SAVE_EBX, %ebx
        !           235:        addl    $FRAME, %esp
        !           236:
        !           237:        ret
        !           238:
        !           239:
        !           240:
        !           241: C -----------------------------------------------------------------------------
        !           242: C two or more limbs
        !           243:
        !           244: dnl  MODEXACT_THRESHOLD is the size at which it's better to call
        !           245: dnl  mpn_modexact_1_odd than do an inline loop.
        !           246:
        !           247: deflit(MODEXACT_THRESHOLD, ifdef(`PIC',6,5))
        !           248:
        !           249: L(divide):
        !           250:        C eax   src
        !           251:        C ebx
        !           252:        C ecx   size
        !           253:        C edx   y
        !           254:        C esi   common twos
        !           255:        C edi   [PIC] L(table)
        !           256:        C ebp
        !           257:
        !           258: L(divide_strip_y):
        !           259:        ASSERT(nz,`orl %edx,%edx')
        !           260:        shrl    %edx
        !           261:        jnc     L(divide_strip_y)
        !           262:        leal    1(%edx,%edx), %ebx              C y now odd
        !           263:
        !           264:        movl    %ebp, SAVE_EBP
        !           265:        movl    %eax, %ebp
        !           266:        movl    -4(%eax,%ecx,4), %eax           C src high limb
        !           267:
        !           268:        cmp     $MODEXACT_THRESHOLD, %ecx
        !           269:        jae     L(modexact)
        !           270:
        !           271:        cmpl    %ebx, %eax                      C high cmp divisor
        !           272:        movl    $0, %edx
        !           273:
        !           274:        cmovc(  %eax, %edx)                     C skip a div if high<divisor
        !           275:        sbbl    $0, %ecx
        !           276:
        !           277:
        !           278: L(divide_top):
        !           279:        C eax   scratch (quotient)
        !           280:        C ebx   y
        !           281:        C ecx   counter (size to 1, inclusive)
        !           282:        C edx   carry (remainder)
        !           283:        C esi   common twos
        !           284:        C edi   [PIC] L(table)
        !           285:        C ebp   src
        !           286:
        !           287:        movl    -4(%ebp,%ecx,4), %eax
        !           288:
        !           289:        divl    %ebx
        !           290:
        !           291:        decl    %ecx
        !           292:        jnz     L(divide_top)
        !           293:
        !           294:
        !           295:        C eax
        !           296:        C ebx   y (odd)
        !           297:        C ecx
        !           298:        C edx   x
        !           299:        C esi   common twos
        !           300:        C edi   [PIC] L(table)
        !           301:        C ebp
        !           302:
        !           303:        orl     %edx, %edx
        !           304:        movl    SAVE_EBP, %ebp
        !           305:        movl    %edx, %eax
        !           306:
        !           307:        movl    %edx, %ecx
        !           308:        movl    %ebx, %edx
        !           309:        jnz     L(strip_x_entry)
        !           310:
        !           311:
        !           312: L(done_ebx):
        !           313:        movl    %ebx, %eax
        !           314:        jmp     L(done)
        !           315:
        !           316:
        !           317:
        !           318: L(modexact):
        !           319:        C eax
        !           320:        C ebx   y
        !           321:        C ecx   size
        !           322:        C edx
        !           323:        C esi   common twos
        !           324:        C edi   [PIC] L(table)
        !           325:        C ebp   src
        !           326:
        !           327: ifdef(`PIC',`
        !           328:        movl    %ebp, CALL_SRC
        !           329:        movl    %ebx, %ebp              C y
        !           330:        movl    %edi, %ebx              C L(table)
        !           331:
        !           332:        addl    $_GLOBAL_OFFSET_TABLE_+[.-L(table)], %ebx
        !           333:        movl    %ebp, CALL_DIVISOR
        !           334:        movl    %ecx, CALL_SIZE
        !           335:
        !           336:         call   GSYM_PREFIX`'mpn_modexact_1_odd@PLT
        !           337: ',`
        !           338: dnl non-PIC
        !           339:        movl    %ebx, CALL_DIVISOR
        !           340:        movl    %ebp, CALL_SRC
        !           341:        movl    %ecx, CALL_SIZE
        !           342:
        !           343:        call    GSYM_PREFIX`'mpn_modexact_1_odd
        !           344: ')
        !           345:
        !           346:        C eax   x
        !           347:        C ebx   [non-PIC] y
        !           348:        C ecx
        !           349:        C edx
        !           350:        C esi   common twos
        !           351:        C edi   [PIC] L(table)
        !           352:        C ebp   [PIC] y
        !           353:
        !           354:        orl     %eax, %eax
        !           355:        movl    ifdef(`PIC',`%ebp',`%ebx'), %edx
        !           356:        movl    SAVE_EBP, %ebp
        !           357:
        !           358:        movl    %eax, %ecx
        !           359:        jnz     L(strip_x_entry)
        !           360:
        !           361:        movl    %edx, %eax
        !           362:        jmp     L(done)
        !           363:
        !           364:
        !           365: ifdef(`PIC', `
        !           366: L(movl_eip_to_edi):
        !           367:        movl    (%esp), %edi
        !           368:        ret
        !           369: ')
        !           370:
        !           371: EPILOGUE()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>