OpenXM_contrib/gmp/mpn/x86/p6/mod_1.asm - annotate

Return to mod_1.asm CVS log
Up to [local] / OpenXM_contrib / gmp / mpn / x86 / p6
Annotation of OpenXM_contrib/gmp/mpn/x86/p6/mod_1.asm, Revision 1.1

1.1     ! ohara       1: dnl  Intel P6 mpn_mod_1 -- mpn by limb remainder.
        !             2:
        !             3: dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
        !             4: dnl
        !             5: dnl  This file is part of the GNU MP Library.
        !             6: dnl
        !             7: dnl  The GNU MP Library is free software; you can redistribute it and/or
        !             8: dnl  modify it under the terms of the GNU Lesser General Public License as
        !             9: dnl  published by the Free Software Foundation; either version 2.1 of the
        !            10: dnl  License, or (at your option) any later version.
        !            11: dnl
        !            12: dnl  The GNU MP Library is distributed in the hope that it will be useful,
        !            13: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            14: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
        !            15: dnl  Lesser General Public License for more details.
        !            16: dnl
        !            17: dnl  You should have received a copy of the GNU Lesser General Public
        !            18: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
        !            19: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
        !            20: dnl  Suite 330, Boston, MA 02111-1307, USA.
        !            21:
        !            22: include(`../config.m4')
        !            23:
        !            24:
        !            25: C P6: 21.5 cycles/limb
        !            26:
        !            27:
        !            28: C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
        !            29: C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
        !            30: C                       mp_limb_t carry);
        !            31: C mp_limb_t mpn_preinv_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
        !            32: C                             mp_limb_t inverse);
        !            33: C
        !            34: C The code here is in two parts, a simple divl loop and a mul-by-inverse.
        !            35: C The divl is used by mod_1 and mod_1c for small sizes, until the savings in
        !            36: C the mul-by-inverse can overcome the time to calculate an inverse.
        !            37: C preinv_mod_1 goes straight to the mul-by-inverse.
        !            38: C
        !            39: C The mul-by-inverse normalizes the divisor (or for preinv_mod_1 it's
        !            40: C already normalized).  The calculation done is r=a%(d*2^n) followed by a
        !            41: C final (r*2^n)%(d*2^n), where a is the dividend, d the divisor, and n is
        !            42: C the number of leading zero bits on d.  This means there's no bit shifts in
        !            43: C the main loop, at the cost of an extra divide step at the end.
        !            44: C
        !            45: C The simple divl for mod_1 is able to skip one divide step if high<divisor.
        !            46: C For mod_1c the carry parameter is the high of the first divide step, and
        !            47: C no attempt is make to skip that step since carry==0 will be very rare.
        !            48: C
        !            49: C The mul-by-inverse always skips one divide step, but then needs an extra
        !            50: C step at the end, unless the divisor was already normalized (n==0).  This
        !            51: C leads to different mul-by-inverse thresholds for normalized and
        !            52: C unnormalized divisors, in mod_1 and mod_1c.
        !            53: C
        !            54: C Alternatives:
        !            55: C
        !            56: C If n is small then the extra divide step could be done by a few shift and
        !            57: C trial subtract steps instead of a full divide.  That would probably be 3
        !            58: C or 4 cycles/bit, so say up to n=8 might benefit from that over a 21 cycle
        !            59: C divide.  However it's considered that small divisors, meaning biggish n,
        !            60: C are more likely than small n, and that it's not worth the branch
        !            61: C mispredicts of a loop.
        !            62: C
        !            63: C Past:
        !            64: C
        !            65: C There used to be some MMX based code for P-II and P-III, roughly following
        !            66: C the K7 form, but it was slower (about 24.0 c/l) than the code here.  That
        !            67: C code did have an advantage that mod_1 was able to do one less divide step
        !            68: C when high<divisor and the divisor unnormalized, but the speed advantage of
        !            69: C the current code soon overcomes that.
        !            70: C
        !            71: C Future:
        !            72: C
        !            73: C It's not clear whether what's here is optimal.  A rough count of micro-ops
        !            74: C on the dependent chain would suggest a couple of cycles could be shaved,
        !            75: C perhaps.
        !            76:
        !            77:
        !            78: dnl  The following thresholds are the sizes where the multiply by inverse
        !            79: dnl  method is used instead of plain divl's.  Minimum value 2 each.
        !            80: dnl
        !            81: dnl  MUL_NORM_THRESHOLD is for normalized divisors (high bit set),
        !            82: dnl  MUL_UNNORM_THRESHOLD for unnormalized divisors.
        !            83: dnl
        !            84: dnl  With the divl loop at 39 c/l, and the inverse loop at 21.5 c/l but
        !            85: dnl  setups for the inverse of about 50, the threshold should be around
        !            86: dnl  50/(39-21.5)==2.85.  An unnormalized divisor gets an extra divide step
        !            87: dnl  at the end, so if that's about 25 cycles then that threshold might be
        !            88: dnl  around (50+25)/(39-21.5) == 4.3.
        !            89:
        !            90: deflit(MUL_NORM_THRESHOLD,   4)
        !            91: deflit(MUL_UNNORM_THRESHOLD, 5)
        !            92:
        !            93: deflit(MUL_NORM_DELTA, eval(MUL_NORM_THRESHOLD - MUL_UNNORM_THRESHOLD))
        !            94:
        !            95:
        !            96: defframe(PARAM_INVERSE, 16)  dnl  mpn_preinv_mod_1
        !            97: defframe(PARAM_CARRY,   16)  dnl  mpn_mod_1c
        !            98: defframe(PARAM_DIVISOR, 12)
        !            99: defframe(PARAM_SIZE,     8)
        !           100: defframe(PARAM_SRC,      4)
        !           101:
        !           102: defframe(SAVE_EBX,    -4)
        !           103: defframe(SAVE_ESI,    -8)
        !           104: defframe(SAVE_EDI,    -12)
        !           105: defframe(SAVE_EBP,    -16)
        !           106:
        !           107: defframe(VAR_NORM,    -20)
        !           108: defframe(VAR_INVERSE, -24)
        !           109:
        !           110: deflit(STACK_SPACE, 24)
        !           111:
        !           112:        TEXT
        !           113:
        !           114:        ALIGN(16)
        !           115: PROLOGUE(mpn_preinv_mod_1)
        !           116: deflit(`FRAME',0)
        !           117:
        !           118:        movl    PARAM_SRC, %edx
        !           119:        subl    $STACK_SPACE, %esp      FRAME_subl_esp(STACK_SPACE)
        !           120:
        !           121:        movl    %ebx, SAVE_EBX
        !           122:        movl    PARAM_SIZE, %ebx
        !           123:
        !           124:        movl    %ebp, SAVE_EBP
        !           125:        movl    PARAM_DIVISOR, %ebp
        !           126:
        !           127:        movl    %esi, SAVE_ESI
        !           128:        movl    PARAM_INVERSE, %eax
        !           129:
        !           130:        movl    %edi, SAVE_EDI
        !           131:        movl    -4(%edx,%ebx,4), %edi   C src high limb
        !           132:
        !           133:        movl    $0, VAR_NORM
        !           134:        leal    -8(%edx,%ebx,4), %ecx   C &src[size-2]
        !           135:
        !           136:        C
        !           137:
        !           138:        movl    %edi, %esi
        !           139:        subl    %ebp, %edi              C high-divisor
        !           140:
        !           141:        cmovc(  %esi, %edi)             C restore if underflow
        !           142:        decl    %ebx
        !           143:        jnz     L(preinv_entry)
        !           144:
        !           145:        jmp     L(done_edi)
        !           146:
        !           147: EPILOGUE()
        !           148:
        !           149:
        !           150:        ALIGN(16)
        !           151: PROLOGUE(mpn_mod_1c)
        !           152: deflit(`FRAME',0)
        !           153:
        !           154:        movl    PARAM_SIZE, %ecx
        !           155:        subl    $STACK_SPACE, %esp      FRAME_subl_esp(STACK_SPACE)
        !           156:
        !           157:        movl    %ebp, SAVE_EBP
        !           158:        movl    PARAM_DIVISOR, %eax
        !           159:
        !           160:        movl    %esi, SAVE_ESI
        !           161:        movl    PARAM_CARRY, %edx
        !           162:
        !           163:        movl    PARAM_SRC, %esi
        !           164:        orl     %ecx, %ecx
        !           165:        jz      L(done_edx)             C result==carry if size==0
        !           166:
        !           167:        sarl    $31, %eax
        !           168:        movl    PARAM_DIVISOR, %ebp
        !           169:
        !           170:        andl    $MUL_NORM_DELTA, %eax
        !           171:
        !           172:        addl    $MUL_UNNORM_THRESHOLD, %eax
        !           173:
        !           174:        cmpl    %eax, %ecx
        !           175:        jb      L(divide_top)
        !           176:
        !           177:
        !           178:        C The carry parameter pretends to be the src high limb.
        !           179:
        !           180:        movl    %ebx, SAVE_EBX
        !           181:        leal    1(%ecx), %ebx           C size+1
        !           182:
        !           183:        movl    %edx, %eax              C carry
        !           184:        jmp     L(mul_by_inverse_1c)
        !           185:
        !           186: EPILOGUE()
        !           187:
        !           188:
        !           189:        ALIGN(16)
        !           190: PROLOGUE(mpn_mod_1)
        !           191: deflit(`FRAME',0)
        !           192:
        !           193:        movl    PARAM_SIZE, %ecx
        !           194:        subl    $STACK_SPACE, %esp      FRAME_subl_esp(STACK_SPACE)
        !           195:        movl    $0, %edx                C initial carry (if can't skip a div)
        !           196:
        !           197:        movl    %esi, SAVE_ESI
        !           198:        movl    PARAM_SRC, %eax
        !           199:
        !           200:        movl    %ebp, SAVE_EBP
        !           201:        movl    PARAM_DIVISOR, %ebp
        !           202:
        !           203:        movl    PARAM_DIVISOR, %esi
        !           204:        orl     %ecx, %ecx
        !           205:        jz      L(done_edx)
        !           206:
        !           207:        movl    -4(%eax,%ecx,4), %eax   C src high limb
        !           208:
        !           209:        sarl    $31, %ebp
        !           210:
        !           211:        andl    $MUL_NORM_DELTA, %ebp
        !           212:
        !           213:        addl    $MUL_UNNORM_THRESHOLD, %ebp
        !           214:        cmpl    %esi, %eax              C carry flag if high<divisor
        !           215:
        !           216:        cmovc(  %eax, %edx)             C src high limb as initial carry
        !           217:        movl    PARAM_SRC, %esi
        !           218:
        !           219:        sbbl    $0, %ecx                C size-1 to skip one div
        !           220:        jz      L(done_eax)             C done if had size==1
        !           221:
        !           222:        cmpl    %ebp, %ecx
        !           223:        movl    PARAM_DIVISOR, %ebp
        !           224:        jae     L(mul_by_inverse)
        !           225:
        !           226:
        !           227: L(divide_top):
        !           228:        C eax   scratch (quotient)
        !           229:        C ebx
        !           230:        C ecx   counter, limbs, decrementing
        !           231:        C edx   scratch (remainder)
        !           232:        C esi   src
        !           233:        C edi
        !           234:        C ebp   divisor
        !           235:
        !           236:        movl    -4(%esi,%ecx,4), %eax
        !           237:
        !           238:        divl    %ebp
        !           239:
        !           240:        decl    %ecx
        !           241:        jnz     L(divide_top)
        !           242:
        !           243:
        !           244: L(done_edx):
        !           245:        movl    %edx, %eax
        !           246: L(done_eax):
        !           247:        movl    SAVE_ESI, %esi
        !           248:
        !           249:        movl    SAVE_EBP, %ebp
        !           250:        addl    $STACK_SPACE, %esp
        !           251:
        !           252:        ret
        !           253:
        !           254:
        !           255: C -----------------------------------------------------------------------------
        !           256:
        !           257: L(mul_by_inverse):
        !           258:        C eax   src high limb
        !           259:        C ebx
        !           260:        C ecx
        !           261:        C edx
        !           262:        C esi   src
        !           263:        C edi
        !           264:        C ebp   divisor
        !           265:
        !           266:        movl    %ebx, SAVE_EBX
        !           267:        movl    PARAM_SIZE, %ebx
        !           268:
        !           269: L(mul_by_inverse_1c):
        !           270:        bsrl    %ebp, %ecx              C 31-l
        !           271:
        !           272:        movl    %edi, SAVE_EDI
        !           273:        xorl    $31, %ecx               C l
        !           274:
        !           275:        movl    %ecx, VAR_NORM
        !           276:        shll    %cl, %ebp               C d normalized
        !           277:
        !           278:        movl    %eax, %edi              C src high -> n2
        !           279:        subl    %ebp, %eax
        !           280:
        !           281:        cmovnc( %eax, %edi)             C n2-divisor if no underflow
        !           282:
        !           283:        movl    $-1, %eax
        !           284:        movl    $-1, %edx
        !           285:
        !           286:        subl    %ebp, %edx              C (b-d)-1 so  edx:eax = b*(b-d)-1
        !           287:        leal    -8(%esi,%ebx,4), %ecx   C &src[size-2]
        !           288:
        !           289:        divl    %ebp                    C floor (b*(b-d)-1) / d
        !           290:
        !           291: L(preinv_entry):
        !           292:        movl    %eax, VAR_INVERSE
        !           293:
        !           294:
        !           295:
        !           296: C No special scheduling of loads is necessary in this loop, out of order
        !           297: C execution hides the latencies already.
        !           298: C
        !           299: C The way q1+1 is generated in %ebx and d is moved to %eax for the multiply
        !           300: C seems fastest.  The obvious change to generate q1+1 in %eax and then just
        !           301: C multiply by %ebp (as per mpn/x86/pentium/mod_1.asm in fact) runs 1 cycle
        !           302: C slower, for no obvious reason.
        !           303:
        !           304:
        !           305:        ALIGN(16)
        !           306: L(inverse_top):
        !           307:        C eax   n10 (then scratch)
        !           308:        C ebx   scratch (nadj, q1)
        !           309:        C ecx   src pointer, decrementing
        !           310:        C edx   scratch
        !           311:        C esi   n10
        !           312:        C edi   n2
        !           313:        C ebp   divisor
        !           314:
        !           315:        movl    (%ecx), %eax       C next src limb
        !           316:        movl    %eax, %esi
        !           317:
        !           318:        sarl    $31, %eax          C -n1
        !           319:        movl    %ebp, %ebx
        !           320:
        !           321:        andl    %eax, %ebx         C -n1 & d
        !           322:        negl    %eax               C n1
        !           323:
        !           324:        addl    %edi, %eax         C n2+n1
        !           325:
        !           326:        mull    VAR_INVERSE        C m*(n2+n1)
        !           327:
        !           328:        addl    %esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
        !           329:        subl    $4, %ecx
        !           330:
        !           331:        C
        !           332:
        !           333:        addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
        !           334:        leal    1(%edi), %ebx      C n2+1
        !           335:        movl    %ebp, %eax         C d
        !           336:
        !           337:        adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
        !           338:        jz      L(q1_ff)
        !           339:
        !           340:        mull    %ebx               C (q1+1)*d
        !           341:
        !           342:        C
        !           343:
        !           344:        subl    %eax, %esi         C low n - (q1+1)*d
        !           345:
        !           346:        sbbl    %edx, %edi         C high n - (q1+1)*d, 0 or -1
        !           347:
        !           348:        andl    %ebp, %edi         C d if underflow
        !           349:
        !           350:        addl    %esi, %edi         C remainder with addback if necessary
        !           351:
        !           352:        cmpl    PARAM_SRC, %ecx
        !           353:        jae     L(inverse_top)
        !           354:
        !           355:
        !           356: C -----------------------------------------------------------------------------
        !           357: L(inverse_loop_done):
        !           358:
        !           359:        C %edi is the remainder modulo d*2^n and now must be reduced to
        !           360:        C 0<=r<d by calculating r*2^n mod d*2^n and then right shifting by
        !           361:        C n.  If d was already normalized on entry so that n==0 then nothing
        !           362:        C is needed here.  The chance of n==0 is low, but it's true of say
        !           363:        C PP from gmp-impl.h.
        !           364:        C
        !           365:        C eax
        !           366:        C ebx
        !           367:        C ecx
        !           368:        C edx
        !           369:        C esi
        !           370:        C edi   remainder
        !           371:        C ebp   divisor (normalized)
        !           372:
        !           373:        movl    VAR_NORM, %ecx
        !           374:        movl    $0, %esi
        !           375:
        !           376:        orl     %ecx, %ecx
        !           377:        jz      L(done_edi)
        !           378:
        !           379:
        !           380:        C Here use %edi=n10 and %esi=n2, opposite to the loop above.
        !           381:        C
        !           382:        C The q1=0xFFFFFFFF case is handled with an sbbl to adjust q1+1
        !           383:        C back, rather than q1_ff special case code.  This is simpler and
        !           384:        C costs only 2 uops.
        !           385:
        !           386:        shldl(  %cl, %edi, %esi)
        !           387:
        !           388:        shll    %cl, %edi
        !           389:
        !           390:        movl    %edi, %eax         C n10
        !           391:        movl    %ebp, %ebx         C d
        !           392:
        !           393:        sarl    $31, %eax          C -n1
        !           394:
        !           395:        andl    %eax, %ebx         C -n1 & d
        !           396:        negl    %eax               C n1
        !           397:
        !           398:        addl    %edi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
        !           399:        addl    %esi, %eax         C n2+n1
        !           400:
        !           401:        mull    VAR_INVERSE        C m*(n2+n1)
        !           402:
        !           403:        C
        !           404:
        !           405:        addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
        !           406:        leal    1(%esi), %ebx      C n2+1
        !           407:
        !           408:        adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
        !           409:
        !           410:        sbbl    $0, %ebx
        !           411:        movl    %ebp, %eax         C d
        !           412:
        !           413:        mull    %ebx               C (q1+1)*d
        !           414:
        !           415:        movl    SAVE_EBX, %ebx
        !           416:
        !           417:        C
        !           418:
        !           419:        subl    %eax, %edi         C low  n - (q1+1)*d is remainder
        !           420:
        !           421:        sbbl    %edx, %esi         C high n - (q1+1)*d, 0 or -1
        !           422:
        !           423:        andl    %ebp, %esi
        !           424:        movl    SAVE_EBP, %ebp
        !           425:
        !           426:        leal    (%esi,%edi), %eax  C remainder
        !           427:        movl    SAVE_ESI, %esi
        !           428:
        !           429:        shrl    %cl, %eax          C denorm remainder
        !           430:        movl    SAVE_EDI, %edi
        !           431:        addl    $STACK_SPACE, %esp
        !           432:
        !           433:        ret
        !           434:
        !           435:
        !           436: L(done_edi):
        !           437:        movl    SAVE_EBX, %ebx
        !           438:        movl    %edi, %eax
        !           439:
        !           440:        movl    SAVE_ESI, %esi
        !           441:
        !           442:        movl    SAVE_EDI, %edi
        !           443:
        !           444:        movl    SAVE_EBP, %ebp
        !           445:        addl    $STACK_SPACE, %esp
        !           446:
        !           447:        ret
        !           448:
        !           449:
        !           450: C -----------------------------------------------------------------------------
        !           451: C
        !           452: C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
        !           453: C of q*d is simply -d and the remainder n-q*d = n10+d.
        !           454: C
        !           455: C This is reached only very rarely.
        !           456:
        !           457: L(q1_ff):
        !           458:        C eax   (divisor)
        !           459:        C ebx   (q1+1 == 0)
        !           460:        C ecx   src pointer
        !           461:        C edx
        !           462:        C esi   n10
        !           463:        C edi   (n2)
        !           464:        C ebp   divisor
        !           465:
        !           466:        leal    (%ebp,%esi), %edi       C n-q*d remainder -> next n2
        !           467:
        !           468:        cmpl    PARAM_SRC, %ecx
        !           469:        jae     L(inverse_top)
        !           470:
        !           471:        jmp     L(inverse_loop_done)
        !           472:
        !           473:
        !           474: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>