OpenXM_contrib/gmp/mpn/x86/k6/aorsmul_1.asm - annotate

Return to aorsmul_1.asm CVS log
Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k6
Annotation of OpenXM_contrib/gmp/mpn/x86/k6/aorsmul_1.asm, Revision 1.1

1.1     ! maekawa     1: dnl  AMD K6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
        !             2: dnl
        !             3: dnl  K6: 7.65 to 8.5 cycles/limb (at 16 limbs/loop and depending on the data),
        !             4: dnl  PIC adds about 6 cycles at the start.
        !             5:
        !             6:
        !             7: dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
        !             8: dnl
        !             9: dnl  This file is part of the GNU MP Library.
        !            10: dnl
        !            11: dnl  The GNU MP Library is free software; you can redistribute it and/or
        !            12: dnl  modify it under the terms of the GNU Lesser General Public License as
        !            13: dnl  published by the Free Software Foundation; either version 2.1 of the
        !            14: dnl  License, or (at your option) any later version.
        !            15: dnl
        !            16: dnl  The GNU MP Library is distributed in the hope that it will be useful,
        !            17: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            18: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
        !            19: dnl  Lesser General Public License for more details.
        !            20: dnl
        !            21: dnl  You should have received a copy of the GNU Lesser General Public
        !            22: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
        !            23: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
        !            24: dnl  Suite 330, Boston, MA 02111-1307, USA.
        !            25:
        !            26:
        !            27: include(`../config.m4')
        !            28:
        !            29:
        !            30: dnl  K6:           large multpliers  small multpliers
        !            31: dnl  UNROLL_COUNT    cycles/limb       cycles/limb
        !            32: dnl        4             9.5              7.78
        !            33: dnl        8             9.0              7.78
        !            34: dnl       16             8.4              7.65
        !            35: dnl       32             8.4              8.2
        !            36: dnl
        !            37: dnl  Maximum possible unrolling with the current code is 32.
        !            38: dnl
        !            39: dnl  Unrolling to 16 limbs/loop makes the unrolled loop fit exactly in a 256
        !            40: dnl  byte block, which might explain the good speed at that unrolling.
        !            41:
        !            42: deflit(UNROLL_COUNT, 16)
        !            43:
        !            44:
        !            45: ifdef(`OPERATION_addmul_1', `
        !            46:        define(M4_inst,        addl)
        !            47:        define(M4_function_1,  mpn_addmul_1)
        !            48:        define(M4_function_1c, mpn_addmul_1c)
        !            49:        define(M4_description, add it to)
        !            50:        define(M4_desc_retval, carry)
        !            51: ',`ifdef(`OPERATION_submul_1', `
        !            52:        define(M4_inst,        subl)
        !            53:        define(M4_function_1,  mpn_submul_1)
        !            54:        define(M4_function_1c, mpn_submul_1c)
        !            55:        define(M4_description, subtract it from)
        !            56:        define(M4_desc_retval, borrow)
        !            57: ',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
        !            58: ')')')
        !            59:
        !            60: MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
        !            61:
        !            62:
        !            63: C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
        !            64: C                          mp_limb_t mult);
        !            65: C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
        !            66: C                           mp_limb_t mult, mp_limb_t carry);
        !            67: C
        !            68: C Calculate src,size multiplied by mult and M4_description dst,size.
        !            69: C Return the M4_desc_retval limb from the top of the result.
        !            70: C
        !            71: C The jadcl0()s in the unrolled loop makes the speed data dependent.  Small
        !            72: C multipliers (most significant few bits clear) result in few carry bits and
        !            73: C speeds up to 7.65 cycles/limb are attained.  Large multipliers (most
        !            74: C significant few bits set) make the carry bits 50/50 and lead to something
        !            75: C more like 8.4 c/l.  (With adcl's both of these would be 9.3 c/l.)
        !            76: C
        !            77: C It's important that the gains for jadcl0 on small multipliers don't come
        !            78: C at the cost of slowing down other data.  Tests on uniformly distributed
        !            79: C random data, designed to confound branch prediction, show about a 7%
        !            80: C speed-up using jadcl0 over adcl (8.93 versus 9.57 cycles/limb, with all
        !            81: C overheads included).
        !            82: C
        !            83: C In the simple loop, jadcl0() measures slower than adcl (11.9-14.7 versus
        !            84: C 11.0 cycles/limb), and hence isn't used.
        !            85: C
        !            86: C In the simple loop, note that running ecx from negative to zero and using
        !            87: C it as an index in the two movs wouldn't help.  It would save one
        !            88: C instruction (2*addl+loop becoming incl+jnz), but there's nothing unpaired
        !            89: C that would be collapsed by this.
        !            90: C
        !            91: C
        !            92: C jadcl0
        !            93: C ------
        !            94: C
        !            95: C jadcl0() being faster than adcl $0 seems to be an artifact of two things,
        !            96: C firstly the instruction decoding and secondly the fact that there's a
        !            97: C carry bit for the jadcl0 only on average about 1/4 of the time.
        !            98: C
        !            99: C The code in the unrolled loop decodes something like the following.
        !           100: C
        !           101: C                                         decode cycles
        !           102: C              mull    %ebp                    2
        !           103: C              M4_inst %esi, disp(%edi)        1
        !           104: C              adcl    %eax, %ecx              2
        !           105: C              movl    %edx, %esi            \ 1
        !           106: C              jnc     1f                    /
        !           107: C              incl    %esi                  \ 1
        !           108: C      1:      movl    disp(%ebx), %eax      /
        !           109: C                                              ---
        !           110: C                                               7
        !           111: C
        !           112: C In a back-to-back style test this measures 7 with the jnc not taken, or 8
        !           113: C with it taken (both when correctly predicted).  This is opposite to the
        !           114: C measurements showing small multipliers running faster than large ones.
        !           115: C Watch this space for more info ...
        !           116: C
        !           117: C It's not clear how much branch misprediction might be costing.  The K6
        !           118: C doco says it will be 1 to 4 cycles, but presumably it's near the low end
        !           119: C of that range to get the measured results.
        !           120: C
        !           121: C
        !           122: C In the code the two carries are more or less the preceding mul product and
        !           123: C the calculation is roughly
        !           124: C
        !           125: C      x*y + u*b+v
        !           126: C
        !           127: C where b=2^32 is the size of a limb, x*y is the two carry limbs, and u and
        !           128: C v are the two limbs it's added to (being the low of the next mul, and a
        !           129: C limb from the destination).
        !           130: C
        !           131: C To get a carry requires x*y+u*b+v >= b^2, which is u*b+v >= b^2-x*y, and
        !           132: C there are b^2-(b^2-x*y) = x*y many such values, giving a probability of
        !           133: C x*y/b^2.  If x, y, u and v are random and uniformly distributed between 0
        !           134: C and b-1, then the total probability can be summed over x and y,
        !           135: C
        !           136: C       1    b-1 b-1 x*y    1    b*(b-1)   b*(b-1)
        !           137: C      --- * sum sum --- = --- * ------- * ------- = 1/4
        !           138: C       b^2   x=0 y=1 b^2   b^4      2         2
        !           139: C
        !           140: C Actually it's a very tiny bit less than 1/4 of course.  If y is fixed,
        !           141: C then the probability is 1/2*y/b thus varying linearly between 0 and 1/2.
        !           142:
        !           143:
        !           144: ifdef(`PIC',`
        !           145: deflit(UNROLL_THRESHOLD, 9)
        !           146: ',`
        !           147: deflit(UNROLL_THRESHOLD, 6)
        !           148: ')
        !           149:
        !           150: defframe(PARAM_CARRY,     20)
        !           151: defframe(PARAM_MULTIPLIER,16)
        !           152: defframe(PARAM_SIZE,      12)
        !           153: defframe(PARAM_SRC,       8)
        !           154: defframe(PARAM_DST,       4)
        !           155:
        !           156:        .text
        !           157:        ALIGN(32)
        !           158:
        !           159: PROLOGUE(M4_function_1c)
        !           160:        pushl   %esi
        !           161: deflit(`FRAME',4)
        !           162:        movl    PARAM_CARRY, %esi
        !           163:        jmp     LF(M4_function_1,start_nc)
        !           164: EPILOGUE()
        !           165:
        !           166: PROLOGUE(M4_function_1)
        !           167:        push    %esi
        !           168: deflit(`FRAME',4)
        !           169:        xorl    %esi, %esi      C initial carry
        !           170:
        !           171: L(start_nc):
        !           172:        movl    PARAM_SIZE, %ecx
        !           173:        pushl   %ebx
        !           174: deflit(`FRAME',8)
        !           175:
        !           176:        movl    PARAM_SRC, %ebx
        !           177:        pushl   %edi
        !           178: deflit(`FRAME',12)
        !           179:
        !           180:        cmpl    $UNROLL_THRESHOLD, %ecx
        !           181:        movl    PARAM_DST, %edi
        !           182:
        !           183:        pushl   %ebp
        !           184: deflit(`FRAME',16)
        !           185:        jae     L(unroll)
        !           186:
        !           187:
        !           188:        C simple loop
        !           189:
        !           190:        movl    PARAM_MULTIPLIER, %ebp
        !           191:
        !           192: L(simple):
        !           193:        C eax   scratch
        !           194:        C ebx   src
        !           195:        C ecx   counter
        !           196:        C edx   scratch
        !           197:        C esi   carry
        !           198:        C edi   dst
        !           199:        C ebp   multiplier
        !           200:
        !           201:        movl    (%ebx), %eax
        !           202:        addl    $4, %ebx
        !           203:
        !           204:        mull    %ebp
        !           205:
        !           206:        addl    $4, %edi
        !           207:        addl    %esi, %eax
        !           208:
        !           209:        adcl    $0, %edx
        !           210:
        !           211:        M4_inst %eax, -4(%edi)
        !           212:
        !           213:        adcl    $0, %edx
        !           214:
        !           215:        movl    %edx, %esi
        !           216:        loop    L(simple)
        !           217:
        !           218:
        !           219:        popl    %ebp
        !           220:        popl    %edi
        !           221:
        !           222:        popl    %ebx
        !           223:        movl    %esi, %eax
        !           224:
        !           225:        popl    %esi
        !           226:        ret
        !           227:
        !           228:
        !           229:
        !           230: C -----------------------------------------------------------------------------
        !           231: C The unrolled loop uses a "two carry limbs" scheme.  At the top of the loop
        !           232: C the carries are ecx=lo, esi=hi, then they swap for each limb processed.
        !           233: C For the computed jump an odd size means they start one way around, an even
        !           234: C size the other.
        !           235: C
        !           236: C VAR_JUMP holds the computed jump temporarily because there's not enough
        !           237: C registers at the point of doing the mul for the initial two carry limbs.
        !           238: C
        !           239: C The add/adc for the initial carry in %esi is necessary only for the
        !           240: C mpn_addmul/submul_1c entry points.  Duplicating the startup code to
        !           241: C eliminiate this for the plain mpn_add/submul_1 doesn't seem like a good
        !           242: C idea.
        !           243:
        !           244: dnl  overlapping with parameters already fetched
        !           245: define(VAR_COUNTER, `PARAM_SIZE')
        !           246: define(VAR_JUMP,    `PARAM_DST')
        !           247:
        !           248: L(unroll):
        !           249:        C eax
        !           250:        C ebx   src
        !           251:        C ecx   size
        !           252:        C edx
        !           253:        C esi   initial carry
        !           254:        C edi   dst
        !           255:        C ebp
        !           256:
        !           257:        movl    %ecx, %edx
        !           258:        decl    %ecx
        !           259:
        !           260:        subl    $2, %edx
        !           261:        negl    %ecx
        !           262:
        !           263:        shrl    $UNROLL_LOG2, %edx
        !           264:        andl    $UNROLL_MASK, %ecx
        !           265:
        !           266:        movl    %edx, VAR_COUNTER
        !           267:        movl    %ecx, %edx
        !           268:
        !           269:        shll    $4, %edx
        !           270:        negl    %ecx
        !           271:
        !           272:        C 15 code bytes per limb
        !           273: ifdef(`PIC',`
        !           274:        call    L(pic_calc)
        !           275: L(here):
        !           276: ',`
        !           277:        leal    L(entry) (%edx,%ecx,1), %edx
        !           278: ')
        !           279:        movl    (%ebx), %eax            C src low limb
        !           280:
        !           281:        movl    PARAM_MULTIPLIER, %ebp
        !           282:        movl    %edx, VAR_JUMP
        !           283:
        !           284:        mull    %ebp
        !           285:
        !           286:        addl    %esi, %eax      C initial carry (from _1c)
        !           287:        jadcl0( %edx)
        !           288:
        !           289:
        !           290:        leal    4(%ebx,%ecx,4), %ebx
        !           291:        movl    %edx, %esi      C high carry
        !           292:
        !           293:        movl    VAR_JUMP, %edx
        !           294:        leal    (%edi,%ecx,4), %edi
        !           295:
        !           296:        testl   $1, %ecx
        !           297:        movl    %eax, %ecx      C low carry
        !           298:
        !           299:        jz      L(noswap)
        !           300:        movl    %esi, %ecx      C high,low carry other way around
        !           301:
        !           302:        movl    %eax, %esi
        !           303: L(noswap):
        !           304:
        !           305:        jmp     *%edx
        !           306:
        !           307:
        !           308: ifdef(`PIC',`
        !           309: L(pic_calc):
        !           310:        C See README.family about old gas bugs
        !           311:        leal    (%edx,%ecx,1), %edx
        !           312:        addl    $L(entry)-L(here), %edx
        !           313:        addl    (%esp), %edx
        !           314:        ret
        !           315: ')
        !           316:
        !           317:
        !           318: C -----------------------------------------------------------
        !           319:        ALIGN(32)
        !           320: L(top):
        !           321: deflit(`FRAME',16)
        !           322:        C eax   scratch
        !           323:        C ebx   src
        !           324:        C ecx   carry lo
        !           325:        C edx   scratch
        !           326:        C esi   carry hi
        !           327:        C edi   dst
        !           328:        C ebp   multiplier
        !           329:        C
        !           330:        C 15 code bytes per limb
        !           331:
        !           332:        leal    UNROLL_BYTES(%edi), %edi
        !           333:
        !           334: L(entry):
        !           335: forloop(`i', 0, UNROLL_COUNT/2-1, `
        !           336:        deflit(`disp0', eval(2*i*4))
        !           337:        deflit(`disp1', eval(disp0 + 4))
        !           338:
        !           339: Zdisp( movl,   disp0,(%ebx), %eax)
        !           340:        mull    %ebp
        !           341: Zdisp( M4_inst,%ecx, disp0,(%edi))
        !           342:        adcl    %eax, %esi
        !           343:        movl    %edx, %ecx
        !           344:        jadcl0( %ecx)
        !           345:
        !           346:        movl    disp1(%ebx), %eax
        !           347:        mull    %ebp
        !           348:        M4_inst %esi, disp1(%edi)
        !           349:        adcl    %eax, %ecx
        !           350:        movl    %edx, %esi
        !           351:        jadcl0( %esi)
        !           352: ')
        !           353:
        !           354:        decl    VAR_COUNTER
        !           355:        leal    UNROLL_BYTES(%ebx), %ebx
        !           356:
        !           357:        jns     L(top)
        !           358:
        !           359:
        !           360:        popl    %ebp
        !           361:        M4_inst %ecx, UNROLL_BYTES(%edi)
        !           362:
        !           363:        popl    %edi
        !           364:        movl    %esi, %eax
        !           365:
        !           366:        popl    %ebx
        !           367:        jadcl0( %eax)
        !           368:
        !           369:        popl    %esi
        !           370:        ret
        !           371:
        !           372: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>