OpenXM_contrib/gmp/mpn/x86/k6/aorsmul_1.asm - annotate

Return to aorsmul_1.asm CVS log
Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k6
Annotation of OpenXM_contrib/gmp/mpn/x86/k6/aorsmul_1.asm, Revision 1.1.1.1

1.1       maekawa     1: dnl  AMD K6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
                      2: dnl
                      3: dnl  K6: 7.65 to 8.5 cycles/limb (at 16 limbs/loop and depending on the data),
                      4: dnl  PIC adds about 6 cycles at the start.
                      5:
                      6:
                      7: dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
                      8: dnl
                      9: dnl  This file is part of the GNU MP Library.
                     10: dnl
                     11: dnl  The GNU MP Library is free software; you can redistribute it and/or
                     12: dnl  modify it under the terms of the GNU Lesser General Public License as
                     13: dnl  published by the Free Software Foundation; either version 2.1 of the
                     14: dnl  License, or (at your option) any later version.
                     15: dnl
                     16: dnl  The GNU MP Library is distributed in the hope that it will be useful,
                     17: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
                     18: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     19: dnl  Lesser General Public License for more details.
                     20: dnl
                     21: dnl  You should have received a copy of the GNU Lesser General Public
                     22: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
                     23: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
                     24: dnl  Suite 330, Boston, MA 02111-1307, USA.
                     25:
                     26:
                     27: include(`../config.m4')
                     28:
                     29:
                     30: dnl  K6:           large multpliers  small multpliers
                     31: dnl  UNROLL_COUNT    cycles/limb       cycles/limb
                     32: dnl        4             9.5              7.78
                     33: dnl        8             9.0              7.78
                     34: dnl       16             8.4              7.65
                     35: dnl       32             8.4              8.2
                     36: dnl
                     37: dnl  Maximum possible unrolling with the current code is 32.
                     38: dnl
                     39: dnl  Unrolling to 16 limbs/loop makes the unrolled loop fit exactly in a 256
                     40: dnl  byte block, which might explain the good speed at that unrolling.
                     41:
                     42: deflit(UNROLL_COUNT, 16)
                     43:
                     44:
                     45: ifdef(`OPERATION_addmul_1', `
                     46:        define(M4_inst,        addl)
                     47:        define(M4_function_1,  mpn_addmul_1)
                     48:        define(M4_function_1c, mpn_addmul_1c)
                     49:        define(M4_description, add it to)
                     50:        define(M4_desc_retval, carry)
                     51: ',`ifdef(`OPERATION_submul_1', `
                     52:        define(M4_inst,        subl)
                     53:        define(M4_function_1,  mpn_submul_1)
                     54:        define(M4_function_1c, mpn_submul_1c)
                     55:        define(M4_description, subtract it from)
                     56:        define(M4_desc_retval, borrow)
                     57: ',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
                     58: ')')')
                     59:
                     60: MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
                     61:
                     62:
                     63: C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
                     64: C                          mp_limb_t mult);
                     65: C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
                     66: C                           mp_limb_t mult, mp_limb_t carry);
                     67: C
                     68: C Calculate src,size multiplied by mult and M4_description dst,size.
                     69: C Return the M4_desc_retval limb from the top of the result.
                     70: C
                     71: C The jadcl0()s in the unrolled loop makes the speed data dependent.  Small
                     72: C multipliers (most significant few bits clear) result in few carry bits and
                     73: C speeds up to 7.65 cycles/limb are attained.  Large multipliers (most
                     74: C significant few bits set) make the carry bits 50/50 and lead to something
                     75: C more like 8.4 c/l.  (With adcl's both of these would be 9.3 c/l.)
                     76: C
                     77: C It's important that the gains for jadcl0 on small multipliers don't come
                     78: C at the cost of slowing down other data.  Tests on uniformly distributed
                     79: C random data, designed to confound branch prediction, show about a 7%
                     80: C speed-up using jadcl0 over adcl (8.93 versus 9.57 cycles/limb, with all
                     81: C overheads included).
                     82: C
                     83: C In the simple loop, jadcl0() measures slower than adcl (11.9-14.7 versus
                     84: C 11.0 cycles/limb), and hence isn't used.
                     85: C
                     86: C In the simple loop, note that running ecx from negative to zero and using
                     87: C it as an index in the two movs wouldn't help.  It would save one
                     88: C instruction (2*addl+loop becoming incl+jnz), but there's nothing unpaired
                     89: C that would be collapsed by this.
                     90: C
                     91: C
                     92: C jadcl0
                     93: C ------
                     94: C
                     95: C jadcl0() being faster than adcl $0 seems to be an artifact of two things,
                     96: C firstly the instruction decoding and secondly the fact that there's a
                     97: C carry bit for the jadcl0 only on average about 1/4 of the time.
                     98: C
                     99: C The code in the unrolled loop decodes something like the following.
                    100: C
                    101: C                                         decode cycles
                    102: C              mull    %ebp                    2
                    103: C              M4_inst %esi, disp(%edi)        1
                    104: C              adcl    %eax, %ecx              2
                    105: C              movl    %edx, %esi            \ 1
                    106: C              jnc     1f                    /
                    107: C              incl    %esi                  \ 1
                    108: C      1:      movl    disp(%ebx), %eax      /
                    109: C                                              ---
                    110: C                                               7
                    111: C
                    112: C In a back-to-back style test this measures 7 with the jnc not taken, or 8
                    113: C with it taken (both when correctly predicted).  This is opposite to the
                    114: C measurements showing small multipliers running faster than large ones.
                    115: C Watch this space for more info ...
                    116: C
                    117: C It's not clear how much branch misprediction might be costing.  The K6
                    118: C doco says it will be 1 to 4 cycles, but presumably it's near the low end
                    119: C of that range to get the measured results.
                    120: C
                    121: C
                    122: C In the code the two carries are more or less the preceding mul product and
                    123: C the calculation is roughly
                    124: C
                    125: C      x*y + u*b+v
                    126: C
                    127: C where b=2^32 is the size of a limb, x*y is the two carry limbs, and u and
                    128: C v are the two limbs it's added to (being the low of the next mul, and a
                    129: C limb from the destination).
                    130: C
                    131: C To get a carry requires x*y+u*b+v >= b^2, which is u*b+v >= b^2-x*y, and
                    132: C there are b^2-(b^2-x*y) = x*y many such values, giving a probability of
                    133: C x*y/b^2.  If x, y, u and v are random and uniformly distributed between 0
                    134: C and b-1, then the total probability can be summed over x and y,
                    135: C
                    136: C       1    b-1 b-1 x*y    1    b*(b-1)   b*(b-1)
                    137: C      --- * sum sum --- = --- * ------- * ------- = 1/4
                    138: C       b^2   x=0 y=1 b^2   b^4      2         2
                    139: C
                    140: C Actually it's a very tiny bit less than 1/4 of course.  If y is fixed,
                    141: C then the probability is 1/2*y/b thus varying linearly between 0 and 1/2.
                    142:
                    143:
                    144: ifdef(`PIC',`
                    145: deflit(UNROLL_THRESHOLD, 9)
                    146: ',`
                    147: deflit(UNROLL_THRESHOLD, 6)
                    148: ')
                    149:
                    150: defframe(PARAM_CARRY,     20)
                    151: defframe(PARAM_MULTIPLIER,16)
                    152: defframe(PARAM_SIZE,      12)
                    153: defframe(PARAM_SRC,       8)
                    154: defframe(PARAM_DST,       4)
                    155:
                    156:        .text
                    157:        ALIGN(32)
                    158:
                    159: PROLOGUE(M4_function_1c)
                    160:        pushl   %esi
                    161: deflit(`FRAME',4)
                    162:        movl    PARAM_CARRY, %esi
                    163:        jmp     LF(M4_function_1,start_nc)
                    164: EPILOGUE()
                    165:
                    166: PROLOGUE(M4_function_1)
                    167:        push    %esi
                    168: deflit(`FRAME',4)
                    169:        xorl    %esi, %esi      C initial carry
                    170:
                    171: L(start_nc):
                    172:        movl    PARAM_SIZE, %ecx
                    173:        pushl   %ebx
                    174: deflit(`FRAME',8)
                    175:
                    176:        movl    PARAM_SRC, %ebx
                    177:        pushl   %edi
                    178: deflit(`FRAME',12)
                    179:
                    180:        cmpl    $UNROLL_THRESHOLD, %ecx
                    181:        movl    PARAM_DST, %edi
                    182:
                    183:        pushl   %ebp
                    184: deflit(`FRAME',16)
                    185:        jae     L(unroll)
                    186:
                    187:
                    188:        C simple loop
                    189:
                    190:        movl    PARAM_MULTIPLIER, %ebp
                    191:
                    192: L(simple):
                    193:        C eax   scratch
                    194:        C ebx   src
                    195:        C ecx   counter
                    196:        C edx   scratch
                    197:        C esi   carry
                    198:        C edi   dst
                    199:        C ebp   multiplier
                    200:
                    201:        movl    (%ebx), %eax
                    202:        addl    $4, %ebx
                    203:
                    204:        mull    %ebp
                    205:
                    206:        addl    $4, %edi
                    207:        addl    %esi, %eax
                    208:
                    209:        adcl    $0, %edx
                    210:
                    211:        M4_inst %eax, -4(%edi)
                    212:
                    213:        adcl    $0, %edx
                    214:
                    215:        movl    %edx, %esi
                    216:        loop    L(simple)
                    217:
                    218:
                    219:        popl    %ebp
                    220:        popl    %edi
                    221:
                    222:        popl    %ebx
                    223:        movl    %esi, %eax
                    224:
                    225:        popl    %esi
                    226:        ret
                    227:
                    228:
                    229:
                    230: C -----------------------------------------------------------------------------
                    231: C The unrolled loop uses a "two carry limbs" scheme.  At the top of the loop
                    232: C the carries are ecx=lo, esi=hi, then they swap for each limb processed.
                    233: C For the computed jump an odd size means they start one way around, an even
                    234: C size the other.
                    235: C
                    236: C VAR_JUMP holds the computed jump temporarily because there's not enough
                    237: C registers at the point of doing the mul for the initial two carry limbs.
                    238: C
                    239: C The add/adc for the initial carry in %esi is necessary only for the
                    240: C mpn_addmul/submul_1c entry points.  Duplicating the startup code to
                    241: C eliminiate this for the plain mpn_add/submul_1 doesn't seem like a good
                    242: C idea.
                    243:
                    244: dnl  overlapping with parameters already fetched
                    245: define(VAR_COUNTER, `PARAM_SIZE')
                    246: define(VAR_JUMP,    `PARAM_DST')
                    247:
                    248: L(unroll):
                    249:        C eax
                    250:        C ebx   src
                    251:        C ecx   size
                    252:        C edx
                    253:        C esi   initial carry
                    254:        C edi   dst
                    255:        C ebp
                    256:
                    257:        movl    %ecx, %edx
                    258:        decl    %ecx
                    259:
                    260:        subl    $2, %edx
                    261:        negl    %ecx
                    262:
                    263:        shrl    $UNROLL_LOG2, %edx
                    264:        andl    $UNROLL_MASK, %ecx
                    265:
                    266:        movl    %edx, VAR_COUNTER
                    267:        movl    %ecx, %edx
                    268:
                    269:        shll    $4, %edx
                    270:        negl    %ecx
                    271:
                    272:        C 15 code bytes per limb
                    273: ifdef(`PIC',`
                    274:        call    L(pic_calc)
                    275: L(here):
                    276: ',`
                    277:        leal    L(entry) (%edx,%ecx,1), %edx
                    278: ')
                    279:        movl    (%ebx), %eax            C src low limb
                    280:
                    281:        movl    PARAM_MULTIPLIER, %ebp
                    282:        movl    %edx, VAR_JUMP
                    283:
                    284:        mull    %ebp
                    285:
                    286:        addl    %esi, %eax      C initial carry (from _1c)
                    287:        jadcl0( %edx)
                    288:
                    289:
                    290:        leal    4(%ebx,%ecx,4), %ebx
                    291:        movl    %edx, %esi      C high carry
                    292:
                    293:        movl    VAR_JUMP, %edx
                    294:        leal    (%edi,%ecx,4), %edi
                    295:
                    296:        testl   $1, %ecx
                    297:        movl    %eax, %ecx      C low carry
                    298:
                    299:        jz      L(noswap)
                    300:        movl    %esi, %ecx      C high,low carry other way around
                    301:
                    302:        movl    %eax, %esi
                    303: L(noswap):
                    304:
                    305:        jmp     *%edx
                    306:
                    307:
                    308: ifdef(`PIC',`
                    309: L(pic_calc):
                    310:        C See README.family about old gas bugs
                    311:        leal    (%edx,%ecx,1), %edx
                    312:        addl    $L(entry)-L(here), %edx
                    313:        addl    (%esp), %edx
                    314:        ret
                    315: ')
                    316:
                    317:
                    318: C -----------------------------------------------------------
                    319:        ALIGN(32)
                    320: L(top):
                    321: deflit(`FRAME',16)
                    322:        C eax   scratch
                    323:        C ebx   src
                    324:        C ecx   carry lo
                    325:        C edx   scratch
                    326:        C esi   carry hi
                    327:        C edi   dst
                    328:        C ebp   multiplier
                    329:        C
                    330:        C 15 code bytes per limb
                    331:
                    332:        leal    UNROLL_BYTES(%edi), %edi
                    333:
                    334: L(entry):
                    335: forloop(`i', 0, UNROLL_COUNT/2-1, `
                    336:        deflit(`disp0', eval(2*i*4))
                    337:        deflit(`disp1', eval(disp0 + 4))
                    338:
                    339: Zdisp( movl,   disp0,(%ebx), %eax)
                    340:        mull    %ebp
                    341: Zdisp( M4_inst,%ecx, disp0,(%edi))
                    342:        adcl    %eax, %esi
                    343:        movl    %edx, %ecx
                    344:        jadcl0( %ecx)
                    345:
                    346:        movl    disp1(%ebx), %eax
                    347:        mull    %ebp
                    348:        M4_inst %esi, disp1(%edi)
                    349:        adcl    %eax, %ecx
                    350:        movl    %edx, %esi
                    351:        jadcl0( %esi)
                    352: ')
                    353:
                    354:        decl    VAR_COUNTER
                    355:        leal    UNROLL_BYTES(%ebx), %ebx
                    356:
                    357:        jns     L(top)
                    358:
                    359:
                    360:        popl    %ebp
                    361:        M4_inst %ecx, UNROLL_BYTES(%edi)
                    362:
                    363:        popl    %edi
                    364:        movl    %esi, %eax
                    365:
                    366:        popl    %ebx
                    367:        jadcl0( %eax)
                    368:
                    369:        popl    %esi
                    370:        ret
                    371:
                    372: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>