OpenXM_contrib/gmp/mpn/x86/pentium/mmx/mul_1.asm - annotate

Return to mul_1.asm CVS log
Up to [local] / OpenXM_contrib / gmp / mpn / x86 / pentium / mmx
Annotation of OpenXM_contrib/gmp/mpn/x86/pentium/mmx/mul_1.asm, Revision 1.1.1.1

1.1       ohara       1: dnl  Intel Pentium MMX mpn_mul_1 -- mpn by limb multiplication.
                      2:
                      3: dnl  Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
                      4: dnl
                      5: dnl  This file is part of the GNU MP Library.
                      6: dnl
                      7: dnl  The GNU MP Library is free software; you can redistribute it and/or
                      8: dnl  modify it under the terms of the GNU Lesser General Public License as
                      9: dnl  published by the Free Software Foundation; either version 2.1 of the
                     10: dnl  License, or (at your option) any later version.
                     11: dnl
                     12: dnl  The GNU MP Library is distributed in the hope that it will be useful,
                     13: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
                     14: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     15: dnl  Lesser General Public License for more details.
                     16: dnl
                     17: dnl  You should have received a copy of the GNU Lesser General Public
                     18: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
                     19: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
                     20: dnl  Suite 330, Boston, MA 02111-1307, USA.
                     21:
                     22: include(`../config.m4')
                     23:
                     24:
                     25: C    cycles/limb
                     26: C P5:   12.0   for 32-bit multiplier
                     27: C        7.0   for 16-bit multiplier
                     28:
                     29:
                     30: C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
                     31: C                      mp_limb_t multiplier);
                     32: C
                     33: C When the multiplier is 16 bits some special case MMX code is used.  Small
                     34: C multipliers might arise reasonably often from mpz_mul_ui etc.  If the size
                     35: C is odd there's roughly a 5 cycle penalty, so times for say size==7 and
                     36: C size==8 end up being quite close.  If src isn't aligned to an 8 byte
                     37: C boundary then one limb is processed separately with roughly a 5 cycle
                     38: C penalty, so in that case it's say size==8 and size==9 which are close.
                     39: C
                     40: C Alternatives:
                     41: C
                     42: C MMX is not believed to be of any use for 32-bit multipliers, since for
                     43: C instance the current method would just have to be more or less duplicated
                     44: C for the high and low halves of the multiplier, and would probably
                     45: C therefore run at about 14 cycles, which is slower than the plain integer
                     46: C at 12.
                     47: C
                     48: C Adding the high and low MMX products using integer code seems best.  An
                     49: C attempt at using paddd and carry bit propagation with pcmpgtd didn't give
                     50: C any joy.  Perhaps something could be done keeping the values signed and
                     51: C thereby avoiding adjustments to make pcmpgtd into an unsigned compare, or
                     52: C perhaps not.
                     53: C
                     54: C Future:
                     55: C
                     56: C An mpn_mul_1c entrypoint would need a double carry out of the low result
                     57: C limb in the 16-bit code, unless it could be assumed the carry fits in 16
                     58: C bits, possibly as carry<multiplier, this being true of a big calculation
                     59: C done piece by piece.  But let's worry about that if/when mul_1c is
                     60: C actually used.
                     61:
                     62: defframe(PARAM_MULTIPLIER,16)
                     63: defframe(PARAM_SIZE,      12)
                     64: defframe(PARAM_SRC,       8)
                     65: defframe(PARAM_DST,       4)
                     66:
                     67:        TEXT
                     68:
                     69:        ALIGN(8)
                     70: PROLOGUE(mpn_mul_1)
                     71: deflit(`FRAME',0)
                     72:
                     73:        movl    PARAM_SIZE, %ecx
                     74:        movl    PARAM_SRC, %edx
                     75:
                     76:        cmpl    $1, %ecx
                     77:        jne     L(two_or_more)
                     78:
                     79:        C one limb only
                     80:
                     81:        movl    PARAM_MULTIPLIER, %eax
                     82:        movl    PARAM_DST, %ecx
                     83:
                     84:        mull    (%edx)
                     85:
                     86:        movl    %eax, (%ecx)
                     87:        movl    %edx, %eax
                     88:
                     89:        ret
                     90:
                     91:
                     92: L(two_or_more):
                     93:        C eax   size
                     94:        C ebx
                     95:        C ecx   carry
                     96:        C edx
                     97:        C esi   src
                     98:        C edi
                     99:        C ebp
                    100:
                    101:        pushl   %esi            FRAME_pushl()
                    102:        pushl   %edi            FRAME_pushl()
                    103:
                    104:        movl    %edx, %esi              C src
                    105:        movl    PARAM_DST, %edi
                    106:
                    107:        movl    PARAM_MULTIPLIER, %eax
                    108:        pushl   %ebx            FRAME_pushl()
                    109:
                    110:        leal    (%esi,%ecx,4), %esi     C src end
                    111:        leal    (%edi,%ecx,4), %edi     C dst end
                    112:
                    113:        negl    %ecx                    C -size
                    114:
                    115:        pushl   %ebp            FRAME_pushl()
                    116:        cmpl    $65536, %eax
                    117:
                    118:        jb      L(small)
                    119:
                    120:
                    121: L(big):
                    122:        xorl    %ebx, %ebx              C carry limb
                    123:        sarl    %ecx                    C -size/2
                    124:
                    125:        jnc     L(top)                  C with carry flag clear
                    126:
                    127:
                    128:        C size was odd, process one limb separately
                    129:
                    130:        mull    4(%esi,%ecx,8)          C m * src[0]
                    131:
                    132:        movl    %eax, 4(%edi,%ecx,8)
                    133:        incl    %ecx
                    134:
                    135:        orl     %edx, %ebx              C carry limb, and clear carry flag
                    136:
                    137:
                    138: L(top):
                    139:        C eax
                    140:        C ebx   carry
                    141:        C ecx   counter, negative
                    142:        C edx
                    143:        C esi   src end
                    144:        C edi   dst end
                    145:        C ebp   (scratch carry)
                    146:
                    147:        adcl    $0, %ebx
                    148:        movl    (%esi,%ecx,8), %eax
                    149:
                    150:        mull    PARAM_MULTIPLIER
                    151:
                    152:        movl    %edx, %ebp
                    153:        addl    %eax, %ebx
                    154:
                    155:        adcl    $0, %ebp
                    156:        movl    4(%esi,%ecx,8), %eax
                    157:
                    158:        mull    PARAM_MULTIPLIER
                    159:
                    160:        movl    %ebx, (%edi,%ecx,8)
                    161:        addl    %ebp, %eax
                    162:
                    163:        movl    %eax, 4(%edi,%ecx,8)
                    164:        incl    %ecx
                    165:
                    166:        movl    %edx, %ebx
                    167:        jnz     L(top)
                    168:
                    169:
                    170:        adcl    $0, %ebx
                    171:        popl    %ebp
                    172:
                    173:        movl    %ebx, %eax
                    174:        popl    %ebx
                    175:
                    176:        popl    %edi
                    177:        popl    %esi
                    178:
                    179:        ret
                    180:
                    181:
                    182: L(small):
                    183:        C Special case for 16-bit multiplier.
                    184:        C
                    185:        C eax   multiplier
                    186:        C ebx
                    187:        C ecx   -size
                    188:        C edx   src
                    189:        C esi   src end
                    190:        C edi   dst end
                    191:        C ebp   multiplier
                    192:
                    193:        C size<3 not supported here.  At size==3 we're already a couple of
                    194:        C cycles faster, so there's no threshold as such, just use the MMX
                    195:        C as soon as possible.
                    196:
                    197:        cmpl    $-3, %ecx
                    198:        ja      L(big)
                    199:
                    200:        movd    %eax, %mm7              C m
                    201:        pxor    %mm6, %mm6              C initial carry word
                    202:
                    203:        punpcklwd %mm7, %mm7            C m replicated 2 times
                    204:        addl    $2, %ecx                C -size+2
                    205:
                    206:        punpckldq %mm7, %mm7            C m replicated 4 times
                    207:        andl    $4, %edx                C test alignment, clear carry flag
                    208:
                    209:        movq    %mm7, %mm0              C m
                    210:        jz      L(small_entry)
                    211:
                    212:
                    213:        C Source is unaligned, process one limb separately.
                    214:        C
                    215:        C Plain integer code is used here, since it's smaller and is about
                    216:        C the same 13 cycles as an mmx block would be.
                    217:        C
                    218:        C An "addl $1,%ecx" doesn't clear the carry flag when size==3, hence
                    219:        C the use of separate incl and orl.
                    220:
                    221:        mull    -8(%esi,%ecx,4)         C m * src[0]
                    222:
                    223:        movl    %eax, -8(%edi,%ecx,4)   C dst[0]
                    224:        incl    %ecx                    C one limb processed
                    225:
                    226:        movd    %edx, %mm6              C initial carry
                    227:
                    228:        orl     %eax, %eax              C clear carry flag
                    229:        jmp     L(small_entry)
                    230:
                    231:
                    232: C The scheduling here is quite tricky, since so many instructions have
                    233: C pairing restrictions.  In particular the js won't pair with a movd, and
                    234: C can't be paired with an adc since it wants flags from the inc, so
                    235: C instructions are rotated to the top of the loop to find somewhere useful
                    236: C for it.
                    237: C
                    238: C Trouble has been taken to avoid overlapping successive loop iterations,
                    239: C since that would greatly increase the size of the startup and finishup
                    240: C code.  Actually there's probably not much advantage to be had from
                    241: C overlapping anyway, since the difficulties are mostly with pairing, not
                    242: C with latencies as such.
                    243: C
                    244: C In the comments x represents the src data and m the multiplier (16
                    245: C bits, but replicated 4 times).
                    246: C
                    247: C The m signs calculated in %mm3 are a loop invariant and could be held in
                    248: C say %mm5, but that would save only one instruction and hence be no faster.
                    249:
                    250: L(small_top):
                    251:        C eax   l.low, then l.high
                    252:        C ebx   (h.low)
                    253:        C ecx   counter, -size+2 to 0 or 1
                    254:        C edx   (h.high)
                    255:        C esi   &src[size]
                    256:        C edi   &dst[size]
                    257:        C ebp
                    258:        C
                    259:        C %mm0  (high products)
                    260:        C %mm1  (low products)
                    261:        C %mm2  (adjust for m using x signs)
                    262:        C %mm3  (adjust for x using m signs)
                    263:        C %mm4
                    264:        C %mm5
                    265:        C %mm6  h.low, then carry
                    266:        C %mm7  m replicated 4 times
                    267:
                    268:        movd    %mm6, %ebx              C h.low
                    269:        psrlq   $32, %mm1               C l.high
                    270:
                    271:        movd    %mm0, %edx              C h.high
                    272:        movq    %mm0, %mm6              C new c
                    273:
                    274:        adcl    %eax, %ebx
                    275:        incl    %ecx
                    276:
                    277:        movd    %mm1, %eax              C l.high
                    278:        movq    %mm7, %mm0
                    279:
                    280:        adcl    %eax, %edx
                    281:        movl    %ebx, -16(%edi,%ecx,4)
                    282:
                    283:        movl    %edx, -12(%edi,%ecx,4)
                    284:        psrlq   $32, %mm6               C c
                    285:
                    286: L(small_entry):
                    287:        pmulhw  -8(%esi,%ecx,4), %mm0   C h = (x*m).high
                    288:        movq    %mm7, %mm1
                    289:
                    290:        pmullw  -8(%esi,%ecx,4), %mm1   C l = (x*m).low
                    291:        movq    %mm7, %mm3
                    292:
                    293:        movq    -8(%esi,%ecx,4), %mm2   C x
                    294:        psraw   $15, %mm3               C m signs
                    295:
                    296:        pand    -8(%esi,%ecx,4), %mm3   C x selected by m signs
                    297:        psraw   $15, %mm2               C x signs
                    298:
                    299:        paddw   %mm3, %mm0              C add x to h if m neg
                    300:        pand    %mm7, %mm2              C m selected by x signs
                    301:
                    302:        paddw   %mm2, %mm0              C add m to h if x neg
                    303:        incl    %ecx
                    304:
                    305:        movd    %mm1, %eax              C l.low
                    306:        punpcklwd %mm0, %mm6            C c + h.low << 16
                    307:
                    308:        psrlq   $16, %mm0               C h.high
                    309:        js      L(small_top)
                    310:
                    311:
                    312:
                    313:
                    314:        movd    %mm6, %ebx              C h.low
                    315:        psrlq   $32, %mm1               C l.high
                    316:
                    317:        adcl    %eax, %ebx
                    318:        popl    %ebp            FRAME_popl()
                    319:
                    320:        movd    %mm0, %edx              C h.high
                    321:        psrlq   $32, %mm0               C l.high
                    322:
                    323:        movd    %mm1, %eax              C l.high
                    324:
                    325:        adcl    %eax, %edx
                    326:        movl    %ebx, -12(%edi,%ecx,4)
                    327:
                    328:        movd    %mm0, %eax              C c
                    329:
                    330:        adcl    $0, %eax
                    331:        movl    %edx, -8(%edi,%ecx,4)
                    332:
                    333:        orl     %ecx, %ecx
                    334:        jnz     L(small_done)           C final %ecx==1 means even, ==0 odd
                    335:
                    336:
                    337:        C Size odd, one extra limb to process.
                    338:        C Plain integer code is used here, since it's smaller and is about
                    339:        C the same speed as another mmx block would be.
                    340:
                    341:        movl    %eax, %ecx
                    342:        movl    PARAM_MULTIPLIER, %eax
                    343:
                    344:        mull    -4(%esi)
                    345:
                    346:        addl    %ecx, %eax
                    347:
                    348:        adcl    $0, %edx
                    349:        movl    %eax, -4(%edi)
                    350:
                    351:        movl    %edx, %eax
                    352: L(small_done):
                    353:        popl    %ebx
                    354:
                    355:        popl    %edi
                    356:        popl    %esi
                    357:
                    358:        emms
                    359:
                    360:        ret
                    361:
                    362: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>