OpenXM_contrib/gmp/mpn/x86/pentium/mmx/mul_1.asm - annotate

Return to mul_1.asm CVS log
Up to [local] / OpenXM_contrib / gmp / mpn / x86 / pentium / mmx
Annotation of OpenXM_contrib/gmp/mpn/x86/pentium/mmx/mul_1.asm, Revision 1.1

1.1     ! ohara       1: dnl  Intel Pentium MMX mpn_mul_1 -- mpn by limb multiplication.
        !             2:
        !             3: dnl  Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
        !             4: dnl
        !             5: dnl  This file is part of the GNU MP Library.
        !             6: dnl
        !             7: dnl  The GNU MP Library is free software; you can redistribute it and/or
        !             8: dnl  modify it under the terms of the GNU Lesser General Public License as
        !             9: dnl  published by the Free Software Foundation; either version 2.1 of the
        !            10: dnl  License, or (at your option) any later version.
        !            11: dnl
        !            12: dnl  The GNU MP Library is distributed in the hope that it will be useful,
        !            13: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            14: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
        !            15: dnl  Lesser General Public License for more details.
        !            16: dnl
        !            17: dnl  You should have received a copy of the GNU Lesser General Public
        !            18: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
        !            19: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
        !            20: dnl  Suite 330, Boston, MA 02111-1307, USA.
        !            21:
        !            22: include(`../config.m4')
        !            23:
        !            24:
        !            25: C    cycles/limb
        !            26: C P5:   12.0   for 32-bit multiplier
        !            27: C        7.0   for 16-bit multiplier
        !            28:
        !            29:
        !            30: C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
        !            31: C                      mp_limb_t multiplier);
        !            32: C
        !            33: C When the multiplier is 16 bits some special case MMX code is used.  Small
        !            34: C multipliers might arise reasonably often from mpz_mul_ui etc.  If the size
        !            35: C is odd there's roughly a 5 cycle penalty, so times for say size==7 and
        !            36: C size==8 end up being quite close.  If src isn't aligned to an 8 byte
        !            37: C boundary then one limb is processed separately with roughly a 5 cycle
        !            38: C penalty, so in that case it's say size==8 and size==9 which are close.
        !            39: C
        !            40: C Alternatives:
        !            41: C
        !            42: C MMX is not believed to be of any use for 32-bit multipliers, since for
        !            43: C instance the current method would just have to be more or less duplicated
        !            44: C for the high and low halves of the multiplier, and would probably
        !            45: C therefore run at about 14 cycles, which is slower than the plain integer
        !            46: C at 12.
        !            47: C
        !            48: C Adding the high and low MMX products using integer code seems best.  An
        !            49: C attempt at using paddd and carry bit propagation with pcmpgtd didn't give
        !            50: C any joy.  Perhaps something could be done keeping the values signed and
        !            51: C thereby avoiding adjustments to make pcmpgtd into an unsigned compare, or
        !            52: C perhaps not.
        !            53: C
        !            54: C Future:
        !            55: C
        !            56: C An mpn_mul_1c entrypoint would need a double carry out of the low result
        !            57: C limb in the 16-bit code, unless it could be assumed the carry fits in 16
        !            58: C bits, possibly as carry<multiplier, this being true of a big calculation
        !            59: C done piece by piece.  But let's worry about that if/when mul_1c is
        !            60: C actually used.
        !            61:
        !            62: defframe(PARAM_MULTIPLIER,16)
        !            63: defframe(PARAM_SIZE,      12)
        !            64: defframe(PARAM_SRC,       8)
        !            65: defframe(PARAM_DST,       4)
        !            66:
        !            67:        TEXT
        !            68:
        !            69:        ALIGN(8)
        !            70: PROLOGUE(mpn_mul_1)
        !            71: deflit(`FRAME',0)
        !            72:
        !            73:        movl    PARAM_SIZE, %ecx
        !            74:        movl    PARAM_SRC, %edx
        !            75:
        !            76:        cmpl    $1, %ecx
        !            77:        jne     L(two_or_more)
        !            78:
        !            79:        C one limb only
        !            80:
        !            81:        movl    PARAM_MULTIPLIER, %eax
        !            82:        movl    PARAM_DST, %ecx
        !            83:
        !            84:        mull    (%edx)
        !            85:
        !            86:        movl    %eax, (%ecx)
        !            87:        movl    %edx, %eax
        !            88:
        !            89:        ret
        !            90:
        !            91:
        !            92: L(two_or_more):
        !            93:        C eax   size
        !            94:        C ebx
        !            95:        C ecx   carry
        !            96:        C edx
        !            97:        C esi   src
        !            98:        C edi
        !            99:        C ebp
        !           100:
        !           101:        pushl   %esi            FRAME_pushl()
        !           102:        pushl   %edi            FRAME_pushl()
        !           103:
        !           104:        movl    %edx, %esi              C src
        !           105:        movl    PARAM_DST, %edi
        !           106:
        !           107:        movl    PARAM_MULTIPLIER, %eax
        !           108:        pushl   %ebx            FRAME_pushl()
        !           109:
        !           110:        leal    (%esi,%ecx,4), %esi     C src end
        !           111:        leal    (%edi,%ecx,4), %edi     C dst end
        !           112:
        !           113:        negl    %ecx                    C -size
        !           114:
        !           115:        pushl   %ebp            FRAME_pushl()
        !           116:        cmpl    $65536, %eax
        !           117:
        !           118:        jb      L(small)
        !           119:
        !           120:
        !           121: L(big):
        !           122:        xorl    %ebx, %ebx              C carry limb
        !           123:        sarl    %ecx                    C -size/2
        !           124:
        !           125:        jnc     L(top)                  C with carry flag clear
        !           126:
        !           127:
        !           128:        C size was odd, process one limb separately
        !           129:
        !           130:        mull    4(%esi,%ecx,8)          C m * src[0]
        !           131:
        !           132:        movl    %eax, 4(%edi,%ecx,8)
        !           133:        incl    %ecx
        !           134:
        !           135:        orl     %edx, %ebx              C carry limb, and clear carry flag
        !           136:
        !           137:
        !           138: L(top):
        !           139:        C eax
        !           140:        C ebx   carry
        !           141:        C ecx   counter, negative
        !           142:        C edx
        !           143:        C esi   src end
        !           144:        C edi   dst end
        !           145:        C ebp   (scratch carry)
        !           146:
        !           147:        adcl    $0, %ebx
        !           148:        movl    (%esi,%ecx,8), %eax
        !           149:
        !           150:        mull    PARAM_MULTIPLIER
        !           151:
        !           152:        movl    %edx, %ebp
        !           153:        addl    %eax, %ebx
        !           154:
        !           155:        adcl    $0, %ebp
        !           156:        movl    4(%esi,%ecx,8), %eax
        !           157:
        !           158:        mull    PARAM_MULTIPLIER
        !           159:
        !           160:        movl    %ebx, (%edi,%ecx,8)
        !           161:        addl    %ebp, %eax
        !           162:
        !           163:        movl    %eax, 4(%edi,%ecx,8)
        !           164:        incl    %ecx
        !           165:
        !           166:        movl    %edx, %ebx
        !           167:        jnz     L(top)
        !           168:
        !           169:
        !           170:        adcl    $0, %ebx
        !           171:        popl    %ebp
        !           172:
        !           173:        movl    %ebx, %eax
        !           174:        popl    %ebx
        !           175:
        !           176:        popl    %edi
        !           177:        popl    %esi
        !           178:
        !           179:        ret
        !           180:
        !           181:
        !           182: L(small):
        !           183:        C Special case for 16-bit multiplier.
        !           184:        C
        !           185:        C eax   multiplier
        !           186:        C ebx
        !           187:        C ecx   -size
        !           188:        C edx   src
        !           189:        C esi   src end
        !           190:        C edi   dst end
        !           191:        C ebp   multiplier
        !           192:
        !           193:        C size<3 not supported here.  At size==3 we're already a couple of
        !           194:        C cycles faster, so there's no threshold as such, just use the MMX
        !           195:        C as soon as possible.
        !           196:
        !           197:        cmpl    $-3, %ecx
        !           198:        ja      L(big)
        !           199:
        !           200:        movd    %eax, %mm7              C m
        !           201:        pxor    %mm6, %mm6              C initial carry word
        !           202:
        !           203:        punpcklwd %mm7, %mm7            C m replicated 2 times
        !           204:        addl    $2, %ecx                C -size+2
        !           205:
        !           206:        punpckldq %mm7, %mm7            C m replicated 4 times
        !           207:        andl    $4, %edx                C test alignment, clear carry flag
        !           208:
        !           209:        movq    %mm7, %mm0              C m
        !           210:        jz      L(small_entry)
        !           211:
        !           212:
        !           213:        C Source is unaligned, process one limb separately.
        !           214:        C
        !           215:        C Plain integer code is used here, since it's smaller and is about
        !           216:        C the same 13 cycles as an mmx block would be.
        !           217:        C
        !           218:        C An "addl $1,%ecx" doesn't clear the carry flag when size==3, hence
        !           219:        C the use of separate incl and orl.
        !           220:
        !           221:        mull    -8(%esi,%ecx,4)         C m * src[0]
        !           222:
        !           223:        movl    %eax, -8(%edi,%ecx,4)   C dst[0]
        !           224:        incl    %ecx                    C one limb processed
        !           225:
        !           226:        movd    %edx, %mm6              C initial carry
        !           227:
        !           228:        orl     %eax, %eax              C clear carry flag
        !           229:        jmp     L(small_entry)
        !           230:
        !           231:
        !           232: C The scheduling here is quite tricky, since so many instructions have
        !           233: C pairing restrictions.  In particular the js won't pair with a movd, and
        !           234: C can't be paired with an adc since it wants flags from the inc, so
        !           235: C instructions are rotated to the top of the loop to find somewhere useful
        !           236: C for it.
        !           237: C
        !           238: C Trouble has been taken to avoid overlapping successive loop iterations,
        !           239: C since that would greatly increase the size of the startup and finishup
        !           240: C code.  Actually there's probably not much advantage to be had from
        !           241: C overlapping anyway, since the difficulties are mostly with pairing, not
        !           242: C with latencies as such.
        !           243: C
        !           244: C In the comments x represents the src data and m the multiplier (16
        !           245: C bits, but replicated 4 times).
        !           246: C
        !           247: C The m signs calculated in %mm3 are a loop invariant and could be held in
        !           248: C say %mm5, but that would save only one instruction and hence be no faster.
        !           249:
        !           250: L(small_top):
        !           251:        C eax   l.low, then l.high
        !           252:        C ebx   (h.low)
        !           253:        C ecx   counter, -size+2 to 0 or 1
        !           254:        C edx   (h.high)
        !           255:        C esi   &src[size]
        !           256:        C edi   &dst[size]
        !           257:        C ebp
        !           258:        C
        !           259:        C %mm0  (high products)
        !           260:        C %mm1  (low products)
        !           261:        C %mm2  (adjust for m using x signs)
        !           262:        C %mm3  (adjust for x using m signs)
        !           263:        C %mm4
        !           264:        C %mm5
        !           265:        C %mm6  h.low, then carry
        !           266:        C %mm7  m replicated 4 times
        !           267:
        !           268:        movd    %mm6, %ebx              C h.low
        !           269:        psrlq   $32, %mm1               C l.high
        !           270:
        !           271:        movd    %mm0, %edx              C h.high
        !           272:        movq    %mm0, %mm6              C new c
        !           273:
        !           274:        adcl    %eax, %ebx
        !           275:        incl    %ecx
        !           276:
        !           277:        movd    %mm1, %eax              C l.high
        !           278:        movq    %mm7, %mm0
        !           279:
        !           280:        adcl    %eax, %edx
        !           281:        movl    %ebx, -16(%edi,%ecx,4)
        !           282:
        !           283:        movl    %edx, -12(%edi,%ecx,4)
        !           284:        psrlq   $32, %mm6               C c
        !           285:
        !           286: L(small_entry):
        !           287:        pmulhw  -8(%esi,%ecx,4), %mm0   C h = (x*m).high
        !           288:        movq    %mm7, %mm1
        !           289:
        !           290:        pmullw  -8(%esi,%ecx,4), %mm1   C l = (x*m).low
        !           291:        movq    %mm7, %mm3
        !           292:
        !           293:        movq    -8(%esi,%ecx,4), %mm2   C x
        !           294:        psraw   $15, %mm3               C m signs
        !           295:
        !           296:        pand    -8(%esi,%ecx,4), %mm3   C x selected by m signs
        !           297:        psraw   $15, %mm2               C x signs
        !           298:
        !           299:        paddw   %mm3, %mm0              C add x to h if m neg
        !           300:        pand    %mm7, %mm2              C m selected by x signs
        !           301:
        !           302:        paddw   %mm2, %mm0              C add m to h if x neg
        !           303:        incl    %ecx
        !           304:
        !           305:        movd    %mm1, %eax              C l.low
        !           306:        punpcklwd %mm0, %mm6            C c + h.low << 16
        !           307:
        !           308:        psrlq   $16, %mm0               C h.high
        !           309:        js      L(small_top)
        !           310:
        !           311:
        !           312:
        !           313:
        !           314:        movd    %mm6, %ebx              C h.low
        !           315:        psrlq   $32, %mm1               C l.high
        !           316:
        !           317:        adcl    %eax, %ebx
        !           318:        popl    %ebp            FRAME_popl()
        !           319:
        !           320:        movd    %mm0, %edx              C h.high
        !           321:        psrlq   $32, %mm0               C l.high
        !           322:
        !           323:        movd    %mm1, %eax              C l.high
        !           324:
        !           325:        adcl    %eax, %edx
        !           326:        movl    %ebx, -12(%edi,%ecx,4)
        !           327:
        !           328:        movd    %mm0, %eax              C c
        !           329:
        !           330:        adcl    $0, %eax
        !           331:        movl    %edx, -8(%edi,%ecx,4)
        !           332:
        !           333:        orl     %ecx, %ecx
        !           334:        jnz     L(small_done)           C final %ecx==1 means even, ==0 odd
        !           335:
        !           336:
        !           337:        C Size odd, one extra limb to process.
        !           338:        C Plain integer code is used here, since it's smaller and is about
        !           339:        C the same speed as another mmx block would be.
        !           340:
        !           341:        movl    %eax, %ecx
        !           342:        movl    PARAM_MULTIPLIER, %eax
        !           343:
        !           344:        mull    -4(%esi)
        !           345:
        !           346:        addl    %ecx, %eax
        !           347:
        !           348:        adcl    $0, %edx
        !           349:        movl    %eax, -4(%edi)
        !           350:
        !           351:        movl    %edx, %eax
        !           352: L(small_done):
        !           353:        popl    %ebx
        !           354:
        !           355:        popl    %edi
        !           356:        popl    %esi
        !           357:
        !           358:        emms
        !           359:
        !           360:        ret
        !           361:
        !           362: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>