OpenXM_contrib/gmp/mpn/x86/pentium/mmx/lshift.asm - annotate

Return to lshift.asm CVS log
Up to [local] / OpenXM_contrib / gmp / mpn / x86 / pentium / mmx
Annotation of OpenXM_contrib/gmp/mpn/x86/pentium/mmx/lshift.asm, Revision 1.1

1.1     ! maekawa     1: dnl  Intel P5 mpn_lshift -- mpn left shift.
        !             2: dnl
        !             3: dnl  P5: 1.75 cycles/limb.
        !             4:
        !             5:
        !             6: dnl  Copyright (C) 2000 Free Software Foundation, Inc.
        !             7: dnl
        !             8: dnl  This file is part of the GNU MP Library.
        !             9: dnl
        !            10: dnl  The GNU MP Library is free software; you can redistribute it and/or
        !            11: dnl  modify it under the terms of the GNU Lesser General Public License as
        !            12: dnl  published by the Free Software Foundation; either version 2.1 of the
        !            13: dnl  License, or (at your option) any later version.
        !            14: dnl
        !            15: dnl  The GNU MP Library is distributed in the hope that it will be useful,
        !            16: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            17: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
        !            18: dnl  Lesser General Public License for more details.
        !            19: dnl
        !            20: dnl  You should have received a copy of the GNU Lesser General Public
        !            21: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
        !            22: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
        !            23: dnl  Suite 330, Boston, MA 02111-1307, USA.
        !            24:
        !            25:
        !            26: include(`../config.m4')
        !            27:
        !            28:
        !            29: C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
        !            30: C                       unsigned shift);
        !            31: C
        !            32: C Shift src,size left by shift many bits and store the result in dst,size.
        !            33: C Zeros are shifted in at the right.  Return the bits shifted out at the
        !            34: C left.
        !            35: C
        !            36: C The comments in mpn_rshift apply here too.
        !            37:
        !            38: defframe(PARAM_SHIFT,16)
        !            39: defframe(PARAM_SIZE, 12)
        !            40: defframe(PARAM_SRC,  8)
        !            41: defframe(PARAM_DST,  4)
        !            42: deflit(`FRAME',0)
        !            43:
        !            44: dnl  minimum 5, because the unrolled loop can't handle less
        !            45: deflit(UNROLL_THRESHOLD, 5)
        !            46:
        !            47:        .text
        !            48:        ALIGN(8)
        !            49:
        !            50: PROLOGUE(mpn_lshift)
        !            51:
        !            52:        pushl   %ebx
        !            53:        pushl   %edi
        !            54: deflit(`FRAME',8)
        !            55:
        !            56:        movl    PARAM_SIZE, %eax
        !            57:        movl    PARAM_DST, %edx
        !            58:
        !            59:        movl    PARAM_SRC, %ebx
        !            60:        movl    PARAM_SHIFT, %ecx
        !            61:
        !            62:        cmp     $UNROLL_THRESHOLD, %eax
        !            63:        jae     L(unroll)
        !            64:
        !            65:        movl    -4(%ebx,%eax,4), %edi   C src high limb
        !            66:        decl    %eax
        !            67:
        !            68:        jnz     L(simple)
        !            69:
        !            70:        shldl(  %cl, %edi, %eax)        C eax was decremented to zero
        !            71:
        !            72:        shll    %cl, %edi
        !            73:
        !            74:        movl    %edi, (%edx)            C dst low limb
        !            75:        popl    %edi                    C risk of data cache bank clash
        !            76:
        !            77:        popl    %ebx
        !            78:
        !            79:        ret
        !            80:
        !            81:
        !            82: C -----------------------------------------------------------------------------
        !            83: L(simple):
        !            84:        C eax   size-1
        !            85:        C ebx   src
        !            86:        C ecx   shift
        !            87:        C edx   dst
        !            88:        C esi
        !            89:        C edi
        !            90:        C ebp
        !            91: deflit(`FRAME',8)
        !            92:
        !            93:        movd    (%ebx,%eax,4), %mm5     C src high limb
        !            94:
        !            95:        movd    %ecx, %mm6              C lshift
        !            96:        negl    %ecx
        !            97:
        !            98:        psllq   %mm6, %mm5
        !            99:        addl    $32, %ecx
        !           100:
        !           101:        movd    %ecx, %mm7
        !           102:        psrlq   $32, %mm5               C retval
        !           103:
        !           104:
        !           105: L(simple_top):
        !           106:        C eax   counter, limbs, negative
        !           107:        C ebx   src
        !           108:        C ecx
        !           109:        C edx   dst
        !           110:        C esi
        !           111:        C edi
        !           112:        C
        !           113:        C mm0   scratch
        !           114:        C mm5   return value
        !           115:        C mm6   shift
        !           116:        C mm7   32-shift
        !           117:
        !           118:        movq    -4(%ebx,%eax,4), %mm0
        !           119:        decl    %eax
        !           120:
        !           121:        psrlq   %mm7, %mm0
        !           122:
        !           123:        C
        !           124:
        !           125:        movd    %mm0, 4(%edx,%eax,4)
        !           126:        jnz     L(simple_top)
        !           127:
        !           128:
        !           129:        movd    (%ebx), %mm0
        !           130:
        !           131:        movd    %mm5, %eax
        !           132:        psllq   %mm6, %mm0
        !           133:
        !           134:        popl    %edi
        !           135:        popl    %ebx
        !           136:
        !           137:        movd    %mm0, (%edx)
        !           138:
        !           139:        emms
        !           140:
        !           141:        ret
        !           142:
        !           143:
        !           144: C -----------------------------------------------------------------------------
        !           145:        ALIGN(8)
        !           146: L(unroll):
        !           147:        C eax   size
        !           148:        C ebx   src
        !           149:        C ecx   shift
        !           150:        C edx   dst
        !           151:        C esi
        !           152:        C edi
        !           153:        C ebp
        !           154: deflit(`FRAME',8)
        !           155:
        !           156:        movd    -4(%ebx,%eax,4), %mm5   C src high limb
        !           157:        leal    (%ebx,%eax,4), %edi
        !           158:
        !           159:        movd    %ecx, %mm6              C lshift
        !           160:        andl    $4, %edi
        !           161:
        !           162:        psllq   %mm6, %mm5
        !           163:        jz      L(start_src_aligned)
        !           164:
        !           165:
        !           166:        C src isn't aligned, process high limb separately (marked xxx) to
        !           167:        C make it so.
        !           168:        C
        !           169:        C  source     -8(ebx,%eax,4)
        !           170:        C                  |
        !           171:        C  +-------+-------+-------+--
        !           172:        C  |               |
        !           173:        C  +-------+-------+-------+--
        !           174:        C        0mod8   4mod8   0mod8
        !           175:        C
        !           176:        C  dest
        !           177:        C     -4(edx,%eax,4)
        !           178:        C          |
        !           179:        C  +-------+-------+--
        !           180:        C  |  xxx  |       |
        !           181:        C  +-------+-------+--
        !           182:
        !           183:        movq    -8(%ebx,%eax,4), %mm0   C unaligned load
        !           184:
        !           185:        psllq   %mm6, %mm0
        !           186:        decl    %eax
        !           187:
        !           188:        psrlq   $32, %mm0
        !           189:
        !           190:        C
        !           191:
        !           192:        movd    %mm0, (%edx,%eax,4)
        !           193: L(start_src_aligned):
        !           194:
        !           195:        movq    -8(%ebx,%eax,4), %mm1   C src high qword
        !           196:        leal    (%edx,%eax,4), %edi
        !           197:
        !           198:        andl    $4, %edi
        !           199:        psrlq   $32, %mm5               C return value
        !           200:
        !           201:        movq    -16(%ebx,%eax,4), %mm3  C src second highest qword
        !           202:        jz      L(start_dst_aligned)
        !           203:
        !           204:        C dst isn't aligned, subtract 4 to make it so, and pretend the shift
        !           205:        C is 32 bits extra.  High limb of dst (marked xxx) handled here
        !           206:        C separately.
        !           207:        C
        !           208:        C  source     -8(ebx,%eax,4)
        !           209:        C                  |
        !           210:        C  +-------+-------+--
        !           211:        C  |      mm1      |
        !           212:        C  +-------+-------+--
        !           213:        C                0mod8   4mod8
        !           214:        C
        !           215:        C  dest
        !           216:        C     -4(edx,%eax,4)
        !           217:        C          |
        !           218:        C  +-------+-------+-------+--
        !           219:        C  |  xxx  |               |
        !           220:        C  +-------+-------+-------+--
        !           221:        C        0mod8   4mod8   0mod8
        !           222:
        !           223:        movq    %mm1, %mm0
        !           224:        addl    $32, %ecx               C new shift
        !           225:
        !           226:        psllq   %mm6, %mm0
        !           227:
        !           228:        movd    %ecx, %mm6
        !           229:        psrlq   $32, %mm0
        !           230:
        !           231:        C wasted cycle here waiting for %mm0
        !           232:
        !           233:        movd    %mm0, -4(%edx,%eax,4)
        !           234:        subl    $4, %edx
        !           235: L(start_dst_aligned):
        !           236:
        !           237:
        !           238:        psllq   %mm6, %mm1
        !           239:        negl    %ecx                    C -shift
        !           240:
        !           241:         addl    $64, %ecx              C 64-shift
        !           242:        movq    %mm3, %mm2
        !           243:
        !           244:         movd    %ecx, %mm7
        !           245:        subl    $8, %eax                C size-8
        !           246:
        !           247:        psrlq   %mm7, %mm3
        !           248:
        !           249:        por     %mm1, %mm3              C mm3 ready to store
        !           250:        jc      L(finish)
        !           251:
        !           252:
        !           253:        C The comments in mpn_rshift apply here too.
        !           254:
        !           255:        ALIGN(8)
        !           256: L(unroll_loop):
        !           257:        C eax   counter, limbs
        !           258:        C ebx   src
        !           259:        C ecx
        !           260:        C edx   dst
        !           261:        C esi
        !           262:        C edi
        !           263:        C
        !           264:        C mm0
        !           265:        C mm1
        !           266:        C mm2   src qword from 48(%ebx,%eax,4)
        !           267:        C mm3   dst qword ready to store to 56(%edx,%eax,4)
        !           268:        C
        !           269:        C mm5   return value
        !           270:        C mm6   lshift
        !           271:        C mm7   rshift
        !           272:
        !           273:        movq    8(%ebx,%eax,4), %mm0
        !           274:        psllq   %mm6, %mm2
        !           275:
        !           276:        movq    %mm0, %mm1
        !           277:        psrlq   %mm7, %mm0
        !           278:
        !           279:        movq    %mm3, 24(%edx,%eax,4)   C prev
        !           280:        por     %mm2, %mm0
        !           281:
        !           282:        movq    (%ebx,%eax,4), %mm3     C
        !           283:        psllq   %mm6, %mm1              C
        !           284:
        !           285:        movq    %mm0, 16(%edx,%eax,4)
        !           286:        movq    %mm3, %mm2              C
        !           287:
        !           288:        psrlq   %mm7, %mm3              C
        !           289:        subl    $4, %eax
        !           290:
        !           291:        por     %mm1, %mm3              C
        !           292:        jnc     L(unroll_loop)
        !           293:
        !           294:
        !           295:
        !           296: L(finish):
        !           297:        C eax   -4 to -1 representing respectively 0 to 3 limbs remaining
        !           298:
        !           299:        testb   $2, %al
        !           300:
        !           301:        jz      L(finish_no_two)
        !           302:
        !           303:        movq    8(%ebx,%eax,4), %mm0
        !           304:        psllq   %mm6, %mm2
        !           305:
        !           306:        movq    %mm0, %mm1
        !           307:        psrlq   %mm7, %mm0
        !           308:
        !           309:        movq    %mm3, 24(%edx,%eax,4)   C prev
        !           310:        por     %mm2, %mm0
        !           311:
        !           312:        movq    %mm1, %mm2
        !           313:        movq    %mm0, %mm3
        !           314:
        !           315:        subl    $2, %eax
        !           316: L(finish_no_two):
        !           317:
        !           318:
        !           319:        C eax   -4 or -3 representing respectively 0 or 1 limbs remaining
        !           320:        C
        !           321:        C mm2   src prev qword, from 48(%ebx,%eax,4)
        !           322:        C mm3   dst qword, for 56(%edx,%eax,4)
        !           323:
        !           324:        testb   $1, %al
        !           325:        movd    %mm5, %eax      C retval
        !           326:
        !           327:        popl    %edi
        !           328:        jz      L(finish_zero)
        !           329:
        !           330:
        !           331:        C One extra src limb, destination was aligned.
        !           332:        C
        !           333:        C                 source                  ebx
        !           334:        C                 --+---------------+-------+
        !           335:        C                   |      mm2      |       |
        !           336:        C                 --+---------------+-------+
        !           337:        C
        !           338:        C dest         edx+12           edx+4     edx
        !           339:        C --+---------------+---------------+-------+
        !           340:        C   |      mm3      |               |       |
        !           341:        C --+---------------+---------------+-------+
        !           342:        C
        !           343:        C mm6 = shift
        !           344:        C mm7 = ecx = 64-shift
        !           345:
        !           346:
        !           347:        C One extra src limb, destination was unaligned.
        !           348:        C
        !           349:        C                 source                  ebx
        !           350:        C                 --+---------------+-------+
        !           351:        C                   |      mm2      |       |
        !           352:        C                 --+---------------+-------+
        !           353:        C
        !           354:        C         dest         edx+12           edx+4
        !           355:        C         --+---------------+---------------+
        !           356:        C           |      mm3      |               |
        !           357:        C         --+---------------+---------------+
        !           358:        C
        !           359:        C mm6 = shift+32
        !           360:        C mm7 = ecx = 64-(shift+32)
        !           361:
        !           362:
        !           363:        C In both cases there's one extra limb of src to fetch and combine
        !           364:        C with mm2 to make a qword at 4(%edx), and in the aligned case
        !           365:        C there's an extra limb of dst to be formed from that extra src limb
        !           366:        C left shifted.
        !           367:
        !           368:
        !           369:         movd    (%ebx), %mm0
        !           370:        psllq   %mm6, %mm2
        !           371:
        !           372:        movq    %mm3, 12(%edx)
        !           373:        psllq   $32, %mm0
        !           374:
        !           375:         movq    %mm0, %mm1
        !           376:         psrlq   %mm7, %mm0
        !           377:
        !           378:         por     %mm2, %mm0
        !           379:         psllq   %mm6, %mm1
        !           380:
        !           381:        movq    %mm0, 4(%edx)
        !           382:        psrlq   $32, %mm1
        !           383:
        !           384:         andl   $32, %ecx
        !           385:        popl    %ebx
        !           386:
        !           387:        jz      L(finish_one_unaligned)
        !           388:
        !           389:        movd    %mm1, (%edx)
        !           390: L(finish_one_unaligned):
        !           391:
        !           392:        emms
        !           393:
        !           394:         ret
        !           395:
        !           396:
        !           397: L(finish_zero):
        !           398:
        !           399:        C No extra src limbs, destination was aligned.
        !           400:        C
        !           401:        C                 source          ebx
        !           402:        C                 --+---------------+
        !           403:        C                   |      mm2      |
        !           404:        C                 --+---------------+
        !           405:        C
        !           406:        C dest          edx+8             edx
        !           407:        C --+---------------+---------------+
        !           408:        C   |      mm3      |               |
        !           409:        C --+---------------+---------------+
        !           410:        C
        !           411:        C mm6 = shift
        !           412:        C mm7 = ecx = 64-shift
        !           413:
        !           414:
        !           415:        C No extra src limbs, destination was unaligned.
        !           416:        C
        !           417:        C               source            ebx
        !           418:        C                 --+---------------+
        !           419:        C                   |      mm2      |
        !           420:        C                 --+---------------+
        !           421:        C
        !           422:        C         dest          edx+8   edx+4
        !           423:        C         --+---------------+-------+
        !           424:        C           |      mm3      |       |
        !           425:        C         --+---------------+-------+
        !           426:        C
        !           427:        C mm6 = shift+32
        !           428:        C mm7 = ecx = 64-(shift+32)
        !           429:
        !           430:
        !           431:        C The movd for the unaligned case writes the same data to 4(%edx)
        !           432:        C that the movq does for the aligned case.
        !           433:
        !           434:
        !           435:        movq    %mm3, 8(%edx)
        !           436:        andl    $32, %ecx
        !           437:
        !           438:        psllq   %mm6, %mm2
        !           439:        jz      L(finish_zero_unaligned)
        !           440:
        !           441:        movq    %mm2, (%edx)
        !           442: L(finish_zero_unaligned):
        !           443:
        !           444:        psrlq   $32, %mm2
        !           445:        popl    %ebx
        !           446:
        !           447:        movd    %mm5, %eax      C retval
        !           448:
        !           449:        movd    %mm2, 4(%edx)
        !           450:
        !           451:        emms
        !           452:
        !           453:        ret
        !           454:
        !           455: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>