OpenXM_contrib/gmp/mpn/x86/pentium/mmx/rshift.asm - annotate

Return to rshift.asm CVS log
Up to [local] / OpenXM_contrib / gmp / mpn / x86 / pentium / mmx
Annotation of OpenXM_contrib/gmp/mpn/x86/pentium/mmx/rshift.asm, Revision 1.1

1.1     ! maekawa     1: dnl  Intel P5 mpn_rshift -- mpn right shift.
        !             2: dnl
        !             3: dnl  P5: 1.75 cycles/limb.
        !             4:
        !             5:
        !             6: dnl  Copyright (C) 2000 Free Software Foundation, Inc.
        !             7: dnl
        !             8: dnl  This file is part of the GNU MP Library.
        !             9: dnl
        !            10: dnl  The GNU MP Library is free software; you can redistribute it and/or
        !            11: dnl  modify it under the terms of the GNU Lesser General Public License as
        !            12: dnl  published by the Free Software Foundation; either version 2.1 of the
        !            13: dnl  License, or (at your option) any later version.
        !            14: dnl
        !            15: dnl  The GNU MP Library is distributed in the hope that it will be useful,
        !            16: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            17: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
        !            18: dnl  Lesser General Public License for more details.
        !            19: dnl
        !            20: dnl  You should have received a copy of the GNU Lesser General Public
        !            21: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
        !            22: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
        !            23: dnl  Suite 330, Boston, MA 02111-1307, USA.
        !            24:
        !            25:
        !            26: include(`../config.m4')
        !            27:
        !            28:
        !            29: C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
        !            30: C                       unsigned shift);
        !            31: C
        !            32: C Shift src,size right by shift many bits and store the result in dst,size.
        !            33: C Zeros are shifted in at the left.  Return the bits shifted out at the
        !            34: C right.
        !            35: C
        !            36: C It takes 6 mmx instructions to process 2 limbs, making 1.5 cycles/limb,
        !            37: C and with a 4 limb loop and 1 cycle of loop overhead the total is 1.75 c/l.
        !            38: C
        !            39: C Full speed depends on source and destination being aligned.  Unaligned mmx
        !            40: C loads and stores on P5 don't pair and have a 2 cycle penalty.  Some hairy
        !            41: C setups and finish-ups are done to ensure alignment for the loop.
        !            42: C
        !            43: C MMX shifts work out a bit faster even for the simple loop.
        !            44:
        !            45: defframe(PARAM_SHIFT,16)
        !            46: defframe(PARAM_SIZE, 12)
        !            47: defframe(PARAM_SRC,  8)
        !            48: defframe(PARAM_DST,  4)
        !            49: deflit(`FRAME',0)
        !            50:
        !            51: dnl  Minimum 5, because the unrolled loop can't handle less.
        !            52: deflit(UNROLL_THRESHOLD, 5)
        !            53:
        !            54:        .text
        !            55:        ALIGN(8)
        !            56:
        !            57: PROLOGUE(mpn_rshift)
        !            58:
        !            59:        pushl   %ebx
        !            60:        pushl   %edi
        !            61: deflit(`FRAME',8)
        !            62:
        !            63:        movl    PARAM_SIZE, %eax
        !            64:        movl    PARAM_DST, %edx
        !            65:
        !            66:        movl    PARAM_SRC, %ebx
        !            67:        movl    PARAM_SHIFT, %ecx
        !            68:
        !            69:        cmp     $UNROLL_THRESHOLD, %eax
        !            70:        jae     L(unroll)
        !            71:
        !            72:        decl    %eax
        !            73:        movl    (%ebx), %edi            C src low limb
        !            74:
        !            75:        jnz     L(simple)
        !            76:
        !            77:        shrdl(  %cl, %edi, %eax)        C eax was decremented to zero
        !            78:
        !            79:        shrl    %cl, %edi
        !            80:
        !            81:        movl    %edi, (%edx)            C dst low limb
        !            82:        popl    %edi                    C risk of data cache bank clash
        !            83:
        !            84:        popl    %ebx
        !            85:
        !            86:        ret
        !            87:
        !            88:
        !            89: C -----------------------------------------------------------------------------
        !            90:        ALIGN(8)
        !            91: L(simple):
        !            92:        C eax   size-1
        !            93:        C ebx   src
        !            94:        C ecx   shift
        !            95:        C edx   dst
        !            96:        C esi
        !            97:        C edi
        !            98:        C ebp
        !            99: deflit(`FRAME',8)
        !           100:
        !           101:        movd    (%ebx), %mm5            C src[0]
        !           102:        leal    (%ebx,%eax,4), %ebx     C &src[size-1]
        !           103:
        !           104:        movd    %ecx, %mm6              C rshift
        !           105:        leal    -4(%edx,%eax,4), %edx   C &dst[size-2]
        !           106:
        !           107:        psllq   $32, %mm5
        !           108:        negl    %eax
        !           109:
        !           110:
        !           111: C This loop is 5 or 8 cycles, with every second load unaligned and a wasted
        !           112: C cycle waiting for the mm0 result to be ready.  For comparison a shrdl is 4
        !           113: C cycles and would be 8 in a simple loop.  Using mmx helps the return value
        !           114: C and last limb calculations too.
        !           115:
        !           116: L(simple_top):
        !           117:        C eax   counter, limbs, negative
        !           118:        C ebx   &src[size-1]
        !           119:        C ecx   return value
        !           120:        C edx   &dst[size-2]
        !           121:        C
        !           122:        C mm0   scratch
        !           123:        C mm5   return value
        !           124:        C mm6   shift
        !           125:
        !           126:        movq    (%ebx,%eax,4), %mm0
        !           127:        incl    %eax
        !           128:
        !           129:        psrlq   %mm6, %mm0
        !           130:
        !           131:        movd    %mm0, (%edx,%eax,4)
        !           132:        jnz     L(simple_top)
        !           133:
        !           134:
        !           135:        movd    (%ebx), %mm0
        !           136:        psrlq   %mm6, %mm5              C return value
        !           137:
        !           138:        psrlq   %mm6, %mm0
        !           139:        popl    %edi
        !           140:
        !           141:        movd    %mm5, %eax
        !           142:        popl    %ebx
        !           143:
        !           144:        movd    %mm0, 4(%edx)
        !           145:
        !           146:        emms
        !           147:
        !           148:        ret
        !           149:
        !           150:
        !           151: C -----------------------------------------------------------------------------
        !           152:        ALIGN(8)
        !           153: L(unroll):
        !           154:        C eax   size
        !           155:        C ebx   src
        !           156:        C ecx   shift
        !           157:        C edx   dst
        !           158:        C esi
        !           159:        C edi
        !           160:        C ebp
        !           161: deflit(`FRAME',8)
        !           162:
        !           163:        movd    (%ebx), %mm5            C src[0]
        !           164:        movl    $4, %edi
        !           165:
        !           166:        movd    %ecx, %mm6              C rshift
        !           167:        testl   %edi, %ebx
        !           168:
        !           169:        psllq   $32, %mm5
        !           170:        jz      L(start_src_aligned)
        !           171:
        !           172:
        !           173:        C src isn't aligned, process low limb separately (marked xxx) and
        !           174:        C step src and dst by one limb, making src aligned.
        !           175:        C
        !           176:        C source                  ebx
        !           177:        C --+-------+-------+-------+
        !           178:        C           |          xxx  |
        !           179:        C --+-------+-------+-------+
        !           180:        C         4mod8   0mod8   4mod8
        !           181:        C
        !           182:        C         dest            edx
        !           183:        C         --+-------+-------+
        !           184:        C           |       |  xxx  |
        !           185:        C         --+-------+-------+
        !           186:
        !           187:        movq    (%ebx), %mm0            C unaligned load
        !           188:
        !           189:        psrlq   %mm6, %mm0
        !           190:        addl    $4, %ebx
        !           191:
        !           192:        decl    %eax
        !           193:
        !           194:        movd    %mm0, (%edx)
        !           195:        addl    $4, %edx
        !           196: L(start_src_aligned):
        !           197:
        !           198:
        !           199:        movq    (%ebx), %mm1
        !           200:        testl   %edi, %edx
        !           201:
        !           202:        psrlq   %mm6, %mm5              C retval
        !           203:        jz      L(start_dst_aligned)
        !           204:
        !           205:        C dst isn't aligned, add 4 to make it so, and pretend the shift is
        !           206:        C 32 bits extra.  Low limb of dst (marked xxx) handled here
        !           207:        C separately.
        !           208:        C
        !           209:        C          source          ebx
        !           210:        C          --+-------+-------+
        !           211:        C            |      mm1      |
        !           212:        C          --+-------+-------+
        !           213:        C                  4mod8   0mod8
        !           214:        C
        !           215:        C  dest                    edx
        !           216:        C  --+-------+-------+-------+
        !           217:        C                    |  xxx  |
        !           218:        C  --+-------+-------+-------+
        !           219:        C          4mod8   0mod8   4mod8
        !           220:
        !           221:        movq    %mm1, %mm0
        !           222:        addl    $32, %ecx               C new shift
        !           223:
        !           224:        psrlq   %mm6, %mm0
        !           225:
        !           226:        movd    %ecx, %mm6
        !           227:
        !           228:        movd    %mm0, (%edx)
        !           229:        addl    $4, %edx
        !           230: L(start_dst_aligned):
        !           231:
        !           232:
        !           233:        movq    8(%ebx), %mm3
        !           234:        negl    %ecx
        !           235:
        !           236:        movq    %mm3, %mm2              C mm2 src qword
        !           237:         addl    $64, %ecx
        !           238:
        !           239:         movd    %ecx, %mm7
        !           240:        psrlq   %mm6, %mm1
        !           241:
        !           242:        leal    -12(%ebx,%eax,4), %ebx
        !           243:        leal    -20(%edx,%eax,4), %edx
        !           244:
        !           245:        psllq   %mm7, %mm3
        !           246:        subl    $7, %eax                C size-7
        !           247:
        !           248:        por     %mm1, %mm3              C mm3 ready to store
        !           249:        negl    %eax                    C -(size-7)
        !           250:
        !           251:        jns     L(finish)
        !           252:
        !           253:
        !           254:        C This loop is the important bit, the rest is just support.  Careful
        !           255:        C instruction scheduling achieves the claimed 1.75 c/l.  The
        !           256:        C relevant parts of the pairing rules are:
        !           257:        C
        !           258:        C - mmx loads and stores execute only in the U pipe
        !           259:        C - only one mmx shift in a pair
        !           260:        C - wait one cycle before storing an mmx register result
        !           261:        C - the usual address generation interlock
        !           262:        C
        !           263:        C Two qword calculations are slightly interleaved.  The instructions
        !           264:        C marked "C" belong to the second qword, and the "C prev" one is for
        !           265:        C the second qword from the previous iteration.
        !           266:
        !           267:        ALIGN(8)
        !           268: L(unroll_loop):
        !           269:        C eax   counter, limbs, negative
        !           270:        C ebx   &src[size-12]
        !           271:        C ecx
        !           272:        C edx   &dst[size-12]
        !           273:        C esi
        !           274:        C edi
        !           275:        C
        !           276:        C mm0
        !           277:        C mm1
        !           278:        C mm2   src qword from -8(%ebx,%eax,4)
        !           279:        C mm3   dst qword ready to store to -8(%edx,%eax,4)
        !           280:        C
        !           281:        C mm5   return value
        !           282:        C mm6   rshift
        !           283:        C mm7   lshift
        !           284:
        !           285:        movq    (%ebx,%eax,4), %mm0
        !           286:        psrlq   %mm6, %mm2
        !           287:
        !           288:        movq    %mm0, %mm1
        !           289:        psllq   %mm7, %mm0
        !           290:
        !           291:        movq    %mm3, -8(%edx,%eax,4)   C prev
        !           292:        por     %mm2, %mm0
        !           293:
        !           294:        movq    8(%ebx,%eax,4), %mm3    C
        !           295:        psrlq   %mm6, %mm1              C
        !           296:
        !           297:        movq    %mm0, (%edx,%eax,4)
        !           298:        movq    %mm3, %mm2              C
        !           299:
        !           300:        psllq   %mm7, %mm3              C
        !           301:        addl    $4, %eax
        !           302:
        !           303:        por     %mm1, %mm3              C
        !           304:        js      L(unroll_loop)
        !           305:
        !           306:
        !           307: L(finish):
        !           308:        C eax   0 to 3 representing respectively 3 to 0 limbs remaining
        !           309:
        !           310:        testb   $2, %al
        !           311:
        !           312:        jnz     L(finish_no_two)
        !           313:
        !           314:        movq    (%ebx,%eax,4), %mm0
        !           315:        psrlq   %mm6, %mm2
        !           316:
        !           317:        movq    %mm0, %mm1
        !           318:        psllq   %mm7, %mm0
        !           319:
        !           320:        movq    %mm3, -8(%edx,%eax,4)   C prev
        !           321:        por     %mm2, %mm0
        !           322:
        !           323:        movq    %mm1, %mm2
        !           324:        movq    %mm0, %mm3
        !           325:
        !           326:        addl    $2, %eax
        !           327: L(finish_no_two):
        !           328:
        !           329:
        !           330:        C eax   2 or 3 representing respectively 1 or 0 limbs remaining
        !           331:        C
        !           332:        C mm2   src prev qword, from -8(%ebx,%eax,4)
        !           333:        C mm3   dst qword, for -8(%edx,%eax,4)
        !           334:
        !           335:        testb   $1, %al
        !           336:        popl    %edi
        !           337:
        !           338:        movd    %mm5, %eax      C retval
        !           339:        jnz     L(finish_zero)
        !           340:
        !           341:
        !           342:        C One extra limb, destination was aligned.
        !           343:        C
        !           344:        C source                ebx
        !           345:        C +-------+---------------+--
        !           346:        C |       |      mm2      |
        !           347:        C +-------+---------------+--
        !           348:        C
        !           349:        C dest                                  edx
        !           350:        C +-------+---------------+---------------+--
        !           351:        C |       |               |      mm3      |
        !           352:        C +-------+---------------+---------------+--
        !           353:        C
        !           354:        C mm6 = shift
        !           355:        C mm7 = ecx = 64-shift
        !           356:
        !           357:
        !           358:        C One extra limb, destination was unaligned.
        !           359:        C
        !           360:        C source                ebx
        !           361:        C +-------+---------------+--
        !           362:        C |       |      mm2      |
        !           363:        C +-------+---------------+--
        !           364:        C
        !           365:        C dest                          edx
        !           366:        C +---------------+---------------+--
        !           367:        C |               |      mm3      |
        !           368:        C +---------------+---------------+--
        !           369:        C
        !           370:        C mm6 = shift+32
        !           371:        C mm7 = ecx = 64-(shift+32)
        !           372:
        !           373:
        !           374:        C In both cases there's one extra limb of src to fetch and combine
        !           375:        C with mm2 to make a qword at 8(%edx), and in the aligned case
        !           376:        C there's a further extra limb of dst to be formed.
        !           377:
        !           378:
        !           379:         movd    8(%ebx), %mm0
        !           380:        psrlq   %mm6, %mm2
        !           381:
        !           382:         movq    %mm0, %mm1
        !           383:         psllq   %mm7, %mm0
        !           384:
        !           385:        movq    %mm3, (%edx)
        !           386:         por     %mm2, %mm0
        !           387:
        !           388:         psrlq   %mm6, %mm1
        !           389:         andl   $32, %ecx
        !           390:
        !           391:        popl    %ebx
        !           392:        jz      L(finish_one_unaligned)
        !           393:
        !           394:         C dst was aligned, must store one extra limb
        !           395:        movd    %mm1, 16(%edx)
        !           396: L(finish_one_unaligned):
        !           397:
        !           398:        movq    %mm0, 8(%edx)
        !           399:
        !           400:        emms
        !           401:
        !           402:         ret
        !           403:
        !           404:
        !           405: L(finish_zero):
        !           406:
        !           407:        C No extra limbs, destination was aligned.
        !           408:        C
        !           409:        C source        ebx
        !           410:        C +---------------+--
        !           411:        C |      mm2      |
        !           412:        C +---------------+--
        !           413:        C
        !           414:        C dest                        edx+4
        !           415:        C +---------------+---------------+--
        !           416:        C |               |      mm3      |
        !           417:        C +---------------+---------------+--
        !           418:        C
        !           419:        C mm6 = shift
        !           420:        C mm7 = ecx = 64-shift
        !           421:
        !           422:
        !           423:        C No extra limbs, destination was unaligned.
        !           424:        C
        !           425:        C source        ebx
        !           426:        C +---------------+--
        !           427:        C |      mm2      |
        !           428:        C +---------------+--
        !           429:        C
        !           430:        C dest                edx+4
        !           431:        C +-------+---------------+--
        !           432:        C |       |      mm3      |
        !           433:        C +-------+---------------+--
        !           434:        C
        !           435:        C mm6 = shift+32
        !           436:        C mm7 = 64-(shift+32)
        !           437:
        !           438:
        !           439:        C The movd for the unaligned case is clearly the same data as the
        !           440:        C movq for the aligned case, it's just a choice between whether one
        !           441:        C or two limbs should be written.
        !           442:
        !           443:
        !           444:        movq    %mm3, 4(%edx)
        !           445:        psrlq   %mm6, %mm2
        !           446:
        !           447:        movd    %mm2, 12(%edx)
        !           448:        andl    $32, %ecx
        !           449:
        !           450:        popl    %ebx
        !           451:        jz      L(finish_zero_unaligned)
        !           452:
        !           453:        movq    %mm2, 12(%edx)
        !           454: L(finish_zero_unaligned):
        !           455:
        !           456:        emms
        !           457:
        !           458:        ret
        !           459:
        !           460: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>