OpenXM_contrib/gmp/mpn/x86/k7/mmx/rshift.asm - annotate

Return to rshift.asm CVS log
Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k7 / mmx
Annotation of OpenXM_contrib/gmp/mpn/x86/k7/mmx/rshift.asm, Revision 1.1

1.1     ! maekawa     1: dnl  AMD K7 mpn_rshift -- mpn right shift.
        !             2: dnl
        !             3: dnl  K7: 1.21 cycles/limb (at 16 limbs/loop).
        !             4:
        !             5:
        !             6: dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
        !             7: dnl
        !             8: dnl  This file is part of the GNU MP Library.
        !             9: dnl
        !            10: dnl  The GNU MP Library is free software; you can redistribute it and/or
        !            11: dnl  modify it under the terms of the GNU Lesser General Public License as
        !            12: dnl  published by the Free Software Foundation; either version 2.1 of the
        !            13: dnl  License, or (at your option) any later version.
        !            14: dnl
        !            15: dnl  The GNU MP Library is distributed in the hope that it will be useful,
        !            16: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            17: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
        !            18: dnl  Lesser General Public License for more details.
        !            19: dnl
        !            20: dnl  You should have received a copy of the GNU Lesser General Public
        !            21: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
        !            22: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
        !            23: dnl  Suite 330, Boston, MA 02111-1307, USA.
        !            24:
        !            25:
        !            26: include(`../config.m4')
        !            27:
        !            28:
        !            29: dnl  K7: UNROLL_COUNT cycles/limb
        !            30: dnl           4           1.51
        !            31: dnl           8           1.26
        !            32: dnl          16           1.21
        !            33: dnl          32           1.2
        !            34: dnl  Maximum possible with the current code is 64.
        !            35:
        !            36: deflit(UNROLL_COUNT, 16)
        !            37:
        !            38:
        !            39: C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
        !            40: C                       unsigned shift);
        !            41: C
        !            42: C Shift src,size right by shift many bits and store the result in dst,size.
        !            43: C Zeros are shifted in at the left.  The bits shifted out at the right are
        !            44: C the return value.
        !            45: C
        !            46: C This code uses 64-bit MMX operations, which makes it possible to handle
        !            47: C two limbs at a time, for a theoretical 1.0 cycles/limb.  Plain integer
        !            48: C code, on the other hand, suffers from shrd being a vector path decode and
        !            49: C running at 3 cycles back-to-back.
        !            50: C
        !            51: C Full speed depends on source and destination being aligned, and some hairy
        !            52: C setups and finish-ups are done to arrange this for the loop.
        !            53:
        !            54: ifdef(`PIC',`
        !            55: deflit(UNROLL_THRESHOLD, 10)
        !            56: ',`
        !            57: deflit(UNROLL_THRESHOLD, 10)
        !            58: ')
        !            59:
        !            60: defframe(PARAM_SHIFT,16)
        !            61: defframe(PARAM_SIZE, 12)
        !            62: defframe(PARAM_SRC,  8)
        !            63: defframe(PARAM_DST,  4)
        !            64:
        !            65: defframe(SAVE_EDI, -4)
        !            66: defframe(SAVE_ESI, -8)
        !            67: defframe(SAVE_EBX, -12)
        !            68: deflit(SAVE_SIZE, 12)
        !            69:
        !            70:        .text
        !            71:        ALIGN(32)
        !            72:
        !            73: PROLOGUE(mpn_rshift)
        !            74: deflit(`FRAME',0)
        !            75:
        !            76:        movl    PARAM_SIZE, %eax
        !            77:        movl    PARAM_SRC, %edx
        !            78:        subl    $SAVE_SIZE, %esp
        !            79: deflit(`FRAME',SAVE_SIZE)
        !            80:
        !            81:        movl    PARAM_SHIFT, %ecx
        !            82:        movl    %edi, SAVE_EDI
        !            83:
        !            84:        movl    PARAM_DST, %edi
        !            85:        decl    %eax
        !            86:        jnz     L(more_than_one_limb)
        !            87:
        !            88:        movl    (%edx), %edx            C src limb
        !            89:
        !            90:        shrdl(  %cl, %edx, %eax)        C eax was decremented to zero
        !            91:
        !            92:        shrl    %cl, %edx
        !            93:
        !            94:        movl    %edx, (%edi)            C dst limb
        !            95:        movl    SAVE_EDI, %edi
        !            96:        addl    $SAVE_SIZE, %esp
        !            97:
        !            98:        ret
        !            99:
        !           100:
        !           101: C -----------------------------------------------------------------------------
        !           102: L(more_than_one_limb):
        !           103:        C eax   size-1
        !           104:        C ebx
        !           105:        C ecx   shift
        !           106:        C edx   src
        !           107:        C esi
        !           108:        C edi   dst
        !           109:        C ebp
        !           110:
        !           111:        movd    PARAM_SHIFT, %mm6       C rshift
        !           112:        movd    (%edx), %mm5            C src low limb
        !           113:        cmp     $UNROLL_THRESHOLD-1, %eax
        !           114:
        !           115:        jae     L(unroll)
        !           116:        leal    (%edx,%eax,4), %edx     C &src[size-1]
        !           117:        leal    -4(%edi,%eax,4), %edi   C &dst[size-2]
        !           118:
        !           119:        movd    (%edx), %mm4            C src high limb
        !           120:        negl    %eax
        !           121:
        !           122:
        !           123: L(simple_top):
        !           124:        C eax   loop counter, limbs, negative
        !           125:        C ebx
        !           126:        C ecx   shift
        !           127:        C edx   carry
        !           128:        C edx   &src[size-1]
        !           129:        C edi   &dst[size-2]
        !           130:        C ebp
        !           131:        C
        !           132:        C mm0   scratch
        !           133:        C mm4   src high limb
        !           134:        C mm5   src low limb
        !           135:        C mm6   shift
        !           136:
        !           137:        movq    (%edx,%eax,4), %mm0
        !           138:        incl    %eax
        !           139:
        !           140:        psrlq   %mm6, %mm0
        !           141:
        !           142:        movd    %mm0, (%edi,%eax,4)
        !           143:        jnz     L(simple_top)
        !           144:
        !           145:
        !           146:        psllq   $32, %mm5
        !           147:        psrlq   %mm6, %mm4
        !           148:
        !           149:        psrlq   %mm6, %mm5
        !           150:        movd    %mm4, 4(%edi)           C dst high limb
        !           151:
        !           152:        movd    %mm5, %eax              C return value
        !           153:
        !           154:        movl    SAVE_EDI, %edi
        !           155:        addl    $SAVE_SIZE, %esp
        !           156:        emms
        !           157:
        !           158:        ret
        !           159:
        !           160:
        !           161: C -----------------------------------------------------------------------------
        !           162:        ALIGN(16)
        !           163: L(unroll):
        !           164:        C eax   size-1
        !           165:        C ebx
        !           166:        C ecx   shift
        !           167:        C edx   src
        !           168:        C esi
        !           169:        C edi   dst
        !           170:        C ebp
        !           171:        C
        !           172:        C mm5   src low limb
        !           173:        C mm6   rshift
        !           174:
        !           175:        testb   $4, %dl
        !           176:        movl    %esi, SAVE_ESI
        !           177:        movl    %ebx, SAVE_EBX
        !           178:
        !           179:        psllq   $32, %mm5
        !           180:        jz      L(start_src_aligned)
        !           181:
        !           182:
        !           183:        C src isn't aligned, process low limb separately (marked xxx) and
        !           184:        C step src and dst by one limb, making src aligned.
        !           185:        C
        !           186:        C source                  edx
        !           187:        C --+-------+-------+-------+
        !           188:        C           |          xxx  |
        !           189:        C --+-------+-------+-------+
        !           190:        C         4mod8   0mod8   4mod8
        !           191:        C
        !           192:        C         dest            edi
        !           193:        C         --+-------+-------+
        !           194:        C           |       |  xxx  |
        !           195:        C         --+-------+-------+
        !           196:
        !           197:        movq    (%edx), %mm0            C src low two limbs
        !           198:        addl    $4, %edx
        !           199:        movl    %eax, PARAM_SIZE        C size-1
        !           200:
        !           201:        addl    $4, %edi
        !           202:        decl    %eax                    C size-2 is new size-1
        !           203:
        !           204:        psrlq   %mm6, %mm0
        !           205:        movl    %edi, PARAM_DST         C new dst
        !           206:
        !           207:        movd    %mm0, -4(%edi)
        !           208: L(start_src_aligned):
        !           209:
        !           210:
        !           211:        movq    (%edx), %mm1            C src low two limbs
        !           212:        decl    %eax                    C size-2, two last limbs handled at end
        !           213:        testl   $4, %edi
        !           214:
        !           215:        psrlq   %mm6, %mm5
        !           216:        jz      L(start_dst_aligned)
        !           217:
        !           218:
        !           219:        C dst isn't aligned, add 4 to make it so, and pretend the shift is
        !           220:        C 32 bits extra.  Low limb of dst (marked xxx) handled here separately.
        !           221:        C
        !           222:        C          source          edx
        !           223:        C          --+-------+-------+
        !           224:        C            |      mm1      |
        !           225:        C          --+-------+-------+
        !           226:        C                  4mod8   0mod8
        !           227:        C
        !           228:        C  dest                    edi
        !           229:        C  --+-------+-------+-------+
        !           230:        C                    |  xxx  |
        !           231:        C  --+-------+-------+-------+
        !           232:        C          4mod8   0mod8   4mod8
        !           233:
        !           234:        movq    %mm1, %mm0
        !           235:        psrlq   %mm6, %mm1
        !           236:        addl    $32, %ecx               C shift+32
        !           237:
        !           238:        movd    %mm1, (%edi)
        !           239:        movq    %mm0, %mm1
        !           240:        addl    $4, %edi                C new dst
        !           241:
        !           242:        movd    %ecx, %mm6
        !           243: L(start_dst_aligned):
        !           244:
        !           245:
        !           246:        movq    %mm1, %mm2              C copy of src low two limbs
        !           247:        negl    %ecx
        !           248:        andl    $-2, %eax               C round size down to even
        !           249:
        !           250:        movl    %eax, %ebx
        !           251:        negl    %eax
        !           252:        addl    $64, %ecx
        !           253:
        !           254:        andl    $UNROLL_MASK, %eax
        !           255:        decl    %ebx
        !           256:
        !           257:        shll    %eax
        !           258:
        !           259:        movd    %ecx, %mm7              C lshift = 64-rshift
        !           260:
        !           261: ifdef(`PIC',`
        !           262:        call    L(pic_calc)
        !           263: L(here):
        !           264: ',`
        !           265:        leal    L(entry) (%eax,%eax,4), %esi
        !           266:        negl    %eax
        !           267: ')
        !           268:        shrl    $UNROLL_LOG2, %ebx      C loop counter
        !           269:
        !           270:        leal    ifelse(UNROLL_BYTES,256,128+) 8(%edx,%eax,2), %edx
        !           271:        leal    ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
        !           272:        movl    PARAM_SIZE, %eax        C for use at end
        !           273:
        !           274:        jmp     *%esi
        !           275:
        !           276:
        !           277: ifdef(`PIC',`
        !           278: L(pic_calc):
        !           279:        C See README.family about old gas bugs
        !           280:        leal    (%eax,%eax,4), %esi
        !           281:        addl    $L(entry)-L(here), %esi
        !           282:        addl    (%esp), %esi
        !           283:        negl    %eax
        !           284:
        !           285:        ret
        !           286: ')
        !           287:
        !           288:
        !           289: C -----------------------------------------------------------------------------
        !           290:        ALIGN(64)
        !           291: L(top):
        !           292:        C eax   size, for use at end
        !           293:        C ebx   loop counter
        !           294:        C ecx   lshift
        !           295:        C edx   src
        !           296:        C esi   was computed jump
        !           297:        C edi   dst
        !           298:        C ebp
        !           299:        C
        !           300:        C mm0   scratch
        !           301:        C mm1   \ carry (alternating)
        !           302:        C mm2   /
        !           303:        C mm6   rshift
        !           304:        C mm7   lshift
        !           305:        C
        !           306:        C 10 code bytes/limb
        !           307:        C
        !           308:        C The two chunks differ in whether mm1 or mm2 hold the carry.
        !           309:        C The computed jump puts the initial carry in both mm1 and mm2.
        !           310:
        !           311: L(entry):
        !           312: deflit(CHUNK_COUNT, 4)
        !           313: forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
        !           314:        deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
        !           315:        deflit(`disp1', eval(disp0 + 8))
        !           316:
        !           317:        movq    disp0(%edx), %mm0
        !           318:        psrlq   %mm6, %mm2
        !           319:
        !           320:        movq    %mm0, %mm1
        !           321:        psllq   %mm7, %mm0
        !           322:
        !           323:        por     %mm2, %mm0
        !           324:        movq    %mm0, disp0(%edi)
        !           325:
        !           326:
        !           327:        movq    disp1(%edx), %mm0
        !           328:        psrlq   %mm6, %mm1
        !           329:
        !           330:        movq    %mm0, %mm2
        !           331:        psllq   %mm7, %mm0
        !           332:
        !           333:        por     %mm1, %mm0
        !           334:        movq    %mm0, disp1(%edi)
        !           335: ')
        !           336:
        !           337:        addl    $UNROLL_BYTES, %edx
        !           338:        addl    $UNROLL_BYTES, %edi
        !           339:        decl    %ebx
        !           340:
        !           341:        jns     L(top)
        !           342:
        !           343:
        !           344: deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
        !           345: deflit(`disp1', eval(disp0-0 + 8))
        !           346:
        !           347:        testb   $1, %al
        !           348:        psrlq   %mm6, %mm2      C wanted rshifted in all cases below
        !           349:        movl    SAVE_ESI, %esi
        !           350:
        !           351:        movd    %mm5, %eax              C return value
        !           352:
        !           353:        movl    SAVE_EBX, %ebx
        !           354:        jz      L(end_even)
        !           355:
        !           356:
        !           357:        C Size odd, destination was aligned.
        !           358:        C
        !           359:        C source
        !           360:        C       edx
        !           361:        C +-------+---------------+--
        !           362:        C |       |      mm2      |
        !           363:        C +-------+---------------+--
        !           364:        C
        !           365:        C dest                  edi
        !           366:        C +-------+---------------+---------------+--
        !           367:        C |       |               |    written    |
        !           368:        C +-------+---------------+---------------+--
        !           369:        C
        !           370:        C mm6 = shift
        !           371:        C mm7 = ecx = 64-shift
        !           372:
        !           373:
        !           374:        C Size odd, destination was unaligned.
        !           375:        C
        !           376:        C source
        !           377:        C       edx
        !           378:        C +-------+---------------+--
        !           379:        C |       |      mm2      |
        !           380:        C +-------+---------------+--
        !           381:        C
        !           382:        C dest          edi
        !           383:        C +---------------+---------------+--
        !           384:        C |               |    written    |
        !           385:        C +---------------+---------------+--
        !           386:        C
        !           387:        C mm6 = shift+32
        !           388:        C mm7 = ecx = 64-(shift+32)
        !           389:
        !           390:
        !           391:        C In both cases there's one extra limb of src to fetch and combine
        !           392:        C with mm2 to make a qword to store, and in the aligned case there's
        !           393:        C a further extra limb of dst to be formed.
        !           394:
        !           395:
        !           396:        movd    disp0(%edx), %mm0
        !           397:        movq    %mm0, %mm1
        !           398:
        !           399:        psllq   %mm7, %mm0
        !           400:        testb   $32, %cl
        !           401:
        !           402:        por     %mm2, %mm0
        !           403:        psrlq   %mm6, %mm1
        !           404:
        !           405:        movq    %mm0, disp0(%edi)
        !           406:        jz      L(finish_odd_unaligned)
        !           407:
        !           408:        movd    %mm1, disp1(%edi)
        !           409: L(finish_odd_unaligned):
        !           410:
        !           411:        movl    SAVE_EDI, %edi
        !           412:        addl    $SAVE_SIZE, %esp
        !           413:        emms
        !           414:
        !           415:        ret
        !           416:
        !           417:
        !           418: L(end_even):
        !           419:
        !           420:        C Size even, destination was aligned.
        !           421:        C
        !           422:        C source
        !           423:        C +---------------+--
        !           424:        C |      mm2      |
        !           425:        C +---------------+--
        !           426:        C
        !           427:        C dest          edi
        !           428:        C +---------------+---------------+--
        !           429:        C |               |      mm3      |
        !           430:        C +---------------+---------------+--
        !           431:        C
        !           432:        C mm6 = shift
        !           433:        C mm7 = ecx = 64-shift
        !           434:
        !           435:
        !           436:        C Size even, destination was unaligned.
        !           437:        C
        !           438:        C source
        !           439:        C +---------------+--
        !           440:        C |      mm2      |
        !           441:        C +---------------+--
        !           442:        C
        !           443:        C dest  edi
        !           444:        C +-------+---------------+--
        !           445:        C |       |      mm3      |
        !           446:        C +-------+---------------+--
        !           447:        C
        !           448:        C mm6 = shift+32
        !           449:        C mm7 = 64-(shift+32)
        !           450:
        !           451:
        !           452:        C The movd for the unaligned case is the same data as the movq for
        !           453:        C the aligned case, it's just a choice between whether one or two
        !           454:        C limbs should be written.
        !           455:
        !           456:
        !           457:        testb   $32, %cl
        !           458:        movd    %mm2, disp0(%edi)
        !           459:
        !           460:        jz      L(end_even_unaligned)
        !           461:
        !           462:        movq    %mm2, disp0(%edi)
        !           463: L(end_even_unaligned):
        !           464:
        !           465:        movl    SAVE_EDI, %edi
        !           466:        addl    $SAVE_SIZE, %esp
        !           467:        emms
        !           468:
        !           469:        ret
        !           470:
        !           471: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>