OpenXM_contrib/gmp/mpn/x86/k7/mmx/rshift.asm - annotate

Return to rshift.asm CVS log
Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k7 / mmx
Annotation of OpenXM_contrib/gmp/mpn/x86/k7/mmx/rshift.asm, Revision 1.1.1.2

1.1       maekawa     1: dnl  AMD K7 mpn_rshift -- mpn right shift.
                      2:
1.1.1.2 ! ohara       3: dnl  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
1.1       maekawa     4: dnl
                      5: dnl  This file is part of the GNU MP Library.
                      6: dnl
                      7: dnl  The GNU MP Library is free software; you can redistribute it and/or
                      8: dnl  modify it under the terms of the GNU Lesser General Public License as
                      9: dnl  published by the Free Software Foundation; either version 2.1 of the
                     10: dnl  License, or (at your option) any later version.
                     11: dnl
                     12: dnl  The GNU MP Library is distributed in the hope that it will be useful,
                     13: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
                     14: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     15: dnl  Lesser General Public License for more details.
                     16: dnl
                     17: dnl  You should have received a copy of the GNU Lesser General Public
                     18: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
                     19: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
                     20: dnl  Suite 330, Boston, MA 02111-1307, USA.
                     21:
                     22: include(`../config.m4')
                     23:
                     24:
1.1.1.2 ! ohara      25: C K7: 1.21 cycles/limb (at 16 limbs/loop).
        !            26:
        !            27:
        !            28:
1.1       maekawa    29: dnl  K7: UNROLL_COUNT cycles/limb
                     30: dnl           4           1.51
                     31: dnl           8           1.26
                     32: dnl          16           1.21
                     33: dnl          32           1.2
                     34: dnl  Maximum possible with the current code is 64.
                     35:
                     36: deflit(UNROLL_COUNT, 16)
                     37:
                     38:
                     39: C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
                     40: C                       unsigned shift);
                     41: C
                     42: C Shift src,size right by shift many bits and store the result in dst,size.
                     43: C Zeros are shifted in at the left.  The bits shifted out at the right are
                     44: C the return value.
                     45: C
                     46: C This code uses 64-bit MMX operations, which makes it possible to handle
                     47: C two limbs at a time, for a theoretical 1.0 cycles/limb.  Plain integer
                     48: C code, on the other hand, suffers from shrd being a vector path decode and
                     49: C running at 3 cycles back-to-back.
                     50: C
                     51: C Full speed depends on source and destination being aligned, and some hairy
                     52: C setups and finish-ups are done to arrange this for the loop.
                     53:
                     54: ifdef(`PIC',`
                     55: deflit(UNROLL_THRESHOLD, 10)
                     56: ',`
                     57: deflit(UNROLL_THRESHOLD, 10)
                     58: ')
                     59:
                     60: defframe(PARAM_SHIFT,16)
                     61: defframe(PARAM_SIZE, 12)
                     62: defframe(PARAM_SRC,  8)
                     63: defframe(PARAM_DST,  4)
                     64:
                     65: defframe(SAVE_EDI, -4)
                     66: defframe(SAVE_ESI, -8)
                     67: defframe(SAVE_EBX, -12)
                     68: deflit(SAVE_SIZE, 12)
                     69:
1.1.1.2 ! ohara      70:        TEXT
1.1       maekawa    71:        ALIGN(32)
                     72:
                     73: PROLOGUE(mpn_rshift)
                     74: deflit(`FRAME',0)
                     75:
                     76:        movl    PARAM_SIZE, %eax
                     77:        movl    PARAM_SRC, %edx
                     78:        subl    $SAVE_SIZE, %esp
                     79: deflit(`FRAME',SAVE_SIZE)
                     80:
                     81:        movl    PARAM_SHIFT, %ecx
                     82:        movl    %edi, SAVE_EDI
                     83:
                     84:        movl    PARAM_DST, %edi
                     85:        decl    %eax
                     86:        jnz     L(more_than_one_limb)
                     87:
                     88:        movl    (%edx), %edx            C src limb
                     89:
                     90:        shrdl(  %cl, %edx, %eax)        C eax was decremented to zero
                     91:
                     92:        shrl    %cl, %edx
                     93:
                     94:        movl    %edx, (%edi)            C dst limb
                     95:        movl    SAVE_EDI, %edi
                     96:        addl    $SAVE_SIZE, %esp
                     97:
                     98:        ret
                     99:
                    100:
                    101: C -----------------------------------------------------------------------------
                    102: L(more_than_one_limb):
                    103:        C eax   size-1
                    104:        C ebx
                    105:        C ecx   shift
                    106:        C edx   src
                    107:        C esi
                    108:        C edi   dst
                    109:        C ebp
                    110:
                    111:        movd    PARAM_SHIFT, %mm6       C rshift
                    112:        movd    (%edx), %mm5            C src low limb
                    113:        cmp     $UNROLL_THRESHOLD-1, %eax
                    114:
                    115:        jae     L(unroll)
                    116:        leal    (%edx,%eax,4), %edx     C &src[size-1]
                    117:        leal    -4(%edi,%eax,4), %edi   C &dst[size-2]
                    118:
                    119:        movd    (%edx), %mm4            C src high limb
                    120:        negl    %eax
                    121:
                    122:
                    123: L(simple_top):
                    124:        C eax   loop counter, limbs, negative
                    125:        C ebx
                    126:        C ecx   shift
                    127:        C edx   carry
                    128:        C edx   &src[size-1]
                    129:        C edi   &dst[size-2]
                    130:        C ebp
                    131:        C
                    132:        C mm0   scratch
                    133:        C mm4   src high limb
                    134:        C mm5   src low limb
                    135:        C mm6   shift
                    136:
                    137:        movq    (%edx,%eax,4), %mm0
                    138:        incl    %eax
                    139:
                    140:        psrlq   %mm6, %mm0
                    141:
                    142:        movd    %mm0, (%edi,%eax,4)
                    143:        jnz     L(simple_top)
                    144:
                    145:
                    146:        psllq   $32, %mm5
                    147:        psrlq   %mm6, %mm4
                    148:
                    149:        psrlq   %mm6, %mm5
                    150:        movd    %mm4, 4(%edi)           C dst high limb
                    151:
                    152:        movd    %mm5, %eax              C return value
                    153:
                    154:        movl    SAVE_EDI, %edi
                    155:        addl    $SAVE_SIZE, %esp
                    156:        emms
                    157:
                    158:        ret
                    159:
                    160:
                    161: C -----------------------------------------------------------------------------
                    162:        ALIGN(16)
                    163: L(unroll):
                    164:        C eax   size-1
                    165:        C ebx
                    166:        C ecx   shift
                    167:        C edx   src
                    168:        C esi
                    169:        C edi   dst
                    170:        C ebp
                    171:        C
                    172:        C mm5   src low limb
                    173:        C mm6   rshift
                    174:
                    175:        testb   $4, %dl
                    176:        movl    %esi, SAVE_ESI
                    177:        movl    %ebx, SAVE_EBX
                    178:
                    179:        psllq   $32, %mm5
                    180:        jz      L(start_src_aligned)
                    181:
                    182:
                    183:        C src isn't aligned, process low limb separately (marked xxx) and
                    184:        C step src and dst by one limb, making src aligned.
                    185:        C
                    186:        C source                  edx
                    187:        C --+-------+-------+-------+
                    188:        C           |          xxx  |
                    189:        C --+-------+-------+-------+
                    190:        C         4mod8   0mod8   4mod8
                    191:        C
                    192:        C         dest            edi
                    193:        C         --+-------+-------+
                    194:        C           |       |  xxx  |
                    195:        C         --+-------+-------+
                    196:
                    197:        movq    (%edx), %mm0            C src low two limbs
                    198:        addl    $4, %edx
                    199:        movl    %eax, PARAM_SIZE        C size-1
                    200:
                    201:        addl    $4, %edi
                    202:        decl    %eax                    C size-2 is new size-1
                    203:
                    204:        psrlq   %mm6, %mm0
                    205:        movl    %edi, PARAM_DST         C new dst
                    206:
                    207:        movd    %mm0, -4(%edi)
                    208: L(start_src_aligned):
                    209:
                    210:
                    211:        movq    (%edx), %mm1            C src low two limbs
                    212:        decl    %eax                    C size-2, two last limbs handled at end
                    213:        testl   $4, %edi
                    214:
                    215:        psrlq   %mm6, %mm5
                    216:        jz      L(start_dst_aligned)
                    217:
                    218:
                    219:        C dst isn't aligned, add 4 to make it so, and pretend the shift is
                    220:        C 32 bits extra.  Low limb of dst (marked xxx) handled here separately.
                    221:        C
                    222:        C          source          edx
                    223:        C          --+-------+-------+
                    224:        C            |      mm1      |
                    225:        C          --+-------+-------+
                    226:        C                  4mod8   0mod8
                    227:        C
                    228:        C  dest                    edi
                    229:        C  --+-------+-------+-------+
                    230:        C                    |  xxx  |
                    231:        C  --+-------+-------+-------+
                    232:        C          4mod8   0mod8   4mod8
                    233:
                    234:        movq    %mm1, %mm0
                    235:        psrlq   %mm6, %mm1
                    236:        addl    $32, %ecx               C shift+32
                    237:
                    238:        movd    %mm1, (%edi)
                    239:        movq    %mm0, %mm1
                    240:        addl    $4, %edi                C new dst
                    241:
                    242:        movd    %ecx, %mm6
                    243: L(start_dst_aligned):
                    244:
                    245:
                    246:        movq    %mm1, %mm2              C copy of src low two limbs
                    247:        negl    %ecx
                    248:        andl    $-2, %eax               C round size down to even
                    249:
                    250:        movl    %eax, %ebx
                    251:        negl    %eax
                    252:        addl    $64, %ecx
                    253:
                    254:        andl    $UNROLL_MASK, %eax
                    255:        decl    %ebx
                    256:
                    257:        shll    %eax
                    258:
                    259:        movd    %ecx, %mm7              C lshift = 64-rshift
                    260:
                    261: ifdef(`PIC',`
                    262:        call    L(pic_calc)
                    263: L(here):
                    264: ',`
                    265:        leal    L(entry) (%eax,%eax,4), %esi
                    266:        negl    %eax
                    267: ')
                    268:        shrl    $UNROLL_LOG2, %ebx      C loop counter
                    269:
                    270:        leal    ifelse(UNROLL_BYTES,256,128+) 8(%edx,%eax,2), %edx
                    271:        leal    ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
                    272:        movl    PARAM_SIZE, %eax        C for use at end
                    273:
                    274:        jmp     *%esi
                    275:
                    276:
                    277: ifdef(`PIC',`
                    278: L(pic_calc):
1.1.1.2 ! ohara     279:        C See mpn/x86/README about old gas bugs
1.1       maekawa   280:        leal    (%eax,%eax,4), %esi
                    281:        addl    $L(entry)-L(here), %esi
                    282:        addl    (%esp), %esi
                    283:        negl    %eax
                    284:
                    285:        ret
                    286: ')
                    287:
                    288:
                    289: C -----------------------------------------------------------------------------
                    290:        ALIGN(64)
                    291: L(top):
                    292:        C eax   size, for use at end
                    293:        C ebx   loop counter
                    294:        C ecx   lshift
                    295:        C edx   src
                    296:        C esi   was computed jump
                    297:        C edi   dst
                    298:        C ebp
                    299:        C
                    300:        C mm0   scratch
                    301:        C mm1   \ carry (alternating)
                    302:        C mm2   /
                    303:        C mm6   rshift
                    304:        C mm7   lshift
                    305:        C
                    306:        C 10 code bytes/limb
                    307:        C
                    308:        C The two chunks differ in whether mm1 or mm2 hold the carry.
                    309:        C The computed jump puts the initial carry in both mm1 and mm2.
                    310:
                    311: L(entry):
                    312: deflit(CHUNK_COUNT, 4)
                    313: forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
                    314:        deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
                    315:        deflit(`disp1', eval(disp0 + 8))
                    316:
1.1.1.2 ! ohara     317: Zdisp( movq,   disp0,(%edx), %mm0)
1.1       maekawa   318:        psrlq   %mm6, %mm2
                    319:
                    320:        movq    %mm0, %mm1
                    321:        psllq   %mm7, %mm0
                    322:
                    323:        por     %mm2, %mm0
1.1.1.2 ! ohara     324: Zdisp( movq,   %mm0, disp0,(%edi))
1.1       maekawa   325:
                    326:
1.1.1.2 ! ohara     327: Zdisp( movq,   disp1,(%edx), %mm0)
1.1       maekawa   328:        psrlq   %mm6, %mm1
                    329:
                    330:        movq    %mm0, %mm2
                    331:        psllq   %mm7, %mm0
                    332:
                    333:        por     %mm1, %mm0
1.1.1.2 ! ohara     334: Zdisp( movq,   %mm0, disp1,(%edi))
1.1       maekawa   335: ')
                    336:
                    337:        addl    $UNROLL_BYTES, %edx
                    338:        addl    $UNROLL_BYTES, %edi
                    339:        decl    %ebx
                    340:
                    341:        jns     L(top)
                    342:
                    343:
                    344: deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
                    345: deflit(`disp1', eval(disp0-0 + 8))
                    346:
                    347:        testb   $1, %al
                    348:        psrlq   %mm6, %mm2      C wanted rshifted in all cases below
                    349:        movl    SAVE_ESI, %esi
                    350:
                    351:        movd    %mm5, %eax              C return value
                    352:
                    353:        movl    SAVE_EBX, %ebx
                    354:        jz      L(end_even)
                    355:
                    356:
                    357:        C Size odd, destination was aligned.
                    358:        C
                    359:        C source
                    360:        C       edx
                    361:        C +-------+---------------+--
                    362:        C |       |      mm2      |
                    363:        C +-------+---------------+--
                    364:        C
                    365:        C dest                  edi
                    366:        C +-------+---------------+---------------+--
                    367:        C |       |               |    written    |
                    368:        C +-------+---------------+---------------+--
                    369:        C
                    370:        C mm6 = shift
                    371:        C mm7 = ecx = 64-shift
                    372:
                    373:
                    374:        C Size odd, destination was unaligned.
                    375:        C
                    376:        C source
                    377:        C       edx
                    378:        C +-------+---------------+--
                    379:        C |       |      mm2      |
                    380:        C +-------+---------------+--
                    381:        C
                    382:        C dest          edi
                    383:        C +---------------+---------------+--
                    384:        C |               |    written    |
                    385:        C +---------------+---------------+--
                    386:        C
                    387:        C mm6 = shift+32
                    388:        C mm7 = ecx = 64-(shift+32)
                    389:
                    390:
                    391:        C In both cases there's one extra limb of src to fetch and combine
                    392:        C with mm2 to make a qword to store, and in the aligned case there's
                    393:        C a further extra limb of dst to be formed.
                    394:
                    395:
                    396:        movd    disp0(%edx), %mm0
                    397:        movq    %mm0, %mm1
                    398:
                    399:        psllq   %mm7, %mm0
                    400:        testb   $32, %cl
                    401:
                    402:        por     %mm2, %mm0
                    403:        psrlq   %mm6, %mm1
                    404:
                    405:        movq    %mm0, disp0(%edi)
                    406:        jz      L(finish_odd_unaligned)
                    407:
                    408:        movd    %mm1, disp1(%edi)
                    409: L(finish_odd_unaligned):
                    410:
                    411:        movl    SAVE_EDI, %edi
                    412:        addl    $SAVE_SIZE, %esp
                    413:        emms
                    414:
                    415:        ret
                    416:
                    417:
                    418: L(end_even):
                    419:
                    420:        C Size even, destination was aligned.
                    421:        C
                    422:        C source
                    423:        C +---------------+--
                    424:        C |      mm2      |
                    425:        C +---------------+--
                    426:        C
                    427:        C dest          edi
                    428:        C +---------------+---------------+--
                    429:        C |               |      mm3      |
                    430:        C +---------------+---------------+--
                    431:        C
                    432:        C mm6 = shift
                    433:        C mm7 = ecx = 64-shift
                    434:
                    435:
                    436:        C Size even, destination was unaligned.
                    437:        C
                    438:        C source
                    439:        C +---------------+--
                    440:        C |      mm2      |
                    441:        C +---------------+--
                    442:        C
                    443:        C dest  edi
                    444:        C +-------+---------------+--
                    445:        C |       |      mm3      |
                    446:        C +-------+---------------+--
                    447:        C
                    448:        C mm6 = shift+32
                    449:        C mm7 = 64-(shift+32)
                    450:
                    451:
                    452:        C The movd for the unaligned case is the same data as the movq for
                    453:        C the aligned case, it's just a choice between whether one or two
                    454:        C limbs should be written.
                    455:
                    456:
                    457:        testb   $32, %cl
                    458:        movd    %mm2, disp0(%edi)
                    459:
                    460:        jz      L(end_even_unaligned)
                    461:
                    462:        movq    %mm2, disp0(%edi)
                    463: L(end_even_unaligned):
                    464:
                    465:        movl    SAVE_EDI, %edi
                    466:        addl    $SAVE_SIZE, %esp
                    467:        emms
                    468:
                    469:        ret
                    470:
                    471: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>