OpenXM_contrib/gmp/mpn/x86/k7/mmx/lshift.asm - annotate

Return to lshift.asm CVS log
Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k7 / mmx
Annotation of OpenXM_contrib/gmp/mpn/x86/k7/mmx/lshift.asm, Revision 1.1.1.2

1.1       maekawa     1: dnl  AMD K7 mpn_lshift -- mpn left shift.
                      2:
1.1.1.2 ! ohara       3: dnl  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
1.1       maekawa     4: dnl
                      5: dnl  This file is part of the GNU MP Library.
                      6: dnl
                      7: dnl  The GNU MP Library is free software; you can redistribute it and/or
                      8: dnl  modify it under the terms of the GNU Lesser General Public License as
                      9: dnl  published by the Free Software Foundation; either version 2.1 of the
                     10: dnl  License, or (at your option) any later version.
                     11: dnl
                     12: dnl  The GNU MP Library is distributed in the hope that it will be useful,
                     13: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
                     14: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     15: dnl  Lesser General Public License for more details.
                     16: dnl
                     17: dnl  You should have received a copy of the GNU Lesser General Public
                     18: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
                     19: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
                     20: dnl  Suite 330, Boston, MA 02111-1307, USA.
                     21:
                     22: include(`../config.m4')
                     23:
                     24:
1.1.1.2 ! ohara      25: C K7: 1.21 cycles/limb (at 16 limbs/loop).
        !            26:
        !            27:
        !            28:
1.1       maekawa    29: dnl  K7: UNROLL_COUNT cycles/limb
                     30: dnl           4           1.51
                     31: dnl           8           1.26
                     32: dnl          16           1.21
                     33: dnl          32           1.2
                     34: dnl  Maximum possible with the current code is 64.
                     35:
                     36: deflit(UNROLL_COUNT, 16)
                     37:
                     38:
                     39: C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
                     40: C                       unsigned shift);
                     41: C
                     42: C Shift src,size left by shift many bits and store the result in dst,size.
                     43: C Zeros are shifted in at the right.  The bits shifted out at the left are
                     44: C the return value.
                     45: C
                     46: C The comments in mpn_rshift apply here too.
                     47:
                     48: ifdef(`PIC',`
                     49: deflit(UNROLL_THRESHOLD, 10)
                     50: ',`
                     51: deflit(UNROLL_THRESHOLD, 10)
                     52: ')
                     53:
                     54: defframe(PARAM_SHIFT,16)
                     55: defframe(PARAM_SIZE, 12)
                     56: defframe(PARAM_SRC,  8)
                     57: defframe(PARAM_DST,  4)
                     58:
                     59: defframe(SAVE_EDI, -4)
                     60: defframe(SAVE_ESI, -8)
                     61: defframe(SAVE_EBX, -12)
                     62: deflit(SAVE_SIZE, 12)
                     63:
1.1.1.2 ! ohara      64:        TEXT
1.1       maekawa    65:        ALIGN(32)
                     66:
                     67: PROLOGUE(mpn_lshift)
                     68: deflit(`FRAME',0)
                     69:
                     70:        movl    PARAM_SIZE, %eax
                     71:        movl    PARAM_SRC, %edx
                     72:        subl    $SAVE_SIZE, %esp
                     73: deflit(`FRAME',SAVE_SIZE)
                     74:
                     75:        movl    PARAM_SHIFT, %ecx
                     76:        movl    %edi, SAVE_EDI
                     77:
                     78:        movl    PARAM_DST, %edi
                     79:        decl    %eax
                     80:        jnz     L(more_than_one_limb)
                     81:
                     82:        movl    (%edx), %edx
                     83:
                     84:        shldl(  %cl, %edx, %eax)        C eax was decremented to zero
                     85:
                     86:        shll    %cl, %edx
                     87:
                     88:        movl    %edx, (%edi)
                     89:        movl    SAVE_EDI, %edi
                     90:        addl    $SAVE_SIZE, %esp
                     91:
                     92:        ret
                     93:
                     94:
                     95: C -----------------------------------------------------------------------------
                     96: L(more_than_one_limb):
                     97:        C eax   size-1
                     98:        C ebx
                     99:        C ecx   shift
                    100:        C edx   src
                    101:        C esi
                    102:        C edi   dst
                    103:        C ebp
                    104:
                    105:        movd    PARAM_SHIFT, %mm6
                    106:        movd    (%edx,%eax,4), %mm5     C src high limb
                    107:        cmp     $UNROLL_THRESHOLD-1, %eax
                    108:
                    109:        jae     L(unroll)
                    110:        negl    %ecx
                    111:        movd    (%edx), %mm4            C src low limb
                    112:
                    113:        addl    $32, %ecx
                    114:
                    115:        movd    %ecx, %mm7
                    116:
                    117: L(simple_top):
                    118:        C eax   loop counter, limbs
                    119:        C ebx
                    120:        C ecx
                    121:        C edx   src
                    122:        C esi
                    123:        C edi   dst
                    124:        C ebp
                    125:        C
                    126:        C mm0   scratch
                    127:        C mm4   src low limb
                    128:        C mm5   src high limb
                    129:        C mm6   shift
                    130:        C mm7   32-shift
                    131:
                    132:        movq    -4(%edx,%eax,4), %mm0
                    133:        decl    %eax
                    134:
                    135:        psrlq   %mm7, %mm0
                    136:
                    137:        movd    %mm0, 4(%edi,%eax,4)
                    138:        jnz     L(simple_top)
                    139:
                    140:
                    141:        psllq   %mm6, %mm5
                    142:        psllq   %mm6, %mm4
                    143:
                    144:        psrlq   $32, %mm5
                    145:        movd    %mm4, (%edi)            C dst low limb
                    146:
                    147:        movd    %mm5, %eax              C return value
                    148:
                    149:        movl    SAVE_EDI, %edi
                    150:        addl    $SAVE_SIZE, %esp
                    151:        emms
                    152:
                    153:        ret
                    154:
                    155:
                    156: C -----------------------------------------------------------------------------
                    157:        ALIGN(16)
                    158: L(unroll):
                    159:        C eax   size-1
                    160:        C ebx   (saved)
                    161:        C ecx   shift
                    162:        C edx   src
                    163:        C esi
                    164:        C edi   dst
                    165:        C ebp
                    166:        C
                    167:        C mm5   src high limb, for return value
                    168:        C mm6   lshift
                    169:
                    170:        movl    %esi, SAVE_ESI
                    171:        movl    %ebx, SAVE_EBX
                    172:        leal    -4(%edx,%eax,4), %edx   C &src[size-2]
                    173:
                    174:        testb   $4, %dl
                    175:        movq    (%edx), %mm1            C src high qword
                    176:
                    177:        jz      L(start_src_aligned)
                    178:
                    179:
                    180:        C src isn't aligned, process high limb (marked xxx) separately to
                    181:        C make it so
                    182:        C
                    183:        C  source    -4(edx,%eax,4)
                    184:        C                  |
                    185:        C  +-------+-------+-------+--
                    186:        C  |  xxx          |
                    187:        C  +-------+-------+-------+--
                    188:        C        0mod8   4mod8   0mod8
                    189:        C
                    190:        C  dest      -4(edi,%eax,4)
                    191:        C                  |
                    192:        C  +-------+-------+--
                    193:        C  |  xxx  |       |
                    194:        C  +-------+-------+--
                    195:
                    196:        psllq   %mm6, %mm1
                    197:        subl    $4, %edx
                    198:        movl    %eax, PARAM_SIZE        C size-1
                    199:
                    200:        psrlq   $32, %mm1
                    201:        decl    %eax                    C size-2 is new size-1
                    202:
                    203:        movd    %mm1, 4(%edi,%eax,4)
                    204:        movq    (%edx), %mm1            C new src high qword
                    205: L(start_src_aligned):
                    206:
                    207:
                    208:         leal    -4(%edi,%eax,4), %edi   C &dst[size-2]
                    209:        psllq   %mm6, %mm5
                    210:
                    211:        testl   $4, %edi
                    212:        psrlq   $32, %mm5               C return value
                    213:
                    214:        jz      L(start_dst_aligned)
                    215:
                    216:
                    217:        C dst isn't aligned, subtract 4 bytes to make it so, and pretend the
                    218:        C shift is 32 bits extra.  High limb of dst (marked xxx) handled
                    219:        C here separately.
                    220:        C
                    221:        C  source       %edx
                    222:        C  +-------+-------+--
                    223:        C  |      mm1      |
                    224:        C  +-------+-------+--
                    225:        C                0mod8   4mod8
                    226:        C
                    227:        C  dest         %edi
                    228:        C  +-------+-------+-------+--
                    229:        C  |  xxx  |
                    230:        C  +-------+-------+-------+--
                    231:        C        0mod8   4mod8   0mod8
                    232:
                    233:        movq    %mm1, %mm0
                    234:        psllq   %mm6, %mm1
                    235:        addl    $32, %ecx               C shift+32
                    236:
                    237:        psrlq   $32, %mm1
                    238:
                    239:        movd    %mm1, 4(%edi)
                    240:        movq    %mm0, %mm1
                    241:        subl    $4, %edi
                    242:
                    243:        movd    %ecx, %mm6              C new lshift
                    244: L(start_dst_aligned):
                    245:
                    246:        decl    %eax                    C size-2, two last limbs handled at end
                    247:        movq    %mm1, %mm2              C copy of src high qword
                    248:        negl    %ecx
                    249:
                    250:        andl    $-2, %eax               C round size down to even
                    251:        addl    $64, %ecx
                    252:
                    253:        movl    %eax, %ebx
                    254:        negl    %eax
                    255:
                    256:        andl    $UNROLL_MASK, %eax
                    257:        decl    %ebx
                    258:
                    259:        shll    %eax
                    260:
                    261:        movd    %ecx, %mm7              C rshift = 64-lshift
                    262:
                    263: ifdef(`PIC',`
                    264:        call    L(pic_calc)
                    265: L(here):
                    266: ',`
                    267:        leal    L(entry) (%eax,%eax,4), %esi
                    268: ')
                    269:        shrl    $UNROLL_LOG2, %ebx      C loop counter
                    270:
                    271:        leal    ifelse(UNROLL_BYTES,256,128) -8(%edx,%eax,2), %edx
                    272:        leal    ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
                    273:        movl    PARAM_SIZE, %eax        C for use at end
                    274:        jmp     *%esi
                    275:
                    276:
                    277: ifdef(`PIC',`
                    278: L(pic_calc):
1.1.1.2 ! ohara     279:        C See mpn/x86/README about old gas bugs
1.1       maekawa   280:        leal    (%eax,%eax,4), %esi
                    281:        addl    $L(entry)-L(here), %esi
                    282:        addl    (%esp), %esi
                    283:
                    284:        ret
                    285: ')
                    286:
                    287:
                    288: C -----------------------------------------------------------------------------
                    289:        ALIGN(32)
                    290: L(top):
                    291:        C eax   size (for use at end)
                    292:        C ebx   loop counter
                    293:        C ecx   rshift
                    294:        C edx   src
                    295:        C esi   computed jump
                    296:        C edi   dst
                    297:        C ebp
                    298:        C
                    299:        C mm0   scratch
                    300:        C mm1   \ carry (alternating, mm2 first)
                    301:        C mm2   /
                    302:        C mm6   lshift
                    303:        C mm7   rshift
                    304:        C
                    305:        C 10 code bytes/limb
                    306:        C
                    307:        C The two chunks differ in whether mm1 or mm2 hold the carry.
                    308:        C The computed jump puts the initial carry in both mm1 and mm2.
                    309:
                    310: L(entry):
                    311: deflit(CHUNK_COUNT, 4)
                    312: forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
                    313:        deflit(`disp0', eval(-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
                    314:        deflit(`disp1', eval(disp0 - 8))
                    315:
1.1.1.2 ! ohara     316: Zdisp( movq,   disp0,(%edx), %mm0)
1.1       maekawa   317:        psllq   %mm6, %mm2
                    318:
                    319:        movq    %mm0, %mm1
                    320:        psrlq   %mm7, %mm0
                    321:
                    322:        por     %mm2, %mm0
1.1.1.2 ! ohara     323: Zdisp( movq,   %mm0, disp0,(%edi))
1.1       maekawa   324:
                    325:
1.1.1.2 ! ohara     326: Zdisp( movq,   disp1,(%edx), %mm0)
1.1       maekawa   327:        psllq   %mm6, %mm1
                    328:
                    329:        movq    %mm0, %mm2
                    330:        psrlq   %mm7, %mm0
                    331:
                    332:        por     %mm1, %mm0
1.1.1.2 ! ohara     333: Zdisp( movq,   %mm0, disp1,(%edi))
1.1       maekawa   334: ')
                    335:
                    336:        subl    $UNROLL_BYTES, %edx
                    337:        subl    $UNROLL_BYTES, %edi
                    338:        decl    %ebx
                    339:
                    340:        jns     L(top)
                    341:
                    342:
                    343:
                    344: define(`disp', `m4_empty_if_zero(eval($1 ifelse(UNROLL_BYTES,256,-128)))')
                    345:
                    346: L(end):
                    347:        testb   $1, %al
                    348:        movl    SAVE_EBX, %ebx
                    349:        psllq   %mm6, %mm2      C wanted left shifted in all cases below
                    350:
                    351:        movd    %mm5, %eax
                    352:
                    353:        movl    SAVE_ESI, %esi
                    354:        jz      L(end_even)
                    355:
                    356:
                    357: L(end_odd):
                    358:
                    359:        C Size odd, destination was aligned.
                    360:        C
                    361:        C                 source        edx+8   edx+4
                    362:        C                 --+---------------+-------+
                    363:        C                   |      mm2      |       |
                    364:        C                 --+---------------+-------+
                    365:        C
                    366:        C dest                            edi
                    367:        C --+---------------+---------------+-------+
                    368:        C   |   written     |               |       |
                    369:        C --+---------------+---------------+-------+
                    370:        C
                    371:        C mm6 = shift
                    372:        C mm7 = ecx = 64-shift
                    373:
                    374:
                    375:        C Size odd, destination was unaligned.
                    376:        C
                    377:        C                 source        edx+8   edx+4
                    378:        C                 --+---------------+-------+
                    379:        C                   |      mm2      |       |
                    380:        C                 --+---------------+-------+
                    381:        C
                    382:        C         dest                            edi
                    383:        C         --+---------------+---------------+
                    384:        C           |   written     |               |
                    385:        C         --+---------------+---------------+
                    386:        C
                    387:        C mm6 = shift+32
                    388:        C mm7 = ecx = 64-(shift+32)
                    389:
                    390:
                    391:        C In both cases there's one extra limb of src to fetch and combine
                    392:        C with mm2 to make a qword at (%edi), and in the aligned case
                    393:        C there's an extra limb of dst to be formed from that extra src limb
                    394:        C left shifted.
                    395:
                    396:        movd    disp(4) (%edx), %mm0
                    397:        testb   $32, %cl
                    398:
                    399:        movq    %mm0, %mm1
                    400:        psllq   $32, %mm0
                    401:
                    402:        psrlq   %mm7, %mm0
                    403:        psllq   %mm6, %mm1
                    404:
                    405:        por     %mm2, %mm0
                    406:
                    407:        movq    %mm0, disp(0) (%edi)
                    408:        jz      L(end_odd_unaligned)
                    409:        movd    %mm1, disp(-4) (%edi)
                    410: L(end_odd_unaligned):
                    411:
                    412:        movl    SAVE_EDI, %edi
                    413:        addl    $SAVE_SIZE, %esp
                    414:        emms
                    415:
                    416:        ret
                    417:
                    418:
                    419: L(end_even):
                    420:
                    421:        C Size even, destination was aligned.
                    422:        C
                    423:        C                 source        edx+8
                    424:        C                 --+---------------+
                    425:        C                   |      mm2      |
                    426:        C                 --+---------------+
                    427:        C
                    428:        C dest                            edi
                    429:        C --+---------------+---------------+
                    430:        C   |   written     |               |
                    431:        C --+---------------+---------------+
                    432:        C
                    433:        C mm6 = shift
                    434:        C mm7 = ecx = 64-shift
                    435:
                    436:
                    437:        C Size even, destination was unaligned.
                    438:        C
                    439:        C               source          edx+8
                    440:        C                 --+---------------+
                    441:        C                   |      mm2      |
                    442:        C                 --+---------------+
                    443:        C
                    444:        C         dest                  edi+4
                    445:        C         --+---------------+-------+
                    446:        C           |    written    |       |
                    447:        C         --+---------------+-------+
                    448:        C
                    449:        C mm6 = shift+32
                    450:        C mm7 = ecx = 64-(shift+32)
                    451:
                    452:
                    453:        C The movq for the aligned case overwrites the movd for the
                    454:        C unaligned case.
                    455:
                    456:        movq    %mm2, %mm0
                    457:        psrlq   $32, %mm2
                    458:
                    459:        testb   $32, %cl
                    460:        movd    %mm2, disp(4) (%edi)
                    461:
                    462:        jz      L(end_even_unaligned)
                    463:        movq    %mm0, disp(0) (%edi)
                    464: L(end_even_unaligned):
                    465:
                    466:        movl    SAVE_EDI, %edi
                    467:        addl    $SAVE_SIZE, %esp
                    468:        emms
                    469:
                    470:        ret
                    471:
                    472: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>