OpenXM_contrib/gmp/mpn/x86/pentium/mmx/rshift.asm - annotate

Return to rshift.asm CVS log
Up to [local] / OpenXM_contrib / gmp / mpn / x86 / pentium / mmx
Annotation of OpenXM_contrib/gmp/mpn/x86/pentium/mmx/rshift.asm, Revision 1.1.1.1

1.1       maekawa     1: dnl  Intel P5 mpn_rshift -- mpn right shift.
                      2: dnl
                      3: dnl  P5: 1.75 cycles/limb.
                      4:
                      5:
                      6: dnl  Copyright (C) 2000 Free Software Foundation, Inc.
                      7: dnl
                      8: dnl  This file is part of the GNU MP Library.
                      9: dnl
                     10: dnl  The GNU MP Library is free software; you can redistribute it and/or
                     11: dnl  modify it under the terms of the GNU Lesser General Public License as
                     12: dnl  published by the Free Software Foundation; either version 2.1 of the
                     13: dnl  License, or (at your option) any later version.
                     14: dnl
                     15: dnl  The GNU MP Library is distributed in the hope that it will be useful,
                     16: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
                     17: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     18: dnl  Lesser General Public License for more details.
                     19: dnl
                     20: dnl  You should have received a copy of the GNU Lesser General Public
                     21: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
                     22: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
                     23: dnl  Suite 330, Boston, MA 02111-1307, USA.
                     24:
                     25:
                     26: include(`../config.m4')
                     27:
                     28:
                     29: C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
                     30: C                       unsigned shift);
                     31: C
                     32: C Shift src,size right by shift many bits and store the result in dst,size.
                     33: C Zeros are shifted in at the left.  Return the bits shifted out at the
                     34: C right.
                     35: C
                     36: C It takes 6 mmx instructions to process 2 limbs, making 1.5 cycles/limb,
                     37: C and with a 4 limb loop and 1 cycle of loop overhead the total is 1.75 c/l.
                     38: C
                     39: C Full speed depends on source and destination being aligned.  Unaligned mmx
                     40: C loads and stores on P5 don't pair and have a 2 cycle penalty.  Some hairy
                     41: C setups and finish-ups are done to ensure alignment for the loop.
                     42: C
                     43: C MMX shifts work out a bit faster even for the simple loop.
                     44:
                     45: defframe(PARAM_SHIFT,16)
                     46: defframe(PARAM_SIZE, 12)
                     47: defframe(PARAM_SRC,  8)
                     48: defframe(PARAM_DST,  4)
                     49: deflit(`FRAME',0)
                     50:
                     51: dnl  Minimum 5, because the unrolled loop can't handle less.
                     52: deflit(UNROLL_THRESHOLD, 5)
                     53:
                     54:        .text
                     55:        ALIGN(8)
                     56:
                     57: PROLOGUE(mpn_rshift)
                     58:
                     59:        pushl   %ebx
                     60:        pushl   %edi
                     61: deflit(`FRAME',8)
                     62:
                     63:        movl    PARAM_SIZE, %eax
                     64:        movl    PARAM_DST, %edx
                     65:
                     66:        movl    PARAM_SRC, %ebx
                     67:        movl    PARAM_SHIFT, %ecx
                     68:
                     69:        cmp     $UNROLL_THRESHOLD, %eax
                     70:        jae     L(unroll)
                     71:
                     72:        decl    %eax
                     73:        movl    (%ebx), %edi            C src low limb
                     74:
                     75:        jnz     L(simple)
                     76:
                     77:        shrdl(  %cl, %edi, %eax)        C eax was decremented to zero
                     78:
                     79:        shrl    %cl, %edi
                     80:
                     81:        movl    %edi, (%edx)            C dst low limb
                     82:        popl    %edi                    C risk of data cache bank clash
                     83:
                     84:        popl    %ebx
                     85:
                     86:        ret
                     87:
                     88:
                     89: C -----------------------------------------------------------------------------
                     90:        ALIGN(8)
                     91: L(simple):
                     92:        C eax   size-1
                     93:        C ebx   src
                     94:        C ecx   shift
                     95:        C edx   dst
                     96:        C esi
                     97:        C edi
                     98:        C ebp
                     99: deflit(`FRAME',8)
                    100:
                    101:        movd    (%ebx), %mm5            C src[0]
                    102:        leal    (%ebx,%eax,4), %ebx     C &src[size-1]
                    103:
                    104:        movd    %ecx, %mm6              C rshift
                    105:        leal    -4(%edx,%eax,4), %edx   C &dst[size-2]
                    106:
                    107:        psllq   $32, %mm5
                    108:        negl    %eax
                    109:
                    110:
                    111: C This loop is 5 or 8 cycles, with every second load unaligned and a wasted
                    112: C cycle waiting for the mm0 result to be ready.  For comparison a shrdl is 4
                    113: C cycles and would be 8 in a simple loop.  Using mmx helps the return value
                    114: C and last limb calculations too.
                    115:
                    116: L(simple_top):
                    117:        C eax   counter, limbs, negative
                    118:        C ebx   &src[size-1]
                    119:        C ecx   return value
                    120:        C edx   &dst[size-2]
                    121:        C
                    122:        C mm0   scratch
                    123:        C mm5   return value
                    124:        C mm6   shift
                    125:
                    126:        movq    (%ebx,%eax,4), %mm0
                    127:        incl    %eax
                    128:
                    129:        psrlq   %mm6, %mm0
                    130:
                    131:        movd    %mm0, (%edx,%eax,4)
                    132:        jnz     L(simple_top)
                    133:
                    134:
                    135:        movd    (%ebx), %mm0
                    136:        psrlq   %mm6, %mm5              C return value
                    137:
                    138:        psrlq   %mm6, %mm0
                    139:        popl    %edi
                    140:
                    141:        movd    %mm5, %eax
                    142:        popl    %ebx
                    143:
                    144:        movd    %mm0, 4(%edx)
                    145:
                    146:        emms
                    147:
                    148:        ret
                    149:
                    150:
                    151: C -----------------------------------------------------------------------------
                    152:        ALIGN(8)
                    153: L(unroll):
                    154:        C eax   size
                    155:        C ebx   src
                    156:        C ecx   shift
                    157:        C edx   dst
                    158:        C esi
                    159:        C edi
                    160:        C ebp
                    161: deflit(`FRAME',8)
                    162:
                    163:        movd    (%ebx), %mm5            C src[0]
                    164:        movl    $4, %edi
                    165:
                    166:        movd    %ecx, %mm6              C rshift
                    167:        testl   %edi, %ebx
                    168:
                    169:        psllq   $32, %mm5
                    170:        jz      L(start_src_aligned)
                    171:
                    172:
                    173:        C src isn't aligned, process low limb separately (marked xxx) and
                    174:        C step src and dst by one limb, making src aligned.
                    175:        C
                    176:        C source                  ebx
                    177:        C --+-------+-------+-------+
                    178:        C           |          xxx  |
                    179:        C --+-------+-------+-------+
                    180:        C         4mod8   0mod8   4mod8
                    181:        C
                    182:        C         dest            edx
                    183:        C         --+-------+-------+
                    184:        C           |       |  xxx  |
                    185:        C         --+-------+-------+
                    186:
                    187:        movq    (%ebx), %mm0            C unaligned load
                    188:
                    189:        psrlq   %mm6, %mm0
                    190:        addl    $4, %ebx
                    191:
                    192:        decl    %eax
                    193:
                    194:        movd    %mm0, (%edx)
                    195:        addl    $4, %edx
                    196: L(start_src_aligned):
                    197:
                    198:
                    199:        movq    (%ebx), %mm1
                    200:        testl   %edi, %edx
                    201:
                    202:        psrlq   %mm6, %mm5              C retval
                    203:        jz      L(start_dst_aligned)
                    204:
                    205:        C dst isn't aligned, add 4 to make it so, and pretend the shift is
                    206:        C 32 bits extra.  Low limb of dst (marked xxx) handled here
                    207:        C separately.
                    208:        C
                    209:        C          source          ebx
                    210:        C          --+-------+-------+
                    211:        C            |      mm1      |
                    212:        C          --+-------+-------+
                    213:        C                  4mod8   0mod8
                    214:        C
                    215:        C  dest                    edx
                    216:        C  --+-------+-------+-------+
                    217:        C                    |  xxx  |
                    218:        C  --+-------+-------+-------+
                    219:        C          4mod8   0mod8   4mod8
                    220:
                    221:        movq    %mm1, %mm0
                    222:        addl    $32, %ecx               C new shift
                    223:
                    224:        psrlq   %mm6, %mm0
                    225:
                    226:        movd    %ecx, %mm6
                    227:
                    228:        movd    %mm0, (%edx)
                    229:        addl    $4, %edx
                    230: L(start_dst_aligned):
                    231:
                    232:
                    233:        movq    8(%ebx), %mm3
                    234:        negl    %ecx
                    235:
                    236:        movq    %mm3, %mm2              C mm2 src qword
                    237:         addl    $64, %ecx
                    238:
                    239:         movd    %ecx, %mm7
                    240:        psrlq   %mm6, %mm1
                    241:
                    242:        leal    -12(%ebx,%eax,4), %ebx
                    243:        leal    -20(%edx,%eax,4), %edx
                    244:
                    245:        psllq   %mm7, %mm3
                    246:        subl    $7, %eax                C size-7
                    247:
                    248:        por     %mm1, %mm3              C mm3 ready to store
                    249:        negl    %eax                    C -(size-7)
                    250:
                    251:        jns     L(finish)
                    252:
                    253:
                    254:        C This loop is the important bit, the rest is just support.  Careful
                    255:        C instruction scheduling achieves the claimed 1.75 c/l.  The
                    256:        C relevant parts of the pairing rules are:
                    257:        C
                    258:        C - mmx loads and stores execute only in the U pipe
                    259:        C - only one mmx shift in a pair
                    260:        C - wait one cycle before storing an mmx register result
                    261:        C - the usual address generation interlock
                    262:        C
                    263:        C Two qword calculations are slightly interleaved.  The instructions
                    264:        C marked "C" belong to the second qword, and the "C prev" one is for
                    265:        C the second qword from the previous iteration.
                    266:
                    267:        ALIGN(8)
                    268: L(unroll_loop):
                    269:        C eax   counter, limbs, negative
                    270:        C ebx   &src[size-12]
                    271:        C ecx
                    272:        C edx   &dst[size-12]
                    273:        C esi
                    274:        C edi
                    275:        C
                    276:        C mm0
                    277:        C mm1
                    278:        C mm2   src qword from -8(%ebx,%eax,4)
                    279:        C mm3   dst qword ready to store to -8(%edx,%eax,4)
                    280:        C
                    281:        C mm5   return value
                    282:        C mm6   rshift
                    283:        C mm7   lshift
                    284:
                    285:        movq    (%ebx,%eax,4), %mm0
                    286:        psrlq   %mm6, %mm2
                    287:
                    288:        movq    %mm0, %mm1
                    289:        psllq   %mm7, %mm0
                    290:
                    291:        movq    %mm3, -8(%edx,%eax,4)   C prev
                    292:        por     %mm2, %mm0
                    293:
                    294:        movq    8(%ebx,%eax,4), %mm3    C
                    295:        psrlq   %mm6, %mm1              C
                    296:
                    297:        movq    %mm0, (%edx,%eax,4)
                    298:        movq    %mm3, %mm2              C
                    299:
                    300:        psllq   %mm7, %mm3              C
                    301:        addl    $4, %eax
                    302:
                    303:        por     %mm1, %mm3              C
                    304:        js      L(unroll_loop)
                    305:
                    306:
                    307: L(finish):
                    308:        C eax   0 to 3 representing respectively 3 to 0 limbs remaining
                    309:
                    310:        testb   $2, %al
                    311:
                    312:        jnz     L(finish_no_two)
                    313:
                    314:        movq    (%ebx,%eax,4), %mm0
                    315:        psrlq   %mm6, %mm2
                    316:
                    317:        movq    %mm0, %mm1
                    318:        psllq   %mm7, %mm0
                    319:
                    320:        movq    %mm3, -8(%edx,%eax,4)   C prev
                    321:        por     %mm2, %mm0
                    322:
                    323:        movq    %mm1, %mm2
                    324:        movq    %mm0, %mm3
                    325:
                    326:        addl    $2, %eax
                    327: L(finish_no_two):
                    328:
                    329:
                    330:        C eax   2 or 3 representing respectively 1 or 0 limbs remaining
                    331:        C
                    332:        C mm2   src prev qword, from -8(%ebx,%eax,4)
                    333:        C mm3   dst qword, for -8(%edx,%eax,4)
                    334:
                    335:        testb   $1, %al
                    336:        popl    %edi
                    337:
                    338:        movd    %mm5, %eax      C retval
                    339:        jnz     L(finish_zero)
                    340:
                    341:
                    342:        C One extra limb, destination was aligned.
                    343:        C
                    344:        C source                ebx
                    345:        C +-------+---------------+--
                    346:        C |       |      mm2      |
                    347:        C +-------+---------------+--
                    348:        C
                    349:        C dest                                  edx
                    350:        C +-------+---------------+---------------+--
                    351:        C |       |               |      mm3      |
                    352:        C +-------+---------------+---------------+--
                    353:        C
                    354:        C mm6 = shift
                    355:        C mm7 = ecx = 64-shift
                    356:
                    357:
                    358:        C One extra limb, destination was unaligned.
                    359:        C
                    360:        C source                ebx
                    361:        C +-------+---------------+--
                    362:        C |       |      mm2      |
                    363:        C +-------+---------------+--
                    364:        C
                    365:        C dest                          edx
                    366:        C +---------------+---------------+--
                    367:        C |               |      mm3      |
                    368:        C +---------------+---------------+--
                    369:        C
                    370:        C mm6 = shift+32
                    371:        C mm7 = ecx = 64-(shift+32)
                    372:
                    373:
                    374:        C In both cases there's one extra limb of src to fetch and combine
                    375:        C with mm2 to make a qword at 8(%edx), and in the aligned case
                    376:        C there's a further extra limb of dst to be formed.
                    377:
                    378:
                    379:         movd    8(%ebx), %mm0
                    380:        psrlq   %mm6, %mm2
                    381:
                    382:         movq    %mm0, %mm1
                    383:         psllq   %mm7, %mm0
                    384:
                    385:        movq    %mm3, (%edx)
                    386:         por     %mm2, %mm0
                    387:
                    388:         psrlq   %mm6, %mm1
                    389:         andl   $32, %ecx
                    390:
                    391:        popl    %ebx
                    392:        jz      L(finish_one_unaligned)
                    393:
                    394:         C dst was aligned, must store one extra limb
                    395:        movd    %mm1, 16(%edx)
                    396: L(finish_one_unaligned):
                    397:
                    398:        movq    %mm0, 8(%edx)
                    399:
                    400:        emms
                    401:
                    402:         ret
                    403:
                    404:
                    405: L(finish_zero):
                    406:
                    407:        C No extra limbs, destination was aligned.
                    408:        C
                    409:        C source        ebx
                    410:        C +---------------+--
                    411:        C |      mm2      |
                    412:        C +---------------+--
                    413:        C
                    414:        C dest                        edx+4
                    415:        C +---------------+---------------+--
                    416:        C |               |      mm3      |
                    417:        C +---------------+---------------+--
                    418:        C
                    419:        C mm6 = shift
                    420:        C mm7 = ecx = 64-shift
                    421:
                    422:
                    423:        C No extra limbs, destination was unaligned.
                    424:        C
                    425:        C source        ebx
                    426:        C +---------------+--
                    427:        C |      mm2      |
                    428:        C +---------------+--
                    429:        C
                    430:        C dest                edx+4
                    431:        C +-------+---------------+--
                    432:        C |       |      mm3      |
                    433:        C +-------+---------------+--
                    434:        C
                    435:        C mm6 = shift+32
                    436:        C mm7 = 64-(shift+32)
                    437:
                    438:
                    439:        C The movd for the unaligned case is clearly the same data as the
                    440:        C movq for the aligned case, it's just a choice between whether one
                    441:        C or two limbs should be written.
                    442:
                    443:
                    444:        movq    %mm3, 4(%edx)
                    445:        psrlq   %mm6, %mm2
                    446:
                    447:        movd    %mm2, 12(%edx)
                    448:        andl    $32, %ecx
                    449:
                    450:        popl    %ebx
                    451:        jz      L(finish_zero_unaligned)
                    452:
                    453:        movq    %mm2, 12(%edx)
                    454: L(finish_zero_unaligned):
                    455:
                    456:        emms
                    457:
                    458:        ret
                    459:
                    460: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>