[BACK]Return to rshift.asm CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / x86 / pentium / mmx

Annotation of OpenXM_contrib/gmp/mpn/x86/pentium/mmx/rshift.asm, Revision 1.1.1.2

1.1       maekawa     1: dnl  Intel P5 mpn_rshift -- mpn right shift.
                      2:
1.1.1.2 ! ohara       3: dnl  Copyright 2000, 2002 Free Software Foundation, Inc.
1.1       maekawa     4: dnl
                      5: dnl  This file is part of the GNU MP Library.
                      6: dnl
                      7: dnl  The GNU MP Library is free software; you can redistribute it and/or
                      8: dnl  modify it under the terms of the GNU Lesser General Public License as
                      9: dnl  published by the Free Software Foundation; either version 2.1 of the
                     10: dnl  License, or (at your option) any later version.
                     11: dnl
                     12: dnl  The GNU MP Library is distributed in the hope that it will be useful,
                     13: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
                     14: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     15: dnl  Lesser General Public License for more details.
                     16: dnl
                     17: dnl  You should have received a copy of the GNU Lesser General Public
                     18: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
                     19: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
                     20: dnl  Suite 330, Boston, MA 02111-1307, USA.
                     21:
                     22: include(`../config.m4')
                     23:
                     24:
1.1.1.2 ! ohara      25: C P5: 1.75 cycles/limb.
        !            26:
        !            27:
1.1       maekawa    28: C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
                     29: C                       unsigned shift);
                     30: C
                     31: C Shift src,size right by shift many bits and store the result in dst,size.
                     32: C Zeros are shifted in at the left.  Return the bits shifted out at the
                     33: C right.
                     34: C
                     35: C It takes 6 mmx instructions to process 2 limbs, making 1.5 cycles/limb,
                     36: C and with a 4 limb loop and 1 cycle of loop overhead the total is 1.75 c/l.
                     37: C
                     38: C Full speed depends on source and destination being aligned.  Unaligned mmx
                     39: C loads and stores on P5 don't pair and have a 2 cycle penalty.  Some hairy
                     40: C setups and finish-ups are done to ensure alignment for the loop.
                     41: C
                     42: C MMX shifts work out a bit faster even for the simple loop.
                     43:
                     44: defframe(PARAM_SHIFT,16)
                     45: defframe(PARAM_SIZE, 12)
                     46: defframe(PARAM_SRC,  8)
                     47: defframe(PARAM_DST,  4)
                     48: deflit(`FRAME',0)
                     49:
                     50: dnl  Minimum 5, because the unrolled loop can't handle less.
                     51: deflit(UNROLL_THRESHOLD, 5)
                     52:
1.1.1.2 ! ohara      53:        TEXT
1.1       maekawa    54:        ALIGN(8)
                     55:
                     56: PROLOGUE(mpn_rshift)
                     57:
                     58:        pushl   %ebx
                     59:        pushl   %edi
                     60: deflit(`FRAME',8)
                     61:
                     62:        movl    PARAM_SIZE, %eax
                     63:        movl    PARAM_DST, %edx
                     64:
                     65:        movl    PARAM_SRC, %ebx
                     66:        movl    PARAM_SHIFT, %ecx
                     67:
                     68:        cmp     $UNROLL_THRESHOLD, %eax
                     69:        jae     L(unroll)
                     70:
                     71:        decl    %eax
                     72:        movl    (%ebx), %edi            C src low limb
                     73:
                     74:        jnz     L(simple)
                     75:
                     76:        shrdl(  %cl, %edi, %eax)        C eax was decremented to zero
                     77:
                     78:        shrl    %cl, %edi
                     79:
                     80:        movl    %edi, (%edx)            C dst low limb
                     81:        popl    %edi                    C risk of data cache bank clash
                     82:
                     83:        popl    %ebx
                     84:
                     85:        ret
                     86:
                     87:
                     88: C -----------------------------------------------------------------------------
                     89:        ALIGN(8)
                     90: L(simple):
                     91:        C eax   size-1
                     92:        C ebx   src
                     93:        C ecx   shift
                     94:        C edx   dst
                     95:        C esi
                     96:        C edi
                     97:        C ebp
                     98: deflit(`FRAME',8)
                     99:
                    100:        movd    (%ebx), %mm5            C src[0]
                    101:        leal    (%ebx,%eax,4), %ebx     C &src[size-1]
                    102:
                    103:        movd    %ecx, %mm6              C rshift
                    104:        leal    -4(%edx,%eax,4), %edx   C &dst[size-2]
                    105:
                    106:        psllq   $32, %mm5
                    107:        negl    %eax
                    108:
                    109:
                    110: C This loop is 5 or 8 cycles, with every second load unaligned and a wasted
                    111: C cycle waiting for the mm0 result to be ready.  For comparison a shrdl is 4
                    112: C cycles and would be 8 in a simple loop.  Using mmx helps the return value
                    113: C and last limb calculations too.
                    114:
                    115: L(simple_top):
                    116:        C eax   counter, limbs, negative
                    117:        C ebx   &src[size-1]
                    118:        C ecx   return value
                    119:        C edx   &dst[size-2]
                    120:        C
                    121:        C mm0   scratch
                    122:        C mm5   return value
                    123:        C mm6   shift
                    124:
                    125:        movq    (%ebx,%eax,4), %mm0
                    126:        incl    %eax
                    127:
                    128:        psrlq   %mm6, %mm0
                    129:
                    130:        movd    %mm0, (%edx,%eax,4)
                    131:        jnz     L(simple_top)
                    132:
                    133:
                    134:        movd    (%ebx), %mm0
                    135:        psrlq   %mm6, %mm5              C return value
                    136:
                    137:        psrlq   %mm6, %mm0
                    138:        popl    %edi
                    139:
                    140:        movd    %mm5, %eax
                    141:        popl    %ebx
                    142:
                    143:        movd    %mm0, 4(%edx)
                    144:
                    145:        emms
                    146:
                    147:        ret
                    148:
                    149:
                    150: C -----------------------------------------------------------------------------
                    151:        ALIGN(8)
                    152: L(unroll):
                    153:        C eax   size
                    154:        C ebx   src
                    155:        C ecx   shift
                    156:        C edx   dst
                    157:        C esi
                    158:        C edi
                    159:        C ebp
                    160: deflit(`FRAME',8)
                    161:
                    162:        movd    (%ebx), %mm5            C src[0]
                    163:        movl    $4, %edi
                    164:
                    165:        movd    %ecx, %mm6              C rshift
                    166:        testl   %edi, %ebx
                    167:
                    168:        psllq   $32, %mm5
                    169:        jz      L(start_src_aligned)
                    170:
                    171:
                    172:        C src isn't aligned, process low limb separately (marked xxx) and
                    173:        C step src and dst by one limb, making src aligned.
                    174:        C
                    175:        C source                  ebx
                    176:        C --+-------+-------+-------+
                    177:        C           |          xxx  |
                    178:        C --+-------+-------+-------+
                    179:        C         4mod8   0mod8   4mod8
                    180:        C
                    181:        C         dest            edx
                    182:        C         --+-------+-------+
                    183:        C           |       |  xxx  |
                    184:        C         --+-------+-------+
                    185:
                    186:        movq    (%ebx), %mm0            C unaligned load
                    187:
                    188:        psrlq   %mm6, %mm0
                    189:        addl    $4, %ebx
                    190:
                    191:        decl    %eax
                    192:
                    193:        movd    %mm0, (%edx)
                    194:        addl    $4, %edx
                    195: L(start_src_aligned):
                    196:
                    197:
                    198:        movq    (%ebx), %mm1
                    199:        testl   %edi, %edx
                    200:
                    201:        psrlq   %mm6, %mm5              C retval
                    202:        jz      L(start_dst_aligned)
                    203:
                    204:        C dst isn't aligned, add 4 to make it so, and pretend the shift is
                    205:        C 32 bits extra.  Low limb of dst (marked xxx) handled here
                    206:        C separately.
                    207:        C
                    208:        C          source          ebx
                    209:        C          --+-------+-------+
                    210:        C            |      mm1      |
                    211:        C          --+-------+-------+
                    212:        C                  4mod8   0mod8
                    213:        C
                    214:        C  dest                    edx
                    215:        C  --+-------+-------+-------+
                    216:        C                    |  xxx  |
                    217:        C  --+-------+-------+-------+
                    218:        C          4mod8   0mod8   4mod8
                    219:
                    220:        movq    %mm1, %mm0
                    221:        addl    $32, %ecx               C new shift
                    222:
                    223:        psrlq   %mm6, %mm0
                    224:
                    225:        movd    %ecx, %mm6
                    226:
                    227:        movd    %mm0, (%edx)
                    228:        addl    $4, %edx
                    229: L(start_dst_aligned):
                    230:
                    231:
                    232:        movq    8(%ebx), %mm3
                    233:        negl    %ecx
                    234:
                    235:        movq    %mm3, %mm2              C mm2 src qword
                    236:         addl    $64, %ecx
                    237:
                    238:         movd    %ecx, %mm7
                    239:        psrlq   %mm6, %mm1
                    240:
                    241:        leal    -12(%ebx,%eax,4), %ebx
                    242:        leal    -20(%edx,%eax,4), %edx
                    243:
                    244:        psllq   %mm7, %mm3
                    245:        subl    $7, %eax                C size-7
                    246:
                    247:        por     %mm1, %mm3              C mm3 ready to store
                    248:        negl    %eax                    C -(size-7)
                    249:
                    250:        jns     L(finish)
                    251:
                    252:
                    253:        C This loop is the important bit, the rest is just support.  Careful
                    254:        C instruction scheduling achieves the claimed 1.75 c/l.  The
                    255:        C relevant parts of the pairing rules are:
                    256:        C
                    257:        C - mmx loads and stores execute only in the U pipe
                    258:        C - only one mmx shift in a pair
                    259:        C - wait one cycle before storing an mmx register result
                    260:        C - the usual address generation interlock
                    261:        C
                    262:        C Two qword calculations are slightly interleaved.  The instructions
                    263:        C marked "C" belong to the second qword, and the "C prev" one is for
                    264:        C the second qword from the previous iteration.
                    265:
                    266:        ALIGN(8)
                    267: L(unroll_loop):
                    268:        C eax   counter, limbs, negative
                    269:        C ebx   &src[size-12]
                    270:        C ecx
                    271:        C edx   &dst[size-12]
                    272:        C esi
                    273:        C edi
                    274:        C
                    275:        C mm0
                    276:        C mm1
                    277:        C mm2   src qword from -8(%ebx,%eax,4)
                    278:        C mm3   dst qword ready to store to -8(%edx,%eax,4)
                    279:        C
                    280:        C mm5   return value
                    281:        C mm6   rshift
                    282:        C mm7   lshift
                    283:
                    284:        movq    (%ebx,%eax,4), %mm0
                    285:        psrlq   %mm6, %mm2
                    286:
                    287:        movq    %mm0, %mm1
                    288:        psllq   %mm7, %mm0
                    289:
                    290:        movq    %mm3, -8(%edx,%eax,4)   C prev
                    291:        por     %mm2, %mm0
                    292:
                    293:        movq    8(%ebx,%eax,4), %mm3    C
                    294:        psrlq   %mm6, %mm1              C
                    295:
                    296:        movq    %mm0, (%edx,%eax,4)
                    297:        movq    %mm3, %mm2              C
                    298:
                    299:        psllq   %mm7, %mm3              C
                    300:        addl    $4, %eax
                    301:
                    302:        por     %mm1, %mm3              C
                    303:        js      L(unroll_loop)
                    304:
                    305:
                    306: L(finish):
                    307:        C eax   0 to 3 representing respectively 3 to 0 limbs remaining
                    308:
                    309:        testb   $2, %al
                    310:
                    311:        jnz     L(finish_no_two)
                    312:
                    313:        movq    (%ebx,%eax,4), %mm0
                    314:        psrlq   %mm6, %mm2
                    315:
                    316:        movq    %mm0, %mm1
                    317:        psllq   %mm7, %mm0
                    318:
                    319:        movq    %mm3, -8(%edx,%eax,4)   C prev
                    320:        por     %mm2, %mm0
                    321:
                    322:        movq    %mm1, %mm2
                    323:        movq    %mm0, %mm3
                    324:
                    325:        addl    $2, %eax
                    326: L(finish_no_two):
                    327:
                    328:
                    329:        C eax   2 or 3 representing respectively 1 or 0 limbs remaining
                    330:        C
                    331:        C mm2   src prev qword, from -8(%ebx,%eax,4)
                    332:        C mm3   dst qword, for -8(%edx,%eax,4)
                    333:
                    334:        testb   $1, %al
                    335:        popl    %edi
                    336:
                    337:        movd    %mm5, %eax      C retval
                    338:        jnz     L(finish_zero)
                    339:
                    340:
                    341:        C One extra limb, destination was aligned.
                    342:        C
                    343:        C source                ebx
                    344:        C +-------+---------------+--
                    345:        C |       |      mm2      |
                    346:        C +-------+---------------+--
                    347:        C
                    348:        C dest                                  edx
                    349:        C +-------+---------------+---------------+--
                    350:        C |       |               |      mm3      |
                    351:        C +-------+---------------+---------------+--
                    352:        C
                    353:        C mm6 = shift
                    354:        C mm7 = ecx = 64-shift
                    355:
                    356:
                    357:        C One extra limb, destination was unaligned.
                    358:        C
                    359:        C source                ebx
                    360:        C +-------+---------------+--
                    361:        C |       |      mm2      |
                    362:        C +-------+---------------+--
                    363:        C
                    364:        C dest                          edx
                    365:        C +---------------+---------------+--
                    366:        C |               |      mm3      |
                    367:        C +---------------+---------------+--
                    368:        C
                    369:        C mm6 = shift+32
                    370:        C mm7 = ecx = 64-(shift+32)
                    371:
                    372:
                    373:        C In both cases there's one extra limb of src to fetch and combine
                    374:        C with mm2 to make a qword at 8(%edx), and in the aligned case
                    375:        C there's a further extra limb of dst to be formed.
                    376:
                    377:
                    378:         movd    8(%ebx), %mm0
                    379:        psrlq   %mm6, %mm2
                    380:
                    381:         movq    %mm0, %mm1
                    382:         psllq   %mm7, %mm0
                    383:
                    384:        movq    %mm3, (%edx)
                    385:         por     %mm2, %mm0
                    386:
                    387:         psrlq   %mm6, %mm1
                    388:         andl   $32, %ecx
                    389:
                    390:        popl    %ebx
                    391:        jz      L(finish_one_unaligned)
                    392:
                    393:         C dst was aligned, must store one extra limb
                    394:        movd    %mm1, 16(%edx)
                    395: L(finish_one_unaligned):
                    396:
                    397:        movq    %mm0, 8(%edx)
                    398:
                    399:        emms
                    400:
                    401:         ret
                    402:
                    403:
                    404: L(finish_zero):
                    405:
                    406:        C No extra limbs, destination was aligned.
                    407:        C
                    408:        C source        ebx
                    409:        C +---------------+--
                    410:        C |      mm2      |
                    411:        C +---------------+--
                    412:        C
                    413:        C dest                        edx+4
                    414:        C +---------------+---------------+--
                    415:        C |               |      mm3      |
                    416:        C +---------------+---------------+--
                    417:        C
                    418:        C mm6 = shift
                    419:        C mm7 = ecx = 64-shift
                    420:
                    421:
                    422:        C No extra limbs, destination was unaligned.
                    423:        C
                    424:        C source        ebx
                    425:        C +---------------+--
                    426:        C |      mm2      |
                    427:        C +---------------+--
                    428:        C
                    429:        C dest                edx+4
                    430:        C +-------+---------------+--
                    431:        C |       |      mm3      |
                    432:        C +-------+---------------+--
                    433:        C
                    434:        C mm6 = shift+32
                    435:        C mm7 = 64-(shift+32)
                    436:
                    437:
                    438:        C The movd for the unaligned case is clearly the same data as the
                    439:        C movq for the aligned case, it's just a choice between whether one
                    440:        C or two limbs should be written.
                    441:
                    442:
                    443:        movq    %mm3, 4(%edx)
                    444:        psrlq   %mm6, %mm2
                    445:
                    446:        movd    %mm2, 12(%edx)
                    447:        andl    $32, %ecx
                    448:
                    449:        popl    %ebx
                    450:        jz      L(finish_zero_unaligned)
                    451:
                    452:        movq    %mm2, 12(%edx)
                    453: L(finish_zero_unaligned):
                    454:
                    455:        emms
                    456:
                    457:        ret
                    458:
                    459: EPILOGUE()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>