OpenXM_contrib/gmp/mpn/x86/k6/k62mmx/lshift.asm - annotate

Return to lshift.asm CVS log
Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k6 / k62mmx
Annotation of OpenXM_contrib/gmp/mpn/x86/k6/k62mmx/lshift.asm, Revision 1.1.1.2

1.1       maekawa     1: dnl  AMD K6-2 mpn_lshift -- mpn left shift.
                      2:
1.1.1.2 ! ohara       3: dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
1.1       maekawa     4: dnl
                      5: dnl  This file is part of the GNU MP Library.
                      6: dnl
                      7: dnl  The GNU MP Library is free software; you can redistribute it and/or
                      8: dnl  modify it under the terms of the GNU Lesser General Public License as
                      9: dnl  published by the Free Software Foundation; either version 2.1 of the
                     10: dnl  License, or (at your option) any later version.
                     11: dnl
                     12: dnl  The GNU MP Library is distributed in the hope that it will be useful,
                     13: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
                     14: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     15: dnl  Lesser General Public License for more details.
                     16: dnl
                     17: dnl  You should have received a copy of the GNU Lesser General Public
                     18: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
                     19: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
                     20: dnl  Suite 330, Boston, MA 02111-1307, USA.
                     21:
                     22: include(`../config.m4')
                     23:
                     24:
1.1.1.2 ! ohara      25: C K6-2: 1.75 cycles/limb
        !            26:
        !            27:
1.1       maekawa    28: C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
                     29: C                       unsigned shift);
                     30: C
                     31:
                     32: defframe(PARAM_SHIFT,16)
                     33: defframe(PARAM_SIZE, 12)
                     34: defframe(PARAM_SRC,  8)
                     35: defframe(PARAM_DST,  4)
                     36: deflit(`FRAME',0)
                     37:
                     38: dnl  used after src has been fetched
                     39: define(VAR_RETVAL,`PARAM_SRC')
                     40:
                     41: dnl  minimum 9, because unrolled loop can't handle less
                     42: deflit(UNROLL_THRESHOLD, 9)
                     43:
1.1.1.2 ! ohara      44:        TEXT
1.1       maekawa    45:        ALIGN(32)
                     46:
                     47: PROLOGUE(mpn_lshift)
                     48: deflit(`FRAME',0)
                     49:
                     50:        C The 1 limb case can be done without the push %ebx, but it's then
                     51:        C still the same speed.  The push is left as a free helping hand for
                     52:        C the two_or_more code.
                     53:
                     54:        movl    PARAM_SIZE, %eax
                     55:        pushl   %ebx                    FRAME_pushl()
                     56:
                     57:        movl    PARAM_SRC, %ebx
                     58:        decl    %eax
                     59:
                     60:        movl    PARAM_SHIFT, %ecx
                     61:        jnz     L(two_or_more)
                     62:
                     63:        movl    (%ebx), %edx            C src limb
                     64:        movl    PARAM_DST, %ebx
                     65:
                     66:        shldl(  %cl, %edx, %eax)        C return value
                     67:
                     68:        shll    %cl, %edx
                     69:
                     70:        movl    %edx, (%ebx)            C dst limb
                     71:        popl    %ebx
                     72:
                     73:        ret
                     74:
                     75:
                     76: C -----------------------------------------------------------------------------
                     77:        ALIGN(16)       C avoid offset 0x1f
                     78: L(two_or_more):
                     79:        C eax   size-1
                     80:        C ebx   src
                     81:        C ecx   shift
                     82:        C edx
                     83:
                     84:        movl    (%ebx,%eax,4), %edx     C src high limb
                     85:        negl    %ecx
                     86:
                     87:        movd    PARAM_SHIFT, %mm6
                     88:        addl    $32, %ecx               C 32-shift
                     89:
                     90:        shrl    %cl, %edx
                     91:        cmpl    $UNROLL_THRESHOLD-1, %eax
                     92:
                     93:        movl    %edx, VAR_RETVAL
                     94:        jae     L(unroll)
                     95:
                     96:
                     97:        movd    %ecx, %mm7
                     98:        movl    %eax, %ecx
                     99:
                    100:        movl    PARAM_DST, %eax
                    101:
                    102: L(simple):
                    103:        C eax   dst
                    104:        C ebx   src
                    105:        C ecx   counter, size-1 to 1
                    106:        C edx   retval
                    107:        C
                    108:        C mm0   scratch
                    109:        C mm6   shift
                    110:        C mm7   32-shift
                    111:
                    112:        movq    -4(%ebx,%ecx,4), %mm0
                    113:
                    114:        psrlq   %mm7, %mm0
                    115:
                    116: Zdisp( movd,   %mm0, 0,(%eax,%ecx,4))
                    117:        loop    L(simple)
                    118:
                    119:
                    120:        movd    (%ebx), %mm0
                    121:        popl    %ebx
                    122:
                    123:        psllq   %mm6, %mm0
                    124:
                    125:        movd    %mm0, (%eax)
                    126:        movl    %edx, %eax
                    127:
                    128:        femms
                    129:        ret
                    130:
                    131:
                    132: C -----------------------------------------------------------------------------
                    133:        ALIGN(16)
                    134: L(unroll):
                    135:        C eax   size-1
                    136:        C ebx   src
                    137:        C ecx   32-shift
                    138:        C edx   retval (but instead VAR_RETVAL is used)
                    139:        C
                    140:        C mm6   shift
                    141:
                    142:        addl    $32, %ecx
                    143:        movl    PARAM_DST, %edx
                    144:
                    145:        movd    %ecx, %mm7
                    146:        subl    $7, %eax                        C size-8
                    147:
                    148:        leal    (%edx,%eax,4), %ecx             C alignment of dst
                    149:
                    150:        movq    32-8(%ebx,%eax,4), %mm2         C src high qword
                    151:        testb   $4, %cl
                    152:
                    153:        jz      L(dst_aligned)
                    154:        psllq   %mm6, %mm2
                    155:
                    156:        psrlq   $32, %mm2
                    157:        decl    %eax
                    158:
                    159:        movd    %mm2, 32(%edx,%eax,4)           C dst high limb
                    160:        movq    32-8(%ebx,%eax,4), %mm2         C new src high qword
                    161: L(dst_aligned):
                    162:
                    163:        movq    32-16(%ebx,%eax,4), %mm0        C src second highest qword
                    164:
                    165:
                    166:        C This loop is the important bit, the rest is just support for it.
                    167:        C Four src limbs are held at the start, and four more will be read.
                    168:        C Four dst limbs will be written.  This schedule seems necessary for
                    169:        C full speed.
                    170:        C
                    171:        C The use of size-8 lets the loop stop when %eax goes negative and
                    172:        C leaves -4 to -1 which can be tested with test $1 and $2.
                    173:
                    174: L(top):
                    175:        C eax   counter, size-8 step by -4 until <0
                    176:        C ebx   src
                    177:        C ecx
                    178:        C edx   dst
                    179:        C
                    180:        C mm0   src next qword
                    181:        C mm1   scratch
                    182:        C mm2   src prev qword
                    183:        C mm6   shift
                    184:        C mm7   64-shift
                    185:
                    186:        psllq   %mm6, %mm2
                    187:        subl    $4, %eax
                    188:
                    189:        movq    %mm0, %mm1
                    190:        psrlq   %mm7, %mm0
                    191:
                    192:        por     %mm0, %mm2
                    193:        movq    24(%ebx,%eax,4), %mm0
                    194:
                    195:        psllq   %mm6, %mm1
                    196:        movq    %mm2, 40(%edx,%eax,4)
                    197:
                    198:        movq    %mm0, %mm2
                    199:        psrlq   %mm7, %mm0
                    200:
                    201:        por     %mm0, %mm1
                    202:        movq    16(%ebx,%eax,4), %mm0
                    203:
                    204:        movq    %mm1, 32(%edx,%eax,4)
                    205:        jnc     L(top)
                    206:
                    207:
                    208:        C Now have four limbs in mm2 (prev) and mm0 (next), plus eax mod 4.
                    209:        C
                    210:        C 8(%ebx) is the next source, and 24(%edx) is the next destination.
                    211:        C %eax is between -4 and -1, representing respectively 0 to 3 extra
                    212:        C limbs that must be read.
                    213:
                    214:
                    215:        testl   $2, %eax        C testl to avoid bad cache line crossing
                    216:        jz      L(finish_nottwo)
                    217:
                    218:        C Two more limbs: lshift mm2, OR it with rshifted mm0, mm0 becomes
                    219:        C new mm2 and a new mm0 is loaded.
                    220:
                    221:        psllq   %mm6, %mm2
                    222:        movq    %mm0, %mm1
                    223:
                    224:        psrlq   %mm7, %mm0
                    225:        subl    $2, %eax
                    226:
                    227:        por     %mm0, %mm2
                    228:        movq    16(%ebx,%eax,4), %mm0
                    229:
                    230:        movq    %mm2, 32(%edx,%eax,4)
                    231:        movq    %mm1, %mm2
                    232: L(finish_nottwo):
                    233:
                    234:
                    235:        C lshift mm2, OR with rshifted mm0, mm1 becomes lshifted mm0
                    236:
                    237:        testb   $1, %al
                    238:        psllq   %mm6, %mm2
                    239:
                    240:        movq    %mm0, %mm1
                    241:        psrlq   %mm7, %mm0
                    242:
                    243:        por     %mm0, %mm2
                    244:        psllq   %mm6, %mm1
                    245:
                    246:        movq    %mm2, 24(%edx,%eax,4)
                    247:        jz      L(finish_even)
                    248:
                    249:
                    250:        C Size is odd, so mm1 and one extra limb to process.
                    251:
                    252:        movd    (%ebx), %mm0            C src[0]
                    253:        popl    %ebx
                    254: deflit(`FRAME',0)
                    255:
                    256:        movq    %mm0, %mm2
                    257:        psllq   $32, %mm0
                    258:
                    259:        psrlq   %mm7, %mm0
                    260:
                    261:        psllq   %mm6, %mm2
                    262:        por     %mm0, %mm1
                    263:
                    264:        movq    %mm1, 4(%edx)           C dst[1,2]
                    265:        movd    %mm2, (%edx)            C dst[0]
                    266:
                    267:        movl    VAR_RETVAL, %eax
                    268:
                    269:        femms
                    270:        ret
                    271:
                    272:
                    273:        nop     C avoid bad cache line crossing
                    274: L(finish_even):
                    275: deflit(`FRAME',4)
                    276:        C Size is even, so only mm1 left to process.
                    277:
                    278:        movq    %mm1, (%edx)            C dst[0,1]
                    279:        movl    VAR_RETVAL, %eax
                    280:
                    281:        popl    %ebx
                    282:        femms
                    283:        ret
                    284:
                    285: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>