OpenXM_contrib/gmp/mpn/x86/pentium/rshift.asm - annotate

Return to rshift.asm CVS log
Up to [local] / OpenXM_contrib / gmp / mpn / x86 / pentium
Annotation of OpenXM_contrib/gmp/mpn/x86/pentium/rshift.asm, Revision 1.1.1.1

1.1       maekawa     1: dnl  Intel Pentium mpn_rshift -- mpn right shift.
                      2: dnl
                      3: dnl          cycles/limb
                      4: dnl  P5,P54:    6.0
                      5: dnl  P55:       5.375
                      6:
                      7:
                      8: dnl  Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software
                      9: dnl  Foundation, Inc.
                     10: dnl
                     11: dnl  This file is part of the GNU MP Library.
                     12: dnl
                     13: dnl  The GNU MP Library is free software; you can redistribute it and/or
                     14: dnl  modify it under the terms of the GNU Lesser General Public License as
                     15: dnl  published by the Free Software Foundation; either version 2.1 of the
                     16: dnl  License, or (at your option) any later version.
                     17: dnl
                     18: dnl  The GNU MP Library is distributed in the hope that it will be useful,
                     19: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
                     20: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     21: dnl  Lesser General Public License for more details.
                     22: dnl
                     23: dnl  You should have received a copy of the GNU Lesser General Public
                     24: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
                     25: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
                     26: dnl  Suite 330, Boston, MA 02111-1307, USA.
                     27:
                     28:
                     29: include(`../config.m4')
                     30:
                     31:
                     32: C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
                     33: C                       unsigned shift);
                     34: C
                     35: C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does,
                     36: C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere.
                     37:
                     38: defframe(PARAM_SHIFT,16)
                     39: defframe(PARAM_SIZE, 12)
                     40: defframe(PARAM_SRC,  8)
                     41: defframe(PARAM_DST,  4)
                     42:
                     43:        .text
                     44:        ALIGN(8)
                     45: PROLOGUE(mpn_rshift)
                     46:
                     47:        pushl   %edi
                     48:        pushl   %esi
                     49:        pushl   %ebx
                     50:        pushl   %ebp
                     51: deflit(`FRAME',16)
                     52:
                     53:        movl    PARAM_DST,%edi
                     54:        movl    PARAM_SRC,%esi
                     55:        movl    PARAM_SIZE,%ebp
                     56:        movl    PARAM_SHIFT,%ecx
                     57:
                     58: C We can use faster code for shift-by-1 under certain conditions.
                     59:        cmp     $1,%ecx
                     60:        jne     L(normal)
                     61:        leal    4(%edi),%eax
                     62:        cmpl    %esi,%eax
                     63:        jnc     L(special)              C jump if res_ptr + 1 >= s_ptr
                     64:        leal    (%edi,%ebp,4),%eax
                     65:        cmpl    %eax,%esi
                     66:        jnc     L(special)              C jump if s_ptr >= res_ptr + size
                     67:
                     68: L(normal):
                     69:        movl    (%esi),%edx
                     70:        addl    $4,%esi
                     71:        xorl    %eax,%eax
                     72:        shrdl(  %cl, %edx, %eax)        C compute carry limb
                     73:        pushl   %eax                    C push carry limb onto stack
                     74:
                     75:        decl    %ebp
                     76:        pushl   %ebp
                     77:        shrl    $3,%ebp
                     78:        jz      L(end)
                     79:
                     80:        movl    (%edi),%eax             C fetch destination cache line
                     81:
                     82:        ALIGN(4)
                     83: L(oop):        movl    28(%edi),%eax           C fetch destination cache line
                     84:        movl    %edx,%ebx
                     85:
                     86:        movl    (%esi),%eax
                     87:        movl    4(%esi),%edx
                     88:        shrdl(  %cl, %eax, %ebx)
                     89:        shrdl(  %cl, %edx, %eax)
                     90:        movl    %ebx,(%edi)
                     91:        movl    %eax,4(%edi)
                     92:
                     93:        movl    8(%esi),%ebx
                     94:        movl    12(%esi),%eax
                     95:        shrdl(  %cl, %ebx, %edx)
                     96:        shrdl(  %cl, %eax, %ebx)
                     97:        movl    %edx,8(%edi)
                     98:        movl    %ebx,12(%edi)
                     99:
                    100:        movl    16(%esi),%edx
                    101:        movl    20(%esi),%ebx
                    102:        shrdl(  %cl, %edx, %eax)
                    103:        shrdl(  %cl, %ebx, %edx)
                    104:        movl    %eax,16(%edi)
                    105:        movl    %edx,20(%edi)
                    106:
                    107:        movl    24(%esi),%eax
                    108:        movl    28(%esi),%edx
                    109:        shrdl(  %cl, %eax, %ebx)
                    110:        shrdl(  %cl, %edx, %eax)
                    111:        movl    %ebx,24(%edi)
                    112:        movl    %eax,28(%edi)
                    113:
                    114:        addl    $32,%esi
                    115:        addl    $32,%edi
                    116:        decl    %ebp
                    117:        jnz     L(oop)
                    118:
                    119: L(end):        popl    %ebp
                    120:        andl    $7,%ebp
                    121:        jz      L(end2)
                    122: L(oop2):
                    123:        movl    (%esi),%eax
                    124:        shrdl(  %cl,%eax,%edx)          C compute result limb
                    125:        movl    %edx,(%edi)
                    126:        movl    %eax,%edx
                    127:        addl    $4,%esi
                    128:        addl    $4,%edi
                    129:        decl    %ebp
                    130:        jnz     L(oop2)
                    131:
                    132: L(end2):
                    133:        shrl    %cl,%edx                C compute most significant limb
                    134:        movl    %edx,(%edi)             C store it
                    135:
                    136:        popl    %eax                    C pop carry limb
                    137:
                    138:        popl    %ebp
                    139:        popl    %ebx
                    140:        popl    %esi
                    141:        popl    %edi
                    142:        ret
                    143:
                    144:
                    145: C We loop from least significant end of the arrays, which is only
                    146: C permissable if the source and destination don't overlap, since the
                    147: C function is documented to work for overlapping source and destination.
                    148:
                    149: L(special):
                    150:        leal    -4(%edi,%ebp,4),%edi
                    151:        leal    -4(%esi,%ebp,4),%esi
                    152:
                    153:        movl    (%esi),%edx
                    154:        subl    $4,%esi
                    155:
                    156:        decl    %ebp
                    157:        pushl   %ebp
                    158:        shrl    $3,%ebp
                    159:
                    160:        shrl    %edx
                    161:        incl    %ebp
                    162:        decl    %ebp
                    163:        jz      L(Lend)
                    164:
                    165:        movl    (%edi),%eax             C fetch destination cache line
                    166:
                    167:        ALIGN(4)
                    168: L(Loop):
                    169:        movl    -28(%edi),%eax          C fetch destination cache line
                    170:        movl    %edx,%ebx
                    171:
                    172:        movl    (%esi),%eax
                    173:        movl    -4(%esi),%edx
                    174:        rcrl    %eax
                    175:        movl    %ebx,(%edi)
                    176:        rcrl    %edx
                    177:        movl    %eax,-4(%edi)
                    178:
                    179:        movl    -8(%esi),%ebx
                    180:        movl    -12(%esi),%eax
                    181:        rcrl    %ebx
                    182:        movl    %edx,-8(%edi)
                    183:        rcrl    %eax
                    184:        movl    %ebx,-12(%edi)
                    185:
                    186:        movl    -16(%esi),%edx
                    187:        movl    -20(%esi),%ebx
                    188:        rcrl    %edx
                    189:        movl    %eax,-16(%edi)
                    190:        rcrl    %ebx
                    191:        movl    %edx,-20(%edi)
                    192:
                    193:        movl    -24(%esi),%eax
                    194:        movl    -28(%esi),%edx
                    195:        rcrl    %eax
                    196:        movl    %ebx,-24(%edi)
                    197:        rcrl    %edx
                    198:        movl    %eax,-28(%edi)
                    199:
                    200:        leal    -32(%esi),%esi          C use leal not to clobber carry
                    201:        leal    -32(%edi),%edi
                    202:        decl    %ebp
                    203:        jnz     L(Loop)
                    204:
                    205: L(Lend):
                    206:        popl    %ebp
                    207:        sbbl    %eax,%eax               C save carry in %eax
                    208:        andl    $7,%ebp
                    209:        jz      L(Lend2)
                    210:        addl    %eax,%eax               C restore carry from eax
                    211: L(Loop2):
                    212:        movl    %edx,%ebx
                    213:        movl    (%esi),%edx
                    214:        rcrl    %edx
                    215:        movl    %ebx,(%edi)
                    216:
                    217:        leal    -4(%esi),%esi           C use leal not to clobber carry
                    218:        leal    -4(%edi),%edi
                    219:        decl    %ebp
                    220:        jnz     L(Loop2)
                    221:
                    222:        jmp     L(L1)
                    223: L(Lend2):
                    224:        addl    %eax,%eax               C restore carry from eax
                    225: L(L1): movl    %edx,(%edi)             C store last limb
                    226:
                    227:        movl    $0,%eax
                    228:        rcrl    %eax
                    229:
                    230:        popl    %ebp
                    231:        popl    %ebx
                    232:        popl    %esi
                    233:        popl    %edi
                    234:        ret
                    235:
                    236: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>