OpenXM_contrib/gmp/mpn/x86/pentium/rshift.asm - annotate

Return to rshift.asm CVS log
Up to [local] / OpenXM_contrib / gmp / mpn / x86 / pentium
Annotation of OpenXM_contrib/gmp/mpn/x86/pentium/rshift.asm, Revision 1.1.1.2

1.1       maekawa     1: dnl  Intel Pentium mpn_rshift -- mpn right shift.
                      2:
1.1.1.2 ! ohara       3: dnl  Copyright 1992, 1994, 1995, 1996, 1999, 2000, 2002 Free Software
1.1       maekawa     4: dnl  Foundation, Inc.
                      5: dnl
                      6: dnl  This file is part of the GNU MP Library.
                      7: dnl
                      8: dnl  The GNU MP Library is free software; you can redistribute it and/or
                      9: dnl  modify it under the terms of the GNU Lesser General Public License as
                     10: dnl  published by the Free Software Foundation; either version 2.1 of the
                     11: dnl  License, or (at your option) any later version.
                     12: dnl
                     13: dnl  The GNU MP Library is distributed in the hope that it will be useful,
                     14: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
                     15: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     16: dnl  Lesser General Public License for more details.
                     17: dnl
                     18: dnl  You should have received a copy of the GNU Lesser General Public
                     19: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
                     20: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
                     21: dnl  Suite 330, Boston, MA 02111-1307, USA.
                     22:
                     23: include(`../config.m4')
                     24:
                     25:
1.1.1.2 ! ohara      26: C         cycles/limb
        !            27: C P5,P54:    6.0
        !            28: C P55:       5.375
        !            29:
        !            30:
1.1       maekawa    31: C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
                     32: C                       unsigned shift);
                     33: C
                     34: C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does,
                     35: C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere.
                     36:
                     37: defframe(PARAM_SHIFT,16)
                     38: defframe(PARAM_SIZE, 12)
                     39: defframe(PARAM_SRC,  8)
                     40: defframe(PARAM_DST,  4)
                     41:
1.1.1.2 ! ohara      42:        TEXT
1.1       maekawa    43:        ALIGN(8)
                     44: PROLOGUE(mpn_rshift)
                     45:
                     46:        pushl   %edi
                     47:        pushl   %esi
                     48:        pushl   %ebx
                     49:        pushl   %ebp
                     50: deflit(`FRAME',16)
                     51:
                     52:        movl    PARAM_DST,%edi
                     53:        movl    PARAM_SRC,%esi
                     54:        movl    PARAM_SIZE,%ebp
                     55:        movl    PARAM_SHIFT,%ecx
                     56:
                     57: C We can use faster code for shift-by-1 under certain conditions.
                     58:        cmp     $1,%ecx
                     59:        jne     L(normal)
                     60:        leal    4(%edi),%eax
                     61:        cmpl    %esi,%eax
                     62:        jnc     L(special)              C jump if res_ptr + 1 >= s_ptr
                     63:        leal    (%edi,%ebp,4),%eax
                     64:        cmpl    %eax,%esi
                     65:        jnc     L(special)              C jump if s_ptr >= res_ptr + size
                     66:
                     67: L(normal):
                     68:        movl    (%esi),%edx
                     69:        addl    $4,%esi
                     70:        xorl    %eax,%eax
                     71:        shrdl(  %cl, %edx, %eax)        C compute carry limb
                     72:        pushl   %eax                    C push carry limb onto stack
                     73:
                     74:        decl    %ebp
                     75:        pushl   %ebp
                     76:        shrl    $3,%ebp
                     77:        jz      L(end)
                     78:
                     79:        movl    (%edi),%eax             C fetch destination cache line
                     80:
                     81:        ALIGN(4)
                     82: L(oop):        movl    28(%edi),%eax           C fetch destination cache line
                     83:        movl    %edx,%ebx
                     84:
                     85:        movl    (%esi),%eax
                     86:        movl    4(%esi),%edx
                     87:        shrdl(  %cl, %eax, %ebx)
                     88:        shrdl(  %cl, %edx, %eax)
                     89:        movl    %ebx,(%edi)
                     90:        movl    %eax,4(%edi)
                     91:
                     92:        movl    8(%esi),%ebx
                     93:        movl    12(%esi),%eax
                     94:        shrdl(  %cl, %ebx, %edx)
                     95:        shrdl(  %cl, %eax, %ebx)
                     96:        movl    %edx,8(%edi)
                     97:        movl    %ebx,12(%edi)
                     98:
                     99:        movl    16(%esi),%edx
                    100:        movl    20(%esi),%ebx
                    101:        shrdl(  %cl, %edx, %eax)
                    102:        shrdl(  %cl, %ebx, %edx)
                    103:        movl    %eax,16(%edi)
                    104:        movl    %edx,20(%edi)
                    105:
                    106:        movl    24(%esi),%eax
                    107:        movl    28(%esi),%edx
                    108:        shrdl(  %cl, %eax, %ebx)
                    109:        shrdl(  %cl, %edx, %eax)
                    110:        movl    %ebx,24(%edi)
                    111:        movl    %eax,28(%edi)
                    112:
                    113:        addl    $32,%esi
                    114:        addl    $32,%edi
                    115:        decl    %ebp
                    116:        jnz     L(oop)
                    117:
                    118: L(end):        popl    %ebp
                    119:        andl    $7,%ebp
                    120:        jz      L(end2)
                    121: L(oop2):
                    122:        movl    (%esi),%eax
                    123:        shrdl(  %cl,%eax,%edx)          C compute result limb
                    124:        movl    %edx,(%edi)
                    125:        movl    %eax,%edx
                    126:        addl    $4,%esi
                    127:        addl    $4,%edi
                    128:        decl    %ebp
                    129:        jnz     L(oop2)
                    130:
                    131: L(end2):
                    132:        shrl    %cl,%edx                C compute most significant limb
                    133:        movl    %edx,(%edi)             C store it
                    134:
                    135:        popl    %eax                    C pop carry limb
                    136:
                    137:        popl    %ebp
                    138:        popl    %ebx
                    139:        popl    %esi
                    140:        popl    %edi
                    141:        ret
                    142:
                    143:
                    144: C We loop from least significant end of the arrays, which is only
                    145: C permissable if the source and destination don't overlap, since the
                    146: C function is documented to work for overlapping source and destination.
                    147:
                    148: L(special):
                    149:        leal    -4(%edi,%ebp,4),%edi
                    150:        leal    -4(%esi,%ebp,4),%esi
                    151:
                    152:        movl    (%esi),%edx
                    153:        subl    $4,%esi
                    154:
                    155:        decl    %ebp
                    156:        pushl   %ebp
                    157:        shrl    $3,%ebp
                    158:
                    159:        shrl    %edx
                    160:        incl    %ebp
                    161:        decl    %ebp
                    162:        jz      L(Lend)
                    163:
                    164:        movl    (%edi),%eax             C fetch destination cache line
                    165:
                    166:        ALIGN(4)
                    167: L(Loop):
                    168:        movl    -28(%edi),%eax          C fetch destination cache line
                    169:        movl    %edx,%ebx
                    170:
                    171:        movl    (%esi),%eax
                    172:        movl    -4(%esi),%edx
                    173:        rcrl    %eax
                    174:        movl    %ebx,(%edi)
                    175:        rcrl    %edx
                    176:        movl    %eax,-4(%edi)
                    177:
                    178:        movl    -8(%esi),%ebx
                    179:        movl    -12(%esi),%eax
                    180:        rcrl    %ebx
                    181:        movl    %edx,-8(%edi)
                    182:        rcrl    %eax
                    183:        movl    %ebx,-12(%edi)
                    184:
                    185:        movl    -16(%esi),%edx
                    186:        movl    -20(%esi),%ebx
                    187:        rcrl    %edx
                    188:        movl    %eax,-16(%edi)
                    189:        rcrl    %ebx
                    190:        movl    %edx,-20(%edi)
                    191:
                    192:        movl    -24(%esi),%eax
                    193:        movl    -28(%esi),%edx
                    194:        rcrl    %eax
                    195:        movl    %ebx,-24(%edi)
                    196:        rcrl    %edx
                    197:        movl    %eax,-28(%edi)
                    198:
                    199:        leal    -32(%esi),%esi          C use leal not to clobber carry
                    200:        leal    -32(%edi),%edi
                    201:        decl    %ebp
                    202:        jnz     L(Loop)
                    203:
                    204: L(Lend):
                    205:        popl    %ebp
                    206:        sbbl    %eax,%eax               C save carry in %eax
                    207:        andl    $7,%ebp
                    208:        jz      L(Lend2)
                    209:        addl    %eax,%eax               C restore carry from eax
                    210: L(Loop2):
                    211:        movl    %edx,%ebx
                    212:        movl    (%esi),%edx
                    213:        rcrl    %edx
                    214:        movl    %ebx,(%edi)
                    215:
                    216:        leal    -4(%esi),%esi           C use leal not to clobber carry
                    217:        leal    -4(%edi),%edi
                    218:        decl    %ebp
                    219:        jnz     L(Loop2)
                    220:
                    221:        jmp     L(L1)
                    222: L(Lend2):
                    223:        addl    %eax,%eax               C restore carry from eax
                    224: L(L1): movl    %edx,(%edi)             C store last limb
                    225:
                    226:        movl    $0,%eax
                    227:        rcrl    %eax
                    228:
                    229:        popl    %ebp
                    230:        popl    %ebx
                    231:        popl    %esi
                    232:        popl    %edi
                    233:        ret
                    234:
                    235: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>