OpenXM_contrib/gmp/mpn/x86/k6/mmx/dive_1.asm - annotate

Return to dive_1.asm CVS log
Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k6 / mmx
Annotation of OpenXM_contrib/gmp/mpn/x86/k6/mmx/dive_1.asm, Revision 1.1.1.1

1.1       ohara       1: dnl  AMD K6 mpn_divexact_1 -- mpn by limb exact division.
                      2:
                      3: dnl  Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
                      4: dnl
                      5: dnl  This file is part of the GNU MP Library.
                      6: dnl
                      7: dnl  The GNU MP Library is free software; you can redistribute it and/or
                      8: dnl  modify it under the terms of the GNU Lesser General Public License as
                      9: dnl  published by the Free Software Foundation; either version 2.1 of the
                     10: dnl  License, or (at your option) any later version.
                     11: dnl
                     12: dnl  The GNU MP Library is distributed in the hope that it will be useful,
                     13: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
                     14: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     15: dnl  Lesser General Public License for more details.
                     16: dnl
                     17: dnl  You should have received a copy of the GNU Lesser General Public
                     18: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
                     19: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
                     20: dnl  Suite 330, Boston, MA 02111-1307, USA.
                     21:
                     22: include(`../config.m4')
                     23:
                     24:
                     25: C         divisor
                     26: C       odd   even
                     27: C K6:   10.0  12.0  cycles/limb
                     28: C K6-2: 10.0  11.5
                     29:
                     30:
                     31: C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
                     32: C                      mp_limb_t divisor);
                     33: C
                     34: C A simple divl is used for size==1.  This is about 10 cycles faster for an
                     35: C odd divisor or 20 cycles for an even divisor.
                     36: C
                     37: C The loops are quite sensitive to code alignment, speeds should be
                     38: C rechecked (odd and even divisor, pic and non-pic) if contemplating
                     39: C changing anything.
                     40:
                     41: defframe(PARAM_DIVISOR,16)
                     42: defframe(PARAM_SIZE,   12)
                     43: defframe(PARAM_SRC,    8)
                     44: defframe(PARAM_DST,    4)
                     45:
                     46: dnl  re-use parameter space
                     47: define(VAR_INVERSE,`PARAM_DST')
                     48:
                     49:        TEXT
                     50:
                     51:        ALIGN(32)
                     52: PROLOGUE(mpn_divexact_1)
                     53: deflit(`FRAME',0)
                     54:
                     55:        movl    PARAM_SIZE, %ecx
                     56:
                     57:        movl    PARAM_SRC, %eax
                     58:        xorl    %edx, %edx
                     59:
                     60:        cmpl    $1, %ecx
                     61:        jnz     L(two_or_more)
                     62:
                     63:        movl    (%eax), %eax
                     64:
                     65:        divl    PARAM_DIVISOR
                     66:
                     67:        movl    PARAM_DST, %ecx
                     68:        movl    %eax, (%ecx)
                     69:
                     70:        ret
                     71:
                     72:
                     73: L(two_or_more):
                     74:        movl    PARAM_DIVISOR, %eax
                     75:        pushl   %ebx            FRAME_pushl()
                     76:
                     77:        movl    PARAM_SRC, %ebx
                     78:        pushl   %ebp            FRAME_pushl()
                     79:
                     80: L(strip_twos):
                     81:        shrl    %eax
                     82:        incl    %edx                    C will get shift+1
                     83:
                     84:        jnc     L(strip_twos)
                     85:        pushl   %esi            FRAME_pushl()
                     86:
                     87:        leal    1(%eax,%eax), %esi      C d without twos
                     88:        andl    $127, %eax              C d/2, 7 bits
                     89:
                     90: ifdef(`PIC',`
                     91:        call    L(movl_eip_ebp)
                     92:
                     93:        addl    $_GLOBAL_OFFSET_TABLE_, %ebp
                     94:        C
                     95:        movl    modlimb_invert_table@GOT(%ebp), %ebp
                     96:        C
                     97: Zdisp( movzbl, 0,(%eax,%ebp), %eax)
                     98: ',`
                     99:
                    100: dnl non-PIC
                    101:        movzbl  modlimb_invert_table(%eax), %eax        C inv 8 bits
                    102: ')
                    103:        pushl   %edi            FRAME_pushl()
                    104:
                    105:        leal    (%eax,%eax), %ebp       C 2*inv
                    106:
                    107:        imull   %eax, %eax              C inv*inv
                    108:
                    109:        movl    PARAM_DST, %edi
                    110:
                    111:        imull   %esi, %eax              C inv*inv*d
                    112:
                    113:        subl    %eax, %ebp              C inv = 2*inv - inv*inv*d
                    114:        leal    (%ebp,%ebp), %eax       C 2*inv
                    115:
                    116:        imull   %ebp, %ebp              C inv*inv
                    117:
                    118:        movl    %esi, PARAM_DIVISOR     C d without twos
                    119:        leal    (%ebx,%ecx,4), %ebx     C src end
                    120:
                    121:        imull   %esi, %ebp              C inv*inv*d
                    122:
                    123:        leal    (%edi,%ecx,4), %edi     C dst end
                    124:        negl    %ecx                    C -size
                    125:
                    126:        subl    %ebp, %eax              C inv = 2*inv - inv*inv*d
                    127:        subl    $1, %edx                C shift amount, and clear carry
                    128:
                    129:        ASSERT(e,`      C expect d*inv == 1 mod 2^BITS_PER_MP_LIMB
                    130:        pushl   %eax    FRAME_pushl()
                    131:        imull   PARAM_DIVISOR, %eax
                    132:        cmpl    $1, %eax
                    133:        popl    %eax    FRAME_popl()')
                    134:
                    135:        movl    %eax, VAR_INVERSE
                    136:        jnz     L(even)
                    137:
                    138:        movl    (%ebx,%ecx,4), %esi     C src low limb
                    139:        jmp     L(odd_entry)
                    140:
                    141:
                    142:        ALIGN(16)
                    143:        nop     C code alignment
                    144: L(odd_top):
                    145:        C eax   scratch
                    146:        C ebx   src end
                    147:        C ecx   counter, limbs, negative
                    148:        C edx   inverse
                    149:        C esi   next limb, adjusted for carry
                    150:        C edi   dst end
                    151:        C ebp   carry bit, 0 or -1
                    152:
                    153:        imull   %edx, %esi
                    154:
                    155:        movl    PARAM_DIVISOR, %eax
                    156:        movl    %esi, -4(%edi,%ecx,4)
                    157:
                    158:        mull    %esi                    C carry limb in edx
                    159:
                    160:        subl    %ebp, %edx              C apply carry bit
                    161:        movl    (%ebx,%ecx,4), %esi
                    162:
                    163: L(odd_entry):
                    164:        subl    %edx, %esi              C apply carry limb
                    165:        movl    VAR_INVERSE, %edx
                    166:
                    167:        sbbl    %ebp, %ebp              C 0 or -1
                    168:
                    169:        incl    %ecx
                    170:        jnz     L(odd_top)
                    171:
                    172:
                    173:        imull   %edx, %esi
                    174:
                    175:        movl    %esi, -4(%edi,%ecx,4)
                    176:
                    177:        popl    %edi
                    178:        popl    %esi
                    179:
                    180:        popl    %ebp
                    181:        popl    %ebx
                    182:
                    183:        ret
                    184:
                    185:
                    186: ifdef(`PIC',`
                    187: L(movl_eip_ebp):
                    188:        movl    (%esp), %ebp
                    189:        ret
                    190:
                    191:        ALIGN(8)
                    192:        nop     C code alignment, necessary for claimed speed
                    193:        nop
                    194: ',`
                    195: C non-PIC code alignment already ok at 0x9a
                    196: ')
                    197:
                    198: L(even):
                    199:        C eax
                    200:        C ebx   src end
                    201:        C ecx   -size
                    202:        C edx   twos
                    203:        C esi
                    204:        C edi   dst end
                    205:        C ebp
                    206:
                    207:        xorl    %ebp, %ebp
                    208: Zdisp( movq,   0,(%ebx,%ecx,4), %mm0)  C src[0,1]
                    209:
                    210:        movd    %edx, %mm7
                    211:        movl    VAR_INVERSE, %edx
                    212:
                    213:        addl    $2, %ecx
                    214:        psrlq   %mm7, %mm0
                    215:
                    216:        movd    %mm0, %esi
                    217:        jz      L(even_two)             C if only two limbs
                    218:
                    219:
                    220: C Out-of-order execution is good enough to hide the load/rshift/movd
                    221: C latency.  Having imul at the top of the loop gives 11.5 c/l instead of 12,
                    222: C on K6-2.  In fact there's only 11 of decode, but nothing running at 11 has
                    223: C been found.  Maybe the fact every second movq is unaligned costs the extra
                    224: C 0.5.
                    225:
                    226: L(even_top):
                    227:        C eax   scratch
                    228:        C ebx   src end
                    229:        C ecx   counter, limbs, negative
                    230:        C edx   inverse
                    231:        C esi   next limb, adjusted for carry
                    232:        C edi   dst end
                    233:        C ebp   carry bit, 0 or -1
                    234:        C
                    235:        C mm0   scratch, source limbs
                    236:        C mm7   twos
                    237:
                    238:        imull   %edx, %esi
                    239:
                    240:        movl    %esi, -8(%edi,%ecx,4)
                    241:        movl    PARAM_DIVISOR, %eax
                    242:
                    243:        mull    %esi                    C carry limb in edx
                    244:
                    245:        movq    -4(%ebx,%ecx,4), %mm0
                    246:        psrlq   %mm7, %mm0
                    247:
                    248:        movd    %mm0, %esi
                    249:        subl    %ebp, %edx              C apply carry bit
                    250:
                    251:        subl    %edx, %esi              C apply carry limb
                    252:        movl    VAR_INVERSE, %edx
                    253:
                    254:        sbbl    %ebp, %ebp              C 0 or -1
                    255:
                    256:        incl    %ecx
                    257:        jnz     L(even_top)
                    258:
                    259:
                    260: L(even_two):
                    261:        movd    -4(%ebx), %mm0          C src high limb
                    262:        psrlq   %mm7, %mm0
                    263:
                    264:        imull   %edx, %esi
                    265:
                    266:        movl    %esi, -8(%edi)
                    267:        movl    PARAM_DIVISOR, %eax
                    268:
                    269:        mull    %esi                    C carry limb in edx
                    270:
                    271:        movd    %mm0, %esi
                    272:        subl    %ebp, %edx              C apply carry bit
                    273:
                    274:        movl    VAR_INVERSE, %eax
                    275:        subl    %edx, %esi              C apply carry limb
                    276:
                    277:        imull   %eax, %esi
                    278:
                    279:        movl    %esi, -4(%edi)
                    280:
                    281:        popl    %edi
                    282:        popl    %esi
                    283:
                    284:        popl    %ebp
                    285:        popl    %ebx
                    286:
                    287:        emms_or_femms
                    288:
                    289:        ret
                    290:
                    291: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>