[BACK]Return to diveby3.asm CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / x86 / pentium

Annotation of OpenXM_contrib/gmp/mpn/x86/pentium/diveby3.asm, Revision 1.1

1.1     ! maekawa     1: dnl  Intel P5 mpn_divexact_by3 -- mpn division by 3, expecting no remainder.
        !             2: dnl
        !             3: dnl  P5: 15.0 cycles/limb
        !             4:
        !             5:
        !             6: dnl  Copyright (C) 2000 Free Software Foundation, Inc.
        !             7: dnl
        !             8: dnl  This file is part of the GNU MP Library.
        !             9: dnl
        !            10: dnl  The GNU MP Library is free software; you can redistribute it and/or
        !            11: dnl  modify it under the terms of the GNU Lesser General Public License as
        !            12: dnl  published by the Free Software Foundation; either version 2.1 of the
        !            13: dnl  License, or (at your option) any later version.
        !            14: dnl
        !            15: dnl  The GNU MP Library is distributed in the hope that it will be useful,
        !            16: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            17: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
        !            18: dnl  Lesser General Public License for more details.
        !            19: dnl
        !            20: dnl  You should have received a copy of the GNU Lesser General Public
        !            21: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
        !            22: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
        !            23: dnl  Suite 330, Boston, MA 02111-1307, USA.
        !            24:
        !            25:
        !            26: include(`../config.m4')
        !            27:
        !            28:
        !            29: C mp_limb_t mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size,
        !            30: C                              mp_limb_t carry);
        !            31:
        !            32: defframe(PARAM_CARRY,16)
        !            33: defframe(PARAM_SIZE, 12)
        !            34: defframe(PARAM_SRC,   8)
        !            35: defframe(PARAM_DST,   4)
        !            36:
        !            37: dnl  multiplicative inverse of 3, modulo 2^32
        !            38: deflit(INVERSE_3,        0xAAAAAAAB)
        !            39:
        !            40: dnl  ceil(b/3), ceil(b*2/3) and floor(b*2/3) where b=2^32
        !            41: deflit(ONE_THIRD_CEIL,   0x55555556)
        !            42: deflit(TWO_THIRDS_CEIL,  0xAAAAAAAB)
        !            43: deflit(TWO_THIRDS_FLOOR, 0xAAAAAAAA)
        !            44:
        !            45:        .text
        !            46:        ALIGN(8)
        !            47:
        !            48: PROLOGUE(mpn_divexact_by3c)
        !            49: deflit(`FRAME',0)
        !            50:
        !            51:        movl    PARAM_SRC, %ecx
        !            52:        movl    PARAM_SIZE, %edx
        !            53:
        !            54:        decl    %edx
        !            55:        jnz     L(two_or_more)
        !            56:
        !            57:        movl    (%ecx), %edx
        !            58:        movl    PARAM_CARRY, %eax       C risk of cache bank clash here
        !            59:
        !            60:        movl    PARAM_DST, %ecx
        !            61:        subl    %eax, %edx
        !            62:
        !            63:        sbbl    %eax, %eax              C 0 or -1
        !            64:
        !            65:        imull   $INVERSE_3, %edx, %edx
        !            66:
        !            67:        negl    %eax                    C 0 or 1
        !            68:        cmpl    $ONE_THIRD_CEIL, %edx
        !            69:
        !            70:        sbbl    $-1, %eax               C +1 if edx>=ceil(b/3)
        !            71:        cmpl    $TWO_THIRDS_CEIL, %edx
        !            72:
        !            73:        sbbl    $-1, %eax               C +1 if edx>=ceil(b*2/3)
        !            74:        movl    %edx, (%ecx)
        !            75:
        !            76:        ret
        !            77:
        !            78:
        !            79: L(two_or_more):
        !            80:        C eax
        !            81:        C ebx
        !            82:        C ecx   src
        !            83:        C edx   size-1
        !            84:        C esi
        !            85:        C edi
        !            86:        C ebp
        !            87:
        !            88:        pushl   %ebx    FRAME_pushl()
        !            89:        pushl   %esi    FRAME_pushl()
        !            90:
        !            91:        pushl   %edi    FRAME_pushl()
        !            92:        pushl   %ebp    FRAME_pushl()
        !            93:
        !            94:        movl    PARAM_DST, %edi
        !            95:        movl    PARAM_CARRY, %esi
        !            96:
        !            97:        movl    (%ecx), %eax            C src low limb
        !            98:        xorl    %ebx, %ebx
        !            99:
        !           100:        sub     %esi, %eax
        !           101:        movl    $TWO_THIRDS_FLOOR, %esi
        !           102:
        !           103:        leal    (%ecx,%edx,4), %ecx     C &src[size-1]
        !           104:        leal    (%edi,%edx,4), %edi     C &dst[size-1]
        !           105:
        !           106:        adcl    $0, %ebx                C carry, 0 or 1
        !           107:        negl    %edx                    C -(size-1)
        !           108:
        !           109:
        !           110: C The loop needs a source limb ready at the top, which leads to one limb
        !           111: C handled separately at the end, and the special case above for size==1.
        !           112: C There doesn't seem to be any scheduling that would keep the speed but move
        !           113: C the source load and carry subtract up to the top.
        !           114: C
        !           115: C The destination cache line prefetching adds 1 cycle to the loop but is
        !           116: C considered worthwhile.  The slowdown is a factor of 1.07, but will prevent
        !           117: C repeated write-throughs if the destination isn't in L1.  A version using
        !           118: C an outer loop to prefetch only every 8 limbs (a cache line) proved to be
        !           119: C no faster, due to unavoidable branch mispreditions in the inner loop.
        !           120: C
        !           121: C setc is 2 cycles on P54, so an adcl is used instead.  If the movl $0,%ebx
        !           122: C could be avoided then the src limb fetch could pair up and save a cycle.
        !           123: C This would probably mean going to a two limb loop with the carry limb
        !           124: C alternately positive or negative, since an sbbl %ebx,%ebx will leave a
        !           125: C value which is in the opposite sense to the preceding sbbl/adcl %ebx,%eax.
        !           126: C
        !           127: C A register is used for TWO_THIRDS_FLOOR because a cmp can't be done as
        !           128: C "cmpl %edx, $n" with the immediate as the second operand.
        !           129: C
        !           130: C The "4" source displacement is in the loop rather than the setup because
        !           131: C this gets L(top) aligned to 8 bytes at no cost.
        !           132:
        !           133:        ALIGN(8)
        !           134: L(top):
        !           135:        C eax   source limb, carry subtracted
        !           136:        C ebx   carry (0 or 1)
        !           137:        C ecx   &src[size-1]
        !           138:        C edx   counter, limbs, negative
        !           139:        C esi   TWO_THIRDS_FLOOR
        !           140:        C edi   &dst[size-1]
        !           141:        C ebp   scratch (result limb)
        !           142:
        !           143:        imull   $INVERSE_3, %eax, %ebp
        !           144:
        !           145:        cmpl    $ONE_THIRD_CEIL, %ebp
        !           146:        movl    (%edi,%edx,4), %eax     C dst cache line prefetch
        !           147:
        !           148:        sbbl    $-1, %ebx               C +1 if ebp>=ceil(b/3)
        !           149:        cmpl    %ebp, %esi
        !           150:
        !           151:        movl    4(%ecx,%edx,4), %eax    C next src limb
        !           152:
        !           153:        sbbl    %ebx, %eax              C and further -1 if ebp>=ceil(b*2/3)
        !           154:        movl    $0, %ebx
        !           155:
        !           156:        adcl    $0, %ebx                C new carry
        !           157:        movl    %ebp, (%edi,%edx,4)
        !           158:
        !           159:        incl    %edx
        !           160:        jnz     L(top)
        !           161:
        !           162:
        !           163:
        !           164:        imull   $INVERSE_3, %eax, %edx
        !           165:
        !           166:        cmpl    $ONE_THIRD_CEIL, %edx
        !           167:        movl    %edx, (%edi)
        !           168:
        !           169:        sbbl    $-1, %ebx       C +1 if edx>=ceil(b/3)
        !           170:        cmpl    $TWO_THIRDS_CEIL, %edx
        !           171:
        !           172:        sbbl    $-1, %ebx       C +1 if edx>=ceil(b*2/3)
        !           173:        popl    %ebp
        !           174:
        !           175:        movl    %ebx, %eax
        !           176:        popl    %edi
        !           177:
        !           178:        popl    %esi
        !           179:        popl    %ebx
        !           180:
        !           181:        ret
        !           182:
        !           183: EPILOGUE()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>