OpenXM_contrib/gmp/mpn/x86/pentium4/sse2/dive_1.asm - annotate

Return to dive_1.asm CVS log
Up to [local] / OpenXM_contrib / gmp / mpn / x86 / pentium4 / sse2
Annotation of OpenXM_contrib/gmp/mpn/x86/pentium4/sse2/dive_1.asm, Revision 1.1

1.1     ! ohara       1: dnl  Intel Pentium-4 mpn_divexact_1 -- mpn by limb exact division.
        !             2:
        !             3: dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
        !             4: dnl
        !             5: dnl  This file is part of the GNU MP Library.
        !             6: dnl
        !             7: dnl  The GNU MP Library is free software; you can redistribute it and/or
        !             8: dnl  modify it under the terms of the GNU Lesser General Public License as
        !             9: dnl  published by the Free Software Foundation; either version 2.1 of the
        !            10: dnl  License, or (at your option) any later version.
        !            11: dnl
        !            12: dnl  The GNU MP Library is distributed in the hope that it will be useful,
        !            13: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            14: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
        !            15: dnl  Lesser General Public License for more details.
        !            16: dnl
        !            17: dnl  You should have received a copy of the GNU Lesser General Public
        !            18: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
        !            19: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
        !            20: dnl  Suite 330, Boston, MA 02111-1307, USA.
        !            21:
        !            22: include(`../config.m4')
        !            23:
        !            24:
        !            25: C P4: 19.0 cycles/limb
        !            26:
        !            27:
        !            28: C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
        !            29: C                      mp_limb_t divisor);
        !            30: C
        !            31: C Pairs of movd's are used to avoid unaligned loads.  Despite the loads not
        !            32: C being on the dependent chain and there being plenty of cycles available,
        !            33: C using an unaligned movq on every second iteration measured about 23 c/l.
        !            34: C
        !            35: C Using divl for size==1 seems a touch quicker than mul-by-inverse.  The mul
        !            36: C will be about 9+2*4+2*2+10*4+19+12 = 92 cycles latency, though some of
        !            37: C that might be hidden by out-of-order execution, whereas divl is around 60.
        !            38: C At size==2 an extra 19 for the mul versus 60 for the divl will see the mul
        !            39: C faster.
        !            40:
        !            41: defframe(PARAM_DIVISOR,16)
        !            42: defframe(PARAM_SIZE,   12)
        !            43: defframe(PARAM_SRC,    8)
        !            44: defframe(PARAM_DST,    4)
        !            45:
        !            46:        TEXT
        !            47:
        !            48:        ALIGN(16)
        !            49: PROLOGUE(mpn_divexact_1)
        !            50: deflit(`FRAME',0)
        !            51:
        !            52:        movl    PARAM_SIZE, %edx
        !            53:
        !            54:        movl    PARAM_SRC, %eax
        !            55:
        !            56:        movl    PARAM_DIVISOR, %ecx
        !            57:        subl    $1, %edx
        !            58:        jnz     L(two_or_more)
        !            59:
        !            60:        movl    (%eax), %eax
        !            61:        xorl    %edx, %edx
        !            62:
        !            63:        divl    %ecx
        !            64:        movl    PARAM_DST, %ecx
        !            65:
        !            66:        movl    %eax, (%ecx)
        !            67:        ret
        !            68:
        !            69:
        !            70: L(two_or_more):
        !            71:        C eax   src
        !            72:        C ebx
        !            73:        C ecx   divisor
        !            74:        C edx   size-1
        !            75:
        !            76:        movl    %ecx, %eax
        !            77:        bsfl    %ecx, %ecx              C trailing twos
        !            78:
        !            79:        shrl    %cl, %eax               C d = divisor without twos
        !            80:        movd    %eax, %mm6
        !            81:        movd    %ecx, %mm7              C shift
        !            82:
        !            83:        shrl    %eax                    C d/2
        !            84:
        !            85:        andl    $127, %eax              C d/2, 7 bits
        !            86:
        !            87: ifdef(`PIC',`
        !            88:        call    L(movl_eip_ecx)
        !            89:
        !            90:        addl    $_GLOBAL_OFFSET_TABLE_, %ecx
        !            91:
        !            92:        movl    modlimb_invert_table@GOT(%ecx), %ecx
        !            93:        C
        !            94:        movzbl  (%eax,%ecx), %eax                       C inv 8 bits
        !            95: ',`
        !            96: dnl non-PIC
        !            97:        movzbl  modlimb_invert_table(%eax), %eax        C inv 8 bits
        !            98: ')
        !            99:
        !           100:        C
        !           101:
        !           102:        movd    %eax, %mm5              C inv
        !           103:
        !           104:        movd    %eax, %mm0              C inv
        !           105:
        !           106:        pmuludq %mm5, %mm5              C inv*inv
        !           107:
        !           108:        C
        !           109:
        !           110:        pmuludq %mm6, %mm5              C inv*inv*d
        !           111:        paddd   %mm0, %mm0              C 2*inv
        !           112:
        !           113:        C
        !           114:
        !           115:        psubd   %mm5, %mm0              C inv = 2*inv - inv*inv*d
        !           116:        pxor    %mm5, %mm5
        !           117:
        !           118:        paddd   %mm0, %mm5
        !           119:        pmuludq %mm0, %mm0              C inv*inv
        !           120:
        !           121:        pcmpeqd %mm4, %mm4
        !           122:        psrlq   $32, %mm4               C 0x00000000FFFFFFFF
        !           123:
        !           124:        C
        !           125:
        !           126:        pmuludq %mm6, %mm0              C inv*inv*d
        !           127:        paddd   %mm5, %mm5              C 2*inv
        !           128:
        !           129:        movl    PARAM_SRC, %eax
        !           130:        movl    PARAM_DST, %ecx
        !           131:        pxor    %mm1, %mm1              C initial carry limb
        !           132:
        !           133:        C
        !           134:
        !           135:        psubd   %mm0, %mm5              C inv = 2*inv - inv*inv*d
        !           136:
        !           137:        ASSERT(e,`      C expect d*inv == 1 mod 2^BITS_PER_MP_LIMB
        !           138:        pushl   %eax    FRAME_pushl()
        !           139:        movq    %mm6, %mm0
        !           140:        pmuludq %mm5, %mm0
        !           141:        movd    %mm0, %eax
        !           142:        cmpl    $1, %eax
        !           143:        popl    %eax    FRAME_popl()')
        !           144:
        !           145:        pxor    %mm0, %mm0              C initial carry bit
        !           146:
        !           147:
        !           148: C The dependent chain here is as follows.
        !           149: C
        !           150: C                                      latency
        !           151: C      psubq    s = (src-cbit) - climb    2
        !           152: C      pmuludq  q = s*inverse             8
        !           153: C      pmuludq  prod = q*divisor          8
        !           154: C      psrlq    climb = high(prod)        2
        !           155: C                                        --
        !           156: C                                        20
        !           157: C
        !           158: C Yet the loop measures 19.0 c/l, so obviously there's something gained
        !           159: C there over a straight reading of the chip documentation.
        !           160:
        !           161: L(top):
        !           162:        C eax   src, incrementing
        !           163:        C ebx
        !           164:        C ecx   dst, incrementing
        !           165:        C edx   counter, size-1 iterations
        !           166:        C
        !           167:        C mm0   carry bit
        !           168:        C mm1   carry limb
        !           169:        C mm4   0x00000000FFFFFFFF
        !           170:        C mm5   inverse
        !           171:        C mm6   divisor
        !           172:        C mm7   shift
        !           173:
        !           174:        movd    (%eax), %mm2
        !           175:        movd    4(%eax), %mm3
        !           176:        addl    $4, %eax
        !           177:        punpckldq %mm3, %mm2
        !           178:
        !           179:        psrlq   %mm7, %mm2
        !           180:        pand    %mm4, %mm2              C src
        !           181:        psubq   %mm0, %mm2              C src - cbit
        !           182:
        !           183:        psubq   %mm1, %mm2              C src - cbit - climb
        !           184:        movq    %mm2, %mm0
        !           185:        psrlq   $63, %mm0               C new cbit
        !           186:
        !           187:        pmuludq %mm5, %mm2              C s*inverse
        !           188:        movd    %mm2, (%ecx)            C q
        !           189:        addl    $4, %ecx
        !           190:
        !           191:        movq    %mm6, %mm1
        !           192:        pmuludq %mm2, %mm1              C q*divisor
        !           193:        psrlq   $32, %mm1               C new climb
        !           194:
        !           195:        subl    $1, %edx
        !           196:        jnz     L(top)
        !           197:
        !           198:
        !           199: L(done):
        !           200:        movd    (%eax), %mm2
        !           201:        psrlq   %mm7, %mm2              C src
        !           202:        psubq   %mm0, %mm2              C src - cbit
        !           203:
        !           204:        psubq   %mm1, %mm2              C src - cbit - climb
        !           205:
        !           206:        pmuludq %mm5, %mm2              C s*inverse
        !           207:        movd    %mm2, (%ecx)            C q
        !           208:
        !           209:        emms
        !           210:        ret
        !           211:
        !           212:
        !           213: ifdef(`PIC',`
        !           214: L(movl_eip_ecx):
        !           215:        movl    (%esp), %ecx
        !           216:        ret
        !           217: ')
        !           218:
        !           219: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>