[BACK]Return to mod_34lsub1.asm CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / x86 / pentium4 / sse2

Annotation of OpenXM_contrib/gmp/mpn/x86/pentium4/sse2/mod_34lsub1.asm, Revision 1.1

1.1     ! ohara       1: dnl  Intel Pentium 4 mpn_mod_32lsub1 -- remainder modulo 2^24-1.
        !             2:
        !             3: dnl  Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
        !             4: dnl
        !             5: dnl  This file is part of the GNU MP Library.
        !             6: dnl
        !             7: dnl  The GNU MP Library is free software; you can redistribute it and/or
        !             8: dnl  modify it under the terms of the GNU Lesser General Public License as
        !             9: dnl  published by the Free Software Foundation; either version 2.1 of the
        !            10: dnl  License, or (at your option) any later version.
        !            11: dnl
        !            12: dnl  The GNU MP Library is distributed in the hope that it will be useful,
        !            13: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            14: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
        !            15: dnl  Lesser General Public License for more details.
        !            16: dnl
        !            17: dnl  You should have received a copy of the GNU Lesser General Public
        !            18: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
        !            19: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
        !            20: dnl  Suite 330, Boston, MA 02111-1307, USA.
        !            21:
        !            22: include(`../config.m4')
        !            23:
        !            24:
        !            25: C Pentium4: 1.0 cycles/limb
        !            26:
        !            27:
        !            28: C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
        !            29: C
        !            30: C Enhancements:
        !            31: C
        !            32: C There might a couple of cycles to save by using plain integer code for
        !            33: C more small sizes.  2 limbs measures about 20 cycles, but 3 limbs jumps to
        !            34: C about 46 (inclusive of some function call overheads).
        !            35:
        !            36: defframe(PARAM_SIZE, 8)
        !            37: defframe(PARAM_SRC,  4)
        !            38:
        !            39: dnl  re-use parameter space
        !            40: define(SAVE_EBX, `PARAM_SRC')
        !            41: define(SAVE_ESI, `PARAM_SIZE')
        !            42:
        !            43:        TEXT
        !            44:        ALIGN(16)
        !            45: PROLOGUE(mpn_mod_34lsub1)
        !            46: deflit(`FRAME',0)
        !            47:
        !            48:        movl    PARAM_SIZE, %ecx
        !            49:        movl    PARAM_SRC, %edx
        !            50:        movl    (%edx), %eax
        !            51:
        !            52:        subl    $2, %ecx
        !            53:        ja      L(three_or_more)
        !            54:        jne     L(one)
        !            55:
        !            56:        movl    4(%edx), %edx
        !            57:        movl    %eax, %ecx
        !            58:        shrl    $24, %eax               C src[0] high
        !            59:
        !            60:        andl    $0x00FFFFFF, %ecx       C src[0] low
        !            61:        addl    %ecx, %eax
        !            62:
        !            63:        movl    %edx, %ecx
        !            64:        shll    $8, %edx
        !            65:
        !            66:        shrl    $16, %ecx               C src[1] low
        !            67:        addl    %ecx, %eax
        !            68:
        !            69:        andl    $0x00FFFF00, %edx       C src[1] high
        !            70:        addl    %edx, %eax
        !            71:
        !            72: L(one):
        !            73:        ret
        !            74:
        !            75:
        !            76: L(three_or_more):
        !            77:        pxor    %mm0, %mm0
        !            78:        pxor    %mm1, %mm1
        !            79:        pxor    %mm2, %mm2
        !            80:
        !            81:        pcmpeqd %mm7, %mm7
        !            82:        psrlq   $32, %mm7       C 0x00000000FFFFFFFF, low 32 bits
        !            83:
        !            84:        pcmpeqd %mm6, %mm6
        !            85:        psrlq   $40, %mm6       C 0x0000000000FFFFFF, low 24 bits
        !            86:
        !            87: L(top):
        !            88:        C eax
        !            89:        C ebx
        !            90:        C ecx   counter, size-2 to 0, -1 or -2
        !            91:        C edx   src, incrementing
        !            92:        C
        !            93:        C mm0   sum 0mod3
        !            94:        C mm1   sum 1mod3
        !            95:        C mm2   sum 2mod3
        !            96:        C mm3
        !            97:        C mm4
        !            98:        C mm5
        !            99:        C mm6   0x0000000000FFFFFF
        !           100:        C mm7   0x00000000FFFFFFFF
        !           101:
        !           102:        movd    (%edx), %mm3
        !           103:        paddq   %mm3, %mm0
        !           104:
        !           105:        movd    4(%edx), %mm3
        !           106:        paddq   %mm3, %mm1
        !           107:
        !           108:        movd    8(%edx), %mm3
        !           109:        paddq   %mm3, %mm2
        !           110:
        !           111:        addl    $12, %edx
        !           112:        subl    $3, %ecx
        !           113:        ja      L(top)
        !           114:
        !           115:
        !           116:        C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively
        !           117:
        !           118:        addl    $1, %ecx
        !           119:        js      L(combine)              C 0 more
        !           120:
        !           121:        movd    (%edx), %mm3
        !           122:        paddq   %mm3, %mm0
        !           123:
        !           124:        jz      L(combine)              C 1 more
        !           125:
        !           126:        movd    4(%edx), %mm3
        !           127:        paddq   %mm3, %mm1
        !           128:
        !           129: L(combine):
        !           130:        movq    %mm7, %mm3              C low halves
        !           131:        pand    %mm0, %mm3
        !           132:
        !           133:        movq    %mm7, %mm4
        !           134:        pand    %mm1, %mm4
        !           135:
        !           136:        movq    %mm7, %mm5
        !           137:        pand    %mm2, %mm5
        !           138:
        !           139:        psrlq   $32, %mm0               C high halves
        !           140:        psrlq   $32, %mm1
        !           141:        psrlq   $32, %mm2
        !           142:
        !           143:        paddq   %mm0, %mm4              C fold high halves to give 33 bits each
        !           144:        paddq   %mm1, %mm5
        !           145:        paddq   %mm2, %mm3
        !           146:
        !           147:        psllq   $8, %mm4                C combine at respective offsets
        !           148:        psllq   $16, %mm5
        !           149:        paddq   %mm4, %mm3
        !           150:        paddq   %mm5, %mm3              C 0x000cxxxxxxxxxxxx, 50 bits
        !           151:
        !           152:        pand    %mm3, %mm6              C fold at 24 bits
        !           153:        psrlq   $24, %mm3
        !           154:
        !           155:        paddq   %mm6, %mm3
        !           156:        movd    %mm3, %eax
        !           157:
        !           158:        ASSERT(z,       C nothing left in high dword
        !           159:        `psrlq  $32, %mm3
        !           160:        movd    %mm3, %ecx
        !           161:        orl     %ecx, %ecx')
        !           162:
        !           163:        emms
        !           164:        ret
        !           165:
        !           166: EPILOGUE()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>