[BACK]Return to mod_34lsub1.asm CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k6

Annotation of OpenXM_contrib/gmp/mpn/x86/k6/mod_34lsub1.asm, Revision 1.1

1.1     ! ohara       1: dnl  AMD K6 mpn_mod_34lsub1 -- mpn remainder modulo 2**24-1.
        !             2:
        !             3: dnl  Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
        !             4: dnl
        !             5: dnl  This file is part of the GNU MP Library.
        !             6: dnl
        !             7: dnl  The GNU MP Library is free software; you can redistribute it and/or
        !             8: dnl  modify it under the terms of the GNU Lesser General Public License as
        !             9: dnl  published by the Free Software Foundation; either version 2.1 of the
        !            10: dnl  License, or (at your option) any later version.
        !            11: dnl
        !            12: dnl  The GNU MP Library is distributed in the hope that it will be useful,
        !            13: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            14: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
        !            15: dnl  Lesser General Public License for more details.
        !            16: dnl
        !            17: dnl  You should have received a copy of the GNU Lesser General Public
        !            18: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
        !            19: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
        !            20: dnl  Suite 330, Boston, MA 02111-1307, USA.
        !            21:
        !            22: include(`../config.m4')
        !            23:
        !            24:
        !            25: C K6: 2.66 cycles/limb
        !            26:
        !            27:
        !            28: C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
        !            29: C
        !            30: C An attempt was made to use a loop like
        !            31: C
        !            32: C L(top):
        !            33: C      adcl    (%edx), %eax
        !            34: C      adcl    4(%edx), %ebx
        !            35: C      adcl    8(%edx), %esi
        !            36: C      leal    12(%edx), %edx
        !            37: C      loop    L(top)
        !            38: C
        !            39: C with %ecx starting from floor(size/3), but it still measured 2.66 c/l.
        !            40: C The form used instead can save about 6 cycles by not dividing by 3.
        !            41: C
        !            42: C In the code used, putting the "leal"s at the top of the loop is necessary
        !            43: C for the claimed speed, anywhere else costs an extra cycle per loop.
        !            44: C Perhaps a tight loop like this needs short decode instructions at the
        !            45: C branch target, which would explain the leal/loop form above taking 8
        !            46: C cycles instead of 7 too.
        !            47:
        !            48: defframe(PARAM_SIZE, 8)
        !            49: defframe(PARAM_SRC,  4)
        !            50:
        !            51: dnl  re-use parameter space
        !            52: define(SAVE_EBX, `PARAM_SIZE')
        !            53: define(SAVE_ESI, `PARAM_SRC')
        !            54:
        !            55:        TEXT
        !            56:        ALIGN(16)
        !            57: PROLOGUE(mpn_mod_34lsub1)
        !            58: deflit(`FRAME',0)
        !            59:
        !            60:        movl    PARAM_SIZE, %eax
        !            61:        movl    PARAM_SRC, %edx
        !            62:
        !            63:        subl    $2, %eax
        !            64:        ja      L(three_or_more)
        !            65:
        !            66: Zdisp( movl,   0,(%edx), %eax)         C avoid code cache line boundary
        !            67:        jne     L(one)
        !            68:
        !            69:        movl    %eax, %ecx
        !            70:        movl    4(%edx), %edx
        !            71:
        !            72:        shrl    $24, %eax               C src[0] high
        !            73:        andl    $0x00FFFFFF, %ecx       C src[0] low
        !            74:
        !            75:        addl    %ecx, %eax
        !            76:        movl    %edx, %ecx
        !            77:
        !            78:        shll    $8, %edx
        !            79:        andl    $0x00FFFF00, %edx       C src[1] high
        !            80:
        !            81:        shrl    $16, %ecx               C src[1] low
        !            82:        addl    %ecx, %eax
        !            83:
        !            84:        addl    %edx, %eax
        !            85:
        !            86: L(one):
        !            87:        ret
        !            88:
        !            89:
        !            90: L(three_or_more):
        !            91:        C eax   size-2
        !            92:        C ebx
        !            93:        C ecx
        !            94:        C edx   src
        !            95:
        !            96:        movl    %ebx, SAVE_EBX
        !            97:        xorl    %ebx, %ebx
        !            98:
        !            99:        movl    %esi, SAVE_ESI
        !           100:        pushl   %edi    FRAME_pushl()
        !           101:
        !           102:        xorl    %esi, %esi
        !           103:        xorl    %edi, %edi              C and clear carry flag
        !           104:
        !           105: L(top):
        !           106:        C eax   counter, limbs
        !           107:        C ebx   acc 0mod3
        !           108:        C ecx
        !           109:        C edx   src, incrementing
        !           110:        C esi   acc 1mod3
        !           111:        C edi   acc 2mod3
        !           112:        C ebp
        !           113:
        !           114:        leal    -2(%eax), %eax
        !           115:        leal    12(%edx), %edx
        !           116:
        !           117:        adcl    -12(%edx), %ebx
        !           118:        adcl    -8(%edx), %esi
        !           119:        adcl    -4(%edx), %edi
        !           120:
        !           121:        decl    %eax
        !           122:        jg      L(top)
        !           123:
        !           124:
        !           125:        C ecx is -3, -2 or -1 representing 0, 1 or 2 more limbs, respectively
        !           126:
        !           127:        movb    $0, %cl
        !           128:        incl    %eax
        !           129:
        !           130:        js      L(combine)              C 0 more
        !           131:
        !           132: Zdisp( adcl,   0,(%edx), %ebx)         C avoid code cache line crossings
        !           133:
        !           134:        movb    $8, %cl
        !           135:        decl    %eax
        !           136:
        !           137:        js      L(combine)              C 1 more
        !           138:
        !           139:        adcl    4(%edx), %esi
        !           140:
        !           141:        movb    $16, %cl
        !           142:
        !           143:
        !           144: L(combine):
        !           145:        sbbl    %edx, %edx
        !           146:
        !           147:        shll    %cl, %edx               C carry
        !           148:        movl    %ebx, %eax              C 0mod3
        !           149:
        !           150:        shrl    $24, %eax               C 0mod3 high
        !           151:        andl    $0x00FFFFFF, %ebx       C 0mod3 low
        !           152:
        !           153:        subl    %edx, %eax              C apply carry
        !           154:        movl    %esi, %ecx              C 1mod3
        !           155:
        !           156:        shrl    $16, %esi               C 1mod3 high
        !           157:        addl    %ebx, %eax              C apply 0mod3 low
        !           158:
        !           159:        andl    $0x0000FFFF, %ecx
        !           160:        addl    %esi, %eax              C apply 1mod3 high
        !           161:
        !           162:        shll    $8, %ecx                C 1mod3 low
        !           163:        movl    %edi, %edx              C 2mod3
        !           164:
        !           165:        shrl    $8, %edx                C 2mod3 high
        !           166:        addl    %ecx, %eax              C apply 1mod3 low
        !           167:
        !           168:        addl    %edx, %eax              C apply 2mod3 high
        !           169:        andl    $0x000000FF, %edi
        !           170:
        !           171:        shll    $16, %edi               C 2mod3 low
        !           172:        movl    SAVE_EBX, %ebx
        !           173:
        !           174:        addl    %edi, %eax              C apply 2mod3 low
        !           175:        movl    SAVE_ESI, %esi
        !           176:
        !           177:        popl    %edi
        !           178:
        !           179:        ret
        !           180:
        !           181: EPILOGUE()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>