OpenXM_contrib/gmp/mpn/x86/k6/mmx/logops_n.asm - annotate

Return to logops_n.asm CVS log
Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k6 / mmx
Annotation of OpenXM_contrib/gmp/mpn/x86/k6/mmx/logops_n.asm, Revision 1.1

1.1     ! maekawa     1: dnl  AMD K6-2 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n,
        !             2: dnl  mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations.
        !             3: dnl
        !             4: dnl          alignment dst/src1/src2, A=0mod8, N=4mod8
        !             5: dnl       A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N
        !             6: dnl
        !             7: dnl  K6-2  1.2   1.5   1.5   1.2   1.2   1.5   1.5   1.2   and,andn,ior,xor
        !             8: dnl  K6-2  1.5   1.75  2.0   1.75  1.75  2.0   1.75  1.5   iorn,xnor
        !             9: dnl  K6-2  1.75  2.0   2.0   2.0   2.0   2.0   2.0   1.75  nand,nior
        !            10: dnl
        !            11: dnl  K6    1.5   1.68  1.75  1.2   1.75  1.75  1.68  1.5   and,andn,ior,xor
        !            12: dnl  K6    2.0   2.0   2.25  2.25  2.25  2.25  2.0   2.0   iorn,xnor
        !            13: dnl  K6    2.0   2.25  2.25  2.25  2.25  2.25  2.25  2.0   nand,nior
        !            14:
        !            15:
        !            16: dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
        !            17: dnl
        !            18: dnl  This file is part of the GNU MP Library.
        !            19: dnl
        !            20: dnl  The GNU MP Library is free software; you can redistribute it and/or
        !            21: dnl  modify it under the terms of the GNU Lesser General Public License as
        !            22: dnl  published by the Free Software Foundation; either version 2.1 of the
        !            23: dnl  License, or (at your option) any later version.
        !            24: dnl
        !            25: dnl  The GNU MP Library is distributed in the hope that it will be useful,
        !            26: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            27: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
        !            28: dnl  Lesser General Public License for more details.
        !            29: dnl
        !            30: dnl  You should have received a copy of the GNU Lesser General Public
        !            31: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
        !            32: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
        !            33: dnl  Suite 330, Boston, MA 02111-1307, USA.
        !            34:
        !            35:
        !            36: include(`../config.m4')
        !            37:
        !            38:
        !            39: dnl  M4_p and M4_i are the MMX and integer instructions
        !            40: dnl  M4_*_neg_dst means whether to negate the final result before writing
        !            41: dnl  M4_*_neg_src2 means whether to negate the src2 values before using them
        !            42:
        !            43: define(M4_choose_op,
        !            44: m4_assert_numargs(7)
        !            45: `ifdef(`OPERATION_$1',`
        !            46: define(`M4_function',  `mpn_$1')
        !            47: define(`M4_operation', `$1')
        !            48: define(`M4_p',         `$2')
        !            49: define(`M4_p_neg_dst', `$3')
        !            50: define(`M4_p_neg_src2',`$4')
        !            51: define(`M4_i',         `$5')
        !            52: define(`M4_i_neg_dst', `$6')
        !            53: define(`M4_i_neg_src2',`$7')
        !            54: ')')
        !            55:
        !            56: dnl  xnor is done in "iorn" style because it's a touch faster than "nior"
        !            57: dnl  style (the two are equivalent for xor).
        !            58:
        !            59: M4_choose_op( and_n,  pand,0,0,  andl,0,0)
        !            60: M4_choose_op( andn_n, pandn,0,0, andl,0,1)
        !            61: M4_choose_op( nand_n, pand,1,0,  andl,1,0)
        !            62: M4_choose_op( ior_n,  por,0,0,   orl,0,0)
        !            63: M4_choose_op( iorn_n, por,0,1,   orl,0,1)
        !            64: M4_choose_op( nior_n, por,1,0,   orl,1,0)
        !            65: M4_choose_op( xor_n,  pxor,0,0,  xorl,0,0)
        !            66: M4_choose_op( xnor_n, pxor,0,1,  xorl,0,1)
        !            67:
        !            68: ifdef(`M4_function',,
        !            69: `m4_error(`Unrecognised or undefined OPERATION symbol
        !            70: ')')
        !            71:
        !            72: MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
        !            73:
        !            74:
        !            75: C void M4_function (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
        !            76: C                   mp_size_t size);
        !            77: C
        !            78: C Do src1,size M4_operation src2,size, storing the result in dst,size.
        !            79: C
        !            80: C Unaligned movq loads and stores are a bit slower than aligned ones.  The
        !            81: C test at the start of the routine checks the alignment of src1 and if
        !            82: C necessary processes one limb separately at the low end to make it aligned.
        !            83: C
        !            84: C The raw speeds without this alignment switch are as follows.
        !            85: C
        !            86: C           alignment dst/src1/src2, A=0mod8, N=4mod8
        !            87: C     A/A/A  A/A/N  A/N/A  A/N/N  N/A/A  N/A/N  N/N/A  N/N/N
        !            88: C
        !            89: C K6                 1.5    2.0                 1.5    2.0    and,andn,ior,xor
        !            90: C K6                 1.75   2.2                 2.0    2.28   iorn,xnor
        !            91: C K6                 2.0    2.25                2.35   2.28   nand,nior
        !            92: C
        !            93: C
        !            94: C Future:
        !            95: C
        !            96: C K6 can do one 64-bit load per cycle so each of these routines should be
        !            97: C able to approach 1.0 c/l, if aligned.  The basic and/andn/ior/xor might be
        !            98: C able to get 1.0 with just a 4 limb loop, being 3 instructions per 2 limbs.
        !            99: C The others are 4 instructions per 2 limbs, and so can only approach 1.0
        !           100: C because there's nowhere to hide some loop control.
        !           101:
        !           102: defframe(PARAM_SIZE,16)
        !           103: defframe(PARAM_SRC2,12)
        !           104: defframe(PARAM_SRC1,8)
        !           105: defframe(PARAM_DST, 4)
        !           106: deflit(`FRAME',0)
        !           107:
        !           108:        .text
        !           109:        ALIGN(32)
        !           110: PROLOGUE(M4_function)
        !           111:                        movl    PARAM_SIZE, %ecx
        !           112:                        pushl   %ebx
        !           113:                FRAME_pushl()
        !           114:                        movl    PARAM_SRC1, %eax
        !           115:                        movl    PARAM_SRC2, %ebx
        !           116:                        cmpl    $1, %ecx
        !           117:                        movl    PARAM_DST, %edx
        !           118:                        ja      L(two_or_more)
        !           119:
        !           120:
        !           121:                        movl    (%ebx), %ecx
        !           122:                        popl    %ebx
        !           123: ifelse(M4_i_neg_src2,1,`notl   %ecx')
        !           124:                        M4_i    (%eax), %ecx
        !           125: ifelse(M4_i_neg_dst,1,`        notl    %ecx')
        !           126:                        movl    %ecx, (%edx)
        !           127:
        !           128:                        ret
        !           129:
        !           130:
        !           131: L(two_or_more):
        !           132:                        C eax   src1
        !           133:                        C ebx   src2
        !           134:                        C ecx   size
        !           135:                        C edx   dst
        !           136:                        C esi
        !           137:                        C edi
        !           138:                        C ebp
        !           139:                        C
        !           140:                        C carry bit is low of size
        !           141:
        !           142:                        pushl   %esi
        !           143:                FRAME_pushl()
        !           144:                        testl   $4, %eax
        !           145:                        jz      L(alignment_ok)
        !           146:
        !           147:                        movl    (%ebx), %esi
        !           148:                        addl    $4, %ebx
        !           149: ifelse(M4_i_neg_src2,1,`notl   %esi')
        !           150:                        M4_i    (%eax), %esi
        !           151:                        addl    $4, %eax
        !           152: ifelse(M4_i_neg_dst,1,`        notl    %esi')
        !           153:                        movl    %esi, (%edx)
        !           154:                        addl    $4, %edx
        !           155:                        decl    %ecx
        !           156:
        !           157: L(alignment_ok):
        !           158:                        movl    %ecx, %esi
        !           159:                        shrl    %ecx
        !           160:                        jnz     L(still_two_or_more)
        !           161:
        !           162:                        movl    (%ebx), %ecx
        !           163:                        popl    %esi
        !           164: ifelse(M4_i_neg_src2,1,`notl   %ecx')
        !           165:                        M4_i    (%eax), %ecx
        !           166: ifelse(M4_i_neg_dst,1,`        notl    %ecx')
        !           167:                        popl    %ebx
        !           168:                        movl    %ecx, (%edx)
        !           169:                        ret
        !           170:
        !           171:
        !           172: L(still_two_or_more):
        !           173: ifelse(eval(M4_p_neg_src2 || M4_p_neg_dst),1,`
        !           174:                        pcmpeqd %mm7, %mm7      C all ones
        !           175: ')
        !           176:
        !           177:                        ALIGN(16)
        !           178: L(top):
        !           179:                        C eax   src1
        !           180:                        C ebx   src2
        !           181:                        C ecx   counter
        !           182:                        C edx   dst
        !           183:                        C esi
        !           184:                        C edi
        !           185:                        C ebp
        !           186:                        C
        !           187:                        C carry bit is low of size
        !           188:
        !           189:                        movq    -8(%ebx,%ecx,8), %mm0
        !           190: ifelse(M4_p_neg_src2,1,`pxor   %mm7, %mm0')
        !           191:                        M4_p    -8(%eax,%ecx,8), %mm0
        !           192: ifelse(M4_p_neg_dst,1,`        pxor    %mm7, %mm0')
        !           193:                        movq    %mm0, -8(%edx,%ecx,8)
        !           194:
        !           195:                        loop    L(top)
        !           196:
        !           197:
        !           198:                        jnc     L(no_extra)
        !           199:
        !           200:                        movl    -4(%ebx,%esi,4), %ebx
        !           201: ifelse(M4_i_neg_src2,1,`notl   %ebx')
        !           202:                        M4_i    -4(%eax,%esi,4), %ebx
        !           203: ifelse(M4_i_neg_dst,1,`        notl    %ebx')
        !           204:                        movl    %ebx, -4(%edx,%esi,4)
        !           205: L(no_extra):
        !           206:
        !           207:                        popl    %esi
        !           208:                        popl    %ebx
        !           209:                        emms_or_femms
        !           210:                        ret
        !           211:
        !           212: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>