OpenXM_contrib/gmp/mpn/x86/k6/mmx/logops_n.asm - annotate

Return to logops_n.asm CVS log
Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k6 / mmx
Annotation of OpenXM_contrib/gmp/mpn/x86/k6/mmx/logops_n.asm, Revision 1.1.1.1

1.1       maekawa     1: dnl  AMD K6-2 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n,
                      2: dnl  mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations.
                      3: dnl
                      4: dnl          alignment dst/src1/src2, A=0mod8, N=4mod8
                      5: dnl       A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N
                      6: dnl
                      7: dnl  K6-2  1.2   1.5   1.5   1.2   1.2   1.5   1.5   1.2   and,andn,ior,xor
                      8: dnl  K6-2  1.5   1.75  2.0   1.75  1.75  2.0   1.75  1.5   iorn,xnor
                      9: dnl  K6-2  1.75  2.0   2.0   2.0   2.0   2.0   2.0   1.75  nand,nior
                     10: dnl
                     11: dnl  K6    1.5   1.68  1.75  1.2   1.75  1.75  1.68  1.5   and,andn,ior,xor
                     12: dnl  K6    2.0   2.0   2.25  2.25  2.25  2.25  2.0   2.0   iorn,xnor
                     13: dnl  K6    2.0   2.25  2.25  2.25  2.25  2.25  2.25  2.0   nand,nior
                     14:
                     15:
                     16: dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
                     17: dnl
                     18: dnl  This file is part of the GNU MP Library.
                     19: dnl
                     20: dnl  The GNU MP Library is free software; you can redistribute it and/or
                     21: dnl  modify it under the terms of the GNU Lesser General Public License as
                     22: dnl  published by the Free Software Foundation; either version 2.1 of the
                     23: dnl  License, or (at your option) any later version.
                     24: dnl
                     25: dnl  The GNU MP Library is distributed in the hope that it will be useful,
                     26: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
                     27: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     28: dnl  Lesser General Public License for more details.
                     29: dnl
                     30: dnl  You should have received a copy of the GNU Lesser General Public
                     31: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
                     32: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
                     33: dnl  Suite 330, Boston, MA 02111-1307, USA.
                     34:
                     35:
                     36: include(`../config.m4')
                     37:
                     38:
                     39: dnl  M4_p and M4_i are the MMX and integer instructions
                     40: dnl  M4_*_neg_dst means whether to negate the final result before writing
                     41: dnl  M4_*_neg_src2 means whether to negate the src2 values before using them
                     42:
                     43: define(M4_choose_op,
                     44: m4_assert_numargs(7)
                     45: `ifdef(`OPERATION_$1',`
                     46: define(`M4_function',  `mpn_$1')
                     47: define(`M4_operation', `$1')
                     48: define(`M4_p',         `$2')
                     49: define(`M4_p_neg_dst', `$3')
                     50: define(`M4_p_neg_src2',`$4')
                     51: define(`M4_i',         `$5')
                     52: define(`M4_i_neg_dst', `$6')
                     53: define(`M4_i_neg_src2',`$7')
                     54: ')')
                     55:
                     56: dnl  xnor is done in "iorn" style because it's a touch faster than "nior"
                     57: dnl  style (the two are equivalent for xor).
                     58:
                     59: M4_choose_op( and_n,  pand,0,0,  andl,0,0)
                     60: M4_choose_op( andn_n, pandn,0,0, andl,0,1)
                     61: M4_choose_op( nand_n, pand,1,0,  andl,1,0)
                     62: M4_choose_op( ior_n,  por,0,0,   orl,0,0)
                     63: M4_choose_op( iorn_n, por,0,1,   orl,0,1)
                     64: M4_choose_op( nior_n, por,1,0,   orl,1,0)
                     65: M4_choose_op( xor_n,  pxor,0,0,  xorl,0,0)
                     66: M4_choose_op( xnor_n, pxor,0,1,  xorl,0,1)
                     67:
                     68: ifdef(`M4_function',,
                     69: `m4_error(`Unrecognised or undefined OPERATION symbol
                     70: ')')
                     71:
                     72: MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
                     73:
                     74:
                     75: C void M4_function (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
                     76: C                   mp_size_t size);
                     77: C
                     78: C Do src1,size M4_operation src2,size, storing the result in dst,size.
                     79: C
                     80: C Unaligned movq loads and stores are a bit slower than aligned ones.  The
                     81: C test at the start of the routine checks the alignment of src1 and if
                     82: C necessary processes one limb separately at the low end to make it aligned.
                     83: C
                     84: C The raw speeds without this alignment switch are as follows.
                     85: C
                     86: C           alignment dst/src1/src2, A=0mod8, N=4mod8
                     87: C     A/A/A  A/A/N  A/N/A  A/N/N  N/A/A  N/A/N  N/N/A  N/N/N
                     88: C
                     89: C K6                 1.5    2.0                 1.5    2.0    and,andn,ior,xor
                     90: C K6                 1.75   2.2                 2.0    2.28   iorn,xnor
                     91: C K6                 2.0    2.25                2.35   2.28   nand,nior
                     92: C
                     93: C
                     94: C Future:
                     95: C
                     96: C K6 can do one 64-bit load per cycle so each of these routines should be
                     97: C able to approach 1.0 c/l, if aligned.  The basic and/andn/ior/xor might be
                     98: C able to get 1.0 with just a 4 limb loop, being 3 instructions per 2 limbs.
                     99: C The others are 4 instructions per 2 limbs, and so can only approach 1.0
                    100: C because there's nowhere to hide some loop control.
                    101:
                    102: defframe(PARAM_SIZE,16)
                    103: defframe(PARAM_SRC2,12)
                    104: defframe(PARAM_SRC1,8)
                    105: defframe(PARAM_DST, 4)
                    106: deflit(`FRAME',0)
                    107:
                    108:        .text
                    109:        ALIGN(32)
                    110: PROLOGUE(M4_function)
                    111:                        movl    PARAM_SIZE, %ecx
                    112:                        pushl   %ebx
                    113:                FRAME_pushl()
                    114:                        movl    PARAM_SRC1, %eax
                    115:                        movl    PARAM_SRC2, %ebx
                    116:                        cmpl    $1, %ecx
                    117:                        movl    PARAM_DST, %edx
                    118:                        ja      L(two_or_more)
                    119:
                    120:
                    121:                        movl    (%ebx), %ecx
                    122:                        popl    %ebx
                    123: ifelse(M4_i_neg_src2,1,`notl   %ecx')
                    124:                        M4_i    (%eax), %ecx
                    125: ifelse(M4_i_neg_dst,1,`        notl    %ecx')
                    126:                        movl    %ecx, (%edx)
                    127:
                    128:                        ret
                    129:
                    130:
                    131: L(two_or_more):
                    132:                        C eax   src1
                    133:                        C ebx   src2
                    134:                        C ecx   size
                    135:                        C edx   dst
                    136:                        C esi
                    137:                        C edi
                    138:                        C ebp
                    139:                        C
                    140:                        C carry bit is low of size
                    141:
                    142:                        pushl   %esi
                    143:                FRAME_pushl()
                    144:                        testl   $4, %eax
                    145:                        jz      L(alignment_ok)
                    146:
                    147:                        movl    (%ebx), %esi
                    148:                        addl    $4, %ebx
                    149: ifelse(M4_i_neg_src2,1,`notl   %esi')
                    150:                        M4_i    (%eax), %esi
                    151:                        addl    $4, %eax
                    152: ifelse(M4_i_neg_dst,1,`        notl    %esi')
                    153:                        movl    %esi, (%edx)
                    154:                        addl    $4, %edx
                    155:                        decl    %ecx
                    156:
                    157: L(alignment_ok):
                    158:                        movl    %ecx, %esi
                    159:                        shrl    %ecx
                    160:                        jnz     L(still_two_or_more)
                    161:
                    162:                        movl    (%ebx), %ecx
                    163:                        popl    %esi
                    164: ifelse(M4_i_neg_src2,1,`notl   %ecx')
                    165:                        M4_i    (%eax), %ecx
                    166: ifelse(M4_i_neg_dst,1,`        notl    %ecx')
                    167:                        popl    %ebx
                    168:                        movl    %ecx, (%edx)
                    169:                        ret
                    170:
                    171:
                    172: L(still_two_or_more):
                    173: ifelse(eval(M4_p_neg_src2 || M4_p_neg_dst),1,`
                    174:                        pcmpeqd %mm7, %mm7      C all ones
                    175: ')
                    176:
                    177:                        ALIGN(16)
                    178: L(top):
                    179:                        C eax   src1
                    180:                        C ebx   src2
                    181:                        C ecx   counter
                    182:                        C edx   dst
                    183:                        C esi
                    184:                        C edi
                    185:                        C ebp
                    186:                        C
                    187:                        C carry bit is low of size
                    188:
                    189:                        movq    -8(%ebx,%ecx,8), %mm0
                    190: ifelse(M4_p_neg_src2,1,`pxor   %mm7, %mm0')
                    191:                        M4_p    -8(%eax,%ecx,8), %mm0
                    192: ifelse(M4_p_neg_dst,1,`        pxor    %mm7, %mm0')
                    193:                        movq    %mm0, -8(%edx,%ecx,8)
                    194:
                    195:                        loop    L(top)
                    196:
                    197:
                    198:                        jnc     L(no_extra)
                    199:
                    200:                        movl    -4(%ebx,%esi,4), %ebx
                    201: ifelse(M4_i_neg_src2,1,`notl   %ebx')
                    202:                        M4_i    -4(%eax,%esi,4), %ebx
                    203: ifelse(M4_i_neg_dst,1,`        notl    %ebx')
                    204:                        movl    %ebx, -4(%edx,%esi,4)
                    205: L(no_extra):
                    206:
                    207:                        popl    %esi
                    208:                        popl    %ebx
                    209:                        emms_or_femms
                    210:                        ret
                    211:
                    212: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>