OpenXM_contrib/gmp/mpn/x86/k6/mmx/logops_n.asm - annotate

Return to logops_n.asm CVS log
Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k6 / mmx
Annotation of OpenXM_contrib/gmp/mpn/x86/k6/mmx/logops_n.asm, Revision 1.1.1.2

1.1       maekawa     1: dnl  AMD K6-2 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n,
                      2: dnl  mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations.
                      3:
1.1.1.2 ! ohara       4: dnl  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
1.1       maekawa     5: dnl
                      6: dnl  This file is part of the GNU MP Library.
                      7: dnl
                      8: dnl  The GNU MP Library is free software; you can redistribute it and/or
                      9: dnl  modify it under the terms of the GNU Lesser General Public License as
                     10: dnl  published by the Free Software Foundation; either version 2.1 of the
                     11: dnl  License, or (at your option) any later version.
                     12: dnl
                     13: dnl  The GNU MP Library is distributed in the hope that it will be useful,
                     14: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
                     15: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     16: dnl  Lesser General Public License for more details.
                     17: dnl
                     18: dnl  You should have received a copy of the GNU Lesser General Public
                     19: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
                     20: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
                     21: dnl  Suite 330, Boston, MA 02111-1307, USA.
                     22:
                     23: include(`../config.m4')
                     24:
1.1.1.2 ! ohara      25: NAILS_SUPPORT(0-31)
        !            26:
        !            27:
        !            28: C         alignment dst/src1/src2, A=0mod8, N=4mod8
        !            29: C      A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N
        !            30: C
        !            31: C K6-2  1.2   1.5   1.5   1.2   1.2   1.5   1.5   1.2   and,andn,ior,xor
        !            32: C K6-2  1.5   1.75  2.0   1.75  1.75  2.0   1.75  1.5   iorn,xnor
        !            33: C K6-2  1.75  2.0   2.0   2.0   2.0   2.0   2.0   1.75  nand,nior
        !            34: C
        !            35: C K6    1.5   1.68  1.75  1.2   1.75  1.75  1.68  1.5   and,andn,ior,xor
        !            36: C K6    2.0   2.0   2.25  2.25  2.25  2.25  2.0   2.0   iorn,xnor
        !            37: C K6    2.0   2.25  2.25  2.25  2.25  2.25  2.25  2.0   nand,nior
        !            38:
1.1       maekawa    39:
                     40: dnl  M4_p and M4_i are the MMX and integer instructions
                     41: dnl  M4_*_neg_dst means whether to negate the final result before writing
                     42: dnl  M4_*_neg_src2 means whether to negate the src2 values before using them
                     43:
                     44: define(M4_choose_op,
                     45: m4_assert_numargs(7)
                     46: `ifdef(`OPERATION_$1',`
                     47: define(`M4_function',  `mpn_$1')
                     48: define(`M4_operation', `$1')
                     49: define(`M4_p',         `$2')
                     50: define(`M4_p_neg_dst', `$3')
                     51: define(`M4_p_neg_src2',`$4')
                     52: define(`M4_i',         `$5')
                     53: define(`M4_i_neg_dst', `$6')
                     54: define(`M4_i_neg_src2',`$7')
                     55: ')')
                     56:
                     57: dnl  xnor is done in "iorn" style because it's a touch faster than "nior"
                     58: dnl  style (the two are equivalent for xor).
1.1.1.2 ! ohara      59: dnl
        !            60: dnl  pandn can't be used with nails.
1.1       maekawa    61:
                     62: M4_choose_op( and_n,  pand,0,0,  andl,0,0)
1.1.1.2 ! ohara      63: ifelse(GMP_NAIL_BITS,0,
        !            64: `M4_choose_op(andn_n, pandn,0,0, andl,0,1)',
        !            65: `M4_choose_op(andn_n, pand,0,1,  andl,0,1)')
1.1       maekawa    66: M4_choose_op( nand_n, pand,1,0,  andl,1,0)
                     67: M4_choose_op( ior_n,  por,0,0,   orl,0,0)
                     68: M4_choose_op( iorn_n, por,0,1,   orl,0,1)
                     69: M4_choose_op( nior_n, por,1,0,   orl,1,0)
                     70: M4_choose_op( xor_n,  pxor,0,0,  xorl,0,0)
                     71: M4_choose_op( xnor_n, pxor,0,1,  xorl,0,1)
                     72:
                     73: ifdef(`M4_function',,
                     74: `m4_error(`Unrecognised or undefined OPERATION symbol
                     75: ')')
                     76:
                     77: MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
                     78:
                     79:
                     80: C void M4_function (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
                     81: C                   mp_size_t size);
                     82: C
                     83: C Do src1,size M4_operation src2,size, storing the result in dst,size.
                     84: C
                     85: C Unaligned movq loads and stores are a bit slower than aligned ones.  The
                     86: C test at the start of the routine checks the alignment of src1 and if
                     87: C necessary processes one limb separately at the low end to make it aligned.
                     88: C
                     89: C The raw speeds without this alignment switch are as follows.
                     90: C
                     91: C           alignment dst/src1/src2, A=0mod8, N=4mod8
                     92: C     A/A/A  A/A/N  A/N/A  A/N/N  N/A/A  N/A/N  N/N/A  N/N/N
                     93: C
                     94: C K6                 1.5    2.0                 1.5    2.0    and,andn,ior,xor
                     95: C K6                 1.75   2.2                 2.0    2.28   iorn,xnor
                     96: C K6                 2.0    2.25                2.35   2.28   nand,nior
                     97: C
                     98: C
                     99: C Future:
                    100: C
                    101: C K6 can do one 64-bit load per cycle so each of these routines should be
                    102: C able to approach 1.0 c/l, if aligned.  The basic and/andn/ior/xor might be
                    103: C able to get 1.0 with just a 4 limb loop, being 3 instructions per 2 limbs.
                    104: C The others are 4 instructions per 2 limbs, and so can only approach 1.0
                    105: C because there's nowhere to hide some loop control.
                    106:
                    107: defframe(PARAM_SIZE,16)
                    108: defframe(PARAM_SRC2,12)
                    109: defframe(PARAM_SRC1,8)
                    110: defframe(PARAM_DST, 4)
                    111: deflit(`FRAME',0)
                    112:
1.1.1.2 ! ohara     113:        TEXT
1.1       maekawa   114:        ALIGN(32)
                    115: PROLOGUE(M4_function)
                    116:                        movl    PARAM_SIZE, %ecx
1.1.1.2 ! ohara     117:                        pushl   %ebx            FRAME_pushl()
        !           118:
1.1       maekawa   119:                        movl    PARAM_SRC1, %eax
1.1.1.2 ! ohara     120:
1.1       maekawa   121:                        movl    PARAM_SRC2, %ebx
                    122:                        cmpl    $1, %ecx
1.1.1.2 ! ohara     123:
1.1       maekawa   124:                        movl    PARAM_DST, %edx
                    125:                        ja      L(two_or_more)
                    126:
                    127:
                    128:                        movl    (%ebx), %ecx
                    129:                        popl    %ebx
1.1.1.2 ! ohara     130: ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK(    %ecx)')
1.1       maekawa   131:                        M4_i    (%eax), %ecx
1.1.1.2 ! ohara     132: ifelse(M4_i_neg_dst,1,`        notl_or_xorl_GMP_NUMB_MASK(     %ecx)')
1.1       maekawa   133:                        movl    %ecx, (%edx)
                    134:
                    135:                        ret
                    136:
                    137:
                    138: L(two_or_more):
                    139:                        C eax   src1
                    140:                        C ebx   src2
                    141:                        C ecx   size
                    142:                        C edx   dst
                    143:                        C esi
                    144:                        C edi
                    145:                        C ebp
                    146:
1.1.1.2 ! ohara     147:                        pushl   %esi            FRAME_pushl()
1.1       maekawa   148:                        testl   $4, %eax
                    149:                        jz      L(alignment_ok)
                    150:
                    151:                        movl    (%ebx), %esi
                    152:                        addl    $4, %ebx
1.1.1.2 ! ohara     153: ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK(    %esi)')
1.1       maekawa   154:                        M4_i    (%eax), %esi
                    155:                        addl    $4, %eax
1.1.1.2 ! ohara     156: ifelse(M4_i_neg_dst,1,`        notl_or_xorl_GMP_NUMB_MASK(     %esi)')
1.1       maekawa   157:                        movl    %esi, (%edx)
                    158:                        addl    $4, %edx
                    159:                        decl    %ecx
                    160:
                    161: L(alignment_ok):
                    162:                        movl    %ecx, %esi
                    163:                        shrl    %ecx
                    164:                        jnz     L(still_two_or_more)
                    165:
                    166:                        movl    (%ebx), %ecx
                    167:                        popl    %esi
1.1.1.2 ! ohara     168: ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK(    %ecx)')
1.1       maekawa   169:                        M4_i    (%eax), %ecx
1.1.1.2 ! ohara     170: ifelse(M4_i_neg_dst,1,`        notl_or_xorl_GMP_NUMB_MASK(     %ecx)')
1.1       maekawa   171:                        popl    %ebx
                    172:                        movl    %ecx, (%edx)
                    173:                        ret
                    174:
                    175:
                    176: L(still_two_or_more):
                    177: ifelse(eval(M4_p_neg_src2 || M4_p_neg_dst),1,`
1.1.1.2 ! ohara     178:                        pcmpeqd %mm7, %mm7              C all ones
        !           179: ifelse(GMP_NAIL_BITS,0,,`psrld $GMP_NAIL_BITS, %mm7')  C clear nails
1.1       maekawa   180: ')
                    181:
                    182:                        ALIGN(16)
                    183: L(top):
                    184:                        C eax   src1
                    185:                        C ebx   src2
                    186:                        C ecx   counter
                    187:                        C edx   dst
                    188:                        C esi
                    189:                        C edi
                    190:                        C ebp
                    191:                        C
                    192:                        C carry bit is low of size
                    193:
                    194:                        movq    -8(%ebx,%ecx,8), %mm0
                    195: ifelse(M4_p_neg_src2,1,`pxor   %mm7, %mm0')
                    196:                        M4_p    -8(%eax,%ecx,8), %mm0
                    197: ifelse(M4_p_neg_dst,1,`        pxor    %mm7, %mm0')
                    198:                        movq    %mm0, -8(%edx,%ecx,8)
                    199:
                    200:                        loop    L(top)
                    201:
                    202:
                    203:                        jnc     L(no_extra)
                    204:
                    205:                        movl    -4(%ebx,%esi,4), %ebx
1.1.1.2 ! ohara     206: ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK(    %ebx)')
1.1       maekawa   207:                        M4_i    -4(%eax,%esi,4), %ebx
1.1.1.2 ! ohara     208: ifelse(M4_i_neg_dst,1,`        notl_or_xorl_GMP_NUMB_MASK(     %ebx)')
1.1       maekawa   209:                        movl    %ebx, -4(%edx,%esi,4)
                    210: L(no_extra):
                    211:
                    212:                        popl    %esi
                    213:                        popl    %ebx
                    214:                        emms_or_femms
                    215:                        ret
                    216:
                    217: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>