Annotation of OpenXM_contrib/gmp/mpn/x86/k6/mmx/logops_n.asm, Revision 1.1
1.1 ! maekawa 1: dnl AMD K6-2 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n,
! 2: dnl mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations.
! 3: dnl
! 4: dnl alignment dst/src1/src2, A=0mod8, N=4mod8
! 5: dnl A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N
! 6: dnl
! 7: dnl K6-2 1.2 1.5 1.5 1.2 1.2 1.5 1.5 1.2 and,andn,ior,xor
! 8: dnl K6-2 1.5 1.75 2.0 1.75 1.75 2.0 1.75 1.5 iorn,xnor
! 9: dnl K6-2 1.75 2.0 2.0 2.0 2.0 2.0 2.0 1.75 nand,nior
! 10: dnl
! 11: dnl K6 1.5 1.68 1.75 1.2 1.75 1.75 1.68 1.5 and,andn,ior,xor
! 12: dnl K6 2.0 2.0 2.25 2.25 2.25 2.25 2.0 2.0 iorn,xnor
! 13: dnl K6 2.0 2.25 2.25 2.25 2.25 2.25 2.25 2.0 nand,nior
! 14:
! 15:
! 16: dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
! 17: dnl
! 18: dnl This file is part of the GNU MP Library.
! 19: dnl
! 20: dnl The GNU MP Library is free software; you can redistribute it and/or
! 21: dnl modify it under the terms of the GNU Lesser General Public License as
! 22: dnl published by the Free Software Foundation; either version 2.1 of the
! 23: dnl License, or (at your option) any later version.
! 24: dnl
! 25: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 26: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 27: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 28: dnl Lesser General Public License for more details.
! 29: dnl
! 30: dnl You should have received a copy of the GNU Lesser General Public
! 31: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 32: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 33: dnl Suite 330, Boston, MA 02111-1307, USA.
! 34:
! 35:
! 36: include(`../config.m4')
! 37:
! 38:
! 39: dnl M4_p and M4_i are the MMX and integer instructions
! 40: dnl M4_*_neg_dst means whether to negate the final result before writing
! 41: dnl M4_*_neg_src2 means whether to negate the src2 values before using them
! 42:
! 43: define(M4_choose_op,
! 44: m4_assert_numargs(7)
! 45: `ifdef(`OPERATION_$1',`
! 46: define(`M4_function', `mpn_$1')
! 47: define(`M4_operation', `$1')
! 48: define(`M4_p', `$2')
! 49: define(`M4_p_neg_dst', `$3')
! 50: define(`M4_p_neg_src2',`$4')
! 51: define(`M4_i', `$5')
! 52: define(`M4_i_neg_dst', `$6')
! 53: define(`M4_i_neg_src2',`$7')
! 54: ')')
! 55:
! 56: dnl xnor is done in "iorn" style because it's a touch faster than "nior"
! 57: dnl style (the two are equivalent for xor).
! 58:
! 59: M4_choose_op( and_n, pand,0,0, andl,0,0)
! 60: M4_choose_op( andn_n, pandn,0,0, andl,0,1)
! 61: M4_choose_op( nand_n, pand,1,0, andl,1,0)
! 62: M4_choose_op( ior_n, por,0,0, orl,0,0)
! 63: M4_choose_op( iorn_n, por,0,1, orl,0,1)
! 64: M4_choose_op( nior_n, por,1,0, orl,1,0)
! 65: M4_choose_op( xor_n, pxor,0,0, xorl,0,0)
! 66: M4_choose_op( xnor_n, pxor,0,1, xorl,0,1)
! 67:
! 68: ifdef(`M4_function',,
! 69: `m4_error(`Unrecognised or undefined OPERATION symbol
! 70: ')')
! 71:
! 72: MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
! 73:
! 74:
! 75: C void M4_function (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
! 76: C mp_size_t size);
! 77: C
! 78: C Do src1,size M4_operation src2,size, storing the result in dst,size.
! 79: C
! 80: C Unaligned movq loads and stores are a bit slower than aligned ones. The
! 81: C test at the start of the routine checks the alignment of src1 and if
! 82: C necessary processes one limb separately at the low end to make it aligned.
! 83: C
! 84: C The raw speeds without this alignment switch are as follows.
! 85: C
! 86: C alignment dst/src1/src2, A=0mod8, N=4mod8
! 87: C A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N
! 88: C
! 89: C K6 1.5 2.0 1.5 2.0 and,andn,ior,xor
! 90: C K6 1.75 2.2 2.0 2.28 iorn,xnor
! 91: C K6 2.0 2.25 2.35 2.28 nand,nior
! 92: C
! 93: C
! 94: C Future:
! 95: C
! 96: C K6 can do one 64-bit load per cycle so each of these routines should be
! 97: C able to approach 1.0 c/l, if aligned. The basic and/andn/ior/xor might be
! 98: C able to get 1.0 with just a 4 limb loop, being 3 instructions per 2 limbs.
! 99: C The others are 4 instructions per 2 limbs, and so can only approach 1.0
! 100: C because there's nowhere to hide some loop control.
! 101:
! 102: defframe(PARAM_SIZE,16)
! 103: defframe(PARAM_SRC2,12)
! 104: defframe(PARAM_SRC1,8)
! 105: defframe(PARAM_DST, 4)
! 106: deflit(`FRAME',0)
! 107:
! 108: .text
! 109: ALIGN(32)
! 110: PROLOGUE(M4_function)
! 111: movl PARAM_SIZE, %ecx
! 112: pushl %ebx
! 113: FRAME_pushl()
! 114: movl PARAM_SRC1, %eax
! 115: movl PARAM_SRC2, %ebx
! 116: cmpl $1, %ecx
! 117: movl PARAM_DST, %edx
! 118: ja L(two_or_more)
! 119:
! 120:
! 121: movl (%ebx), %ecx
! 122: popl %ebx
! 123: ifelse(M4_i_neg_src2,1,`notl %ecx')
! 124: M4_i (%eax), %ecx
! 125: ifelse(M4_i_neg_dst,1,` notl %ecx')
! 126: movl %ecx, (%edx)
! 127:
! 128: ret
! 129:
! 130:
! 131: L(two_or_more):
! 132: C eax src1
! 133: C ebx src2
! 134: C ecx size
! 135: C edx dst
! 136: C esi
! 137: C edi
! 138: C ebp
! 139: C
! 140: C carry bit is low of size
! 141:
! 142: pushl %esi
! 143: FRAME_pushl()
! 144: testl $4, %eax
! 145: jz L(alignment_ok)
! 146:
! 147: movl (%ebx), %esi
! 148: addl $4, %ebx
! 149: ifelse(M4_i_neg_src2,1,`notl %esi')
! 150: M4_i (%eax), %esi
! 151: addl $4, %eax
! 152: ifelse(M4_i_neg_dst,1,` notl %esi')
! 153: movl %esi, (%edx)
! 154: addl $4, %edx
! 155: decl %ecx
! 156:
! 157: L(alignment_ok):
! 158: movl %ecx, %esi
! 159: shrl %ecx
! 160: jnz L(still_two_or_more)
! 161:
! 162: movl (%ebx), %ecx
! 163: popl %esi
! 164: ifelse(M4_i_neg_src2,1,`notl %ecx')
! 165: M4_i (%eax), %ecx
! 166: ifelse(M4_i_neg_dst,1,` notl %ecx')
! 167: popl %ebx
! 168: movl %ecx, (%edx)
! 169: ret
! 170:
! 171:
! 172: L(still_two_or_more):
! 173: ifelse(eval(M4_p_neg_src2 || M4_p_neg_dst),1,`
! 174: pcmpeqd %mm7, %mm7 C all ones
! 175: ')
! 176:
! 177: ALIGN(16)
! 178: L(top):
! 179: C eax src1
! 180: C ebx src2
! 181: C ecx counter
! 182: C edx dst
! 183: C esi
! 184: C edi
! 185: C ebp
! 186: C
! 187: C carry bit is low of size
! 188:
! 189: movq -8(%ebx,%ecx,8), %mm0
! 190: ifelse(M4_p_neg_src2,1,`pxor %mm7, %mm0')
! 191: M4_p -8(%eax,%ecx,8), %mm0
! 192: ifelse(M4_p_neg_dst,1,` pxor %mm7, %mm0')
! 193: movq %mm0, -8(%edx,%ecx,8)
! 194:
! 195: loop L(top)
! 196:
! 197:
! 198: jnc L(no_extra)
! 199:
! 200: movl -4(%ebx,%esi,4), %ebx
! 201: ifelse(M4_i_neg_src2,1,`notl %ebx')
! 202: M4_i -4(%eax,%esi,4), %ebx
! 203: ifelse(M4_i_neg_dst,1,` notl %ebx')
! 204: movl %ebx, -4(%edx,%esi,4)
! 205: L(no_extra):
! 206:
! 207: popl %esi
! 208: popl %ebx
! 209: emms_or_femms
! 210: ret
! 211:
! 212: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>