Annotation of OpenXM_contrib/gmp/mpn/x86/k6/mmx/logops_n.asm, Revision 1.1.1.2
1.1 maekawa 1: dnl AMD K6-2 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n,
2: dnl mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations.
3:
1.1.1.2 ! ohara 4: dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
1.1 maekawa 5: dnl
6: dnl This file is part of the GNU MP Library.
7: dnl
8: dnl The GNU MP Library is free software; you can redistribute it and/or
9: dnl modify it under the terms of the GNU Lesser General Public License as
10: dnl published by the Free Software Foundation; either version 2.1 of the
11: dnl License, or (at your option) any later version.
12: dnl
13: dnl The GNU MP Library is distributed in the hope that it will be useful,
14: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
15: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16: dnl Lesser General Public License for more details.
17: dnl
18: dnl You should have received a copy of the GNU Lesser General Public
19: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
20: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
21: dnl Suite 330, Boston, MA 02111-1307, USA.
22:
23: include(`../config.m4')
24:
1.1.1.2 ! ohara 25: NAILS_SUPPORT(0-31)
! 26:
! 27:
! 28: C alignment dst/src1/src2, A=0mod8, N=4mod8
! 29: C A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N
! 30: C
! 31: C K6-2 1.2 1.5 1.5 1.2 1.2 1.5 1.5 1.2 and,andn,ior,xor
! 32: C K6-2 1.5 1.75 2.0 1.75 1.75 2.0 1.75 1.5 iorn,xnor
! 33: C K6-2 1.75 2.0 2.0 2.0 2.0 2.0 2.0 1.75 nand,nior
! 34: C
! 35: C K6 1.5 1.68 1.75 1.2 1.75 1.75 1.68 1.5 and,andn,ior,xor
! 36: C K6 2.0 2.0 2.25 2.25 2.25 2.25 2.0 2.0 iorn,xnor
! 37: C K6 2.0 2.25 2.25 2.25 2.25 2.25 2.25 2.0 nand,nior
! 38:
1.1 maekawa 39:
40: dnl M4_p and M4_i are the MMX and integer instructions
41: dnl M4_*_neg_dst means whether to negate the final result before writing
42: dnl M4_*_neg_src2 means whether to negate the src2 values before using them
43:
44: define(M4_choose_op,
45: m4_assert_numargs(7)
46: `ifdef(`OPERATION_$1',`
47: define(`M4_function', `mpn_$1')
48: define(`M4_operation', `$1')
49: define(`M4_p', `$2')
50: define(`M4_p_neg_dst', `$3')
51: define(`M4_p_neg_src2',`$4')
52: define(`M4_i', `$5')
53: define(`M4_i_neg_dst', `$6')
54: define(`M4_i_neg_src2',`$7')
55: ')')
56:
57: dnl xnor is done in "iorn" style because it's a touch faster than "nior"
58: dnl style (the two are equivalent for xor).
1.1.1.2 ! ohara 59: dnl
! 60: dnl pandn can't be used with nails.
1.1 maekawa 61:
62: M4_choose_op( and_n, pand,0,0, andl,0,0)
1.1.1.2 ! ohara 63: ifelse(GMP_NAIL_BITS,0,
! 64: `M4_choose_op(andn_n, pandn,0,0, andl,0,1)',
! 65: `M4_choose_op(andn_n, pand,0,1, andl,0,1)')
1.1 maekawa 66: M4_choose_op( nand_n, pand,1,0, andl,1,0)
67: M4_choose_op( ior_n, por,0,0, orl,0,0)
68: M4_choose_op( iorn_n, por,0,1, orl,0,1)
69: M4_choose_op( nior_n, por,1,0, orl,1,0)
70: M4_choose_op( xor_n, pxor,0,0, xorl,0,0)
71: M4_choose_op( xnor_n, pxor,0,1, xorl,0,1)
72:
73: ifdef(`M4_function',,
74: `m4_error(`Unrecognised or undefined OPERATION symbol
75: ')')
76:
77: MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
78:
79:
80: C void M4_function (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
81: C mp_size_t size);
82: C
83: C Do src1,size M4_operation src2,size, storing the result in dst,size.
84: C
85: C Unaligned movq loads and stores are a bit slower than aligned ones. The
86: C test at the start of the routine checks the alignment of src1 and if
87: C necessary processes one limb separately at the low end to make it aligned.
88: C
89: C The raw speeds without this alignment switch are as follows.
90: C
91: C alignment dst/src1/src2, A=0mod8, N=4mod8
92: C A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N
93: C
94: C K6 1.5 2.0 1.5 2.0 and,andn,ior,xor
95: C K6 1.75 2.2 2.0 2.28 iorn,xnor
96: C K6 2.0 2.25 2.35 2.28 nand,nior
97: C
98: C
99: C Future:
100: C
101: C K6 can do one 64-bit load per cycle so each of these routines should be
102: C able to approach 1.0 c/l, if aligned. The basic and/andn/ior/xor might be
103: C able to get 1.0 with just a 4 limb loop, being 3 instructions per 2 limbs.
104: C The others are 4 instructions per 2 limbs, and so can only approach 1.0
105: C because there's nowhere to hide some loop control.
106:
107: defframe(PARAM_SIZE,16)
108: defframe(PARAM_SRC2,12)
109: defframe(PARAM_SRC1,8)
110: defframe(PARAM_DST, 4)
111: deflit(`FRAME',0)
112:
1.1.1.2 ! ohara 113: TEXT
1.1 maekawa 114: ALIGN(32)
115: PROLOGUE(M4_function)
116: movl PARAM_SIZE, %ecx
1.1.1.2 ! ohara 117: pushl %ebx FRAME_pushl()
! 118:
1.1 maekawa 119: movl PARAM_SRC1, %eax
1.1.1.2 ! ohara 120:
1.1 maekawa 121: movl PARAM_SRC2, %ebx
122: cmpl $1, %ecx
1.1.1.2 ! ohara 123:
1.1 maekawa 124: movl PARAM_DST, %edx
125: ja L(two_or_more)
126:
127:
128: movl (%ebx), %ecx
129: popl %ebx
1.1.1.2 ! ohara 130: ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %ecx)')
1.1 maekawa 131: M4_i (%eax), %ecx
1.1.1.2 ! ohara 132: ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %ecx)')
1.1 maekawa 133: movl %ecx, (%edx)
134:
135: ret
136:
137:
138: L(two_or_more):
139: C eax src1
140: C ebx src2
141: C ecx size
142: C edx dst
143: C esi
144: C edi
145: C ebp
146:
1.1.1.2 ! ohara 147: pushl %esi FRAME_pushl()
1.1 maekawa 148: testl $4, %eax
149: jz L(alignment_ok)
150:
151: movl (%ebx), %esi
152: addl $4, %ebx
1.1.1.2 ! ohara 153: ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %esi)')
1.1 maekawa 154: M4_i (%eax), %esi
155: addl $4, %eax
1.1.1.2 ! ohara 156: ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %esi)')
1.1 maekawa 157: movl %esi, (%edx)
158: addl $4, %edx
159: decl %ecx
160:
161: L(alignment_ok):
162: movl %ecx, %esi
163: shrl %ecx
164: jnz L(still_two_or_more)
165:
166: movl (%ebx), %ecx
167: popl %esi
1.1.1.2 ! ohara 168: ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %ecx)')
1.1 maekawa 169: M4_i (%eax), %ecx
1.1.1.2 ! ohara 170: ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %ecx)')
1.1 maekawa 171: popl %ebx
172: movl %ecx, (%edx)
173: ret
174:
175:
176: L(still_two_or_more):
177: ifelse(eval(M4_p_neg_src2 || M4_p_neg_dst),1,`
1.1.1.2 ! ohara 178: pcmpeqd %mm7, %mm7 C all ones
! 179: ifelse(GMP_NAIL_BITS,0,,`psrld $GMP_NAIL_BITS, %mm7') C clear nails
1.1 maekawa 180: ')
181:
182: ALIGN(16)
183: L(top):
184: C eax src1
185: C ebx src2
186: C ecx counter
187: C edx dst
188: C esi
189: C edi
190: C ebp
191: C
192: C carry bit is low of size
193:
194: movq -8(%ebx,%ecx,8), %mm0
195: ifelse(M4_p_neg_src2,1,`pxor %mm7, %mm0')
196: M4_p -8(%eax,%ecx,8), %mm0
197: ifelse(M4_p_neg_dst,1,` pxor %mm7, %mm0')
198: movq %mm0, -8(%edx,%ecx,8)
199:
200: loop L(top)
201:
202:
203: jnc L(no_extra)
204:
205: movl -4(%ebx,%esi,4), %ebx
1.1.1.2 ! ohara 206: ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %ebx)')
1.1 maekawa 207: M4_i -4(%eax,%esi,4), %ebx
1.1.1.2 ! ohara 208: ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %ebx)')
1.1 maekawa 209: movl %ebx, -4(%edx,%esi,4)
210: L(no_extra):
211:
212: popl %esi
213: popl %ebx
214: emms_or_femms
215: ret
216:
217: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>