Annotation of OpenXM_contrib/gmp/mpn/x86/k6/mmx/logops_n.asm, Revision 1.1.1.1
1.1 maekawa 1: dnl AMD K6-2 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n,
2: dnl mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations.
3: dnl
4: dnl alignment dst/src1/src2, A=0mod8, N=4mod8
5: dnl A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N
6: dnl
7: dnl K6-2 1.2 1.5 1.5 1.2 1.2 1.5 1.5 1.2 and,andn,ior,xor
8: dnl K6-2 1.5 1.75 2.0 1.75 1.75 2.0 1.75 1.5 iorn,xnor
9: dnl K6-2 1.75 2.0 2.0 2.0 2.0 2.0 2.0 1.75 nand,nior
10: dnl
11: dnl K6 1.5 1.68 1.75 1.2 1.75 1.75 1.68 1.5 and,andn,ior,xor
12: dnl K6 2.0 2.0 2.25 2.25 2.25 2.25 2.0 2.0 iorn,xnor
13: dnl K6 2.0 2.25 2.25 2.25 2.25 2.25 2.25 2.0 nand,nior
14:
15:
16: dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
17: dnl
18: dnl This file is part of the GNU MP Library.
19: dnl
20: dnl The GNU MP Library is free software; you can redistribute it and/or
21: dnl modify it under the terms of the GNU Lesser General Public License as
22: dnl published by the Free Software Foundation; either version 2.1 of the
23: dnl License, or (at your option) any later version.
24: dnl
25: dnl The GNU MP Library is distributed in the hope that it will be useful,
26: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
27: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
28: dnl Lesser General Public License for more details.
29: dnl
30: dnl You should have received a copy of the GNU Lesser General Public
31: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
32: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
33: dnl Suite 330, Boston, MA 02111-1307, USA.
34:
35:
36: include(`../config.m4')
37:
38:
39: dnl M4_p and M4_i are the MMX and integer instructions
40: dnl M4_*_neg_dst means whether to negate the final result before writing
41: dnl M4_*_neg_src2 means whether to negate the src2 values before using them
42:
43: define(M4_choose_op,
44: m4_assert_numargs(7)
45: `ifdef(`OPERATION_$1',`
46: define(`M4_function', `mpn_$1')
47: define(`M4_operation', `$1')
48: define(`M4_p', `$2')
49: define(`M4_p_neg_dst', `$3')
50: define(`M4_p_neg_src2',`$4')
51: define(`M4_i', `$5')
52: define(`M4_i_neg_dst', `$6')
53: define(`M4_i_neg_src2',`$7')
54: ')')
55:
56: dnl xnor is done in "iorn" style because it's a touch faster than "nior"
57: dnl style (the two are equivalent for xor).
58:
59: M4_choose_op( and_n, pand,0,0, andl,0,0)
60: M4_choose_op( andn_n, pandn,0,0, andl,0,1)
61: M4_choose_op( nand_n, pand,1,0, andl,1,0)
62: M4_choose_op( ior_n, por,0,0, orl,0,0)
63: M4_choose_op( iorn_n, por,0,1, orl,0,1)
64: M4_choose_op( nior_n, por,1,0, orl,1,0)
65: M4_choose_op( xor_n, pxor,0,0, xorl,0,0)
66: M4_choose_op( xnor_n, pxor,0,1, xorl,0,1)
67:
68: ifdef(`M4_function',,
69: `m4_error(`Unrecognised or undefined OPERATION symbol
70: ')')
71:
72: MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
73:
74:
75: C void M4_function (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
76: C mp_size_t size);
77: C
78: C Do src1,size M4_operation src2,size, storing the result in dst,size.
79: C
80: C Unaligned movq loads and stores are a bit slower than aligned ones. The
81: C test at the start of the routine checks the alignment of src1 and if
82: C necessary processes one limb separately at the low end to make it aligned.
83: C
84: C The raw speeds without this alignment switch are as follows.
85: C
86: C alignment dst/src1/src2, A=0mod8, N=4mod8
87: C A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N
88: C
89: C K6 1.5 2.0 1.5 2.0 and,andn,ior,xor
90: C K6 1.75 2.2 2.0 2.28 iorn,xnor
91: C K6 2.0 2.25 2.35 2.28 nand,nior
92: C
93: C
94: C Future:
95: C
96: C K6 can do one 64-bit load per cycle so each of these routines should be
97: C able to approach 1.0 c/l, if aligned. The basic and/andn/ior/xor might be
98: C able to get 1.0 with just a 4 limb loop, being 3 instructions per 2 limbs.
99: C The others are 4 instructions per 2 limbs, and so can only approach 1.0
100: C because there's nowhere to hide some loop control.
101:
102: defframe(PARAM_SIZE,16)
103: defframe(PARAM_SRC2,12)
104: defframe(PARAM_SRC1,8)
105: defframe(PARAM_DST, 4)
106: deflit(`FRAME',0)
107:
108: .text
109: ALIGN(32)
110: PROLOGUE(M4_function)
111: movl PARAM_SIZE, %ecx
112: pushl %ebx
113: FRAME_pushl()
114: movl PARAM_SRC1, %eax
115: movl PARAM_SRC2, %ebx
116: cmpl $1, %ecx
117: movl PARAM_DST, %edx
118: ja L(two_or_more)
119:
120:
121: movl (%ebx), %ecx
122: popl %ebx
123: ifelse(M4_i_neg_src2,1,`notl %ecx')
124: M4_i (%eax), %ecx
125: ifelse(M4_i_neg_dst,1,` notl %ecx')
126: movl %ecx, (%edx)
127:
128: ret
129:
130:
131: L(two_or_more):
132: C eax src1
133: C ebx src2
134: C ecx size
135: C edx dst
136: C esi
137: C edi
138: C ebp
139: C
140: C carry bit is low of size
141:
142: pushl %esi
143: FRAME_pushl()
144: testl $4, %eax
145: jz L(alignment_ok)
146:
147: movl (%ebx), %esi
148: addl $4, %ebx
149: ifelse(M4_i_neg_src2,1,`notl %esi')
150: M4_i (%eax), %esi
151: addl $4, %eax
152: ifelse(M4_i_neg_dst,1,` notl %esi')
153: movl %esi, (%edx)
154: addl $4, %edx
155: decl %ecx
156:
157: L(alignment_ok):
158: movl %ecx, %esi
159: shrl %ecx
160: jnz L(still_two_or_more)
161:
162: movl (%ebx), %ecx
163: popl %esi
164: ifelse(M4_i_neg_src2,1,`notl %ecx')
165: M4_i (%eax), %ecx
166: ifelse(M4_i_neg_dst,1,` notl %ecx')
167: popl %ebx
168: movl %ecx, (%edx)
169: ret
170:
171:
172: L(still_two_or_more):
173: ifelse(eval(M4_p_neg_src2 || M4_p_neg_dst),1,`
174: pcmpeqd %mm7, %mm7 C all ones
175: ')
176:
177: ALIGN(16)
178: L(top):
179: C eax src1
180: C ebx src2
181: C ecx counter
182: C edx dst
183: C esi
184: C edi
185: C ebp
186: C
187: C carry bit is low of size
188:
189: movq -8(%ebx,%ecx,8), %mm0
190: ifelse(M4_p_neg_src2,1,`pxor %mm7, %mm0')
191: M4_p -8(%eax,%ecx,8), %mm0
192: ifelse(M4_p_neg_dst,1,` pxor %mm7, %mm0')
193: movq %mm0, -8(%edx,%ecx,8)
194:
195: loop L(top)
196:
197:
198: jnc L(no_extra)
199:
200: movl -4(%ebx,%esi,4), %ebx
201: ifelse(M4_i_neg_src2,1,`notl %ebx')
202: M4_i -4(%eax,%esi,4), %ebx
203: ifelse(M4_i_neg_dst,1,` notl %ebx')
204: movl %ebx, -4(%edx,%esi,4)
205: L(no_extra):
206:
207: popl %esi
208: popl %ebx
209: emms_or_femms
210: ret
211:
212: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>