Annotation of OpenXM_contrib/gmp/mpn/x86/k6/gcd_1.asm, Revision 1.1.1.1
1.1 ohara 1: dnl AMD K6 mpn_mod_1 -- mpn by 1 gcd.
2:
3: dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24:
25: C K6: 9.5 cycles/bit (approx) 1x1 gcd
26: C 11.0 cycles/limb Nx1 reduction (modexact_1_odd)
27:
28:
29: C mp_limb_t mpn_gcd_1 (mp_srcptr src, mp_size_t size, mp_limb_t y);
30: C
31: C This code is nothing very special, but offers a speedup over what gcc 2.95
32: C can do with mpn/generic/gcd_1.c.
33: C
34: C Future:
35: C
36: C Using a lookup table to count trailing zeros seems a touch quicker, but
37: C after a slightly longer startup. Might be worthwhile if an mpn_gcd_2 used
38: C it too.
39:
40:
41: dnl If size==1 and x (the larger operand) is more than DIV_THRESHOLD bits
42: dnl bigger than y, then a division x%y is done to reduce it.
43: dnl
44: dnl A divl is 20 cycles and the loop runs at about 9.5 cycles/bitpair so
45: dnl there should be an advantage in the divl at about 4 or 5 bits, which is
46: dnl what's found.
47:
48: deflit(DIV_THRESHOLD, 5)
49:
50:
51: defframe(PARAM_LIMB, 12)
52: defframe(PARAM_SIZE, 8)
53: defframe(PARAM_SRC, 4)
54:
55: TEXT
56: ALIGN(16)
57:
58: PROLOGUE(mpn_gcd_1)
59: deflit(`FRAME',0)
60:
61: ASSERT(ne, `cmpl $0, PARAM_LIMB')
62: ASSERT(ae, `cmpl $1, PARAM_SIZE')
63:
64:
65: movl PARAM_SRC, %eax
66: pushl %ebx FRAME_pushl()
67:
68: movl PARAM_LIMB, %edx
69: movl $-1, %ecx
70:
71: movl (%eax), %ebx C src low limb
72:
73: movl %ebx, %eax C src low limb
74: orl %edx, %ebx
75:
76: L(common_twos):
77: shrl %ebx
78: incl %ecx
79:
80: jnc L(common_twos) C 1/4 chance on random data
81: shrl %cl, %edx C y
82:
83: cmpl $1, PARAM_SIZE
84: ja L(size_two_or_more)
85:
86:
87: ASSERT(nz, `orl %eax, %eax') C should have src limb != 0
88:
89: shrl %cl, %eax C x
90:
91:
92: C Swap if necessary to make x>=y. Measures a touch quicker as a
93: C jump than a branch free calculation.
94: C
95: C eax x
96: C ebx
97: C ecx common twos
98: C edx y
99:
100: movl %eax, %ebx
101: cmpl %eax, %edx
102:
103: jb L(noswap)
104: movl %edx, %eax
105:
106: movl %ebx, %edx
107: movl %eax, %ebx
108: L(noswap):
109:
110:
111: C See if it's worth reducing x with a divl.
112: C
113: C eax x
114: C ebx x
115: C ecx common twos
116: C edx y
117:
118: shrl $DIV_THRESHOLD, %ebx
119:
120: cmpl %ebx, %edx
121: ja L(nodiv)
122:
123:
124: C Reduce x to x%y.
125: C
126: C eax x
127: C ebx
128: C ecx common twos
129: C edx y
130:
131: movl %edx, %ebx
132: xorl %edx, %edx
133:
134: divl %ebx
135:
136: orl %edx, %edx C y
137: nop C code alignment
138:
139: movl %ebx, %eax C x
140: jz L(done_shll)
141: L(nodiv):
142:
143:
144: C eax x
145: C ebx
146: C ecx common twos
147: C edx y
148: C esi
149: C edi
150: C ebp
151:
152: L(strip_y):
153: shrl %edx
154: jnc L(strip_y)
155:
156: leal 1(%edx,%edx), %edx
157: movl %ecx, %ebx C common twos
158:
159: leal 1(%eax), %ecx
160: jmp L(strip_x_and)
161:
162:
163: C Calculating a %cl shift based on the low bit 0 or 1 avoids doing a branch
164: C on a 50/50 chance of 0 or 1. The chance of the next bit also being 0 is
165: C only 1/4.
166: C
167: C A second computed %cl shift was tried, but that measured a touch slower
168: C than branching back.
169: C
170: C A branch-free abs(x-y) and min(x,y) calculation was tried, but that
171: C measured about 1 cycle/bit slower.
172:
173: C eax x
174: C ebx common twos
175: C ecx scratch
176: C edx y
177:
178: ALIGN(4)
179: L(swap):
180: addl %eax, %edx C x-y+y = x
181: negl %eax C -(x-y) = y-x
182:
183: L(strip_x):
184: shrl %eax C odd-odd = even, so always one to strip
185: ASSERT(nz)
186:
187: L(strip_x_leal):
188: leal 1(%eax), %ecx
189:
190: L(strip_x_and):
191: andl $1, %ecx C (x^1)&1
192:
193: shrl %cl, %eax C shift if x even
194:
195: testb $1, %al
196: jz L(strip_x)
197:
198: ASSERT(nz,`testl $1, %eax') C x, y odd
199: ASSERT(nz,`testl $1, %edx')
200:
201: subl %edx, %eax
202: jb L(swap)
203: ja L(strip_x)
204:
205:
206: movl %edx, %eax
207: movl %ebx, %ecx
208:
209: L(done_shll):
210: shll %cl, %eax
211: popl %ebx
212:
213: ret
214:
215:
216: C -----------------------------------------------------------------------------
217: C Two or more limbs.
218: C
219: C x={src,size} is reduced modulo y using either a plain mod_1 style
220: C remainder, or a modexact_1 style exact division.
221:
222: deflit(MODEXACT_THRESHOLD, ifdef(`PIC', 4, 4))
223:
224: ALIGN(8)
225: L(size_two_or_more):
226: C eax
227: C ebx
228: C ecx common twos
229: C edx y, without common twos
230: C esi
231: C edi
232: C ebp
233:
234: deflit(FRAME_TWO_OR_MORE, FRAME)
235:
236: pushl %edi defframe_pushl(SAVE_EDI)
237: movl PARAM_SRC, %ebx
238:
239: L(y_twos):
240: shrl %edx
241: jnc L(y_twos)
242:
243: movl %ecx, %edi C common twos
244: movl PARAM_SIZE, %ecx
245:
246: pushl %esi defframe_pushl(SAVE_ESI)
247: leal 1(%edx,%edx), %esi C y (odd)
248:
249: movl -4(%ebx,%ecx,4), %eax C src high limb
250:
251: cmpl %edx, %eax C carry if high<divisor
252:
253: sbbl %edx, %edx C -1 if high<divisor
254:
255: addl %edx, %ecx C skip one limb if high<divisor
256: andl %eax, %edx
257:
258: cmpl $MODEXACT_THRESHOLD, %ecx
259: jae L(modexact)
260:
261:
262: L(divide_top):
263: C eax scratch (quotient)
264: C ebx src
265: C ecx counter, size-1 to 1
266: C edx carry (remainder)
267: C esi divisor (odd)
268: C edi
269: C ebp
270:
271: movl -4(%ebx,%ecx,4), %eax
272: divl %esi
273: loop L(divide_top)
274:
275:
276: movl %edx, %eax C x
277: movl %esi, %edx C y (odd)
278:
279: movl %edi, %ebx C common twos
280: popl %esi
281:
282: popl %edi
283: leal 1(%eax), %ecx
284:
285: orl %eax, %eax
286: jnz L(strip_x_and)
287:
288:
289: movl %ebx, %ecx
290: movl %edx, %eax
291:
292: shll %cl, %eax
293: popl %ebx
294:
295: ret
296:
297:
298: ALIGN(8)
299: L(modexact):
300: C eax
301: C ebx src ptr
302: C ecx size or size-1
303: C edx
304: C esi y odd
305: C edi common twos
306: C ebp
307:
308: movl PARAM_SIZE, %eax
309: pushl %esi FRAME_pushl()
310:
311: pushl %eax FRAME_pushl()
312:
313: pushl %ebx FRAME_pushl()
314:
315: ifdef(`PIC',`
316: nop C code alignment
317: call L(movl_eip_ebx)
318: L(here):
319: addl $_GLOBAL_OFFSET_TABLE_, %ebx
320: call GSYM_PREFIX`'mpn_modexact_1_odd@PLT
321: ',`
322: call GSYM_PREFIX`'mpn_modexact_1_odd
323: ')
324:
325: movl %esi, %edx C y odd
326: movl SAVE_ESI, %esi
327:
328: movl %edi, %ebx C common twos
329: movl SAVE_EDI, %edi
330:
331: addl $eval(FRAME - FRAME_TWO_OR_MORE), %esp
332: orl %eax, %eax
333:
334: leal 1(%eax), %ecx
335: jnz L(strip_x_and)
336:
337:
338: movl %ebx, %ecx
339: movl %edx, %eax
340:
341: shll %cl, %eax
342: popl %ebx
343:
344: ret
345:
346:
347: ifdef(`PIC',`
348: L(movl_eip_ebx):
349: movl (%esp), %ebx
350: ret
351: ')
352:
353: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>