Annotation of OpenXM_contrib/gmp/mpn/x86/k7/gcd_1.asm, Revision 1.1.1.1
1.1 ohara 1: dnl AMD K7 mpn_gcd_1 -- mpn by 1 gcd.
2:
3: dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24:
25: C K7: 6.75 cycles/bit (approx) 1x1 gcd
26: C 11.0 cycles/limb Nx1 reduction (modexact_1_odd)
27:
28:
29: dnl Reduce using x%y if x is more than DIV_THRESHOLD bits bigger than y,
30: dnl where x is the larger of the two. See tune/README for more.
31: dnl
32: dnl divl at 40 cycles compared to the gcd at about 7 cycles/bitpair
33: dnl suggests 40/7*2=11.4 but 7 seems to be about right.
34:
35: deflit(DIV_THRESHOLD, 7)
36:
37:
38: C table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
39: C
40: C This is mixed in with the code, but as per the k7 optimization manual it's
41: C a full cache line and suitably aligned so it won't get swapped between
42: C code and data. Having it in TEXT rather than RODATA saves needing a GOT
43: C entry when PIC.
44: C
45: C Actually, there doesn't seem to be a measurable difference between this in
46: C it's own cache line or plonked in the middle of the code. Presumably
47: C since TEXT is read-only there's no worries about coherency.
48:
49: deflit(MASK, 63)
50: deflit(MAXSHIFT, 6)
51:
52: TEXT
53: ALIGN(64)
54: L(table):
55: .byte MAXSHIFT
56: forloop(i,1,MASK,
57: ` .byte m4_count_trailing_zeros(i)
58: ')
59:
60:
61: C mp_limb_t mpn_gcd_1 (mp_srcptr src, mp_size_t size, mp_limb_t limb);
62: C
63:
64: defframe(PARAM_LIMB, 12)
65: defframe(PARAM_SIZE, 8)
66: defframe(PARAM_SRC, 4)
67:
68: defframe(SAVE_EBX, -4)
69: defframe(SAVE_ESI, -8)
70: defframe(SAVE_EDI, -12)
71: defframe(SAVE_EBP, -16)
72: defframe(CALL_DIVISOR,-20)
73: defframe(CALL_SIZE, -24)
74: defframe(CALL_SRC, -28)
75:
76: deflit(STACK_SPACE, 28)
77:
78: TEXT
79: ALIGN(16)
80:
81: PROLOGUE(mpn_gcd_1)
82: deflit(`FRAME',0)
83:
84: ASSERT(ne, `cmpl $0, PARAM_LIMB') C y!=0
85: ASSERT(ae, `cmpl $1, PARAM_SIZE') C size>=1
86:
87: movl PARAM_SRC, %eax
88: movl PARAM_LIMB, %edx
89: subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE)
90:
91: movl %esi, SAVE_ESI
92: movl %ebx, SAVE_EBX
93:
94: movl (%eax), %esi C src low limb
95:
96: ifdef(`PIC',`
97: movl %edi, SAVE_EDI
98: call L(movl_eip_to_edi)
99: L(here):
100: addl $L(table)-L(here), %edi
101: ')
102:
103: movl %esi, %ebx
104: orl %edx, %esi C x|y
105: movl $-1, %ecx
106:
107: L(twos):
108: incl %ecx
109: shrl %esi
110: jnc L(twos) C 3/4 chance of x or y odd already
111:
112: shrl %cl, %ebx
113: shrl %cl, %edx
114: movl %ecx, %esi C common twos
115:
116: movl PARAM_SIZE, %ecx
117: cmpl $1, %ecx
118: ja L(divide)
119:
120:
121: C eax
122: C ebx x
123: C ecx
124: C edx y
125: C esi common twos
126: C edi [PIC] L(table)
127: C ebp
128:
129: movl %edx, %eax
130: cmpl %ebx, %edx
131:
132: cmovb( %ebx, %eax) C swap to make x bigger than y
133: cmovb( %edx, %ebx)
134:
135:
136: L(strip_y):
137: C eax x
138: C ebx y
139: C ecx
140: C edx
141: C esi common twos
142: C edi [PIC] L(table)
143: C ebp
144:
145: ASSERT(nz,`orl %ebx,%ebx')
146: shrl %ebx
147: jnc L(strip_y)
148: rcll %ebx
149:
150:
151: C eax x
152: C ebx y (odd)
153: C ecx
154: C edx
155: C esi common twos
156: C edi [PIC] L(table)
157: C ebp
158:
159: movl %eax, %ecx
160: movl %ebx, %edx
161: shrl $DIV_THRESHOLD, %eax
162:
163: cmpl %eax, %ebx
164: movl %ecx, %eax
165: ja L(strip_x_entry) C do x%y if x much bigger than y
166:
167:
168: xorl %edx, %edx
169:
170: divl %ebx
171:
172: orl %edx, %edx
173: movl %edx, %eax C remainder -> x
174: movl %ebx, %edx C y
175:
176: jz L(done_ebx)
177: jmp L(strip_x)
178:
179:
180: C Offset 0x9D here for non-PIC. About 0.4 cycles/bit is saved by
181: C ensuring the end of the jnz at the end of this loop doesn't cross
182: C into the next cache line at 0xC0.
183: C
184: C PIC on the other hand is offset 0xAC here and extends to 0xC9, so
185: C it crosses but doesn't suffer any measurable slowdown.
186:
187: L(top):
188: C eax x
189: C ebx y-x
190: C ecx x-y
191: C edx y
192: C esi twos, for use at end
193: C edi [PIC] L(table)
194:
195: cmovc( %ebx, %ecx) C if x-y gave carry, use x and y-x
196: cmovc( %eax, %edx)
197:
198: L(strip_x):
199: movl %ecx, %eax
200: L(strip_x_entry):
201: andl $MASK, %ecx
202:
203: ASSERT(nz, `orl %eax, %eax')
204:
205: ifdef(`PIC',`
206: movb (%ecx,%edi), %cl
207: ',`
208: movb L(table) (%ecx), %cl
209: ')
210:
211: shrl %cl, %eax
212: cmpb $MAXSHIFT, %cl
213:
214: movl %eax, %ecx
215: movl %edx, %ebx
216: je L(strip_x)
217:
218: ASSERT(nz, `testl $1, %eax') C both odd
219: ASSERT(nz, `testl $1, %edx')
220:
221: subl %eax, %ebx
222: subl %edx, %ecx
223: jnz L(top)
224:
225:
226: L(done):
227: movl %esi, %ecx
228: movl SAVE_ESI, %esi
229: ifdef(`PIC',`
230: movl SAVE_EDI, %edi
231: ')
232:
233: shll %cl, %eax
234: movl SAVE_EBX, %ebx
235: addl $FRAME, %esp
236:
237: ret
238:
239:
240:
241: C -----------------------------------------------------------------------------
242: C two or more limbs
243:
244: dnl MODEXACT_THRESHOLD is the size at which it's better to call
245: dnl mpn_modexact_1_odd than do an inline loop.
246:
247: deflit(MODEXACT_THRESHOLD, ifdef(`PIC',6,5))
248:
249: L(divide):
250: C eax src
251: C ebx
252: C ecx size
253: C edx y
254: C esi common twos
255: C edi [PIC] L(table)
256: C ebp
257:
258: L(divide_strip_y):
259: ASSERT(nz,`orl %edx,%edx')
260: shrl %edx
261: jnc L(divide_strip_y)
262: leal 1(%edx,%edx), %ebx C y now odd
263:
264: movl %ebp, SAVE_EBP
265: movl %eax, %ebp
266: movl -4(%eax,%ecx,4), %eax C src high limb
267:
268: cmp $MODEXACT_THRESHOLD, %ecx
269: jae L(modexact)
270:
271: cmpl %ebx, %eax C high cmp divisor
272: movl $0, %edx
273:
274: cmovc( %eax, %edx) C skip a div if high<divisor
275: sbbl $0, %ecx
276:
277:
278: L(divide_top):
279: C eax scratch (quotient)
280: C ebx y
281: C ecx counter (size to 1, inclusive)
282: C edx carry (remainder)
283: C esi common twos
284: C edi [PIC] L(table)
285: C ebp src
286:
287: movl -4(%ebp,%ecx,4), %eax
288:
289: divl %ebx
290:
291: decl %ecx
292: jnz L(divide_top)
293:
294:
295: C eax
296: C ebx y (odd)
297: C ecx
298: C edx x
299: C esi common twos
300: C edi [PIC] L(table)
301: C ebp
302:
303: orl %edx, %edx
304: movl SAVE_EBP, %ebp
305: movl %edx, %eax
306:
307: movl %edx, %ecx
308: movl %ebx, %edx
309: jnz L(strip_x_entry)
310:
311:
312: L(done_ebx):
313: movl %ebx, %eax
314: jmp L(done)
315:
316:
317:
318: L(modexact):
319: C eax
320: C ebx y
321: C ecx size
322: C edx
323: C esi common twos
324: C edi [PIC] L(table)
325: C ebp src
326:
327: ifdef(`PIC',`
328: movl %ebp, CALL_SRC
329: movl %ebx, %ebp C y
330: movl %edi, %ebx C L(table)
331:
332: addl $_GLOBAL_OFFSET_TABLE_+[.-L(table)], %ebx
333: movl %ebp, CALL_DIVISOR
334: movl %ecx, CALL_SIZE
335:
336: call GSYM_PREFIX`'mpn_modexact_1_odd@PLT
337: ',`
338: dnl non-PIC
339: movl %ebx, CALL_DIVISOR
340: movl %ebp, CALL_SRC
341: movl %ecx, CALL_SIZE
342:
343: call GSYM_PREFIX`'mpn_modexact_1_odd
344: ')
345:
346: C eax x
347: C ebx [non-PIC] y
348: C ecx
349: C edx
350: C esi common twos
351: C edi [PIC] L(table)
352: C ebp [PIC] y
353:
354: orl %eax, %eax
355: movl ifdef(`PIC',`%ebp',`%ebx'), %edx
356: movl SAVE_EBP, %ebp
357:
358: movl %eax, %ecx
359: jnz L(strip_x_entry)
360:
361: movl %edx, %eax
362: jmp L(done)
363:
364:
365: ifdef(`PIC', `
366: L(movl_eip_to_edi):
367: movl (%esp), %edi
368: ret
369: ')
370:
371: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>