Annotation of OpenXM_contrib/gmp/mpn/x86/k7/mmx/mod_1.asm, Revision 1.1.1.1
1.1 maekawa 1: dnl AMD K7 mpn_mod_1 -- mpn by limb remainder.
2: dnl
3: dnl K7: 17.0 cycles/limb.
4:
5:
6: dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
7: dnl
8: dnl This file is part of the GNU MP Library.
9: dnl
10: dnl The GNU MP Library is free software; you can redistribute it and/or
11: dnl modify it under the terms of the GNU Lesser General Public License as
12: dnl published by the Free Software Foundation; either version 2.1 of the
13: dnl License, or (at your option) any later version.
14: dnl
15: dnl The GNU MP Library is distributed in the hope that it will be useful,
16: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
17: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18: dnl Lesser General Public License for more details.
19: dnl
20: dnl You should have received a copy of the GNU Lesser General Public
21: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
22: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
23: dnl Suite 330, Boston, MA 02111-1307, USA.
24:
25:
26: include(`../config.m4')
27:
28:
29: C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
30: C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
31: C mp_limb_t carry);
32: C
33: C The code here is the same as mpn_divrem_1, but with the quotient
34: C discarded. See mpn/x86/k7/mmx/divrem_1.c for some comments.
35:
36:
37: dnl MUL_THRESHOLD is the size at which the multiply by inverse method is
38: dnl used, rather than plain "divl"s. Minimum value 2.
39: dnl
40: dnl The inverse takes about 50 cycles to calculate, but after that the
41: dnl multiply is 17 c/l versus division at 41 c/l.
42: dnl
43: dnl Using mul or div is about the same speed at 3 limbs, so the threshold
44: dnl is set to 4 to get the smaller div code used at 3.
45:
46: deflit(MUL_THRESHOLD, 4)
47:
48:
49: defframe(PARAM_CARRY, 16)
50: defframe(PARAM_DIVISOR,12)
51: defframe(PARAM_SIZE, 8)
52: defframe(PARAM_SRC, 4)
53:
54: defframe(SAVE_EBX, -4)
55: defframe(SAVE_ESI, -8)
56: defframe(SAVE_EDI, -12)
57: defframe(SAVE_EBP, -16)
58:
59: defframe(VAR_NORM, -20)
60: defframe(VAR_INVERSE, -24)
61: defframe(VAR_SRC_STOP,-28)
62:
63: deflit(STACK_SPACE, 28)
64:
65: .text
66: ALIGN(32)
67:
68: PROLOGUE(mpn_mod_1c)
69: deflit(`FRAME',0)
70: movl PARAM_CARRY, %edx
71: movl PARAM_SIZE, %ecx
72: subl $STACK_SPACE, %esp
73: deflit(`FRAME',STACK_SPACE)
74:
75: movl %ebp, SAVE_EBP
76: movl PARAM_DIVISOR, %ebp
77:
78: movl %esi, SAVE_ESI
79: movl PARAM_SRC, %esi
80: jmp LF(mpn_mod_1,start_1c)
81:
82: EPILOGUE()
83:
84:
85: ALIGN(32)
86: PROLOGUE(mpn_mod_1)
87: deflit(`FRAME',0)
88:
89: movl PARAM_SIZE, %ecx
90: movl $0, %edx C initial carry (if can't skip a div)
91: subl $STACK_SPACE, %esp
92: deflit(`FRAME',STACK_SPACE)
93:
94: movl %esi, SAVE_ESI
95: movl PARAM_SRC, %esi
96:
97: movl %ebp, SAVE_EBP
98: movl PARAM_DIVISOR, %ebp
99:
100: orl %ecx, %ecx
101: jz L(divide_done)
102:
103: movl -4(%esi,%ecx,4), %eax C src high limb
104:
105: cmpl %ebp, %eax C carry flag if high<divisor
106:
107: cmovc( %eax, %edx) C src high limb as initial carry
108: sbbl $0, %ecx C size-1 to skip one div
109: jz L(divide_done)
110:
111:
112: ALIGN(16)
113: L(start_1c):
114: C eax
115: C ebx
116: C ecx size
117: C edx carry
118: C esi src
119: C edi
120: C ebp divisor
121:
122: cmpl $MUL_THRESHOLD, %ecx
123: jae L(mul_by_inverse)
124:
125:
126:
127: C With a MUL_THRESHOLD of 4, this "loop" only ever does 1 to 3 iterations,
128: C but it's already fast and compact, and there's nothing to gain by
129: C expanding it out.
130: C
131: C Using PARAM_DIVISOR in the divl is a couple of cycles faster than %ebp.
132:
133: orl %ecx, %ecx
134: jz L(divide_done)
135:
136:
137: L(divide_top):
138: C eax scratch (quotient)
139: C ebx
140: C ecx counter, limbs, decrementing
141: C edx scratch (remainder)
142: C esi src
143: C edi
144: C ebp
145:
146: movl -4(%esi,%ecx,4), %eax
147:
148: divl PARAM_DIVISOR
149:
150: decl %ecx
151: jnz L(divide_top)
152:
153:
154: L(divide_done):
155: movl SAVE_ESI, %esi
156: movl SAVE_EBP, %ebp
157: addl $STACK_SPACE, %esp
158:
159: movl %edx, %eax
160:
161: ret
162:
163:
164:
165: C -----------------------------------------------------------------------------
166:
167: L(mul_by_inverse):
168: C eax
169: C ebx
170: C ecx size
171: C edx carry
172: C esi src
173: C edi
174: C ebp divisor
175:
176: bsrl %ebp, %eax C 31-l
177:
178: movl %ebx, SAVE_EBX
179: leal -4(%esi), %ebx
180:
181: movl %ebx, VAR_SRC_STOP
182: movl %edi, SAVE_EDI
183:
184: movl %ecx, %ebx C size
185: movl $31, %ecx
186:
187: movl %edx, %edi C carry
188: movl $-1, %edx
189:
190: C
191:
192: xorl %eax, %ecx C l
193: incl %eax C 32-l
194:
195: shll %cl, %ebp C d normalized
196: movl %ecx, VAR_NORM
197:
198: movd %eax, %mm7
199:
200: movl $-1, %eax
201: subl %ebp, %edx C (b-d)-1 so edx:eax = b*(b-d)-1
202:
203: divl %ebp C floor (b*(b-d)-1) / d
204:
205: C
206:
207: movl %eax, VAR_INVERSE
208: leal -12(%esi,%ebx,4), %eax C &src[size-3]
209:
210: movl 8(%eax), %esi C src high limb
211: movl 4(%eax), %edx C src second highest limb
212:
213: shldl( %cl, %esi, %edi) C n2 = carry,high << l
214:
215: shldl( %cl, %edx, %esi) C n10 = high,second << l
216:
217: movl %eax, %ecx C &src[size-3]
218:
219:
220: ifelse(MUL_THRESHOLD,2,`
221: cmpl $2, %ebx
222: je L(inverse_two_left)
223: ')
224:
225:
226: C The dependent chain here is the same as in mpn_divrem_1, but a few
227: C instructions are saved by not needing to store the quotient limbs.
228: C Unfortunately this doesn't get the code down to the theoretical 16 c/l.
229: C
230: C There's four dummy instructions in the loop, all of which are necessary
231: C for the claimed 17 c/l. It's a 1 to 3 cycle slowdown if any are removed,
232: C or changed from load to store or vice versa. They're not completely
233: C random, since they correspond to what mpn_divrem_1 has, but there's no
234: C obvious reason why they're necessary. Presumably they induce something
235: C good in the out of order execution, perhaps through some load/store
236: C ordering and/or decoding effects.
237: C
238: C The q1==0xFFFFFFFF case is handled here the same as in mpn_divrem_1. On
239: C on special data that comes out as q1==0xFFFFFFFF always, the loop runs at
240: C about 13.5 c/l.
241:
242: ALIGN(32)
243: L(inverse_top):
244: C eax scratch
245: C ebx scratch (nadj, q1)
246: C ecx src pointer, decrementing
247: C edx scratch
248: C esi n10
249: C edi n2
250: C ebp divisor
251: C
252: C mm0 scratch (src qword)
253: C mm7 rshift for normalization
254:
255: cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
256: movl %edi, %eax C n2
257: movl PARAM_SIZE, %ebx C dummy
258:
259: leal (%ebp,%esi), %ebx
260: cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
261: sbbl $-1, %eax C n2+n1
262:
263: mull VAR_INVERSE C m*(n2+n1)
264:
265: movq (%ecx), %mm0 C next src limb and the one below it
266: subl $4, %ecx
267:
268: movl %ecx, PARAM_SIZE C dummy
269:
270: C
271:
272: addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
273: leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
274: movl %ebp, %eax C d
275:
276: C
277:
278: adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
279: jz L(q1_ff)
280: nop C dummy
281:
282: mull %ebx C (q1+1)*d
283:
284: psrlq %mm7, %mm0
285: leal 0(%ecx), %ecx C dummy
286:
287: C
288:
289: C
290:
291: subl %eax, %esi
292: movl VAR_SRC_STOP, %eax
293:
294: C
295:
296: sbbl %edx, %edi C n - (q1+1)*d
297: movl %esi, %edi C remainder -> n2
298: leal (%ebp,%esi), %edx
299:
300: movd %mm0, %esi
301:
302: cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
303: cmpl %eax, %ecx
304: jne L(inverse_top)
305:
306:
307: L(inverse_loop_done):
308:
309:
310: C -----------------------------------------------------------------------------
311:
312: L(inverse_two_left):
313: C eax scratch
314: C ebx scratch (nadj, q1)
315: C ecx &src[-1]
316: C edx scratch
317: C esi n10
318: C edi n2
319: C ebp divisor
320: C
321: C mm0 scratch (src dword)
322: C mm7 rshift
323:
324: cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
325: movl %edi, %eax C n2
326:
327: leal (%ebp,%esi), %ebx
328: cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
329: sbbl $-1, %eax C n2+n1
330:
331: mull VAR_INVERSE C m*(n2+n1)
332:
333: movd 4(%ecx), %mm0 C src low limb
334:
335: C
336:
337: C
338:
339: addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
340: leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
341: movl %ebp, %eax C d
342:
343: adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
344:
345: sbbl $0, %ebx
346:
347: mull %ebx C (q1+1)*d
348:
349: psllq $32, %mm0
350:
351: psrlq %mm7, %mm0
352:
353: C
354:
355: subl %eax, %esi
356:
357: C
358:
359: sbbl %edx, %edi C n - (q1+1)*d
360: movl %esi, %edi C remainder -> n2
361: leal (%ebp,%esi), %edx
362:
363: movd %mm0, %esi
364:
365: cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
366:
367:
368: C One limb left
369:
370: C eax scratch
371: C ebx scratch (nadj, q1)
372: C ecx
373: C edx scratch
374: C esi n10
375: C edi n2
376: C ebp divisor
377: C
378: C mm0 src limb, shifted
379: C mm7 rshift
380:
381: cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
382: movl %edi, %eax C n2
383:
384: leal (%ebp,%esi), %ebx
385: cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
386: sbbl $-1, %eax C n2+n1
387:
388: mull VAR_INVERSE C m*(n2+n1)
389:
390: movl VAR_NORM, %ecx C for final denorm
391:
392: C
393:
394: C
395:
396: addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
397: leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
398: movl %ebp, %eax C d
399:
400: C
401:
402: adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
403:
404: sbbl $0, %ebx
405:
406: mull %ebx C (q1+1)*d
407:
408: movl SAVE_EBX, %ebx
409:
410: C
411:
412: C
413:
414: subl %eax, %esi
415:
416: movl %esi, %eax C remainder
417: movl SAVE_ESI, %esi
418:
419: sbbl %edx, %edi C n - (q1+1)*d
420: leal (%ebp,%eax), %edx
421: movl SAVE_EBP, %ebp
422:
423: cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1
424: movl SAVE_EDI, %edi
425:
426: shrl %cl, %eax C denorm remainder
427: addl $STACK_SPACE, %esp
428: emms
429:
430: ret
431:
432:
433: C -----------------------------------------------------------------------------
434: C
435: C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
436: C of q*d is simply -d and the remainder n-q*d = n10+d
437:
438: L(q1_ff):
439: C eax (divisor)
440: C ebx (q1+1 == 0)
441: C ecx src pointer
442: C edx
443: C esi n10
444: C edi (n2)
445: C ebp divisor
446:
447: movl VAR_SRC_STOP, %edx
448: leal (%ebp,%esi), %edi C n-q*d remainder -> next n2
449: psrlq %mm7, %mm0
450:
451: movd %mm0, %esi C next n10
452:
453: cmpl %ecx, %edx
454: jne L(inverse_top)
455: jmp L(inverse_loop_done)
456:
457: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>