Annotation of OpenXM_contrib/gmp/mpn/x86/k7/mmx/mod_1.asm, Revision 1.1.1.2
1.1 maekawa 1: dnl AMD K7 mpn_mod_1 -- mpn by limb remainder.
2:
1.1.1.2 ! ohara 3: dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
1.1 maekawa 4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24:
1.1.1.2 ! ohara 25: C K7: 17.0 cycles/limb.
! 26:
! 27:
1.1 maekawa 28: C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
29: C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
30: C mp_limb_t carry);
1.1.1.2 ! ohara 31: C mp_limb_t mpn_preinv_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
! 32: C mp_limb_t inverse);
1.1 maekawa 33: C
34: C The code here is the same as mpn_divrem_1, but with the quotient
35: C discarded. See mpn/x86/k7/mmx/divrem_1.c for some comments.
36:
37:
38: dnl MUL_THRESHOLD is the size at which the multiply by inverse method is
39: dnl used, rather than plain "divl"s. Minimum value 2.
40: dnl
41: dnl The inverse takes about 50 cycles to calculate, but after that the
42: dnl multiply is 17 c/l versus division at 41 c/l.
43: dnl
44: dnl Using mul or div is about the same speed at 3 limbs, so the threshold
45: dnl is set to 4 to get the smaller div code used at 3.
46:
47: deflit(MUL_THRESHOLD, 4)
48:
49:
1.1.1.2 ! ohara 50: defframe(PARAM_INVERSE,16) dnl mpn_preinv_mod_1
! 51: defframe(PARAM_CARRY, 16) dnl mpn_mod_1c
1.1 maekawa 52: defframe(PARAM_DIVISOR,12)
53: defframe(PARAM_SIZE, 8)
54: defframe(PARAM_SRC, 4)
55:
56: defframe(SAVE_EBX, -4)
57: defframe(SAVE_ESI, -8)
58: defframe(SAVE_EDI, -12)
59: defframe(SAVE_EBP, -16)
60:
61: defframe(VAR_NORM, -20)
62: defframe(VAR_INVERSE, -24)
63: defframe(VAR_SRC_STOP,-28)
64:
65: deflit(STACK_SPACE, 28)
66:
1.1.1.2 ! ohara 67: TEXT
! 68:
1.1 maekawa 69: ALIGN(32)
1.1.1.2 ! ohara 70: PROLOGUE(mpn_preinv_mod_1)
! 71: deflit(`FRAME',0)
! 72: movl PARAM_SRC, %ecx
! 73: movl PARAM_SIZE, %eax
! 74: subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
! 75:
! 76: movl %ebp, SAVE_EBP
! 77: movl PARAM_DIVISOR, %ebp
! 78:
! 79: movl %edi, SAVE_EDI
! 80: movl PARAM_INVERSE, %edx
! 81:
! 82: movl %esi, SAVE_ESI
! 83: movl -4(%ecx,%eax,4), %edi C src high limb
! 84: leal -16(%ecx,%eax,4), %ecx C &src[size-4]
! 85:
! 86: movl %ebx, SAVE_EBX
! 87: movl PARAM_INVERSE, %edx
! 88:
! 89: movl $0, VAR_NORM C l==0
! 90:
! 91: movl %edi, %esi
! 92: subl %ebp, %edi C high-divisor
! 93:
! 94: cmovc( %esi, %edi) C restore if underflow
! 95: decl %eax
! 96: jz L(done_edi) C size==1, high-divisor only
! 97:
! 98: movl 8(%ecx), %esi C src second high limb
! 99: movl %edx, VAR_INVERSE
! 100:
! 101: movl $32, %ebx C 32-l
! 102: decl %eax
! 103: jz L(inverse_one_left) C size==2, one divide
1.1 maekawa 104:
1.1.1.2 ! ohara 105: movd %ebx, %mm7 C 32-l
! 106: decl %eax
! 107: jz L(inverse_two_left) C size==3, two divides
! 108:
! 109: jmp L(inverse_top) C size>=4
! 110:
! 111:
! 112: L(done_edi):
! 113: movl SAVE_ESI, %esi
! 114: movl SAVE_EBP, %ebp
! 115: movl %edi, %eax
! 116:
! 117: movl SAVE_EDI, %edi
! 118: addl $STACK_SPACE, %esp
! 119:
! 120: ret
! 121:
! 122: EPILOGUE()
! 123:
! 124:
! 125: ALIGN(32)
1.1 maekawa 126: PROLOGUE(mpn_mod_1c)
127: deflit(`FRAME',0)
128: movl PARAM_CARRY, %edx
129: movl PARAM_SIZE, %ecx
130: subl $STACK_SPACE, %esp
131: deflit(`FRAME',STACK_SPACE)
132:
133: movl %ebp, SAVE_EBP
134: movl PARAM_DIVISOR, %ebp
135:
136: movl %esi, SAVE_ESI
137: movl PARAM_SRC, %esi
1.1.1.2 ! ohara 138: jmp L(start_1c)
1.1 maekawa 139:
140: EPILOGUE()
141:
142:
143: ALIGN(32)
144: PROLOGUE(mpn_mod_1)
145: deflit(`FRAME',0)
146:
147: movl PARAM_SIZE, %ecx
148: movl $0, %edx C initial carry (if can't skip a div)
149: subl $STACK_SPACE, %esp
150: deflit(`FRAME',STACK_SPACE)
151:
152: movl %esi, SAVE_ESI
153: movl PARAM_SRC, %esi
154:
155: movl %ebp, SAVE_EBP
156: movl PARAM_DIVISOR, %ebp
157:
158: orl %ecx, %ecx
159: jz L(divide_done)
160:
161: movl -4(%esi,%ecx,4), %eax C src high limb
162:
163: cmpl %ebp, %eax C carry flag if high<divisor
164:
165: cmovc( %eax, %edx) C src high limb as initial carry
166: sbbl $0, %ecx C size-1 to skip one div
167: jz L(divide_done)
168:
169:
170: ALIGN(16)
171: L(start_1c):
172: C eax
173: C ebx
174: C ecx size
175: C edx carry
176: C esi src
177: C edi
178: C ebp divisor
179:
180: cmpl $MUL_THRESHOLD, %ecx
181: jae L(mul_by_inverse)
182:
183:
184:
185: C With a MUL_THRESHOLD of 4, this "loop" only ever does 1 to 3 iterations,
186: C but it's already fast and compact, and there's nothing to gain by
187: C expanding it out.
188: C
189: C Using PARAM_DIVISOR in the divl is a couple of cycles faster than %ebp.
190:
191: orl %ecx, %ecx
192: jz L(divide_done)
193:
194:
195: L(divide_top):
196: C eax scratch (quotient)
197: C ebx
198: C ecx counter, limbs, decrementing
199: C edx scratch (remainder)
200: C esi src
201: C edi
202: C ebp
203:
204: movl -4(%esi,%ecx,4), %eax
205:
206: divl PARAM_DIVISOR
207:
208: decl %ecx
209: jnz L(divide_top)
210:
211:
212: L(divide_done):
213: movl SAVE_ESI, %esi
214: movl SAVE_EBP, %ebp
215: addl $STACK_SPACE, %esp
216:
217: movl %edx, %eax
218:
219: ret
220:
221:
222:
223: C -----------------------------------------------------------------------------
224:
225: L(mul_by_inverse):
226: C eax
227: C ebx
228: C ecx size
229: C edx carry
230: C esi src
231: C edi
232: C ebp divisor
233:
234: bsrl %ebp, %eax C 31-l
235:
236: movl %ebx, SAVE_EBX
1.1.1.2 ! ohara 237: movl %ecx, %ebx C size
1.1 maekawa 238:
239: movl %edi, SAVE_EDI
240: movl $31, %ecx
241:
242: movl %edx, %edi C carry
243: movl $-1, %edx
244:
245: C
246:
247: xorl %eax, %ecx C l
248: incl %eax C 32-l
249:
250: shll %cl, %ebp C d normalized
251: movl %ecx, VAR_NORM
252:
1.1.1.2 ! ohara 253: movd %eax, %mm7 C 32-l
1.1 maekawa 254:
255: movl $-1, %eax
256: subl %ebp, %edx C (b-d)-1 so edx:eax = b*(b-d)-1
257:
258: divl %ebp C floor (b*(b-d)-1) / d
259:
260: C
261:
262: movl %eax, VAR_INVERSE
263: leal -12(%esi,%ebx,4), %eax C &src[size-3]
264:
265: movl 8(%eax), %esi C src high limb
266: movl 4(%eax), %edx C src second highest limb
267:
268: shldl( %cl, %esi, %edi) C n2 = carry,high << l
269:
270: shldl( %cl, %edx, %esi) C n10 = high,second << l
271:
272: movl %eax, %ecx C &src[size-3]
273:
274:
275: ifelse(MUL_THRESHOLD,2,`
276: cmpl $2, %ebx
277: je L(inverse_two_left)
278: ')
279:
280:
281: C The dependent chain here is the same as in mpn_divrem_1, but a few
282: C instructions are saved by not needing to store the quotient limbs.
283: C Unfortunately this doesn't get the code down to the theoretical 16 c/l.
284: C
285: C There's four dummy instructions in the loop, all of which are necessary
286: C for the claimed 17 c/l. It's a 1 to 3 cycle slowdown if any are removed,
287: C or changed from load to store or vice versa. They're not completely
288: C random, since they correspond to what mpn_divrem_1 has, but there's no
289: C obvious reason why they're necessary. Presumably they induce something
290: C good in the out of order execution, perhaps through some load/store
291: C ordering and/or decoding effects.
292: C
293: C The q1==0xFFFFFFFF case is handled here the same as in mpn_divrem_1. On
294: C on special data that comes out as q1==0xFFFFFFFF always, the loop runs at
295: C about 13.5 c/l.
296:
297: ALIGN(32)
298: L(inverse_top):
299: C eax scratch
300: C ebx scratch (nadj, q1)
301: C ecx src pointer, decrementing
302: C edx scratch
303: C esi n10
304: C edi n2
305: C ebp divisor
306: C
307: C mm0 scratch (src qword)
308: C mm7 rshift for normalization
309:
310: cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
311: movl %edi, %eax C n2
312: movl PARAM_SIZE, %ebx C dummy
313:
314: leal (%ebp,%esi), %ebx
315: cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
316: sbbl $-1, %eax C n2+n1
317:
318: mull VAR_INVERSE C m*(n2+n1)
319:
320: movq (%ecx), %mm0 C next src limb and the one below it
321: subl $4, %ecx
322:
323: movl %ecx, PARAM_SIZE C dummy
324:
325: C
326:
327: addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
1.1.1.2 ! ohara 328: leal 1(%edi), %ebx C n2+1
1.1 maekawa 329: movl %ebp, %eax C d
330:
331: C
332:
333: adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
334: jz L(q1_ff)
335: nop C dummy
336:
337: mull %ebx C (q1+1)*d
338:
339: psrlq %mm7, %mm0
1.1.1.2 ! ohara 340: leal (%ecx), %ecx C dummy
1.1 maekawa 341:
342: C
343:
344: C
345:
1.1.1.2 ! ohara 346: subl %eax, %esi C low n - (q1+1)*d
! 347: movl PARAM_SRC, %eax
1.1 maekawa 348:
349: C
350:
1.1.1.2 ! ohara 351: sbbl %edx, %edi C high n - (q1+1)*d, 0 or -1
1.1 maekawa 352: movl %esi, %edi C remainder -> n2
353: leal (%ebp,%esi), %edx
354:
355: movd %mm0, %esi
356:
357: cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
358: cmpl %eax, %ecx
1.1.1.2 ! ohara 359: jae L(inverse_top)
1.1 maekawa 360:
361:
362: L(inverse_loop_done):
363:
364:
365: C -----------------------------------------------------------------------------
366:
367: L(inverse_two_left):
368: C eax scratch
369: C ebx scratch (nadj, q1)
370: C ecx &src[-1]
371: C edx scratch
372: C esi n10
373: C edi n2
374: C ebp divisor
375: C
376: C mm0 scratch (src dword)
377: C mm7 rshift
378:
379: cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
380: movl %edi, %eax C n2
381:
382: leal (%ebp,%esi), %ebx
383: cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
384: sbbl $-1, %eax C n2+n1
385:
386: mull VAR_INVERSE C m*(n2+n1)
387:
388: movd 4(%ecx), %mm0 C src low limb
389:
390: C
391:
392: C
393:
394: addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
1.1.1.2 ! ohara 395: leal 1(%edi), %ebx C n2+1
1.1 maekawa 396: movl %ebp, %eax C d
397:
398: adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
399:
400: sbbl $0, %ebx
401:
402: mull %ebx C (q1+1)*d
403:
404: psllq $32, %mm0
405:
406: psrlq %mm7, %mm0
407:
408: C
409:
410: subl %eax, %esi
411:
412: C
413:
414: sbbl %edx, %edi C n - (q1+1)*d
415: movl %esi, %edi C remainder -> n2
416: leal (%ebp,%esi), %edx
417:
418: movd %mm0, %esi
419:
420: cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
421:
422:
1.1.1.2 ! ohara 423: L(inverse_one_left):
1.1 maekawa 424: C eax scratch
425: C ebx scratch (nadj, q1)
426: C ecx
427: C edx scratch
428: C esi n10
429: C edi n2
430: C ebp divisor
431: C
432: C mm0 src limb, shifted
433: C mm7 rshift
434:
435: cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
436: movl %edi, %eax C n2
437:
438: leal (%ebp,%esi), %ebx
439: cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
440: sbbl $-1, %eax C n2+n1
441:
442: mull VAR_INVERSE C m*(n2+n1)
443:
444: movl VAR_NORM, %ecx C for final denorm
445:
446: C
447:
448: C
449:
450: addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
1.1.1.2 ! ohara 451: leal 1(%edi), %ebx C n2+1
1.1 maekawa 452: movl %ebp, %eax C d
453:
454: C
455:
456: adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
457:
458: sbbl $0, %ebx
459:
460: mull %ebx C (q1+1)*d
461:
462: movl SAVE_EBX, %ebx
463:
464: C
465:
466: C
467:
468: subl %eax, %esi
469:
470: movl %esi, %eax C remainder
471: movl SAVE_ESI, %esi
472:
473: sbbl %edx, %edi C n - (q1+1)*d
474: leal (%ebp,%eax), %edx
475: movl SAVE_EBP, %ebp
476:
477: cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1
478: movl SAVE_EDI, %edi
479:
480: shrl %cl, %eax C denorm remainder
481: addl $STACK_SPACE, %esp
482: emms
483:
484: ret
485:
486:
487: C -----------------------------------------------------------------------------
488: C
489: C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
490: C of q*d is simply -d and the remainder n-q*d = n10+d
491:
492: L(q1_ff):
493: C eax (divisor)
494: C ebx (q1+1 == 0)
495: C ecx src pointer
496: C edx
497: C esi n10
498: C edi (n2)
499: C ebp divisor
500:
1.1.1.2 ! ohara 501: movl PARAM_SRC, %edx
1.1 maekawa 502: leal (%ebp,%esi), %edi C n-q*d remainder -> next n2
503: psrlq %mm7, %mm0
504:
505: movd %mm0, %esi C next n10
506:
1.1.1.2 ! ohara 507: cmpl %edx, %ecx
! 508: jae L(inverse_top)
1.1 maekawa 509: jmp L(inverse_loop_done)
510:
511: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>