Annotation of OpenXM_contrib/gmp/mpn/x86/k7/mmx/divrem_1.asm, Revision 1.1.1.1
1.1 maekawa 1: dnl AMD K7 mpn_divrem_1 -- mpn by limb division.
2: dnl
3: dnl K7: 17.0 cycles/limb integer part, 15.0 cycles/limb fraction part.
4:
5:
6: dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
7: dnl
8: dnl This file is part of the GNU MP Library.
9: dnl
10: dnl The GNU MP Library is free software; you can redistribute it and/or
11: dnl modify it under the terms of the GNU Lesser General Public License as
12: dnl published by the Free Software Foundation; either version 2.1 of the
13: dnl License, or (at your option) any later version.
14: dnl
15: dnl The GNU MP Library is distributed in the hope that it will be useful,
16: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
17: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18: dnl Lesser General Public License for more details.
19: dnl
20: dnl You should have received a copy of the GNU Lesser General Public
21: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
22: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
23: dnl Suite 330, Boston, MA 02111-1307, USA.
24:
25:
26: include(`../config.m4')
27:
28:
29: C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
30: C mp_srcptr src, mp_size_t size,
31: C mp_limb_t divisor);
32: C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
33: C mp_srcptr src, mp_size_t size,
34: C mp_limb_t divisor, mp_limb_t carry);
35: C
36: C The method and nomenclature follow part 8 of "Division by Invariant
37: C Integers using Multiplication" by Granlund and Montgomery, reference in
38: C gmp.texi.
39: C
40: C The "and"s shown in the paper are done here with "cmov"s. "m" is written
41: C for m', and "d" for d_norm, which won't cause any confusion since it's
42: C only the normalized divisor that's of any use in the code. "b" is written
43: C for 2^N, the size of a limb, N being 32 here.
44: C
45: C mpn_divrem_1 avoids one division if the src high limb is less than the
46: C divisor. mpn_divrem_1c doesn't check for a zero carry, since in normal
47: C circumstances that will be a very rare event.
48: C
49: C There's a small bias towards expecting xsize==0, by having code for
50: C xsize==0 in a straight line and xsize!=0 under forward jumps.
51:
52:
53: dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by
54: dnl inverse method is used, rather than plain "divl"s. Minimum value 1.
55: dnl
56: dnl The inverse takes about 50 cycles to calculate, but after that the
57: dnl multiply is 17 c/l versus division at 42 c/l.
58: dnl
59: dnl At 3 limbs the mul is a touch faster than div on the integer part, and
60: dnl even more so on the fractional part.
61:
62: deflit(MUL_THRESHOLD, 3)
63:
64:
65: defframe(PARAM_CARRY, 24)
66: defframe(PARAM_DIVISOR,20)
67: defframe(PARAM_SIZE, 16)
68: defframe(PARAM_SRC, 12)
69: defframe(PARAM_XSIZE, 8)
70: defframe(PARAM_DST, 4)
71:
72: defframe(SAVE_EBX, -4)
73: defframe(SAVE_ESI, -8)
74: defframe(SAVE_EDI, -12)
75: defframe(SAVE_EBP, -16)
76:
77: defframe(VAR_NORM, -20)
78: defframe(VAR_INVERSE, -24)
79: defframe(VAR_SRC, -28)
80: defframe(VAR_DST, -32)
81: defframe(VAR_DST_STOP,-36)
82:
83: deflit(STACK_SPACE, 36)
84:
85: .text
86: ALIGN(32)
87:
88: PROLOGUE(mpn_divrem_1c)
89: deflit(`FRAME',0)
90: movl PARAM_CARRY, %edx
91: movl PARAM_SIZE, %ecx
92: subl $STACK_SPACE, %esp
93: deflit(`FRAME',STACK_SPACE)
94:
95: movl %ebx, SAVE_EBX
96: movl PARAM_XSIZE, %ebx
97:
98: movl %edi, SAVE_EDI
99: movl PARAM_DST, %edi
100:
101: movl %ebp, SAVE_EBP
102: movl PARAM_DIVISOR, %ebp
103:
104: movl %esi, SAVE_ESI
105: movl PARAM_SRC, %esi
106:
107: leal -4(%edi,%ebx,4), %edi
108: jmp LF(mpn_divrem_1,start_1c)
109:
110: EPILOGUE()
111:
112:
113: C offset 0x31, close enough to aligned
114: PROLOGUE(mpn_divrem_1)
115: deflit(`FRAME',0)
116:
117: movl PARAM_SIZE, %ecx
118: movl $0, %edx C initial carry (if can't skip a div)
119: subl $STACK_SPACE, %esp
120: deflit(`FRAME',STACK_SPACE)
121:
122: movl %ebp, SAVE_EBP
123: movl PARAM_DIVISOR, %ebp
124:
125: movl %ebx, SAVE_EBX
126: movl PARAM_XSIZE, %ebx
127:
128: movl %esi, SAVE_ESI
129: movl PARAM_SRC, %esi
130: orl %ecx, %ecx
131:
132: movl %edi, SAVE_EDI
133: movl PARAM_DST, %edi
134: leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
135:
136: jz L(no_skip_div)
137: movl -4(%esi,%ecx,4), %eax C src high limb
138:
139: cmpl %ebp, %eax C one less div if high<divisor
140: jnb L(no_skip_div)
141:
142: movl $0, (%edi,%ecx,4) C dst high limb
143: decl %ecx C size-1
144: movl %eax, %edx C src high limb as initial carry
145: L(no_skip_div):
146:
147:
148: L(start_1c):
149: C eax
150: C ebx xsize
151: C ecx size
152: C edx carry
153: C esi src
154: C edi &dst[xsize-1]
155: C ebp divisor
156:
157: leal (%ebx,%ecx), %eax C size+xsize
158: cmpl $MUL_THRESHOLD, %eax
159: jae L(mul_by_inverse)
160:
161:
162: C With MUL_THRESHOLD set to 3, the simple loops here only do 0 to 2 limbs.
163: C It'd be possible to write them out without the looping, but no speedup
164: C would be expected.
165: C
166: C Using PARAM_DIVISOR instead of %ebp measures 1 cycle/loop faster on the
167: C integer part, but curiously not on the fractional part, where %ebp is a
168: C (fixed) couple of cycles faster.
169:
170: orl %ecx, %ecx
171: jz L(divide_no_integer)
172:
173: L(divide_integer):
174: C eax scratch (quotient)
175: C ebx xsize
176: C ecx counter
177: C edx scratch (remainder)
178: C esi src
179: C edi &dst[xsize-1]
180: C ebp divisor
181:
182: movl -4(%esi,%ecx,4), %eax
183:
184: divl PARAM_DIVISOR
185:
186: movl %eax, (%edi,%ecx,4)
187: decl %ecx
188: jnz L(divide_integer)
189:
190:
191: L(divide_no_integer):
192: movl PARAM_DST, %edi
193: orl %ebx, %ebx
194: jnz L(divide_fraction)
195:
196: L(divide_done):
197: movl SAVE_ESI, %esi
198: movl SAVE_EDI, %edi
199: movl %edx, %eax
200:
201: movl SAVE_EBX, %ebx
202: movl SAVE_EBP, %ebp
203: addl $STACK_SPACE, %esp
204:
205: ret
206:
207:
208: L(divide_fraction):
209: C eax scratch (quotient)
210: C ebx counter
211: C ecx
212: C edx scratch (remainder)
213: C esi
214: C edi dst
215: C ebp divisor
216:
217: movl $0, %eax
218:
219: divl %ebp
220:
221: movl %eax, -4(%edi,%ebx,4)
222: decl %ebx
223: jnz L(divide_fraction)
224:
225: jmp L(divide_done)
226:
227:
228:
229: C -----------------------------------------------------------------------------
230:
231: L(mul_by_inverse):
232: C eax
233: C ebx xsize
234: C ecx size
235: C edx carry
236: C esi src
237: C edi &dst[xsize-1]
238: C ebp divisor
239:
240: bsrl %ebp, %eax C 31-l
241:
242: leal 12(%edi), %ebx
243: leal 4(%edi,%ecx,4), %edi C &dst[xsize+size]
244:
245: movl %edi, VAR_DST
246: movl %ebx, VAR_DST_STOP
247:
248: movl %ecx, %ebx C size
249: movl $31, %ecx
250:
251: movl %edx, %edi C carry
252: movl $-1, %edx
253:
254: C
255:
256: xorl %eax, %ecx C l
257: incl %eax C 32-l
258:
259: shll %cl, %ebp C d normalized
260: movl %ecx, VAR_NORM
261:
262: movd %eax, %mm7
263:
264: movl $-1, %eax
265: subl %ebp, %edx C (b-d)-1 giving edx:eax = b*(b-d)-1
266:
267: divl %ebp C floor (b*(b-d)-1) / d
268:
269: orl %ebx, %ebx C size
270: movl %eax, VAR_INVERSE
271: leal -12(%esi,%ebx,4), %eax C &src[size-3]
272:
273: jz L(start_zero)
274: movl %eax, VAR_SRC
275: cmpl $1, %ebx
276:
277: movl 8(%eax), %esi C src high limb
278: jz L(start_one)
279:
280: L(start_two_or_more):
281: movl 4(%eax), %edx C src second highest limb
282:
283: shldl( %cl, %esi, %edi) C n2 = carry,high << l
284:
285: shldl( %cl, %edx, %esi) C n10 = high,second << l
286:
287: cmpl $2, %ebx
288: je L(integer_two_left)
289: jmp L(integer_top)
290:
291:
292: L(start_one):
293: shldl( %cl, %esi, %edi) C n2 = carry,high << l
294:
295: shll %cl, %esi C n10 = high << l
296: movl %eax, VAR_SRC
297: jmp L(integer_one_left)
298:
299:
300: L(start_zero):
301: shll %cl, %edi C n2 = carry << l
302: movl $0, %esi C n10 = 0
303:
304: C we're here because xsize+size>=MUL_THRESHOLD, so with size==0 then
305: C must have xsize!=0
306: jmp L(fraction_some)
307:
308:
309:
310: C -----------------------------------------------------------------------------
311: C
312: C The multiply by inverse loop is 17 cycles, and relies on some out-of-order
313: C execution. The instruction scheduling is important, with various
314: C apparently equivalent forms running 1 to 5 cycles slower.
315: C
316: C A lower bound for the time would seem to be 16 cycles, based on the
317: C following successive dependencies.
318: C
319: C cycles
320: C n2+n1 1
321: C mul 6
322: C q1+1 1
323: C mul 6
324: C sub 1
325: C addback 1
326: C ---
327: C 16
328: C
329: C This chain is what the loop has already, but 16 cycles isn't achieved.
330: C K7 has enough decode, and probably enough execute (depending maybe on what
331: C a mul actually consumes), but nothing running under 17 has been found.
332: C
333: C In theory n2+n1 could be done in the sub and addback stages (by
334: C calculating both n2 and n2+n1 there), but lack of registers makes this an
335: C unlikely proposition.
336: C
337: C The jz in the loop keeps the q1+1 stage to 1 cycle. Handling an overflow
338: C from q1+1 with an "sbbl $0, %ebx" would add a cycle to the dependent
339: C chain, and nothing better than 18 cycles has been found when using it.
340: C The jump is taken only when q1 is 0xFFFFFFFF, and on random data this will
341: C be an extremely rare event.
342: C
343: C Branch mispredictions will hit random occurrances of q1==0xFFFFFFFF, but
344: C if some special data is coming out with this always, the q1_ff special
345: C case actually runs at 15 c/l. 0x2FFF...FFFD divided by 3 is a good way to
346: C induce the q1_ff case, for speed measurements or testing. Note that
347: C 0xFFF...FFF divided by 1 or 2 doesn't induce it.
348: C
349: C The instruction groupings and empty comments show the cycles for a naive
350: C in-order view of the code (conveniently ignoring the load latency on
351: C VAR_INVERSE). This shows some of where the time is going, but is nonsense
352: C to the extent that out-of-order execution rearranges it. In this case
353: C there's 19 cycles shown, but it executes at 17.
354:
355: ALIGN(16)
356: L(integer_top):
357: C eax scratch
358: C ebx scratch (nadj, q1)
359: C ecx scratch (src, dst)
360: C edx scratch
361: C esi n10
362: C edi n2
363: C ebp divisor
364: C
365: C mm0 scratch (src qword)
366: C mm7 rshift for normalization
367:
368: cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
369: movl %edi, %eax C n2
370: movl VAR_SRC, %ecx
371:
372: leal (%ebp,%esi), %ebx
373: cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
374: sbbl $-1, %eax C n2+n1
375:
376: mull VAR_INVERSE C m*(n2+n1)
377:
378: movq (%ecx), %mm0 C next limb and the one below it
379: subl $4, %ecx
380:
381: movl %ecx, VAR_SRC
382:
383: C
384:
385: addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
386: leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
387: movl %ebp, %eax C d
388:
389: C
390:
391: adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
392: jz L(q1_ff)
393: movl VAR_DST, %ecx
394:
395: mull %ebx C (q1+1)*d
396:
397: psrlq %mm7, %mm0
398:
399: leal -4(%ecx), %ecx
400:
401: C
402:
403: subl %eax, %esi
404: movl VAR_DST_STOP, %eax
405:
406: C
407:
408: sbbl %edx, %edi C n - (q1+1)*d
409: movl %esi, %edi C remainder -> n2
410: leal (%ebp,%esi), %edx
411:
412: movd %mm0, %esi
413:
414: cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
415: sbbl $0, %ebx C q
416: cmpl %eax, %ecx
417:
418: movl %ebx, (%ecx)
419: movl %ecx, VAR_DST
420: jne L(integer_top)
421:
422:
423: L(integer_loop_done):
424:
425:
426: C -----------------------------------------------------------------------------
427: C
428: C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
429: C q1_ff special case. This make the code a bit smaller and simpler, and
430: C costs only 1 cycle (each).
431:
432: L(integer_two_left):
433: C eax scratch
434: C ebx scratch (nadj, q1)
435: C ecx scratch (src, dst)
436: C edx scratch
437: C esi n10
438: C edi n2
439: C ebp divisor
440: C
441: C mm0 src limb, shifted
442: C mm7 rshift
443:
444: cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
445: movl %edi, %eax C n2
446: movl PARAM_SRC, %ecx
447:
448: leal (%ebp,%esi), %ebx
449: cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
450: sbbl $-1, %eax C n2+n1
451:
452: mull VAR_INVERSE C m*(n2+n1)
453:
454: movd (%ecx), %mm0 C src low limb
455:
456: movl VAR_DST_STOP, %ecx
457:
458: C
459:
460: addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
461: leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
462: movl %ebp, %eax C d
463:
464: adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
465:
466: sbbl $0, %ebx
467:
468: mull %ebx C (q1+1)*d
469:
470: psllq $32, %mm0
471:
472: psrlq %mm7, %mm0
473:
474: C
475:
476: subl %eax, %esi
477:
478: C
479:
480: sbbl %edx, %edi C n - (q1+1)*d
481: movl %esi, %edi C remainder -> n2
482: leal (%ebp,%esi), %edx
483:
484: movd %mm0, %esi
485:
486: cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
487: sbbl $0, %ebx C q
488:
489: movl %ebx, -4(%ecx)
490:
491:
492: C -----------------------------------------------------------------------------
493: L(integer_one_left):
494: C eax scratch
495: C ebx scratch (nadj, q1)
496: C ecx dst
497: C edx scratch
498: C esi n10
499: C edi n2
500: C ebp divisor
501: C
502: C mm0 src limb, shifted
503: C mm7 rshift
504:
505: movl VAR_DST_STOP, %ecx
506: cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
507: movl %edi, %eax C n2
508:
509: leal (%ebp,%esi), %ebx
510: cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
511: sbbl $-1, %eax C n2+n1
512:
513: mull VAR_INVERSE C m*(n2+n1)
514:
515: C
516:
517: C
518:
519: C
520:
521: addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
522: leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
523: movl %ebp, %eax C d
524:
525: C
526:
527: adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
528:
529: sbbl $0, %ebx C q1 if q1+1 overflowed
530:
531: mull %ebx
532:
533: C
534:
535: C
536:
537: C
538:
539: subl %eax, %esi
540:
541: C
542:
543: sbbl %edx, %edi C n - (q1+1)*d
544: movl %esi, %edi C remainder -> n2
545: leal (%ebp,%esi), %edx
546:
547: cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
548: sbbl $0, %ebx C q
549:
550: movl %ebx, -8(%ecx)
551: subl $8, %ecx
552:
553:
554:
555: L(integer_none):
556: cmpl $0, PARAM_XSIZE
557: jne L(fraction_some)
558:
559: movl %edi, %eax
560: L(fraction_done):
561: movl VAR_NORM, %ecx
562: movl SAVE_EBP, %ebp
563:
564: movl SAVE_EDI, %edi
565: movl SAVE_ESI, %esi
566:
567: movl SAVE_EBX, %ebx
568: addl $STACK_SPACE, %esp
569:
570: shrl %cl, %eax
571: emms
572:
573: ret
574:
575:
576: C -----------------------------------------------------------------------------
577: C
578: C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
579: C of q*d is simply -d and the remainder n-q*d = n10+d
580:
581: L(q1_ff):
582: C eax (divisor)
583: C ebx (q1+1 == 0)
584: C ecx
585: C edx
586: C esi n10
587: C edi n2
588: C ebp divisor
589:
590: movl VAR_DST, %ecx
591: movl VAR_DST_STOP, %edx
592: subl $4, %ecx
593:
594: psrlq %mm7, %mm0
595: leal (%ebp,%esi), %edi C n-q*d remainder -> next n2
596: movl %ecx, VAR_DST
597:
598: movd %mm0, %esi C next n10
599:
600: movl $-1, (%ecx)
601: cmpl %ecx, %edx
602: jne L(integer_top)
603:
604: jmp L(integer_loop_done)
605:
606:
607:
608: C -----------------------------------------------------------------------------
609: C
610: C Being the fractional part, the "source" limbs are all zero, meaning
611: C n10=0, n1=0, and hence nadj=0, leading to many instructions eliminated.
612: C
613: C The loop runs at 15 cycles. The dependent chain is the same as the
614: C general case above, but without the n2+n1 stage (due to n1==0), so 15
615: C would seem to be the lower bound.
616: C
617: C A not entirely obvious simplification is that q1+1 never overflows a limb,
618: C and so there's no need for the sbbl $0 or jz q1_ff from the general case.
619: C q1 is the high word of m*n2+b*n2 and the following shows q1<=b-2 always.
620: C rnd() means rounding down to a multiple of d.
621: C
622: C m*n2 + b*n2 <= m*(d-1) + b*(d-1)
623: C = m*d + b*d - m - b
624: C = floor((b(b-d)-1)/d)*d + b*d - m - b
625: C = rnd(b(b-d)-1) + b*d - m - b
626: C = rnd(b(b-d)-1 + b*d) - m - b
627: C = rnd(b*b-1) - m - b
628: C <= (b-2)*b
629: C
630: C Unchanged from the general case is that the final quotient limb q can be
631: C either q1 or q1+1, and the q1+1 case occurs often. This can be seen from
632: C equation 8.4 of the paper which simplifies as follows when n1==0 and
633: C n0==0.
634: C
635: C n-q1*d = (n2*k+q0*d)/b <= d + (d*d-2d)/b
636: C
637: C As before, the instruction groupings and empty comments show a naive
638: C in-order view of the code, which is made a nonsense by out of order
639: C execution. There's 17 cycles shown, but it executes at 15.
640: C
641: C Rotating the store q and remainder->n2 instructions up to the top of the
642: C loop gets the run time down from 16 to 15.
643:
644: ALIGN(16)
645: L(fraction_some):
646: C eax
647: C ebx
648: C ecx
649: C edx
650: C esi
651: C edi carry
652: C ebp divisor
653:
654: movl PARAM_DST, %esi
655: movl VAR_DST_STOP, %ecx
656: movl %edi, %eax
657:
658: subl $8, %ecx
659:
660: jmp L(fraction_entry)
661:
662:
663: ALIGN(16)
664: L(fraction_top):
665: C eax n2 carry, then scratch
666: C ebx scratch (nadj, q1)
667: C ecx dst, decrementing
668: C edx scratch
669: C esi dst stop point
670: C edi (will be n2)
671: C ebp divisor
672:
673: movl %ebx, (%ecx) C previous q
674: movl %eax, %edi C remainder->n2
675:
676: L(fraction_entry):
677: mull VAR_INVERSE C m*n2
678:
679: movl %ebp, %eax C d
680: subl $4, %ecx C dst
681: leal 1(%edi), %ebx
682:
683: C
684:
685: C
686:
687: C
688:
689: C
690:
691: addl %edx, %ebx C 1 + high(n2<<32 + m*n2) = q1+1
692:
693: mull %ebx C (q1+1)*d
694:
695: C
696:
697: C
698:
699: C
700:
701: negl %eax C low of n - (q1+1)*d
702:
703: C
704:
705: sbbl %edx, %edi C high of n - (q1+1)*d, caring only about carry
706: leal (%ebp,%eax), %edx
707:
708: cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1
709: sbbl $0, %ebx C q
710: cmpl %esi, %ecx
711:
712: jne L(fraction_top)
713:
714:
715: movl %ebx, (%ecx)
716: jmp L(fraction_done)
717:
718: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>