Annotation of OpenXM_contrib/gmp/mpn/x86/p6/mmx/divrem_1.asm, Revision 1.1.1.1
1.1 maekawa 1: dnl Intel Pentium-II mpn_divrem_1 -- mpn by limb division.
2: dnl
3: dnl P6MMX: 25.0 cycles/limb integer part, 17.5 cycles/limb fraction part.
4:
5:
6: dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
7: dnl
8: dnl This file is part of the GNU MP Library.
9: dnl
10: dnl The GNU MP Library is free software; you can redistribute it and/or
11: dnl modify it under the terms of the GNU Lesser General Public License as
12: dnl published by the Free Software Foundation; either version 2.1 of the
13: dnl License, or (at your option) any later version.
14: dnl
15: dnl The GNU MP Library is distributed in the hope that it will be useful,
16: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
17: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18: dnl Lesser General Public License for more details.
19: dnl
20: dnl You should have received a copy of the GNU Lesser General Public
21: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
22: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
23: dnl Suite 330, Boston, MA 02111-1307, USA.
24:
25:
26: include(`../config.m4')
27:
28:
29: C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
30: C mp_srcptr src, mp_size_t size,
31: C mp_limb_t divisor);
32: C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
33: C mp_srcptr src, mp_size_t size,
34: C mp_limb_t divisor, mp_limb_t carry);
35: C
36: C This code is a lightly reworked version of mpn/x86/k7/mmx/divrem_1.asm,
37: C see that file for some comments. It's likely what's here can be improved.
38:
39:
40: dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by
41: dnl inverse method is used, rather than plain "divl"s. Minimum value 1.
42: dnl
43: dnl The different speeds of the integer and fraction parts means that using
44: dnl xsize+size isn't quite right. The threshold wants to be a bit higher
45: dnl for the integer part and a bit lower for the fraction part. (Or what's
46: dnl really wanted is to speed up the integer part!)
47: dnl
48: dnl The threshold is set to make the integer part right. At 4 limbs the
49: dnl div and mul are about the same there, but on the fractional part the
50: dnl mul is much faster.
51:
52: deflit(MUL_THRESHOLD, 4)
53:
54:
55: defframe(PARAM_CARRY, 24)
56: defframe(PARAM_DIVISOR,20)
57: defframe(PARAM_SIZE, 16)
58: defframe(PARAM_SRC, 12)
59: defframe(PARAM_XSIZE, 8)
60: defframe(PARAM_DST, 4)
61:
62: defframe(SAVE_EBX, -4)
63: defframe(SAVE_ESI, -8)
64: defframe(SAVE_EDI, -12)
65: defframe(SAVE_EBP, -16)
66:
67: defframe(VAR_NORM, -20)
68: defframe(VAR_INVERSE, -24)
69: defframe(VAR_SRC, -28)
70: defframe(VAR_DST, -32)
71: defframe(VAR_DST_STOP,-36)
72:
73: deflit(STACK_SPACE, 36)
74:
75: .text
76: ALIGN(16)
77:
78: PROLOGUE(mpn_divrem_1c)
79: deflit(`FRAME',0)
80: movl PARAM_CARRY, %edx
81:
82: movl PARAM_SIZE, %ecx
83: subl $STACK_SPACE, %esp
84: deflit(`FRAME',STACK_SPACE)
85:
86: movl %ebx, SAVE_EBX
87: movl PARAM_XSIZE, %ebx
88:
89: movl %edi, SAVE_EDI
90: movl PARAM_DST, %edi
91:
92: movl %ebp, SAVE_EBP
93: movl PARAM_DIVISOR, %ebp
94:
95: movl %esi, SAVE_ESI
96: movl PARAM_SRC, %esi
97:
98: leal -4(%edi,%ebx,4), %edi
99: jmp LF(mpn_divrem_1,start_1c)
100:
101: EPILOGUE()
102:
103:
104: C offset 0x31, close enough to aligned
105: PROLOGUE(mpn_divrem_1)
106: deflit(`FRAME',0)
107:
108: movl PARAM_SIZE, %ecx
109: movl $0, %edx C initial carry (if can't skip a div)
110: subl $STACK_SPACE, %esp
111: deflit(`FRAME',STACK_SPACE)
112:
113: movl %ebp, SAVE_EBP
114: movl PARAM_DIVISOR, %ebp
115:
116: movl %ebx, SAVE_EBX
117: movl PARAM_XSIZE, %ebx
118:
119: movl %esi, SAVE_ESI
120: movl PARAM_SRC, %esi
121: orl %ecx, %ecx
122:
123: movl %edi, SAVE_EDI
124: movl PARAM_DST, %edi
125:
126: leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
127: jz L(no_skip_div)
128:
129: movl -4(%esi,%ecx,4), %eax C src high limb
130: cmpl %ebp, %eax C one less div if high<divisor
131: jnb L(no_skip_div)
132:
133: movl $0, (%edi,%ecx,4) C dst high limb
134: decl %ecx C size-1
135: movl %eax, %edx C src high limb as initial carry
136: L(no_skip_div):
137:
138:
139: L(start_1c):
140: C eax
141: C ebx xsize
142: C ecx size
143: C edx carry
144: C esi src
145: C edi &dst[xsize-1]
146: C ebp divisor
147:
148: leal (%ebx,%ecx), %eax C size+xsize
149: cmpl $MUL_THRESHOLD, %eax
150: jae L(mul_by_inverse)
151:
152: orl %ecx, %ecx
153: jz L(divide_no_integer)
154:
155: L(divide_integer):
156: C eax scratch (quotient)
157: C ebx xsize
158: C ecx counter
159: C edx scratch (remainder)
160: C esi src
161: C edi &dst[xsize-1]
162: C ebp divisor
163:
164: movl -4(%esi,%ecx,4), %eax
165:
166: divl %ebp
167:
168: movl %eax, (%edi,%ecx,4)
169: decl %ecx
170: jnz L(divide_integer)
171:
172:
173: L(divide_no_integer):
174: movl PARAM_DST, %edi
175: orl %ebx, %ebx
176: jnz L(divide_fraction)
177:
178: L(divide_done):
179: movl SAVE_ESI, %esi
180:
181: movl SAVE_EDI, %edi
182:
183: movl SAVE_EBX, %ebx
184: movl %edx, %eax
185:
186: movl SAVE_EBP, %ebp
187: addl $STACK_SPACE, %esp
188:
189: ret
190:
191:
192: L(divide_fraction):
193: C eax scratch (quotient)
194: C ebx counter
195: C ecx
196: C edx scratch (remainder)
197: C esi
198: C edi dst
199: C ebp divisor
200:
201: movl $0, %eax
202:
203: divl %ebp
204:
205: movl %eax, -4(%edi,%ebx,4)
206: decl %ebx
207: jnz L(divide_fraction)
208:
209: jmp L(divide_done)
210:
211:
212:
213: C -----------------------------------------------------------------------------
214:
215: L(mul_by_inverse):
216: C eax
217: C ebx xsize
218: C ecx size
219: C edx carry
220: C esi src
221: C edi &dst[xsize-1]
222: C ebp divisor
223:
224: leal 12(%edi), %ebx
225:
226: movl %ebx, VAR_DST_STOP
227: leal 4(%edi,%ecx,4), %edi C &dst[xsize+size]
228:
229: movl %edi, VAR_DST
230: movl %ecx, %ebx C size
231:
232: bsrl %ebp, %ecx C 31-l
233: movl %edx, %edi C carry
234:
235: leal 1(%ecx), %eax C 32-l
236: xorl $31, %ecx C l
237:
238: movl %ecx, VAR_NORM
239: movl $-1, %edx
240:
241: shll %cl, %ebp C d normalized
242: movd %eax, %mm7
243:
244: movl $-1, %eax
245: subl %ebp, %edx C (b-d)-1 giving edx:eax = b*(b-d)-1
246:
247: divl %ebp C floor (b*(b-d)-1) / d
248:
249: movl %eax, VAR_INVERSE
250: orl %ebx, %ebx C size
251: leal -12(%esi,%ebx,4), %eax C &src[size-3]
252:
253: movl %eax, VAR_SRC
254: jz L(start_zero)
255:
256: movl 8(%eax), %esi C src high limb
257: cmpl $1, %ebx
258: jz L(start_one)
259:
260: L(start_two_or_more):
261: movl 4(%eax), %edx C src second highest limb
262:
263: shldl( %cl, %esi, %edi) C n2 = carry,high << l
264:
265: shldl( %cl, %edx, %esi) C n10 = high,second << l
266:
267: cmpl $2, %ebx
268: je L(integer_two_left)
269: jmp L(integer_top)
270:
271:
272: L(start_one):
273: shldl( %cl, %esi, %edi) C n2 = carry,high << l
274:
275: shll %cl, %esi C n10 = high << l
276: jmp L(integer_one_left)
277:
278:
279: L(start_zero):
280: shll %cl, %edi C n2 = carry << l
281: movl $0, %esi C n10 = 0
282:
283: C we're here because xsize+size>=MUL_THRESHOLD, so with size==0 then
284: C must have xsize!=0
285: jmp L(fraction_some)
286:
287:
288:
289: C -----------------------------------------------------------------------------
290: C
291: C This loop runs at about 25 cycles, which is probably sub-optimal, and
292: C certainly more than the dependent chain would suggest. A better loop, or
293: C a better rough analysis of what's possible, would be welcomed.
294: C
295: C In the current implementation, the following successively dependent
296: C micro-ops seem to exist.
297: C
298: C uops
299: C n2+n1 1 (addl)
300: C mul 5
301: C q1+1 3 (addl/adcl)
302: C mul 5
303: C sub 3 (subl/sbbl)
304: C addback 2 (cmov)
305: C ---
306: C 19
307: C
308: C Lack of registers hinders explicit scheduling and it might be that the
309: C normal out of order execution isn't able to hide enough under the mul
310: C latencies.
311: C
312: C Using sarl/negl to pick out n1 for the n2+n1 stage is a touch faster than
313: C cmov (and takes one uop off the dependent chain). A sarl/andl/addl
314: C combination was tried for the addback (despite the fact it would lengthen
315: C the dependent chain) but found to be no faster.
316:
317:
318: ALIGN(16)
319: L(integer_top):
320: C eax scratch
321: C ebx scratch (nadj, q1)
322: C ecx scratch (src, dst)
323: C edx scratch
324: C esi n10
325: C edi n2
326: C ebp d
327: C
328: C mm0 scratch (src qword)
329: C mm7 rshift for normalization
330:
331: movl %esi, %eax
332: movl %ebp, %ebx
333:
334: sarl $31, %eax C -n1
335: movl VAR_SRC, %ecx
336:
337: andl %eax, %ebx C -n1 & d
338: negl %eax C n1
339:
340: addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
341: addl %edi, %eax C n2+n1
342: movq (%ecx), %mm0 C next src limb and the one below it
343:
344: mull VAR_INVERSE C m*(n2+n1)
345:
346: subl $4, %ecx
347:
348: movl %ecx, VAR_SRC
349:
350: C
351:
352: C
353:
354: addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
355: movl %ebp, %eax C d
356: leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
357:
358: adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
359: jz L(q1_ff)
360:
361: mull %ebx C (q1+1)*d
362:
363: movl VAR_DST, %ecx
364: psrlq %mm7, %mm0
365:
366: C
367:
368: C
369:
370: C
371:
372: subl %eax, %esi
373: movl VAR_DST_STOP, %eax
374:
375: sbbl %edx, %edi C n - (q1+1)*d
376: movl %esi, %edi C remainder -> n2
377: leal (%ebp,%esi), %edx
378:
379: cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
380: movd %mm0, %esi
381:
382: sbbl $0, %ebx C q
383: subl $4, %ecx
384:
385: movl %ebx, (%ecx)
386: cmpl %eax, %ecx
387:
388: movl %ecx, VAR_DST
389: jne L(integer_top)
390:
391:
392: L(integer_loop_done):
393:
394:
395: C -----------------------------------------------------------------------------
396: C
397: C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
398: C q1_ff special case. This make the code a bit smaller and simpler, and
399: C costs only 2 cycles (each).
400:
401: L(integer_two_left):
402: C eax scratch
403: C ebx scratch (nadj, q1)
404: C ecx scratch (src, dst)
405: C edx scratch
406: C esi n10
407: C edi n2
408: C ebp divisor
409: C
410: C mm0 src limb, shifted
411: C mm7 rshift
412:
413:
414: movl %esi, %eax
415: movl %ebp, %ebx
416:
417: sarl $31, %eax C -n1
418: movl PARAM_SRC, %ecx
419:
420: andl %eax, %ebx C -n1 & d
421: negl %eax C n1
422:
423: addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
424: addl %edi, %eax C n2+n1
425:
426: mull VAR_INVERSE C m*(n2+n1)
427:
428: movd (%ecx), %mm0 C src low limb
429:
430: movl VAR_DST_STOP, %ecx
431:
432: C
433:
434: C
435:
436: addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
437: leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
438: movl %ebp, %eax C d
439:
440: adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
441:
442: sbbl $0, %ebx
443:
444: mull %ebx C (q1+1)*d
445:
446: psllq $32, %mm0
447:
448: psrlq %mm7, %mm0
449:
450: C
451:
452: C
453:
454: subl %eax, %esi
455:
456: sbbl %edx, %edi C n - (q1+1)*d
457: movl %esi, %edi C remainder -> n2
458: leal (%ebp,%esi), %edx
459:
460: cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
461: movd %mm0, %esi
462:
463: sbbl $0, %ebx C q
464:
465: movl %ebx, -4(%ecx)
466:
467:
468: C -----------------------------------------------------------------------------
469: L(integer_one_left):
470: C eax scratch
471: C ebx scratch (nadj, q1)
472: C ecx scratch (dst)
473: C edx scratch
474: C esi n10
475: C edi n2
476: C ebp divisor
477: C
478: C mm0 src limb, shifted
479: C mm7 rshift
480:
481:
482: movl %esi, %eax
483: movl %ebp, %ebx
484:
485: sarl $31, %eax C -n1
486: movl VAR_DST_STOP, %ecx
487:
488: andl %eax, %ebx C -n1 & d
489: negl %eax C n1
490:
491: addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
492: addl %edi, %eax C n2+n1
493:
494: mull VAR_INVERSE C m*(n2+n1)
495:
496: C
497:
498: C
499:
500: C
501:
502: addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
503: leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
504: movl %ebp, %eax C d
505:
506: C
507:
508: adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
509:
510: sbbl $0, %ebx C q1 if q1+1 overflowed
511:
512: mull %ebx
513:
514: C
515:
516: C
517:
518: C
519:
520: C
521:
522: subl %eax, %esi
523: movl PARAM_XSIZE, %eax
524:
525: sbbl %edx, %edi C n - (q1+1)*d
526: movl %esi, %edi C remainder -> n2
527: leal (%ebp,%esi), %edx
528:
529: cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
530:
531: sbbl $0, %ebx C q
532:
533: movl %ebx, -8(%ecx)
534: subl $8, %ecx
535:
536:
537:
538: orl %eax, %eax C xsize
539: jnz L(fraction_some)
540:
541: movl %edi, %eax
542: L(fraction_done):
543: movl VAR_NORM, %ecx
544: movl SAVE_EBP, %ebp
545:
546: movl SAVE_EDI, %edi
547:
548: movl SAVE_ESI, %esi
549:
550: movl SAVE_EBX, %ebx
551: addl $STACK_SPACE, %esp
552:
553: shrl %cl, %eax
554: emms
555:
556: ret
557:
558:
559: C -----------------------------------------------------------------------------
560: C
561: C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
562: C of q*d is simply -d and the remainder n-q*d = n10+d
563:
564: L(q1_ff):
565: C eax (divisor)
566: C ebx (q1+1 == 0)
567: C ecx
568: C edx
569: C esi n10
570: C edi n2
571: C ebp divisor
572:
573: movl VAR_DST, %ecx
574: movl VAR_DST_STOP, %edx
575: subl $4, %ecx
576:
577: movl %ecx, VAR_DST
578: psrlq %mm7, %mm0
579: leal (%ebp,%esi), %edi C n-q*d remainder -> next n2
580:
581: movl $-1, (%ecx)
582: movd %mm0, %esi C next n10
583:
584: cmpl %ecx, %edx
585: jne L(integer_top)
586:
587: jmp L(integer_loop_done)
588:
589:
590:
591: C -----------------------------------------------------------------------------
592: C
593: C In the current implementation, the following successively dependent
594: C micro-ops seem to exist.
595: C
596: C uops
597: C mul 5
598: C q1+1 1 (addl)
599: C mul 5
600: C sub 3 (negl/sbbl)
601: C addback 2 (cmov)
602: C ---
603: C 16
604: C
605: C The loop in fact runs at about 17.5 cycles. Using a sarl/andl/addl for
606: C the addback was found to be a touch slower.
607:
608:
609: ALIGN(16)
610: L(fraction_some):
611: C eax
612: C ebx
613: C ecx
614: C edx
615: C esi
616: C edi carry
617: C ebp divisor
618:
619: movl PARAM_DST, %esi
620: movl VAR_DST_STOP, %ecx
621: movl %edi, %eax
622:
623: subl $8, %ecx
624:
625:
626: ALIGN(16)
627: L(fraction_top):
628: C eax n2, then scratch
629: C ebx scratch (nadj, q1)
630: C ecx dst, decrementing
631: C edx scratch
632: C esi dst stop point
633: C edi n2
634: C ebp divisor
635:
636: mull VAR_INVERSE C m*n2
637:
638: movl %ebp, %eax C d
639: subl $4, %ecx C dst
640: leal 1(%edi), %ebx
641:
642: C
643:
644: C
645:
646: C
647:
648: addl %edx, %ebx C 1 + high(n2<<32 + m*n2) = q1+1
649:
650: mull %ebx C (q1+1)*d
651:
652: C
653:
654: C
655:
656: C
657:
658: C
659:
660: negl %eax C low of n - (q1+1)*d
661:
662: sbbl %edx, %edi C high of n - (q1+1)*d, caring only about carry
663: leal (%ebp,%eax), %edx
664:
665: cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1
666:
667: sbbl $0, %ebx C q
668: movl %eax, %edi C remainder->n2
669: cmpl %esi, %ecx
670:
671: movl %ebx, (%ecx) C previous q
672: jne L(fraction_top)
673:
674:
675: jmp L(fraction_done)
676:
677: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>