Annotation of OpenXM_contrib/gmp/mpn/x86/p6/mmx/mod_1.asm, Revision 1.1.1.1
1.1 maekawa 1: dnl Intel Pentium-II mpn_mod_1 -- mpn by limb remainder.
2: dnl
3: dnl P6MMX: 24.0 cycles/limb.
4:
5:
6: dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
7: dnl
8: dnl This file is part of the GNU MP Library.
9: dnl
10: dnl The GNU MP Library is free software; you can redistribute it and/or
11: dnl modify it under the terms of the GNU Lesser General Public License as
12: dnl published by the Free Software Foundation; either version 2.1 of the
13: dnl License, or (at your option) any later version.
14: dnl
15: dnl The GNU MP Library is distributed in the hope that it will be useful,
16: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
17: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18: dnl Lesser General Public License for more details.
19: dnl
20: dnl You should have received a copy of the GNU Lesser General Public
21: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
22: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
23: dnl Suite 330, Boston, MA 02111-1307, USA.
24:
25:
26: include(`../config.m4')
27:
28:
29: C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
30: C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
31: C mp_limb_t carry);
32: C
33: C The code here very similar to mpn_divrem_1, but with the quotient
34: C discarded. What's here probably isn't optimal.
35: C
36: C See mpn/x86/p6/mmx/divrem_1.c and mpn/x86/k7/mmx/mod_1.asm for some
37: C comments.
38:
39:
40: dnl MUL_THRESHOLD is the size at which the multiply by inverse method is
41: dnl used, rather than plain "divl"s. Minimum value 2.
42:
43: deflit(MUL_THRESHOLD, 4)
44:
45:
46: defframe(PARAM_CARRY, 16)
47: defframe(PARAM_DIVISOR,12)
48: defframe(PARAM_SIZE, 8)
49: defframe(PARAM_SRC, 4)
50:
51: defframe(SAVE_EBX, -4)
52: defframe(SAVE_ESI, -8)
53: defframe(SAVE_EDI, -12)
54: defframe(SAVE_EBP, -16)
55:
56: defframe(VAR_NORM, -20)
57: defframe(VAR_INVERSE, -24)
58: defframe(VAR_SRC_STOP,-28)
59:
60: deflit(STACK_SPACE, 28)
61:
62: .text
63: ALIGN(16)
64:
65: PROLOGUE(mpn_mod_1c)
66: deflit(`FRAME',0)
67: movl PARAM_CARRY, %edx
68: movl PARAM_SIZE, %ecx
69: subl $STACK_SPACE, %esp
70: deflit(`FRAME',STACK_SPACE)
71:
72: movl %ebp, SAVE_EBP
73: movl PARAM_DIVISOR, %ebp
74:
75: movl %esi, SAVE_ESI
76: movl PARAM_SRC, %esi
77: jmp LF(mpn_mod_1,start_1c)
78:
79: EPILOGUE()
80:
81:
82: ALIGN(16)
83: PROLOGUE(mpn_mod_1)
84: deflit(`FRAME',0)
85:
86: movl $0, %edx C initial carry (if can't skip a div)
87: movl PARAM_SIZE, %ecx
88: subl $STACK_SPACE, %esp
89: deflit(`FRAME',STACK_SPACE)
90:
91: movl %esi, SAVE_ESI
92: movl PARAM_SRC, %esi
93:
94: movl %ebp, SAVE_EBP
95: movl PARAM_DIVISOR, %ebp
96:
97: orl %ecx, %ecx
98: jz L(divide_done)
99:
100: movl -4(%esi,%ecx,4), %eax C src high limb
101:
102: cmpl %ebp, %eax C carry flag if high<divisor
103:
104: cmovc( %eax, %edx) C src high limb as initial carry
105: sbbl $0, %ecx C size-1 to skip one div
106: jz L(divide_done)
107:
108:
109: ALIGN(16)
110: L(start_1c):
111: C eax
112: C ebx
113: C ecx size
114: C edx carry
115: C esi src
116: C edi
117: C ebp divisor
118:
119: cmpl $MUL_THRESHOLD, %ecx
120: jae L(mul_by_inverse)
121:
122:
123: orl %ecx, %ecx
124: jz L(divide_done)
125:
126:
127: L(divide_top):
128: C eax scratch (quotient)
129: C ebx
130: C ecx counter, limbs, decrementing
131: C edx scratch (remainder)
132: C esi src
133: C edi
134: C ebp
135:
136: movl -4(%esi,%ecx,4), %eax
137:
138: divl %ebp
139:
140: decl %ecx
141: jnz L(divide_top)
142:
143:
144: L(divide_done):
145: movl SAVE_ESI, %esi
146: movl %edx, %eax
147:
148: movl SAVE_EBP, %ebp
149: addl $STACK_SPACE, %esp
150:
151: ret
152:
153:
154:
155: C -----------------------------------------------------------------------------
156:
157: L(mul_by_inverse):
158: C eax
159: C ebx
160: C ecx size
161: C edx carry
162: C esi src
163: C edi
164: C ebp divisor
165:
166: movl %ebx, SAVE_EBX
167: leal -4(%esi), %ebx
168:
169: movl %ebx, VAR_SRC_STOP
170: movl %ecx, %ebx C size
171:
172: movl %edi, SAVE_EDI
173: movl %edx, %edi C carry
174:
175: bsrl %ebp, %ecx C 31-l
176: movl $-1, %edx
177:
178: leal 1(%ecx), %eax C 32-l
179: xorl $31, %ecx C l
180:
181: movl %ecx, VAR_NORM
182: shll %cl, %ebp C d normalized
183:
184: movd %eax, %mm7
185: movl $-1, %eax
186: subl %ebp, %edx C (b-d)-1 so edx:eax = b*(b-d)-1
187:
188: divl %ebp C floor (b*(b-d)-1) / d
189:
190: C
191:
192: movl %eax, VAR_INVERSE
193: leal -12(%esi,%ebx,4), %eax C &src[size-3]
194:
195: movl 8(%eax), %esi C src high limb
196: movl 4(%eax), %edx C src second highest limb
197:
198: shldl( %cl, %esi, %edi) C n2 = carry,high << l
199:
200: shldl( %cl, %edx, %esi) C n10 = high,second << l
201:
202: movl %eax, %ecx C &src[size-3]
203:
204:
205: ifelse(MUL_THRESHOLD,2,`
206: cmpl $2, %ebx
207: je L(inverse_two_left)
208: ')
209:
210:
211: C The dependent chain here is the same as in mpn_divrem_1, but a few
212: C instructions are saved by not needing to store the quotient limbs. This
213: C gets it down to 24 c/l, which is still a bit away from a theoretical 19
214: C c/l.
215:
216: ALIGN(16)
217: L(inverse_top):
218: C eax scratch
219: C ebx scratch (nadj, q1)
220: C ecx src pointer, decrementing
221: C edx scratch
222: C esi n10
223: C edi n2
224: C ebp divisor
225: C
226: C mm0 scratch (src qword)
227: C mm7 rshift for normalization
228:
229:
230: movl %esi, %eax
231: movl %ebp, %ebx
232:
233: sarl $31, %eax C -n1
234:
235: andl %eax, %ebx C -n1 & d
236: negl %eax C n1
237:
238: addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
239: addl %edi, %eax C n2+n1
240:
241: mull VAR_INVERSE C m*(n2+n1)
242:
243: movq (%ecx), %mm0 C next src limb and the one below it
244: subl $4, %ecx
245:
246: C
247:
248: C
249:
250: C
251:
252: addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
253: leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
254: movl %ebp, %eax C d
255:
256: adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
257: jz L(q1_ff)
258:
259: mull %ebx C (q1+1)*d
260:
261: psrlq %mm7, %mm0
262: movl VAR_SRC_STOP, %ebx
263:
264: C
265:
266: C
267:
268: C
269:
270: subl %eax, %esi
271:
272: sbbl %edx, %edi C n - (q1+1)*d
273: movl %esi, %edi C remainder -> n2
274: leal (%ebp,%esi), %edx
275:
276: cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
277: movd %mm0, %esi
278: cmpl %ebx, %ecx
279:
280: jne L(inverse_top)
281:
282:
283: L(inverse_loop_done):
284:
285:
286: C -----------------------------------------------------------------------------
287:
288: L(inverse_two_left):
289: C eax scratch
290: C ebx scratch (nadj, q1)
291: C ecx &src[-1]
292: C edx scratch
293: C esi n10
294: C edi n2
295: C ebp divisor
296: C
297: C mm0 scratch (src dword)
298: C mm7 rshift
299:
300: movl %esi, %eax
301: movl %ebp, %ebx
302:
303: sarl $31, %eax C -n1
304:
305: andl %eax, %ebx C -n1 & d
306: negl %eax C n1
307:
308: addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
309: addl %edi, %eax C n2+n1
310:
311: mull VAR_INVERSE C m*(n2+n1)
312:
313: movd 4(%ecx), %mm0 C src low limb
314:
315: C
316:
317: C
318:
319: C
320:
321: addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
322: leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
323:
324: adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
325:
326: sbbl $0, %ebx
327: movl %ebp, %eax C d
328:
329: mull %ebx C (q1+1)*d
330:
331: psllq $32, %mm0
332:
333: psrlq %mm7, %mm0
334:
335: C
336:
337: C
338:
339: subl %eax, %esi
340:
341: sbbl %edx, %edi C n - (q1+1)*d
342: movl %esi, %edi C remainder -> n2
343: leal (%ebp,%esi), %edx
344:
345: cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
346: movd %mm0, %esi
347:
348:
349: C One limb left
350:
351: C eax scratch
352: C ebx scratch (nadj, q1)
353: C ecx
354: C edx scratch
355: C esi n10
356: C edi n2
357: C ebp divisor
358: C
359: C mm0 src limb, shifted
360: C mm7 rshift
361:
362: movl %esi, %eax
363: movl %ebp, %ebx
364:
365: sarl $31, %eax C -n1
366:
367: andl %eax, %ebx C -n1 & d
368: negl %eax C n1
369:
370: addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
371: addl %edi, %eax C n2+n1
372:
373: mull VAR_INVERSE C m*(n2+n1)
374:
375: movl VAR_NORM, %ecx C for final denorm
376:
377: C
378:
379: C
380:
381: C
382:
383: addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
384: leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
385:
386: adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
387:
388: sbbl $0, %ebx
389: movl %ebp, %eax C d
390:
391: mull %ebx C (q1+1)*d
392:
393: movl SAVE_EBX, %ebx
394:
395: C
396:
397: C
398:
399: C
400:
401: subl %eax, %esi
402:
403: sbbl %edx, %edi C n - (q1+1)*d
404: leal (%ebp,%esi), %edx
405: movl SAVE_EBP, %ebp
406:
407: movl %esi, %eax C remainder
408: movl SAVE_ESI, %esi
409:
410: cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1
411: movl SAVE_EDI, %edi
412:
413: shrl %cl, %eax C denorm remainder
414: addl $STACK_SPACE, %esp
415: emms
416:
417: ret
418:
419:
420: C -----------------------------------------------------------------------------
421: C
422: C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
423: C of q*d is simply -d and the remainder n-q*d = n10+d
424:
425: L(q1_ff):
426: C eax (divisor)
427: C ebx (q1+1 == 0)
428: C ecx src pointer
429: C edx
430: C esi n10
431: C edi (n2)
432: C ebp divisor
433:
434: leal (%ebp,%esi), %edi C n-q*d remainder -> next n2
435: movl VAR_SRC_STOP, %edx
436: psrlq %mm7, %mm0
437:
438: movd %mm0, %esi C next n10
439: cmpl %ecx, %edx
440: jne L(inverse_top)
441:
442: jmp L(inverse_loop_done)
443:
444: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>