Annotation of OpenXM_contrib/gmp/mpn/x86/pentium/mod_1.asm, Revision 1.1.1.1
1.1 ohara 1: dnl Intel P5 mpn_mod_1 -- mpn by limb remainder.
2:
3: dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24:
25: C P5: 28.0 cycles/limb
26:
27:
28: C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
29: C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
30: C mp_limb_t carry);
31: C mp_limb_t mpn_preinv_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
32: C mp_limb_t inverse);
33: C
34: C This code is not unlike mpn/x86/p6/mod_1.asm, it does the same sort of
35: C multiply by inverse without on-the-fly shifts. See that code for some
36: C general comments.
37: C
38: C Alternatives:
39: C
40: C P5 shldl is 4 cycles, so shifting on the fly would be at least 5 cycles
41: C slower, probably more depending what it did to register usage. Using MMX
42: C on P55 would be better, but still at least 4 or 5 instructions and so 2 or
43: C 3 cycles.
44:
45:
46: dnl These thresholds are the sizes where the multiply by inverse method is
47: dnl used, rather than plain "divl"s. Minimum value 2.
48: dnl
49: dnl MUL_NORM_THRESHOLD is for an already normalized divisor (high bit set),
50: dnl MUL_UNNORM_THRESHOLD for an unnormalized divisor.
51: dnl
52: dnl With the divl loop at 44 c/l and the inverse at 28 c/l with about 70
53: dnl cycles to setup, the threshold should be about ceil(70/16)==5, which is
54: dnl what happens in practice.
55: dnl
56: dnl An unnormalized divisor gets an extra 40 cycles at the end for the
57: dnl final (r*2^n)%(d*2^n) and shift. This increases the threshold by about
58: dnl 40/16=3.
59: dnl
60: dnl PIC adds between 4 and 7 cycles (not sure why it varies), but this
61: dnl doesn't change the thresholds.
62: dnl
63: dnl The entry sequence code that chooses between MUL_NORM_THRESHOLD and
64: dnl MUL_UNNORM_THRESHOLD is a bit horrible, but it adds only 2 cycles
65: dnl (branch free) and ensures the choice between div or mul is optimal.
66:
67: deflit(MUL_NORM_THRESHOLD, ifdef(`PIC',5,5))
68: deflit(MUL_UNNORM_THRESHOLD, ifdef(`PIC',8,8))
69:
70: deflit(MUL_NORM_DELTA, eval(MUL_NORM_THRESHOLD - MUL_UNNORM_THRESHOLD))
71:
72:
73: defframe(PARAM_INVERSE, 16) dnl mpn_preinv_mod_1
74: defframe(PARAM_CARRY, 16) dnl mpn_mod_1c
75: defframe(PARAM_DIVISOR, 12)
76: defframe(PARAM_SIZE, 8)
77: defframe(PARAM_SRC, 4)
78:
79: dnl re-using parameter space
80: define(VAR_NORM, `PARAM_DIVISOR')
81: define(VAR_INVERSE, `PARAM_SIZE')
82:
83: TEXT
84:
85: ALIGN(8)
86: PROLOGUE(mpn_preinv_mod_1)
87: deflit(`FRAME',0)
88:
89: pushl %ebp FRAME_pushl()
90: pushl %esi FRAME_pushl()
91:
92: movl PARAM_SRC, %esi
93: movl PARAM_SIZE, %edx
94:
95: pushl %edi FRAME_pushl()
96: pushl %ebx FRAME_pushl()
97:
98: movl PARAM_DIVISOR, %ebp
99: movl PARAM_INVERSE, %eax
100:
101: movl -4(%esi,%edx,4), %edi C src high limb
102: leal -8(%esi,%edx,4), %esi C &src[size-2]
103:
104: movl $0, VAR_NORM
105: decl %edx
106:
107: jnz L(start_preinv)
108:
109: subl %ebp, %edi C src-divisor
110: popl %ebx
111:
112: sbbl %ecx, %ecx C -1 if underflow
113: movl %edi, %eax C src-divisor
114:
115: andl %ebp, %ecx C d if underflow
116: popl %edi
117:
118: addl %ecx, %eax C remainder, with possible addback
119: popl %esi
120:
121: popl %ebp
122:
123: ret
124:
125: EPILOGUE()
126:
127:
128: ALIGN(8)
129: PROLOGUE(mpn_mod_1c)
130: deflit(`FRAME',0)
131:
132: movl PARAM_DIVISOR, %eax
133: movl PARAM_SIZE, %ecx
134:
135: sarl $31, %eax C d highbit
136: movl PARAM_CARRY, %edx
137:
138: orl %ecx, %ecx
139: jz L(done_edx) C result==carry if size==0
140:
141: andl $MUL_NORM_DELTA, %eax
142: pushl %ebp FRAME_pushl()
143:
144: addl $MUL_UNNORM_THRESHOLD, %eax C norm or unnorm thresh
145: pushl %esi FRAME_pushl()
146:
147: movl PARAM_SRC, %esi
148: movl PARAM_DIVISOR, %ebp
149:
150: cmpl %eax, %ecx
151: jb L(divide_top)
152:
153: movl %edx, %eax C carry as pretend src high limb
154: leal 1(%ecx), %edx C size+1
155:
156: cmpl $0x1000000, %ebp
157: jmp L(mul_by_inverse_1c)
158:
159: EPILOGUE()
160:
161:
162: ALIGN(8)
163: PROLOGUE(mpn_mod_1)
164: deflit(`FRAME',0)
165:
166: movl PARAM_SIZE, %ecx
167: pushl %ebp FRAME_pushl()
168:
169: orl %ecx, %ecx
170: jz L(done_zero)
171:
172: movl PARAM_SRC, %eax
173: movl PARAM_DIVISOR, %ebp
174:
175: sarl $31, %ebp C -1 if divisor normalized
176: movl -4(%eax,%ecx,4), %eax C src high limb
177:
178: movl PARAM_DIVISOR, %edx
179: pushl %esi FRAME_pushl()
180:
181: andl $MUL_NORM_DELTA, %ebp
182: cmpl %edx, %eax C carry flag if high<divisor
183:
184: sbbl %edx, %edx C -1 if high<divisor
185: addl $MUL_UNNORM_THRESHOLD, %ebp C norm or unnorm thresh
186:
187: addl %edx, %ecx C size-1 if high<divisor
188: jz L(done_eax)
189:
190: cmpl %ebp, %ecx
191: movl PARAM_DIVISOR, %ebp
192:
193: movl PARAM_SRC, %esi
194: jae L(mul_by_inverse)
195:
196: andl %eax, %edx C high as initial carry if high<divisor
197:
198:
199: L(divide_top):
200: C eax scratch (quotient)
201: C ebx
202: C ecx counter, limbs, decrementing
203: C edx scratch (remainder)
204: C esi src
205: C edi
206: C ebp divisor
207:
208: movl -4(%esi,%ecx,4), %eax
209:
210: divl %ebp
211:
212: decl %ecx
213: jnz L(divide_top)
214:
215:
216: popl %esi
217: popl %ebp
218:
219: L(done_edx):
220: movl %edx, %eax
221:
222: ret
223:
224:
225: L(done_zero):
226: xorl %eax, %eax
227: popl %ebp
228:
229: ret
230:
231:
232: C -----------------------------------------------------------------------------
233: C
234: C The divisor is normalized using the same code as the pentium
235: C count_leading_zeros in longlong.h. Going through the GOT for PIC costs a
236: C couple of cycles, but is more or less unavoidable.
237:
238:
239: ALIGN(8)
240: L(mul_by_inverse):
241: C eax src high limb
242: C ebx
243: C ecx size or size-1
244: C edx
245: C esi src
246: C edi
247: C ebp divisor
248:
249: movl PARAM_SIZE, %edx
250: cmpl $0x1000000, %ebp
251:
252: L(mul_by_inverse_1c):
253: sbbl %ecx, %ecx
254: cmpl $0x10000, %ebp
255:
256: sbbl $0, %ecx
257: cmpl $0x100, %ebp
258:
259: sbbl $0, %ecx
260: pushl %edi FRAME_pushl()
261:
262: pushl %ebx FRAME_pushl()
263: movl %ebp, %ebx C d
264:
265: ifdef(`PIC',`
266: call L(here)
267: L(here):
268: popl %edi
269: leal 25(,%ecx,8), %ecx C 0,-1,-2,-3 -> 25,17,9,1
270:
271: shrl %cl, %ebx
272: addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %edi
273:
274: C AGI
275: movl __clz_tab@GOT(%edi), %edi
276: addl $-34, %ecx
277:
278: C AGI
279: movb (%ebx,%edi), %bl
280:
281: ',`
282: leal 25(,%ecx,8), %ecx C 0,-1,-2,-3 -> 25,17,9,1
283:
284: shrl %cl, %ebx
285: addl $-34, %ecx
286:
287: C AGI
288: movb __clz_tab(%ebx), %bl
289: ')
290: movl %eax, %edi C carry -> n1
291:
292: addl %ebx, %ecx C -34 + c + __clz_tab[d>>c] = -clz-1
293: leal -8(%esi,%edx,4), %esi C &src[size-2]
294:
295: xorl $-1, %ecx C clz
296: movl $-1, %edx
297:
298: ASSERT(e,`pushl %eax C clz calculation same as bsrl
299: bsrl %ebp, %eax
300: xorl $31, %eax
301: cmpl %eax, %ecx
302: popl %eax')
303:
304: shll %cl, %ebp C d normalized
305: movl %ecx, VAR_NORM
306:
307: subl %ebp, %edx C (b-d)-1, so edx:eax = b*(b-d)-1
308: movl $-1, %eax
309:
310: divl %ebp C floor (b*(b-d)-1) / d
311:
312: L(start_preinv):
313: movl %eax, VAR_INVERSE
314: movl %ebp, %eax C d
315:
316: movl %ecx, %edx C fake high, will cancel
317:
318:
319: C For mpn_mod_1 and mpn_preinv_mod_1, the initial carry in %edi is the src
320: C high limb, and this may be greater than the divisor and may need one copy
321: C of the divisor subtracted (only one, because the divisor is normalized).
322: C This is accomplished by having the initial ecx:edi act as a fake previous
323: C n2:n10. The initial edx:eax is d, acting as a fake (q1+1)*d which is
324: C subtracted from ecx:edi, with the usual addback if it produces an
325: C underflow.
326:
327:
328: L(inverse_top):
329: C eax scratch (n10, n1, q1, etc)
330: C ebx scratch (nadj, src limit)
331: C ecx old n2
332: C edx scratch
333: C esi src pointer, &src[size-2] to &src[0]
334: C edi old n10
335: C ebp d
336:
337: subl %eax, %edi C low n - (q1+1)*d
338: movl (%esi), %eax C new n10
339:
340: sbbl %edx, %ecx C high n - (q1+1)*d, 0 or -1
341: movl %ebp, %ebx C d
342:
343: sarl $31, %eax C -n1
344: andl %ebp, %ecx C d if underflow
345:
346: addl %edi, %ecx C remainder -> n2, and possible addback
347: ASSERT(b,`cmpl %ebp, %ecx')
348: andl %eax, %ebx C -n1 & d
349:
350: movl (%esi), %edi C n10
351: andl $1, %eax C n1
352:
353: addl %ecx, %eax C n2+n1
354: addl %edi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
355:
356: mull VAR_INVERSE C m*(n2+n1)
357:
358: addl %eax, %ebx C low(m*(n2+n1) + nadj), giving carry flag
359: leal 1(%ecx), %eax C 1+n2
360:
361: adcl %edx, %eax C 1 + high[n2<<32 + m*(n2+n1) + nadj] = q1+1
362: movl PARAM_SRC, %ebx
363:
364: sbbl $0, %eax C use q1 if q1+1 overflows
365: subl $4, %esi C step src ptr
366:
367: mull %ebp C (q1+1)*d
368:
369: cmpl %ebx, %esi
370: jae L(inverse_top)
371:
372:
373:
374: C %edi (after subtract and addback) is the remainder modulo d*2^n
375: C and must be reduced to 0<=r<d by calculating r*2^n mod d*2^n and
376: C right shifting by n.
377: C
378: C If d was already normalized on entry so that n==0 then nothing is
379: C needed here. This is always the case for preinv_mod_1. For mod_1
380: C or mod_1c the chance of n==0 is low, but about 40 cycles can be
381: C saved.
382:
383: subl %eax, %edi C low n - (q1+1)*d
384: movl %ecx, %ebx C n2
385:
386: sbbl %edx, %ebx C high n - (q1+1)*d, 0 or -1
387: xorl %esi, %esi C next n2
388:
389: andl %ebp, %ebx C d if underflow
390: movl VAR_NORM, %ecx
391:
392: addl %ebx, %edi C remainder, with possible addback
393: orl %ecx, %ecx
394:
395: jz L(done_mul_edi)
396:
397:
398: C Here using %esi=n2 and %edi=n10, unlike the above
399:
400: shldl( %cl, %edi, %esi) C n2
401:
402: shll %cl, %edi C n10
403:
404: movl %edi, %eax C n10
405: movl %edi, %ebx C n10
406:
407: sarl $31, %ebx C -n1
408:
409: shrl $31, %eax C n1
410: andl %ebp, %ebx C -n1 & d
411:
412: addl %esi, %eax C n2+n1
413: addl %edi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
414:
415: mull VAR_INVERSE C m*(n2+n1)
416:
417: addl %eax, %ebx C m*(n2+n1) + nadj, low giving carry flag
418: leal 1(%esi), %eax C 1+n2
419:
420: adcl %edx, %eax C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
421:
422: sbbl $0, %eax C use q1 if q1+1 overflows
423:
424: mull %ebp C (q1+1)*d
425:
426: subl %eax, %edi C low n - (q1+1)*d
427: popl %ebx
428:
429: sbbl %edx, %esi C high n - (q1+1)*d, 0 or -1
430: movl %edi, %eax
431:
432: andl %ebp, %esi C d if underflow
433: popl %edi
434:
435: addl %esi, %eax C addback if underflow
436: popl %esi
437:
438: shrl %cl, %eax C denorm remainder
439: popl %ebp
440:
441: ret
442:
443:
444: L(done_mul_edi):
445: movl %edi, %eax
446: popl %ebx
447:
448: popl %edi
449: L(done_eax):
450: popl %esi
451:
452: popl %ebp
453:
454: ret
455:
456: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>