Annotation of OpenXM_contrib/gmp/mpn/x86/pentium/sqr_basecase.asm, Revision 1.1.1.2
1.1 maekawa 1: dnl Intel P5 mpn_sqr_basecase -- square an mpn number.
2:
1.1.1.2 ! ohara 3: dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
1.1 maekawa 4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24:
1.1.1.2 ! ohara 25: C P5: approx 8 cycles per crossproduct, or 15.5 cycles per triangular
! 26: C product at around 20x20 limbs.
! 27:
! 28:
1.1 maekawa 29: C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
30: C
31: C Calculate src,size squared, storing the result in dst,2*size.
32: C
33: C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
34: C lot of function call overheads are avoided, especially when the size is
35: C small.
36:
37: defframe(PARAM_SIZE,12)
38: defframe(PARAM_SRC, 8)
39: defframe(PARAM_DST, 4)
40:
1.1.1.2 ! ohara 41: TEXT
1.1 maekawa 42: ALIGN(8)
43: PROLOGUE(mpn_sqr_basecase)
44: deflit(`FRAME',0)
45:
46: movl PARAM_SIZE, %edx
47: movl PARAM_SRC, %eax
48:
49: cmpl $2, %edx
50: movl PARAM_DST, %ecx
51:
52: je L(two_limbs)
53:
54: movl (%eax), %eax
55: ja L(three_or_more)
56:
57: C -----------------------------------------------------------------------------
58: C one limb only
59: C eax src
60: C ebx
61: C ecx dst
62: C edx
63:
64: mull %eax
65:
66: movl %eax, (%ecx)
67: movl %edx, 4(%ecx)
68:
69: ret
70:
71: C -----------------------------------------------------------------------------
72: ALIGN(8)
73: L(two_limbs):
74: C eax src
75: C ebx
76: C ecx dst
77: C edx size
78:
79: pushl %ebp
80: pushl %edi
81:
82: pushl %esi
83: pushl %ebx
84:
85: movl %eax, %ebx
86: movl (%eax), %eax
87:
88: mull %eax C src[0]^2
89:
90: movl %eax, (%ecx) C dst[0]
91: movl %edx, %esi C dst[1]
92:
93: movl 4(%ebx), %eax
94:
95: mull %eax C src[1]^2
96:
97: movl %eax, %edi C dst[2]
98: movl %edx, %ebp C dst[3]
99:
100: movl (%ebx), %eax
101:
102: mull 4(%ebx) C src[0]*src[1]
103:
104: addl %eax, %esi
105: popl %ebx
106:
107: adcl %edx, %edi
108:
109: adcl $0, %ebp
110: addl %esi, %eax
111:
112: adcl %edi, %edx
113: movl %eax, 4(%ecx)
114:
115: adcl $0, %ebp
116: popl %esi
117:
118: movl %edx, 8(%ecx)
119: movl %ebp, 12(%ecx)
120:
121: popl %edi
122: popl %ebp
123:
124: ret
125:
126:
127: C -----------------------------------------------------------------------------
128: ALIGN(8)
129: L(three_or_more):
130: C eax src low limb
131: C ebx
132: C ecx dst
133: C edx size
134:
135: cmpl $4, %edx
136: pushl %ebx
137: deflit(`FRAME',4)
138:
139: movl PARAM_SRC, %ebx
140: jae L(four_or_more)
141:
142:
143: C -----------------------------------------------------------------------------
144: C three limbs
145: C eax src low limb
146: C ebx src
147: C ecx dst
148: C edx size
149:
150: pushl %ebp
151: pushl %edi
152:
153: mull %eax C src[0] ^ 2
154:
155: movl %eax, (%ecx)
156: movl %edx, 4(%ecx)
157:
158: movl 4(%ebx), %eax
159: xorl %ebp, %ebp
160:
161: mull %eax C src[1] ^ 2
162:
163: movl %eax, 8(%ecx)
164: movl %edx, 12(%ecx)
165:
166: movl 8(%ebx), %eax
167: pushl %esi C risk of cache bank clash
168:
169: mull %eax C src[2] ^ 2
170:
171: movl %eax, 16(%ecx)
172: movl %edx, 20(%ecx)
173:
174: movl (%ebx), %eax
175:
176: mull 4(%ebx) C src[0] * src[1]
177:
178: movl %eax, %esi
179: movl %edx, %edi
180:
181: movl (%ebx), %eax
182:
183: mull 8(%ebx) C src[0] * src[2]
184:
185: addl %eax, %edi
186: movl %edx, %ebp
187:
188: adcl $0, %ebp
189: movl 4(%ebx), %eax
190:
191: mull 8(%ebx) C src[1] * src[2]
192:
193: xorl %ebx, %ebx
194: addl %eax, %ebp
195:
196: C eax
197: C ebx zero, will be dst[5]
198: C ecx dst
199: C edx dst[4]
200: C esi dst[1]
201: C edi dst[2]
202: C ebp dst[3]
203:
204: adcl $0, %edx
205: addl %esi, %esi
206:
207: adcl %edi, %edi
208:
209: adcl %ebp, %ebp
210:
211: adcl %edx, %edx
212: movl 4(%ecx), %eax
213:
214: adcl $0, %ebx
215: addl %esi, %eax
216:
217: movl %eax, 4(%ecx)
218: movl 8(%ecx), %eax
219:
220: adcl %edi, %eax
221: movl 12(%ecx), %esi
222:
223: adcl %ebp, %esi
224: movl 16(%ecx), %edi
225:
226: movl %eax, 8(%ecx)
227: movl %esi, 12(%ecx)
228:
229: adcl %edx, %edi
230: popl %esi
231:
232: movl 20(%ecx), %eax
233: movl %edi, 16(%ecx)
234:
235: popl %edi
236: popl %ebp
237:
238: adcl %ebx, %eax C no carry out of this
239: popl %ebx
240:
241: movl %eax, 20(%ecx)
242:
243: ret
244:
245:
246: C -----------------------------------------------------------------------------
247: ALIGN(8)
248: L(four_or_more):
249: C eax src low limb
250: C ebx src
251: C ecx dst
252: C edx size
253: C esi
254: C edi
255: C ebp
256: C
257: C First multiply src[0]*src[1..size-1] and store at dst[1..size].
258:
259: deflit(`FRAME',4)
260:
261: pushl %edi
262: FRAME_pushl()
263: pushl %esi
264: FRAME_pushl()
265:
266: pushl %ebp
267: FRAME_pushl()
268: leal (%ecx,%edx,4), %edi C dst end of this mul1
269:
270: leal (%ebx,%edx,4), %esi C src end
271: movl %ebx, %ebp C src
272:
273: negl %edx C -size
274: xorl %ebx, %ebx C clear carry limb and carry flag
275:
276: leal 1(%edx), %ecx C -(size-1)
277:
278: L(mul1):
279: C eax scratch
280: C ebx carry
281: C ecx counter, negative
282: C edx scratch
283: C esi &src[size]
284: C edi &dst[size]
285: C ebp src
286:
287: adcl $0, %ebx
288: movl (%esi,%ecx,4), %eax
289:
290: mull (%ebp)
291:
292: addl %eax, %ebx
293:
294: movl %ebx, (%edi,%ecx,4)
295: incl %ecx
296:
297: movl %edx, %ebx
298: jnz L(mul1)
299:
300:
301: C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
302: C n=1..size-2.
303: C
304: C The last two products, which are the end corner of the product
305: C triangle, are handled separately to save looping overhead. These
306: C are src[size-3]*src[size-2,size-1] and src[size-2]*src[size-1].
307: C If size is 4 then it's only these that need to be done.
308: C
309: C In the outer loop %esi is a constant, and %edi just advances by 1
310: C limb each time. The size of the operation decreases by 1 limb
311: C each time.
312:
313: C eax
314: C ebx carry (needing carry flag added)
315: C ecx
316: C edx
317: C esi &src[size]
318: C edi &dst[size]
319: C ebp
320:
321: adcl $0, %ebx
322: movl PARAM_SIZE, %edx
323:
324: movl %ebx, (%edi)
325: subl $4, %edx
326:
327: negl %edx
328: jz L(corner)
329:
330:
331: L(outer):
332: C ebx previous carry limb to store
333: C edx outer loop counter (negative)
334: C esi &src[size]
335: C edi dst, pointing at stored carry limb of previous loop
336:
337: pushl %edx C new outer loop counter
338: leal -2(%edx), %ecx
339:
340: movl %ebx, (%edi)
341: addl $4, %edi
342:
343: addl $4, %ebp
344: xorl %ebx, %ebx C initial carry limb, clear carry flag
345:
346: L(inner):
347: C eax scratch
348: C ebx carry (needing carry flag added)
349: C ecx counter, negative
350: C edx scratch
351: C esi &src[size]
352: C edi dst end of this addmul
353: C ebp &src[j]
354:
355: adcl $0, %ebx
356: movl (%esi,%ecx,4), %eax
357:
358: mull (%ebp)
359:
360: addl %ebx, %eax
361: movl (%edi,%ecx,4), %ebx
362:
363: adcl $0, %edx
364: addl %eax, %ebx
365:
366: movl %ebx, (%edi,%ecx,4)
367: incl %ecx
368:
369: movl %edx, %ebx
370: jnz L(inner)
371:
372:
373: adcl $0, %ebx
374: popl %edx C outer loop counter
375:
376: incl %edx
377: jnz L(outer)
378:
379:
380: movl %ebx, (%edi)
381:
382: L(corner):
383: C esi &src[size]
384: C edi &dst[2*size-4]
385:
386: movl -8(%esi), %eax
387: movl -4(%edi), %ebx C risk of data cache bank clash here
388:
389: mull -12(%esi) C src[size-2]*src[size-3]
390:
391: addl %eax, %ebx
392: movl %edx, %ecx
393:
394: adcl $0, %ecx
395: movl -4(%esi), %eax
396:
397: mull -12(%esi) C src[size-1]*src[size-3]
398:
399: addl %ecx, %eax
400: movl (%edi), %ecx
401:
402: adcl $0, %edx
403: movl %ebx, -4(%edi)
404:
405: addl %eax, %ecx
406: movl %edx, %ebx
407:
408: adcl $0, %ebx
409: movl -4(%esi), %eax
410:
411: mull -8(%esi) C src[size-1]*src[size-2]
412:
1.1.1.2 ! ohara 413: movl %ecx, (%edi)
1.1 maekawa 414: addl %eax, %ebx
415:
416: adcl $0, %edx
417: movl PARAM_SIZE, %eax
418:
419: negl %eax
420: movl %ebx, 4(%edi)
421:
422: addl $1, %eax C -(size-1) and clear carry
423: movl %edx, 8(%edi)
424:
425:
426: C -----------------------------------------------------------------------------
427: C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
428:
429: L(lshift):
430: C eax counter, negative
431: C ebx next limb
432: C ecx
433: C edx
434: C esi
435: C edi &dst[2*size-4]
436: C ebp
437:
438: movl 12(%edi,%eax,8), %ebx
439:
440: rcll %ebx
441: movl 16(%edi,%eax,8), %ecx
442:
443: rcll %ecx
444: movl %ebx, 12(%edi,%eax,8)
445:
446: movl %ecx, 16(%edi,%eax,8)
447: incl %eax
448:
449: jnz L(lshift)
450:
451:
452: adcl %eax, %eax C high bit out
453: movl PARAM_SRC, %esi
454:
455: movl PARAM_SIZE, %ecx C risk of cache bank clash
456: movl %eax, 12(%edi) C dst most significant limb
457:
458:
459: C -----------------------------------------------------------------------------
460: C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
461: C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
462: C low limb of src[0]^2.
463:
464: movl (%esi), %eax C src[0]
465: leal (%esi,%ecx,4), %esi C src end
466:
467: negl %ecx
468:
469: mull %eax
470:
471: movl %eax, 16(%edi,%ecx,8) C dst[0]
472: movl %edx, %ebx
473:
474: addl $1, %ecx C size-1 and clear carry
475:
476: L(diag):
477: C eax scratch (low product)
478: C ebx carry limb
479: C ecx counter, negative
480: C edx scratch (high product)
481: C esi &src[size]
482: C edi &dst[2*size-4]
483: C ebp scratch (fetched dst limbs)
484:
485: movl (%esi,%ecx,4), %eax
486: adcl $0, %ebx
487:
488: mull %eax
489:
490: movl 16-4(%edi,%ecx,8), %ebp
491:
492: addl %ebp, %ebx
493: movl 16(%edi,%ecx,8), %ebp
494:
495: adcl %eax, %ebp
496: movl %ebx, 16-4(%edi,%ecx,8)
497:
498: movl %ebp, 16(%edi,%ecx,8)
499: incl %ecx
500:
501: movl %edx, %ebx
502: jnz L(diag)
503:
504:
505: adcl $0, %edx
506: movl 16-4(%edi), %eax C dst most significant limb
507:
508: addl %eax, %edx
509: popl %ebp
510:
511: movl %edx, 16-4(%edi)
512: popl %esi C risk of cache bank clash
513:
514: popl %edi
515: popl %ebx
516:
517: ret
518:
519: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>