Annotation of OpenXM_contrib/gmp/mpn/x86/pentium/sqr_basecase.asm, Revision 1.1.1.1
1.1 maekawa 1: dnl Intel P5 mpn_sqr_basecase -- square an mpn number.
2: dnl
3: dnl P5: approx 8 cycles per crossproduct, or 15.5 cycles per triangular
4: dnl product at around 20x20 limbs.
5:
6:
7: dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
8: dnl
9: dnl This file is part of the GNU MP Library.
10: dnl
11: dnl The GNU MP Library is free software; you can redistribute it and/or
12: dnl modify it under the terms of the GNU Lesser General Public License as
13: dnl published by the Free Software Foundation; either version 2.1 of the
14: dnl License, or (at your option) any later version.
15: dnl
16: dnl The GNU MP Library is distributed in the hope that it will be useful,
17: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
18: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19: dnl Lesser General Public License for more details.
20: dnl
21: dnl You should have received a copy of the GNU Lesser General Public
22: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
23: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
24: dnl Suite 330, Boston, MA 02111-1307, USA.
25:
26:
27: include(`../config.m4')
28:
29:
30: C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
31: C
32: C Calculate src,size squared, storing the result in dst,2*size.
33: C
34: C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
35: C lot of function call overheads are avoided, especially when the size is
36: C small.
37:
38: defframe(PARAM_SIZE,12)
39: defframe(PARAM_SRC, 8)
40: defframe(PARAM_DST, 4)
41:
42: .text
43: ALIGN(8)
44: PROLOGUE(mpn_sqr_basecase)
45: deflit(`FRAME',0)
46:
47: movl PARAM_SIZE, %edx
48: movl PARAM_SRC, %eax
49:
50: cmpl $2, %edx
51: movl PARAM_DST, %ecx
52:
53: je L(two_limbs)
54:
55: movl (%eax), %eax
56: ja L(three_or_more)
57:
58: C -----------------------------------------------------------------------------
59: C one limb only
60: C eax src
61: C ebx
62: C ecx dst
63: C edx
64:
65: mull %eax
66:
67: movl %eax, (%ecx)
68: movl %edx, 4(%ecx)
69:
70: ret
71:
72: C -----------------------------------------------------------------------------
73: ALIGN(8)
74: L(two_limbs):
75: C eax src
76: C ebx
77: C ecx dst
78: C edx size
79:
80: pushl %ebp
81: pushl %edi
82:
83: pushl %esi
84: pushl %ebx
85:
86: movl %eax, %ebx
87: movl (%eax), %eax
88:
89: mull %eax C src[0]^2
90:
91: movl %eax, (%ecx) C dst[0]
92: movl %edx, %esi C dst[1]
93:
94: movl 4(%ebx), %eax
95:
96: mull %eax C src[1]^2
97:
98: movl %eax, %edi C dst[2]
99: movl %edx, %ebp C dst[3]
100:
101: movl (%ebx), %eax
102:
103: mull 4(%ebx) C src[0]*src[1]
104:
105: addl %eax, %esi
106: popl %ebx
107:
108: adcl %edx, %edi
109:
110: adcl $0, %ebp
111: addl %esi, %eax
112:
113: adcl %edi, %edx
114: movl %eax, 4(%ecx)
115:
116: adcl $0, %ebp
117: popl %esi
118:
119: movl %edx, 8(%ecx)
120: movl %ebp, 12(%ecx)
121:
122: popl %edi
123: popl %ebp
124:
125: ret
126:
127:
128: C -----------------------------------------------------------------------------
129: ALIGN(8)
130: L(three_or_more):
131: C eax src low limb
132: C ebx
133: C ecx dst
134: C edx size
135:
136: cmpl $4, %edx
137: pushl %ebx
138: deflit(`FRAME',4)
139:
140: movl PARAM_SRC, %ebx
141: jae L(four_or_more)
142:
143:
144: C -----------------------------------------------------------------------------
145: C three limbs
146: C eax src low limb
147: C ebx src
148: C ecx dst
149: C edx size
150:
151: pushl %ebp
152: pushl %edi
153:
154: mull %eax C src[0] ^ 2
155:
156: movl %eax, (%ecx)
157: movl %edx, 4(%ecx)
158:
159: movl 4(%ebx), %eax
160: xorl %ebp, %ebp
161:
162: mull %eax C src[1] ^ 2
163:
164: movl %eax, 8(%ecx)
165: movl %edx, 12(%ecx)
166:
167: movl 8(%ebx), %eax
168: pushl %esi C risk of cache bank clash
169:
170: mull %eax C src[2] ^ 2
171:
172: movl %eax, 16(%ecx)
173: movl %edx, 20(%ecx)
174:
175: movl (%ebx), %eax
176:
177: mull 4(%ebx) C src[0] * src[1]
178:
179: movl %eax, %esi
180: movl %edx, %edi
181:
182: movl (%ebx), %eax
183:
184: mull 8(%ebx) C src[0] * src[2]
185:
186: addl %eax, %edi
187: movl %edx, %ebp
188:
189: adcl $0, %ebp
190: movl 4(%ebx), %eax
191:
192: mull 8(%ebx) C src[1] * src[2]
193:
194: xorl %ebx, %ebx
195: addl %eax, %ebp
196:
197: C eax
198: C ebx zero, will be dst[5]
199: C ecx dst
200: C edx dst[4]
201: C esi dst[1]
202: C edi dst[2]
203: C ebp dst[3]
204:
205: adcl $0, %edx
206: addl %esi, %esi
207:
208: adcl %edi, %edi
209:
210: adcl %ebp, %ebp
211:
212: adcl %edx, %edx
213: movl 4(%ecx), %eax
214:
215: adcl $0, %ebx
216: addl %esi, %eax
217:
218: movl %eax, 4(%ecx)
219: movl 8(%ecx), %eax
220:
221: adcl %edi, %eax
222: movl 12(%ecx), %esi
223:
224: adcl %ebp, %esi
225: movl 16(%ecx), %edi
226:
227: movl %eax, 8(%ecx)
228: movl %esi, 12(%ecx)
229:
230: adcl %edx, %edi
231: popl %esi
232:
233: movl 20(%ecx), %eax
234: movl %edi, 16(%ecx)
235:
236: popl %edi
237: popl %ebp
238:
239: adcl %ebx, %eax C no carry out of this
240: popl %ebx
241:
242: movl %eax, 20(%ecx)
243:
244: ret
245:
246:
247: C -----------------------------------------------------------------------------
248: ALIGN(8)
249: L(four_or_more):
250: C eax src low limb
251: C ebx src
252: C ecx dst
253: C edx size
254: C esi
255: C edi
256: C ebp
257: C
258: C First multiply src[0]*src[1..size-1] and store at dst[1..size].
259:
260: deflit(`FRAME',4)
261:
262: pushl %edi
263: FRAME_pushl()
264: pushl %esi
265: FRAME_pushl()
266:
267: pushl %ebp
268: FRAME_pushl()
269: leal (%ecx,%edx,4), %edi C dst end of this mul1
270:
271: leal (%ebx,%edx,4), %esi C src end
272: movl %ebx, %ebp C src
273:
274: negl %edx C -size
275: xorl %ebx, %ebx C clear carry limb and carry flag
276:
277: leal 1(%edx), %ecx C -(size-1)
278:
279: L(mul1):
280: C eax scratch
281: C ebx carry
282: C ecx counter, negative
283: C edx scratch
284: C esi &src[size]
285: C edi &dst[size]
286: C ebp src
287:
288: adcl $0, %ebx
289: movl (%esi,%ecx,4), %eax
290:
291: mull (%ebp)
292:
293: addl %eax, %ebx
294:
295: movl %ebx, (%edi,%ecx,4)
296: incl %ecx
297:
298: movl %edx, %ebx
299: jnz L(mul1)
300:
301:
302: C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
303: C n=1..size-2.
304: C
305: C The last two products, which are the end corner of the product
306: C triangle, are handled separately to save looping overhead. These
307: C are src[size-3]*src[size-2,size-1] and src[size-2]*src[size-1].
308: C If size is 4 then it's only these that need to be done.
309: C
310: C In the outer loop %esi is a constant, and %edi just advances by 1
311: C limb each time. The size of the operation decreases by 1 limb
312: C each time.
313:
314: C eax
315: C ebx carry (needing carry flag added)
316: C ecx
317: C edx
318: C esi &src[size]
319: C edi &dst[size]
320: C ebp
321:
322: adcl $0, %ebx
323: movl PARAM_SIZE, %edx
324:
325: movl %ebx, (%edi)
326: subl $4, %edx
327:
328: negl %edx
329: jz L(corner)
330:
331:
332: L(outer):
333: C ebx previous carry limb to store
334: C edx outer loop counter (negative)
335: C esi &src[size]
336: C edi dst, pointing at stored carry limb of previous loop
337:
338: pushl %edx C new outer loop counter
339: leal -2(%edx), %ecx
340:
341: movl %ebx, (%edi)
342: addl $4, %edi
343:
344: addl $4, %ebp
345: xorl %ebx, %ebx C initial carry limb, clear carry flag
346:
347: L(inner):
348: C eax scratch
349: C ebx carry (needing carry flag added)
350: C ecx counter, negative
351: C edx scratch
352: C esi &src[size]
353: C edi dst end of this addmul
354: C ebp &src[j]
355:
356: adcl $0, %ebx
357: movl (%esi,%ecx,4), %eax
358:
359: mull (%ebp)
360:
361: addl %ebx, %eax
362: movl (%edi,%ecx,4), %ebx
363:
364: adcl $0, %edx
365: addl %eax, %ebx
366:
367: movl %ebx, (%edi,%ecx,4)
368: incl %ecx
369:
370: movl %edx, %ebx
371: jnz L(inner)
372:
373:
374: adcl $0, %ebx
375: popl %edx C outer loop counter
376:
377: incl %edx
378: jnz L(outer)
379:
380:
381: movl %ebx, (%edi)
382:
383: L(corner):
384: C esi &src[size]
385: C edi &dst[2*size-4]
386:
387: movl -8(%esi), %eax
388: movl -4(%edi), %ebx C risk of data cache bank clash here
389:
390: mull -12(%esi) C src[size-2]*src[size-3]
391:
392: addl %eax, %ebx
393: movl %edx, %ecx
394:
395: adcl $0, %ecx
396: movl -4(%esi), %eax
397:
398: mull -12(%esi) C src[size-1]*src[size-3]
399:
400: addl %ecx, %eax
401: movl (%edi), %ecx
402:
403: adcl $0, %edx
404: movl %ebx, -4(%edi)
405:
406: addl %eax, %ecx
407: movl %edx, %ebx
408:
409: adcl $0, %ebx
410: movl -4(%esi), %eax
411:
412: mull -8(%esi) C src[size-1]*src[size-2]
413:
414: movl %ecx, 0(%edi)
415: addl %eax, %ebx
416:
417: adcl $0, %edx
418: movl PARAM_SIZE, %eax
419:
420: negl %eax
421: movl %ebx, 4(%edi)
422:
423: addl $1, %eax C -(size-1) and clear carry
424: movl %edx, 8(%edi)
425:
426:
427: C -----------------------------------------------------------------------------
428: C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
429:
430: L(lshift):
431: C eax counter, negative
432: C ebx next limb
433: C ecx
434: C edx
435: C esi
436: C edi &dst[2*size-4]
437: C ebp
438:
439: movl 12(%edi,%eax,8), %ebx
440:
441: rcll %ebx
442: movl 16(%edi,%eax,8), %ecx
443:
444: rcll %ecx
445: movl %ebx, 12(%edi,%eax,8)
446:
447: movl %ecx, 16(%edi,%eax,8)
448: incl %eax
449:
450: jnz L(lshift)
451:
452:
453: adcl %eax, %eax C high bit out
454: movl PARAM_SRC, %esi
455:
456: movl PARAM_SIZE, %ecx C risk of cache bank clash
457: movl %eax, 12(%edi) C dst most significant limb
458:
459:
460: C -----------------------------------------------------------------------------
461: C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
462: C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
463: C low limb of src[0]^2.
464:
465: movl (%esi), %eax C src[0]
466: leal (%esi,%ecx,4), %esi C src end
467:
468: negl %ecx
469:
470: mull %eax
471:
472: movl %eax, 16(%edi,%ecx,8) C dst[0]
473: movl %edx, %ebx
474:
475: addl $1, %ecx C size-1 and clear carry
476:
477: L(diag):
478: C eax scratch (low product)
479: C ebx carry limb
480: C ecx counter, negative
481: C edx scratch (high product)
482: C esi &src[size]
483: C edi &dst[2*size-4]
484: C ebp scratch (fetched dst limbs)
485:
486: movl (%esi,%ecx,4), %eax
487: adcl $0, %ebx
488:
489: mull %eax
490:
491: movl 16-4(%edi,%ecx,8), %ebp
492:
493: addl %ebp, %ebx
494: movl 16(%edi,%ecx,8), %ebp
495:
496: adcl %eax, %ebp
497: movl %ebx, 16-4(%edi,%ecx,8)
498:
499: movl %ebp, 16(%edi,%ecx,8)
500: incl %ecx
501:
502: movl %edx, %ebx
503: jnz L(diag)
504:
505:
506: adcl $0, %edx
507: movl 16-4(%edi), %eax C dst most significant limb
508:
509: addl %eax, %edx
510: popl %ebp
511:
512: movl %edx, 16-4(%edi)
513: popl %esi C risk of cache bank clash
514:
515: popl %edi
516: popl %ebx
517:
518: ret
519:
520: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>