Annotation of OpenXM_contrib/gmp/mpn/x86/p6/sqr_basecase.asm, Revision 1.1.1.2
1.1 maekawa 1: dnl Intel P6 mpn_sqr_basecase -- square an mpn number.
2:
1.1.1.2 ! ohara 3: dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
1.1 maekawa 4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24:
1.1.1.2 ! ohara 25: C P6: approx 4.0 cycles per cross product, or 7.75 cycles per triangular
! 26: C product (measured on the speed difference between 20 and 40 limbs,
! 27: C which is the Karatsuba recursing range).
! 28:
! 29:
1.1 maekawa 30: dnl These are the same as in mpn/x86/k6/sqr_basecase.asm, see that file for
31: dnl a description. The only difference here is that UNROLL_COUNT can go up
1.1.1.2 ! ohara 32: dnl to 64 (not 63) making SQR_KARATSUBA_THRESHOLD_MAX 67.
1.1 maekawa 33:
1.1.1.2 ! ohara 34: deflit(SQR_KARATSUBA_THRESHOLD_MAX, 67)
1.1 maekawa 35:
1.1.1.2 ! ohara 36: ifdef(`SQR_KARATSUBA_THRESHOLD_OVERRIDE',
! 37: `define(`SQR_KARATSUBA_THRESHOLD',SQR_KARATSUBA_THRESHOLD_OVERRIDE)')
1.1 maekawa 38:
1.1.1.2 ! ohara 39: m4_config_gmp_mparam(`SQR_KARATSUBA_THRESHOLD')
! 40: deflit(UNROLL_COUNT, eval(SQR_KARATSUBA_THRESHOLD-3))
1.1 maekawa 41:
42:
43: C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
44: C
45: C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
46: C lot of function call overheads are avoided, especially when the given size
47: C is small.
48: C
49: C The code size might look a bit excessive, but not all of it is executed so
50: C it won't all get into the code cache. The 1x1, 2x2 and 3x3 special cases
51: C clearly apply only to those sizes; mid sizes like 10x10 only need part of
52: C the unrolled addmul; and big sizes like 40x40 that do use the full
53: C unrolling will least be making good use of it, because 40x40 will take
54: C something like 7000 cycles.
55:
56: defframe(PARAM_SIZE,12)
57: defframe(PARAM_SRC, 8)
58: defframe(PARAM_DST, 4)
59:
1.1.1.2 ! ohara 60: TEXT
1.1 maekawa 61: ALIGN(32)
62: PROLOGUE(mpn_sqr_basecase)
63: deflit(`FRAME',0)
64:
65: movl PARAM_SIZE, %edx
66:
67: movl PARAM_SRC, %eax
68:
69: cmpl $2, %edx
70: movl PARAM_DST, %ecx
71: je L(two_limbs)
72:
73: movl (%eax), %eax
74: ja L(three_or_more)
75:
76:
77: C -----------------------------------------------------------------------------
78: C one limb only
79: C eax src limb
80: C ebx
81: C ecx dst
82: C edx
83:
84: mull %eax
85:
86: movl %eax, (%ecx)
87: movl %edx, 4(%ecx)
88:
89: ret
90:
91:
92: C -----------------------------------------------------------------------------
93: L(two_limbs):
94: C eax src
95: C ebx
96: C ecx dst
97: C edx
98:
99: defframe(SAVE_ESI, -4)
100: defframe(SAVE_EBX, -8)
101: defframe(SAVE_EDI, -12)
102: defframe(SAVE_EBP, -16)
103: deflit(`STACK_SPACE',16)
104:
105: subl $STACK_SPACE, %esp
106: deflit(`FRAME',STACK_SPACE)
107:
108: movl %esi, SAVE_ESI
109: movl %eax, %esi
110: movl (%eax), %eax
111:
112: mull %eax C src[0]^2
113:
114: movl %eax, (%ecx) C dst[0]
115: movl 4(%esi), %eax
116:
117: movl %ebx, SAVE_EBX
118: movl %edx, %ebx C dst[1]
119:
120: mull %eax C src[1]^2
121:
122: movl %edi, SAVE_EDI
123: movl %eax, %edi C dst[2]
124: movl (%esi), %eax
125:
126: movl %ebp, SAVE_EBP
127: movl %edx, %ebp C dst[3]
128:
129: mull 4(%esi) C src[0]*src[1]
130:
131: addl %eax, %ebx
132: movl SAVE_ESI, %esi
133:
134: adcl %edx, %edi
135:
136: adcl $0, %ebp
137: addl %ebx, %eax
138: movl SAVE_EBX, %ebx
139:
140: adcl %edi, %edx
141: movl SAVE_EDI, %edi
142:
143: adcl $0, %ebp
144:
145: movl %eax, 4(%ecx)
146:
147: movl %ebp, 12(%ecx)
148: movl SAVE_EBP, %ebp
149:
150: movl %edx, 8(%ecx)
151: addl $FRAME, %esp
152:
153: ret
154:
155:
156: C -----------------------------------------------------------------------------
157: L(three_or_more):
158: C eax src low limb
159: C ebx
160: C ecx dst
161: C edx size
162: deflit(`FRAME',0)
163:
164: pushl %esi defframe_pushl(`SAVE_ESI')
165: cmpl $4, %edx
166:
167: movl PARAM_SRC, %esi
168: jae L(four_or_more)
169:
170:
171: C -----------------------------------------------------------------------------
172: C three limbs
173:
174: C eax src low limb
175: C ebx
176: C ecx dst
177: C edx
178: C esi src
179: C edi
180: C ebp
181:
182: pushl %ebp defframe_pushl(`SAVE_EBP')
183: pushl %edi defframe_pushl(`SAVE_EDI')
184:
185: mull %eax C src[0] ^ 2
186:
187: movl %eax, (%ecx)
188: movl %edx, 4(%ecx)
189:
190: movl 4(%esi), %eax
191: xorl %ebp, %ebp
192:
193: mull %eax C src[1] ^ 2
194:
195: movl %eax, 8(%ecx)
196: movl %edx, 12(%ecx)
197: movl 8(%esi), %eax
198:
199: pushl %ebx defframe_pushl(`SAVE_EBX')
200:
201: mull %eax C src[2] ^ 2
202:
203: movl %eax, 16(%ecx)
204: movl %edx, 20(%ecx)
205:
206: movl (%esi), %eax
207:
208: mull 4(%esi) C src[0] * src[1]
209:
210: movl %eax, %ebx
211: movl %edx, %edi
212:
213: movl (%esi), %eax
214:
215: mull 8(%esi) C src[0] * src[2]
216:
217: addl %eax, %edi
218: movl %edx, %ebp
219:
220: adcl $0, %ebp
221: movl 4(%esi), %eax
222:
223: mull 8(%esi) C src[1] * src[2]
224:
225: xorl %esi, %esi
226: addl %eax, %ebp
227:
228: C eax
229: C ebx dst[1]
230: C ecx dst
231: C edx dst[4]
232: C esi zero, will be dst[5]
233: C edi dst[2]
234: C ebp dst[3]
235:
236: adcl $0, %edx
237: addl %ebx, %ebx
238:
239: adcl %edi, %edi
240:
241: adcl %ebp, %ebp
242:
243: adcl %edx, %edx
244: movl 4(%ecx), %eax
245:
246: adcl $0, %esi
247: addl %ebx, %eax
248:
249: movl %eax, 4(%ecx)
250: movl 8(%ecx), %eax
251:
252: adcl %edi, %eax
253: movl 12(%ecx), %ebx
254:
255: adcl %ebp, %ebx
256: movl 16(%ecx), %edi
257:
258: movl %eax, 8(%ecx)
259: movl SAVE_EBP, %ebp
260:
261: movl %ebx, 12(%ecx)
262: movl SAVE_EBX, %ebx
263:
264: adcl %edx, %edi
265: movl 20(%ecx), %eax
266:
267: movl %edi, 16(%ecx)
268: movl SAVE_EDI, %edi
269:
270: adcl %esi, %eax C no carry out of this
271: movl SAVE_ESI, %esi
272:
273: movl %eax, 20(%ecx)
274: addl $FRAME, %esp
275:
276: ret
277:
278:
279:
280: C -----------------------------------------------------------------------------
281: defframe(VAR_COUNTER,-20)
282: defframe(VAR_JMP, -24)
283: deflit(`STACK_SPACE',24)
284:
285: L(four_or_more):
286: C eax src low limb
287: C ebx
288: C ecx
289: C edx size
290: C esi src
291: C edi
292: C ebp
293: deflit(`FRAME',4) dnl %esi already pushed
294:
295: C First multiply src[0]*src[1..size-1] and store at dst[1..size].
296:
297: subl $STACK_SPACE-FRAME, %esp
298: deflit(`FRAME',STACK_SPACE)
299: movl $1, %ecx
300:
301: movl %edi, SAVE_EDI
302: movl PARAM_DST, %edi
303:
304: movl %ebx, SAVE_EBX
305: subl %edx, %ecx C -(size-1)
306:
307: movl %ebp, SAVE_EBP
308: movl $0, %ebx C initial carry
309:
310: leal (%esi,%edx,4), %esi C &src[size]
311: movl %eax, %ebp C multiplier
312:
313: leal -4(%edi,%edx,4), %edi C &dst[size-1]
314:
315:
316: C This loop runs at just over 6 c/l.
317:
318: L(mul_1):
319: C eax scratch
320: C ebx carry
321: C ecx counter, limbs, negative, -(size-1) to -1
322: C edx scratch
323: C esi &src[size]
324: C edi &dst[size-1]
325: C ebp multiplier
326:
327: movl %ebp, %eax
328:
329: mull (%esi,%ecx,4)
330:
331: addl %ebx, %eax
332: movl $0, %ebx
333:
334: adcl %edx, %ebx
335: movl %eax, 4(%edi,%ecx,4)
336:
337: incl %ecx
338: jnz L(mul_1)
339:
340:
341: movl %ebx, 4(%edi)
342:
343:
344: C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
345: C
346: C The last two addmuls, which are the bottom right corner of the product
347: C triangle, are left to the end. These are src[size-3]*src[size-2,size-1]
348: C and src[size-2]*src[size-1]. If size is 4 then it's only these corner
349: C cases that need to be done.
350: C
351: C The unrolled code is the same as mpn_addmul_1(), see that routine for some
352: C comments.
353: C
354: C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
355: C
356: C VAR_JMP is the computed jump into the unrolled code, stepped by one code
357: C chunk each outer loop.
358:
359: dnl This is also hard-coded in the address calculation below.
360: deflit(CODE_BYTES_PER_LIMB, 15)
361:
362: dnl With &src[size] and &dst[size-1] pointers, the displacements in the
363: dnl unrolled code fit in a byte for UNROLL_COUNT values up to 32, but above
364: dnl that an offset must be added to them.
365: deflit(OFFSET,
366: ifelse(eval(UNROLL_COUNT>32),1,
367: eval((UNROLL_COUNT-32)*4),
368: 0))
369:
370: C eax
371: C ebx carry
372: C ecx
373: C edx
374: C esi &src[size]
375: C edi &dst[size-1]
376: C ebp
377:
378: movl PARAM_SIZE, %ecx
379:
380: subl $4, %ecx
381: jz L(corner)
382:
383: movl %ecx, %edx
384: negl %ecx
385:
386: shll $4, %ecx
387: ifelse(OFFSET,0,,`subl $OFFSET, %esi')
388:
389: ifdef(`PIC',`
390: call L(pic_calc)
391: L(here):
392: ',`
393: leal L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
394: ')
395: negl %edx
396:
397: ifelse(OFFSET,0,,`subl $OFFSET, %edi')
398:
399: C The calculated jump mustn't be before the start of the available
400: C code. This is the limit that UNROLL_COUNT puts on the src operand
401: C size, but checked here using the jump address directly.
402:
403: ASSERT(ae,
404: `movl_text_address( L(unroll_inner_start), %eax)
405: cmpl %eax, %ecx')
406:
407:
408: C -----------------------------------------------------------------------------
409: ALIGN(16)
410: L(unroll_outer_top):
411: C eax
412: C ebx high limb to store
413: C ecx VAR_JMP
414: C edx VAR_COUNTER, limbs, negative
415: C esi &src[size], constant
416: C edi dst ptr, second highest limb of last addmul
417: C ebp
418:
419: movl -12+OFFSET(%esi,%edx,4), %ebp C multiplier
420: movl %edx, VAR_COUNTER
421:
422: movl -8+OFFSET(%esi,%edx,4), %eax C first limb of multiplicand
423:
424: mull %ebp
425:
426: define(cmovX,`ifelse(eval(UNROLL_COUNT%2),1,`cmovz($@)',`cmovnz($@)')')
427:
428: testb $1, %cl
429:
430: movl %edx, %ebx C high carry
431: leal 4(%edi), %edi
432:
433: movl %ecx, %edx C jump
434:
435: movl %eax, %ecx C low carry
436: leal CODE_BYTES_PER_LIMB(%edx), %edx
437:
438: cmovX( %ebx, %ecx) C high carry reverse
439: cmovX( %eax, %ebx) C low carry reverse
440: movl %edx, VAR_JMP
441: jmp *%edx
442:
443:
444: C Must be on an even address here so the low bit of the jump address
445: C will indicate which way around ecx/ebx should start.
446:
447: ALIGN(2)
448:
449: L(unroll_inner_start):
450: C eax scratch
451: C ebx carry high
452: C ecx carry low
453: C edx scratch
454: C esi src pointer
455: C edi dst pointer
456: C ebp multiplier
457: C
458: C 15 code bytes each limb
459: C ecx/ebx reversed on each chunk
460:
461: forloop(`i', UNROLL_COUNT, 1, `
462: deflit(`disp_src', eval(-i*4 + OFFSET))
463: deflit(`disp_dst', eval(disp_src))
464:
465: m4_assert(`disp_src>=-128 && disp_src<128')
466: m4_assert(`disp_dst>=-128 && disp_dst<128')
467:
468: ifelse(eval(i%2),0,`
469: Zdisp( movl, disp_src,(%esi), %eax)
470: mull %ebp
471: Zdisp( addl, %ebx, disp_dst,(%edi))
472: adcl %eax, %ecx
473: movl %edx, %ebx
474: adcl $0, %ebx
475: ',`
476: dnl this one comes out last
477: Zdisp( movl, disp_src,(%esi), %eax)
478: mull %ebp
479: Zdisp( addl, %ecx, disp_dst,(%edi))
480: adcl %eax, %ebx
481: movl %edx, %ecx
482: adcl $0, %ecx
483: ')
484: ')
485: L(unroll_inner_end):
486:
487: addl %ebx, m4_empty_if_zero(OFFSET)(%edi)
488:
489: movl VAR_COUNTER, %edx
490: adcl $0, %ecx
491:
492: movl %ecx, m4_empty_if_zero(OFFSET+4)(%edi)
493: movl VAR_JMP, %ecx
494:
495: incl %edx
496: jnz L(unroll_outer_top)
497:
498:
499: ifelse(OFFSET,0,,`
500: addl $OFFSET, %esi
501: addl $OFFSET, %edi
502: ')
503:
504:
505: C -----------------------------------------------------------------------------
506: ALIGN(16)
507: L(corner):
508: C eax
509: C ebx
510: C ecx
511: C edx
512: C esi &src[size]
513: C edi &dst[2*size-5]
514: C ebp
515:
516: movl -12(%esi), %eax
517:
518: mull -8(%esi)
519:
520: addl %eax, (%edi)
521: movl -12(%esi), %eax
522: movl $0, %ebx
523:
524: adcl %edx, %ebx
525:
526: mull -4(%esi)
527:
528: addl %eax, %ebx
529: movl -8(%esi), %eax
530:
531: adcl $0, %edx
532:
533: addl %ebx, 4(%edi)
534: movl $0, %ebx
535:
536: adcl %edx, %ebx
537:
538: mull -4(%esi)
539:
540: movl PARAM_SIZE, %ecx
541: addl %ebx, %eax
542:
543: adcl $0, %edx
544:
545: movl %eax, 8(%edi)
546:
547: movl %edx, 12(%edi)
548: movl PARAM_DST, %edi
549:
550:
551: C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
552:
553: subl $1, %ecx C size-1
554: xorl %eax, %eax C ready for final adcl, and clear carry
555:
556: movl %ecx, %edx
557: movl PARAM_SRC, %esi
558:
559:
560: L(lshift):
561: C eax
562: C ebx
563: C ecx counter, size-1 to 1
564: C edx size-1 (for later use)
565: C esi src (for later use)
566: C edi dst, incrementing
567: C ebp
568:
569: rcll 4(%edi)
570: rcll 8(%edi)
571:
572: leal 8(%edi), %edi
573: decl %ecx
574: jnz L(lshift)
575:
576:
577: adcl %eax, %eax
578:
579: movl %eax, 4(%edi) C dst most significant limb
580: movl (%esi), %eax C src[0]
581:
582: leal 4(%esi,%edx,4), %esi C &src[size]
583: subl %edx, %ecx C -(size-1)
584:
585:
586: C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
587: C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
588: C low limb of src[0]^2.
589:
590:
591: mull %eax
592:
593: movl %eax, (%edi,%ecx,8) C dst[0]
594:
595:
596: L(diag):
597: C eax scratch
598: C ebx scratch
599: C ecx counter, negative
600: C edx carry
601: C esi &src[size]
602: C edi dst[2*size-2]
603: C ebp
604:
605: movl (%esi,%ecx,4), %eax
606: movl %edx, %ebx
607:
608: mull %eax
609:
610: addl %ebx, 4(%edi,%ecx,8)
611: adcl %eax, 8(%edi,%ecx,8)
612: adcl $0, %edx
613:
614: incl %ecx
615: jnz L(diag)
616:
617:
618: movl SAVE_ESI, %esi
619: movl SAVE_EBX, %ebx
620:
621: addl %edx, 4(%edi) C dst most significant limb
622:
623: movl SAVE_EDI, %edi
624: movl SAVE_EBP, %ebp
625: addl $FRAME, %esp
626: ret
627:
628:
629:
630: C -----------------------------------------------------------------------------
631: ifdef(`PIC',`
632: L(pic_calc):
633: addl (%esp), %ecx
634: addl $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx
635: addl %edx, %ecx
636: ret
637: ')
638:
639:
640: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>