Annotation of OpenXM_contrib/gmp/mpn/x86/k6/sqr_basecase.asm, Revision 1.1.1.2
1.1 maekawa 1: dnl AMD K6 mpn_sqr_basecase -- square an mpn number.
2:
1.1.1.2 ! ohara 3: dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
1.1 maekawa 4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24:
1.1.1.2 ! ohara 25: C K6: approx 4.7 cycles per cross product, or 9.2 cycles per triangular
! 26: C product (measured on the speed difference between 17 and 33 limbs,
! 27: C which is roughly the Karatsuba recursing range).
! 28:
! 29:
! 30: dnl SQR_KARATSUBA_THRESHOLD_MAX is the maximum SQR_KARATSUBA_THRESHOLD this
1.1 maekawa 31: dnl code supports. This value is used only by the tune program to know
32: dnl what it can go up to. (An attempt to compile with a bigger value will
33: dnl trigger some m4_assert()s in the code, making the build fail.)
34: dnl
35: dnl The value is determined by requiring the displacements in the unrolled
36: dnl addmul to fit in single bytes. This means a maximum UNROLL_COUNT of
1.1.1.2 ! ohara 37: dnl 63, giving a maximum SQR_KARATSUBA_THRESHOLD of 66.
1.1 maekawa 38:
1.1.1.2 ! ohara 39: deflit(SQR_KARATSUBA_THRESHOLD_MAX, 66)
1.1 maekawa 40:
41:
42: dnl Allow a value from the tune program to override config.m4.
43:
1.1.1.2 ! ohara 44: ifdef(`SQR_KARATSUBA_THRESHOLD_OVERRIDE',
! 45: `define(`SQR_KARATSUBA_THRESHOLD',SQR_KARATSUBA_THRESHOLD_OVERRIDE)')
1.1 maekawa 46:
47:
48: dnl UNROLL_COUNT is the number of code chunks in the unrolled addmul. The
1.1.1.2 ! ohara 49: dnl number required is determined by SQR_KARATSUBA_THRESHOLD, since
! 50: dnl mpn_sqr_basecase only needs to handle sizes < SQR_KARATSUBA_THRESHOLD.
1.1 maekawa 51: dnl
52: dnl The first addmul is the biggest, and this takes the second least
53: dnl significant limb and multiplies it by the third least significant and
1.1.1.2 ! ohara 54: dnl up. Hence for a maximum operand size of SQR_KARATSUBA_THRESHOLD-1
! 55: dnl limbs, UNROLL_COUNT needs to be SQR_KARATSUBA_THRESHOLD-3.
1.1 maekawa 56:
1.1.1.2 ! ohara 57: m4_config_gmp_mparam(`SQR_KARATSUBA_THRESHOLD')
! 58: deflit(UNROLL_COUNT, eval(SQR_KARATSUBA_THRESHOLD-3))
1.1 maekawa 59:
60:
61: C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
62: C
63: C The algorithm is essentially the same as mpn/generic/sqr_basecase.c, but a
64: C lot of function call overheads are avoided, especially when the given size
65: C is small.
66: C
67: C The code size might look a bit excessive, but not all of it is executed
68: C and so won't fill up the code cache. The 1x1, 2x2 and 3x3 special cases
69: C clearly apply only to those sizes; mid sizes like 10x10 only need part of
70: C the unrolled addmul; and big sizes like 35x35 that do need all of it will
71: C at least be getting value for money, because 35x35 spends something like
72: C 5780 cycles here.
73: C
74: C Different values of UNROLL_COUNT give slightly different speeds, between
75: C 9.0 and 9.2 c/tri-prod measured on the difference between 17 and 33 limbs.
76: C This isn't a big difference, but it's presumably some alignment effect
77: C which if understood could give a simple speedup.
78:
79: defframe(PARAM_SIZE,12)
80: defframe(PARAM_SRC, 8)
81: defframe(PARAM_DST, 4)
82:
1.1.1.2 ! ohara 83: TEXT
1.1 maekawa 84: ALIGN(32)
85: PROLOGUE(mpn_sqr_basecase)
86: deflit(`FRAME',0)
87:
88: movl PARAM_SIZE, %ecx
89: movl PARAM_SRC, %eax
90:
91: cmpl $2, %ecx
92: je L(two_limbs)
93:
94: movl PARAM_DST, %edx
95: ja L(three_or_more)
96:
97:
98: C -----------------------------------------------------------------------------
99: C one limb only
100: C eax src
101: C ebx
102: C ecx size
103: C edx dst
104:
105: movl (%eax), %eax
106: movl %edx, %ecx
107:
108: mull %eax
109:
110: movl %eax, (%ecx)
111: movl %edx, 4(%ecx)
112: ret
113:
114:
115: C -----------------------------------------------------------------------------
116: ALIGN(16)
117: L(two_limbs):
118: C eax src
119: C ebx
120: C ecx size
121: C edx dst
122:
123: pushl %ebx
124: movl %eax, %ebx C src
125: deflit(`FRAME',4)
126:
127: movl (%ebx), %eax
128: movl PARAM_DST, %ecx
129:
130: mull %eax C src[0]^2
131:
132: movl %eax, (%ecx)
133: movl 4(%ebx), %eax
134:
135: movl %edx, 4(%ecx)
136:
137: mull %eax C src[1]^2
138:
139: movl %eax, 8(%ecx)
140: movl (%ebx), %eax
141:
142: movl %edx, 12(%ecx)
143: movl 4(%ebx), %edx
144:
145: mull %edx C src[0]*src[1]
146:
147: addl %eax, 4(%ecx)
148:
149: adcl %edx, 8(%ecx)
150: adcl $0, 12(%ecx)
151:
152: popl %ebx
153: addl %eax, 4(%ecx)
154:
155: adcl %edx, 8(%ecx)
156: adcl $0, 12(%ecx)
157:
158: ret
159:
160:
161: C -----------------------------------------------------------------------------
162: L(three_or_more):
163: deflit(`FRAME',0)
164: cmpl $4, %ecx
165: jae L(four_or_more)
166:
167:
168: C -----------------------------------------------------------------------------
169: C three limbs
170: C eax src
171: C ecx size
172: C edx dst
173:
174: pushl %ebx
175: movl %eax, %ebx C src
176:
177: movl (%ebx), %eax
178: movl %edx, %ecx C dst
179:
180: mull %eax C src[0] ^ 2
181:
182: movl %eax, (%ecx)
183: movl 4(%ebx), %eax
184:
185: movl %edx, 4(%ecx)
186: pushl %esi
187:
188: mull %eax C src[1] ^ 2
189:
190: movl %eax, 8(%ecx)
191: movl 8(%ebx), %eax
192:
193: movl %edx, 12(%ecx)
194: pushl %edi
195:
196: mull %eax C src[2] ^ 2
197:
198: movl %eax, 16(%ecx)
199: movl (%ebx), %eax
200:
201: movl %edx, 20(%ecx)
202: movl 4(%ebx), %edx
203:
204: mull %edx C src[0] * src[1]
205:
206: movl %eax, %esi
207: movl (%ebx), %eax
208:
209: movl %edx, %edi
210: movl 8(%ebx), %edx
211:
212: pushl %ebp
213: xorl %ebp, %ebp
214:
215: mull %edx C src[0] * src[2]
216:
217: addl %eax, %edi
218: movl 4(%ebx), %eax
219:
220: adcl %edx, %ebp
221:
222: movl 8(%ebx), %edx
223:
224: mull %edx C src[1] * src[2]
225:
226: addl %eax, %ebp
227:
228: adcl $0, %edx
229:
230:
231: C eax will be dst[5]
232: C ebx
233: C ecx dst
234: C edx dst[4]
235: C esi dst[1]
236: C edi dst[2]
237: C ebp dst[3]
238:
239: xorl %eax, %eax
240: addl %esi, %esi
241: adcl %edi, %edi
242: adcl %ebp, %ebp
243: adcl %edx, %edx
244: adcl $0, %eax
245:
246: addl %esi, 4(%ecx)
247: adcl %edi, 8(%ecx)
248: adcl %ebp, 12(%ecx)
249:
250: popl %ebp
251: popl %edi
252:
253: adcl %edx, 16(%ecx)
254:
255: popl %esi
256: popl %ebx
257:
258: adcl %eax, 20(%ecx)
259: ASSERT(nc)
260:
261: ret
262:
263:
264: C -----------------------------------------------------------------------------
265:
266: defframe(SAVE_EBX, -4)
267: defframe(SAVE_ESI, -8)
268: defframe(SAVE_EDI, -12)
269: defframe(SAVE_EBP, -16)
270: defframe(VAR_COUNTER,-20)
271: defframe(VAR_JMP, -24)
272: deflit(STACK_SPACE, 24)
273:
274: ALIGN(16)
275: L(four_or_more):
276:
277: C eax src
278: C ebx
279: C ecx size
280: C edx dst
281: C esi
282: C edi
283: C ebp
284:
285: C First multiply src[0]*src[1..size-1] and store at dst[1..size].
286: C
287: C A test was done calling mpn_mul_1 here to get the benefit of its unrolled
288: C loop, but this was only a tiny speedup; at 35 limbs it took 24 cycles off
289: C a 5780 cycle operation, which is not surprising since the loop here is 8
290: C c/l and mpn_mul_1 is 6.25 c/l.
291:
292: subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE)
293:
294: movl %edi, SAVE_EDI
295: leal 4(%edx), %edi
296:
297: movl %ebx, SAVE_EBX
298: leal 4(%eax), %ebx
299:
300: movl %esi, SAVE_ESI
301: xorl %esi, %esi
302:
303: movl %ebp, SAVE_EBP
304:
305: C eax
306: C ebx src+4
307: C ecx size
308: C edx
309: C esi
310: C edi dst+4
311: C ebp
312:
313: movl (%eax), %ebp C multiplier
314: leal -1(%ecx), %ecx C size-1, and pad to a 16 byte boundary
315:
316:
317: ALIGN(16)
318: L(mul_1):
319: C eax scratch
320: C ebx src ptr
321: C ecx counter
322: C edx scratch
323: C esi carry
324: C edi dst ptr
325: C ebp multiplier
326:
327: movl (%ebx), %eax
328: addl $4, %ebx
329:
330: mull %ebp
331:
332: addl %esi, %eax
333: movl $0, %esi
334:
335: adcl %edx, %esi
336:
337: movl %eax, (%edi)
338: addl $4, %edi
339:
340: loop L(mul_1)
341:
342:
343: C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
344: C
345: C The last two addmuls, which are the bottom right corner of the product
346: C triangle, are left to the end. These are src[size-3]*src[size-2,size-1]
347: C and src[size-2]*src[size-1]. If size is 4 then it's only these corner
348: C cases that need to be done.
349: C
350: C The unrolled code is the same as mpn_addmul_1(), see that routine for some
351: C comments.
352: C
353: C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
354: C
355: C VAR_JMP is the computed jump into the unrolled code, stepped by one code
356: C chunk each outer loop.
357: C
358: C K6 doesn't do any branch prediction on indirect jumps, which is good
359: C actually because it's a different target each time. The unrolled addmul
360: C is about 3 cycles/limb faster than a simple loop, so the 6 cycle cost of
361: C the indirect jump is quickly recovered.
362:
363:
364: dnl This value is also implicitly encoded in a shift and add.
365: dnl
366: deflit(CODE_BYTES_PER_LIMB, 15)
367:
368: dnl With the unmodified &src[size] and &dst[size] pointers, the
369: dnl displacements in the unrolled code fit in a byte for UNROLL_COUNT
370: dnl values up to 31. Above that an offset must be added to them.
371: dnl
372: deflit(OFFSET,
373: ifelse(eval(UNROLL_COUNT>31),1,
374: eval((UNROLL_COUNT-31)*4),
375: 0))
376:
377: C eax
378: C ebx &src[size]
379: C ecx
380: C edx
381: C esi carry
382: C edi &dst[size]
383: C ebp
384:
385: movl PARAM_SIZE, %ecx
386: movl %esi, (%edi)
387:
388: subl $4, %ecx
389: jz L(corner)
390:
391: movl %ecx, %edx
392: ifelse(OFFSET,0,,
393: ` subl $OFFSET, %ebx')
394:
395: shll $4, %ecx
396: ifelse(OFFSET,0,,
397: ` subl $OFFSET, %edi')
398:
399: negl %ecx
400:
401: ifdef(`PIC',`
402: call L(pic_calc)
403: L(here):
404: ',`
405: leal L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
406: ')
407: negl %edx
408:
409:
410: C The calculated jump mustn't be before the start of the available
411: C code. This is the limitation UNROLL_COUNT puts on the src operand
412: C size, but checked here using the jump address directly.
413: C
414: ASSERT(ae,`
415: movl_text_address( L(unroll_inner_start), %eax)
416: cmpl %eax, %ecx
417: ')
418:
419:
420: C -----------------------------------------------------------------------------
421: ALIGN(16)
422: L(unroll_outer_top):
423: C eax
424: C ebx &src[size], constant
425: C ecx VAR_JMP
426: C edx VAR_COUNTER, limbs, negative
427: C esi high limb to store
428: C edi dst ptr, high of last addmul
429: C ebp
430:
431: movl -12+OFFSET(%ebx,%edx,4), %ebp C multiplier
432: movl %edx, VAR_COUNTER
433:
434: movl -8+OFFSET(%ebx,%edx,4), %eax C first limb of multiplicand
435:
436: mull %ebp
437:
438: testb $1, %cl
439:
440: movl %edx, %esi C high carry
441: movl %ecx, %edx C jump
442:
443: movl %eax, %ecx C low carry
444: leal CODE_BYTES_PER_LIMB(%edx), %edx
445:
446: movl %edx, VAR_JMP
447: leal 4(%edi), %edi
448:
449: C A branch-free version of this using some xors was found to be a
450: C touch slower than just a conditional jump, despite the jump
451: C switching between taken and not taken on every loop.
452:
453: ifelse(eval(UNROLL_COUNT%2),0,
454: jz,jnz) L(unroll_noswap)
455: movl %esi, %eax C high,low carry other way around
456:
457: movl %ecx, %esi
458: movl %eax, %ecx
459: L(unroll_noswap):
460:
461: jmp *%edx
462:
463:
464: C Must be on an even address here so the low bit of the jump address
465: C will indicate which way around ecx/esi should start.
466: C
467: C An attempt was made at padding here to get the end of the unrolled
468: C code to come out on a good alignment, to save padding before
469: C L(corner). This worked, but turned out to run slower than just an
470: C ALIGN(2). The reason for this is not clear, it might be related
471: C to the different speeds on different UNROLL_COUNTs noted above.
472:
473: ALIGN(2)
474:
475: L(unroll_inner_start):
476: C eax scratch
477: C ebx src
478: C ecx carry low
479: C edx scratch
480: C esi carry high
481: C edi dst
482: C ebp multiplier
483: C
484: C 15 code bytes each limb
485: C ecx/esi swapped on each chunk
486:
487: forloop(`i', UNROLL_COUNT, 1, `
488: deflit(`disp_src', eval(-i*4 + OFFSET))
489: deflit(`disp_dst', eval(disp_src - 4))
490:
491: m4_assert(`disp_src>=-128 && disp_src<128')
492: m4_assert(`disp_dst>=-128 && disp_dst<128')
493:
494: ifelse(eval(i%2),0,`
495: Zdisp( movl, disp_src,(%ebx), %eax)
496: mull %ebp
497: Zdisp( addl, %esi, disp_dst,(%edi))
498: adcl %eax, %ecx
499: movl %edx, %esi
500: jadcl0( %esi)
501: ',`
502: dnl this one comes out last
503: Zdisp( movl, disp_src,(%ebx), %eax)
504: mull %ebp
505: Zdisp( addl, %ecx, disp_dst,(%edi))
506: adcl %eax, %esi
507: movl %edx, %ecx
508: jadcl0( %ecx)
509: ')
510: ')
511: L(unroll_inner_end):
512:
513: addl %esi, -4+OFFSET(%edi)
514:
515: movl VAR_COUNTER, %edx
516: jadcl0( %ecx)
517:
518: movl %ecx, m4_empty_if_zero(OFFSET)(%edi)
519: movl VAR_JMP, %ecx
520:
521: incl %edx
522: jnz L(unroll_outer_top)
523:
524:
525: ifelse(OFFSET,0,,`
526: addl $OFFSET, %ebx
527: addl $OFFSET, %edi
528: ')
529:
530:
531: C -----------------------------------------------------------------------------
532: ALIGN(16)
533: L(corner):
534: C ebx &src[size]
535: C edi &dst[2*size-5]
536:
537: movl -12(%ebx), %ebp
538:
539: movl -8(%ebx), %eax
540: movl %eax, %ecx
541:
542: mull %ebp
543:
544: addl %eax, -4(%edi)
545: adcl $0, %edx
546:
547: movl -4(%ebx), %eax
548: movl %edx, %esi
549: movl %eax, %ebx
550:
551: mull %ebp
552:
553: addl %esi, %eax
554: adcl $0, %edx
555:
556: addl %eax, (%edi)
557: adcl $0, %edx
558:
559: movl %edx, %esi
560: movl %ebx, %eax
561:
562: mull %ecx
563:
564: addl %esi, %eax
565: movl %eax, 4(%edi)
566:
567: adcl $0, %edx
568:
569: movl %edx, 8(%edi)
570:
571:
572: C -----------------------------------------------------------------------------
573: C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
574: C The loop measures about 6 cycles/iteration, though it looks like it should
575: C decode in 5.
576:
577: L(lshift_start):
578: movl PARAM_SIZE, %ecx
579:
580: movl PARAM_DST, %edi
581: subl $1, %ecx C size-1 and clear carry
582:
583: movl PARAM_SRC, %ebx
584: movl %ecx, %edx
585:
586: xorl %eax, %eax C ready for adcl
587:
588:
589: ALIGN(16)
590: L(lshift):
591: C eax
592: C ebx src (for later use)
593: C ecx counter, decrementing
594: C edx size-1 (for later use)
595: C esi
596: C edi dst, incrementing
597: C ebp
598:
599: rcll 4(%edi)
600: rcll 8(%edi)
601: leal 8(%edi), %edi
602: loop L(lshift)
603:
604:
605: adcl %eax, %eax
606:
607: movl %eax, 4(%edi) C dst most significant limb
608: movl (%ebx), %eax C src[0]
609:
610: leal 4(%ebx,%edx,4), %ebx C &src[size]
611: subl %edx, %ecx C -(size-1)
612:
613:
614: C -----------------------------------------------------------------------------
615: C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
616: C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
617: C low limb of src[0]^2.
618:
619:
620: mull %eax
621:
622: movl %eax, (%edi,%ecx,8) C dst[0]
623:
624:
625: ALIGN(16)
626: L(diag):
627: C eax scratch
628: C ebx &src[size]
629: C ecx counter, negative
630: C edx carry
631: C esi scratch
632: C edi dst[2*size-2]
633: C ebp
634:
635: movl (%ebx,%ecx,4), %eax
636: movl %edx, %esi
637:
638: mull %eax
639:
640: addl %esi, 4(%edi,%ecx,8)
641: adcl %eax, 8(%edi,%ecx,8)
642: adcl $0, %edx
643:
644: incl %ecx
645: jnz L(diag)
646:
647:
648: movl SAVE_EBX, %ebx
649: movl SAVE_ESI, %esi
650:
651: addl %edx, 4(%edi) C dst most significant limb
652:
653: movl SAVE_EDI, %edi
654: movl SAVE_EBP, %ebp
655: addl $FRAME, %esp
656: ret
657:
658:
659:
660: C -----------------------------------------------------------------------------
661: ifdef(`PIC',`
662: L(pic_calc):
1.1.1.2 ! ohara 663: C See mpn/x86/README about old gas bugs
1.1 maekawa 664: addl (%esp), %ecx
665: addl $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx
666: addl %edx, %ecx
667: ret
668: ')
669:
670:
671: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>