Annotation of OpenXM_contrib/gmp/mpn/x86/k7/sqr_basecase.asm, Revision 1.1.1.2
1.1 maekawa 1: dnl AMD K7 mpn_sqr_basecase -- square an mpn number.
2:
1.1.1.2 ! ohara 3: dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
1.1 maekawa 4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24:
1.1.1.2 ! ohara 25: C K7: approx 2.3 cycles/crossproduct, or 4.55 cycles/triangular product
! 26: C (measured on the speed difference between 25 and 50 limbs, which is
! 27: C roughly the Karatsuba recursing range).
! 28:
! 29:
1.1 maekawa 30: dnl These are the same as mpn/x86/k6/sqr_basecase.asm, see that code for
31: dnl some comments.
32:
1.1.1.2 ! ohara 33: deflit(SQR_KARATSUBA_THRESHOLD_MAX, 66)
1.1 maekawa 34:
1.1.1.2 ! ohara 35: ifdef(`SQR_KARATSUBA_THRESHOLD_OVERRIDE',
! 36: `define(`SQR_KARATSUBA_THRESHOLD',SQR_KARATSUBA_THRESHOLD_OVERRIDE)')
1.1 maekawa 37:
1.1.1.2 ! ohara 38: m4_config_gmp_mparam(`SQR_KARATSUBA_THRESHOLD')
! 39: deflit(UNROLL_COUNT, eval(SQR_KARATSUBA_THRESHOLD-3))
1.1 maekawa 40:
41:
42: C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
43: C
1.1.1.2 ! ohara 44: C With a SQR_KARATSUBA_THRESHOLD around 50 this code is about 1500 bytes,
1.1 maekawa 45: C which is quite a bit, but is considered good value since squares big
46: C enough to use most of the code will be spending quite a few cycles in it.
47:
48:
49: defframe(PARAM_SIZE,12)
50: defframe(PARAM_SRC, 8)
51: defframe(PARAM_DST, 4)
52:
1.1.1.2 ! ohara 53: TEXT
1.1 maekawa 54: ALIGN(32)
55: PROLOGUE(mpn_sqr_basecase)
56: deflit(`FRAME',0)
57:
58: movl PARAM_SIZE, %ecx
59: movl PARAM_SRC, %eax
60: cmpl $2, %ecx
61:
62: movl PARAM_DST, %edx
63: je L(two_limbs)
64: ja L(three_or_more)
65:
66:
67: C------------------------------------------------------------------------------
68: C one limb only
69: C eax src
70: C ecx size
71: C edx dst
72:
73: movl (%eax), %eax
74: movl %edx, %ecx
75:
76: mull %eax
77:
78: movl %edx, 4(%ecx)
79: movl %eax, (%ecx)
80: ret
81:
82:
83: C------------------------------------------------------------------------------
84: C
85: C Using the read/modify/write "add"s seems to be faster than saving and
86: C restoring registers. Perhaps the loads for the first set hide under the
87: C mul latency and the second gets store to load forwarding.
88:
89: ALIGN(16)
90: L(two_limbs):
91: C eax src
92: C ebx
93: C ecx size
94: C edx dst
95: deflit(`FRAME',0)
96:
97: pushl %ebx FRAME_pushl()
98: movl %eax, %ebx C src
99: movl (%eax), %eax
100:
101: movl %edx, %ecx C dst
102:
103: mull %eax C src[0]^2
104:
105: movl %eax, (%ecx) C dst[0]
106: movl 4(%ebx), %eax
107:
108: movl %edx, 4(%ecx) C dst[1]
109:
110: mull %eax C src[1]^2
111:
112: movl %eax, 8(%ecx) C dst[2]
113: movl (%ebx), %eax
114:
115: movl %edx, 12(%ecx) C dst[3]
116:
117: mull 4(%ebx) C src[0]*src[1]
118:
119: popl %ebx
120:
121: addl %eax, 4(%ecx)
122: adcl %edx, 8(%ecx)
123: adcl $0, 12(%ecx)
124: ASSERT(nc)
125:
126: addl %eax, 4(%ecx)
127: adcl %edx, 8(%ecx)
128: adcl $0, 12(%ecx)
129: ASSERT(nc)
130:
131: ret
132:
133:
134: C------------------------------------------------------------------------------
135: defframe(SAVE_EBX, -4)
136: defframe(SAVE_ESI, -8)
137: defframe(SAVE_EDI, -12)
138: defframe(SAVE_EBP, -16)
139: deflit(STACK_SPACE, 16)
140:
141: L(three_or_more):
142: subl $STACK_SPACE, %esp
143: cmpl $4, %ecx
144: jae L(four_or_more)
145: deflit(`FRAME',STACK_SPACE)
146:
147:
148: C------------------------------------------------------------------------------
149: C Three limbs
150: C
151: C Writing out the loads and stores separately at the end of this code comes
152: C out about 10 cycles faster than using adcls to memory.
153:
154: C eax src
155: C ecx size
156: C edx dst
157:
158: movl %ebx, SAVE_EBX
159: movl %eax, %ebx C src
160: movl (%eax), %eax
161:
162: movl %edx, %ecx C dst
163: movl %esi, SAVE_ESI
164: movl %edi, SAVE_EDI
165:
166: mull %eax C src[0] ^ 2
167:
168: movl %eax, (%ecx)
169: movl 4(%ebx), %eax
170: movl %edx, 4(%ecx)
171:
172: mull %eax C src[1] ^ 2
173:
174: movl %eax, 8(%ecx)
175: movl 8(%ebx), %eax
176: movl %edx, 12(%ecx)
177:
178: mull %eax C src[2] ^ 2
179:
180: movl %eax, 16(%ecx)
181: movl (%ebx), %eax
182: movl %edx, 20(%ecx)
183:
184: mull 4(%ebx) C src[0] * src[1]
185:
186: movl %eax, %esi
187: movl (%ebx), %eax
188: movl %edx, %edi
189:
190: mull 8(%ebx) C src[0] * src[2]
191:
192: addl %eax, %edi
193: movl %ebp, SAVE_EBP
194: movl $0, %ebp
195:
196: movl 4(%ebx), %eax
197: adcl %edx, %ebp
198:
199: mull 8(%ebx) C src[1] * src[2]
200:
201: xorl %ebx, %ebx
202: addl %eax, %ebp
203:
204: adcl $0, %edx
205:
206: C eax
207: C ebx zero, will be dst[5]
208: C ecx dst
209: C edx dst[4]
210: C esi dst[1]
211: C edi dst[2]
212: C ebp dst[3]
213:
214: adcl $0, %edx
215: addl %esi, %esi
216:
217: adcl %edi, %edi
218: movl 4(%ecx), %eax
219:
220: adcl %ebp, %ebp
221:
222: adcl %edx, %edx
223:
224: adcl $0, %ebx
225: addl %eax, %esi
226: movl 8(%ecx), %eax
227:
228: adcl %eax, %edi
229: movl 12(%ecx), %eax
230: movl %esi, 4(%ecx)
231:
232: adcl %eax, %ebp
233: movl 16(%ecx), %eax
234: movl %edi, 8(%ecx)
235:
236: movl SAVE_ESI, %esi
237: movl SAVE_EDI, %edi
238:
239: adcl %eax, %edx
240: movl 20(%ecx), %eax
241: movl %ebp, 12(%ecx)
242:
243: adcl %ebx, %eax
244: ASSERT(nc)
245: movl SAVE_EBX, %ebx
246: movl SAVE_EBP, %ebp
247:
248: movl %edx, 16(%ecx)
249: movl %eax, 20(%ecx)
250: addl $FRAME, %esp
251:
252: ret
253:
254:
255: C------------------------------------------------------------------------------
256: L(four_or_more):
257:
258: C First multiply src[0]*src[1..size-1] and store at dst[1..size].
259: C Further products are added in rather than stored.
260:
261: C eax src
262: C ebx
263: C ecx size
264: C edx dst
265: C esi
266: C edi
267: C ebp
268:
269: defframe(`VAR_COUNTER',-20)
270: defframe(`VAR_JMP', -24)
271: deflit(EXTRA_STACK_SPACE, 8)
272:
273: movl %ebx, SAVE_EBX
274: movl %edi, SAVE_EDI
275: leal (%edx,%ecx,4), %edi C &dst[size]
276:
277: movl %esi, SAVE_ESI
278: movl %ebp, SAVE_EBP
279: leal (%eax,%ecx,4), %esi C &src[size]
280:
281: movl (%eax), %ebp C multiplier
282: movl $0, %ebx
283: decl %ecx
284:
285: negl %ecx
286: subl $EXTRA_STACK_SPACE, %esp
287: FRAME_subl_esp(EXTRA_STACK_SPACE)
288:
289: L(mul_1):
290: C eax scratch
291: C ebx carry
292: C ecx counter
293: C edx scratch
294: C esi &src[size]
295: C edi &dst[size]
296: C ebp multiplier
297:
298: movl (%esi,%ecx,4), %eax
299:
300: mull %ebp
301:
302: addl %ebx, %eax
303: movl %eax, (%edi,%ecx,4)
304: movl $0, %ebx
305:
306: adcl %edx, %ebx
307: incl %ecx
308: jnz L(mul_1)
309:
310:
311: C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
312: C
313: C The last two products, which are the bottom right corner of the product
314: C triangle, are left to the end. These are src[size-3]*src[size-2,size-1]
315: C and src[size-2]*src[size-1]. If size is 4 then it's only these corner
316: C cases that need to be done.
317: C
318: C The unrolled code is the same as in mpn_addmul_1, see that routine for
319: C some comments.
320: C
321: C VAR_COUNTER is the outer loop, running from -size+4 to -1, inclusive.
322: C
323: C VAR_JMP is the computed jump into the unrolled code, stepped by one code
324: C chunk each outer loop.
325: C
326: C K7 does branch prediction on indirect jumps, which is bad since it's a
327: C different target each time. There seems no way to avoid this.
328:
329: dnl This value also hard coded in some shifts and adds
330: deflit(CODE_BYTES_PER_LIMB, 17)
331:
332: dnl With the unmodified &src[size] and &dst[size] pointers, the
333: dnl displacements in the unrolled code fit in a byte for UNROLL_COUNT
334: dnl values up to 31, but above that an offset must be added to them.
335:
336: deflit(OFFSET,
337: ifelse(eval(UNROLL_COUNT>31),1,
338: eval((UNROLL_COUNT-31)*4),
339: 0))
340:
341: dnl Because the last chunk of code is generated differently, a label placed
342: dnl at the end doesn't work. Instead calculate the implied end using the
343: dnl start and how many chunks of code there are.
344:
345: deflit(UNROLL_INNER_END,
346: `L(unroll_inner_start)+eval(UNROLL_COUNT*CODE_BYTES_PER_LIMB)')
347:
348: C eax
349: C ebx carry
350: C ecx
351: C edx
352: C esi &src[size]
353: C edi &dst[size]
354: C ebp
355:
356: movl PARAM_SIZE, %ecx
357: movl %ebx, (%edi)
358:
359: subl $4, %ecx
360: jz L(corner)
361:
362: negl %ecx
363: ifelse(OFFSET,0,,`subl $OFFSET, %edi')
364: ifelse(OFFSET,0,,`subl $OFFSET, %esi')
365:
366: movl %ecx, %edx
367: shll $4, %ecx
368:
369: ifdef(`PIC',`
370: call L(pic_calc)
371: L(here):
372: ',`
373: leal UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
374: ')
375:
376:
377: C The calculated jump mustn't come out to before the start of the
378: C code available. This is the limit UNROLL_COUNT puts on the src
379: C operand size, but checked here directly using the jump address.
380: ASSERT(ae,
381: `movl_text_address(L(unroll_inner_start), %eax)
382: cmpl %eax, %ecx')
383:
384:
385: C------------------------------------------------------------------------------
386: ALIGN(16)
387: L(unroll_outer_top):
388: C eax
389: C ebx high limb to store
390: C ecx VAR_JMP
391: C edx VAR_COUNTER, limbs, negative
392: C esi &src[size], constant
393: C edi dst ptr, high of last addmul
394: C ebp
395:
396: movl -12+OFFSET(%esi,%edx,4), %ebp C next multiplier
397: movl -8+OFFSET(%esi,%edx,4), %eax C first of multiplicand
398:
399: movl %edx, VAR_COUNTER
400:
401: mull %ebp
402:
403: define(cmovX,`ifelse(eval(UNROLL_COUNT%2),0,`cmovz($@)',`cmovnz($@)')')
404:
405: testb $1, %cl
406: movl %edx, %ebx C high carry
407: movl %ecx, %edx C jump
408:
409: movl %eax, %ecx C low carry
410: cmovX( %ebx, %ecx) C high carry reverse
411: cmovX( %eax, %ebx) C low carry reverse
412:
413: leal CODE_BYTES_PER_LIMB(%edx), %eax
414: xorl %edx, %edx
415: leal 4(%edi), %edi
416:
417: movl %eax, VAR_JMP
418:
419: jmp *%eax
420:
421:
422: ifdef(`PIC',`
423: L(pic_calc):
424: addl (%esp), %ecx
425: addl $UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)-L(here), %ecx
426: addl %edx, %ecx
427: ret
428: ')
429:
430:
431: C Must be an even address to preserve the significance of the low
432: C bit of the jump address indicating which way around ecx/ebx should
433: C start.
434: ALIGN(2)
435:
436: L(unroll_inner_start):
437: C eax next limb
438: C ebx carry high
439: C ecx carry low
440: C edx scratch
441: C esi src
442: C edi dst
443: C ebp multiplier
444:
445: forloop(`i', UNROLL_COUNT, 1, `
446: deflit(`disp_src', eval(-i*4 + OFFSET))
447: deflit(`disp_dst', eval(disp_src - 4))
448:
449: m4_assert(`disp_src>=-128 && disp_src<128')
450: m4_assert(`disp_dst>=-128 && disp_dst<128')
451:
452: ifelse(eval(i%2),0,`
453: Zdisp( movl, disp_src,(%esi), %eax)
454: adcl %edx, %ebx
455:
456: mull %ebp
457:
458: Zdisp( addl, %ecx, disp_dst,(%edi))
459: movl $0, %ecx
460:
461: adcl %eax, %ebx
462:
463: ',`
464: dnl this bit comes out last
465: Zdisp( movl, disp_src,(%esi), %eax)
466: adcl %edx, %ecx
467:
468: mull %ebp
469:
1.1.1.2 ! ohara 470: Zdisp( addl, %ebx, disp_dst,(%edi))
! 471:
1.1 maekawa 472: ifelse(forloop_last,0,
473: ` movl $0, %ebx')
474:
475: adcl %eax, %ecx
476: ')
477: ')
478:
479: C eax next limb
480: C ebx carry high
481: C ecx carry low
482: C edx scratch
483: C esi src
484: C edi dst
485: C ebp multiplier
486:
487: adcl $0, %edx
488: addl %ecx, -4+OFFSET(%edi)
489: movl VAR_JMP, %ecx
490:
491: adcl $0, %edx
492:
493: movl %edx, m4_empty_if_zero(OFFSET) (%edi)
494: movl VAR_COUNTER, %edx
495:
496: incl %edx
497: jnz L(unroll_outer_top)
498:
499:
500: ifelse(OFFSET,0,,`
501: addl $OFFSET, %esi
502: addl $OFFSET, %edi
503: ')
504:
505:
506: C------------------------------------------------------------------------------
507: L(corner):
508: C esi &src[size]
509: C edi &dst[2*size-5]
510:
511: movl -12(%esi), %ebp
512: movl -8(%esi), %eax
513: movl %eax, %ecx
514:
515: mull %ebp
516:
517: addl %eax, -4(%edi)
518: movl -4(%esi), %eax
519:
520: adcl $0, %edx
521: movl %edx, %ebx
522: movl %eax, %esi
523:
524: mull %ebp
525:
526: addl %ebx, %eax
527:
528: adcl $0, %edx
529: addl %eax, (%edi)
530: movl %esi, %eax
531:
532: adcl $0, %edx
533: movl %edx, %ebx
534:
535: mull %ecx
536:
537: addl %ebx, %eax
538: movl %eax, 4(%edi)
539:
540: adcl $0, %edx
541: movl %edx, 8(%edi)
542:
543:
544:
545: C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
546:
547: L(lshift_start):
548: movl PARAM_SIZE, %eax
549: movl PARAM_DST, %edi
550: xorl %ecx, %ecx C clear carry
551:
552: leal (%edi,%eax,8), %edi
553: notl %eax C -size-1, preserve carry
554:
555: leal 2(%eax), %eax C -(size-1)
556:
557: L(lshift):
558: C eax counter, negative
559: C ebx
560: C ecx
561: C edx
562: C esi
563: C edi dst, pointing just after last limb
564: C ebp
565:
566: rcll -4(%edi,%eax,8)
567: rcll (%edi,%eax,8)
568: incl %eax
569: jnz L(lshift)
570:
571: setc %al
572:
573: movl PARAM_SRC, %esi
574: movl %eax, -4(%edi) C dst most significant limb
575:
576: movl PARAM_SIZE, %ecx
577:
578:
579: C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
580: C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
581: C low limb of src[0]^2.
582:
583: movl (%esi), %eax C src[0]
584:
585: mull %eax
586:
587: leal (%esi,%ecx,4), %esi C src point just after last limb
588: negl %ecx
589:
590: movl %eax, (%edi,%ecx,8) C dst[0]
591: incl %ecx
592:
593: L(diag):
594: C eax scratch
595: C ebx scratch
596: C ecx counter, negative
597: C edx carry
598: C esi src just after last limb
599: C edi dst just after last limb
600: C ebp
601:
602: movl (%esi,%ecx,4), %eax
603: movl %edx, %ebx
604:
605: mull %eax
606:
607: addl %ebx, -4(%edi,%ecx,8)
608: adcl %eax, (%edi,%ecx,8)
609: adcl $0, %edx
610:
611: incl %ecx
612: jnz L(diag)
613:
614:
615: movl SAVE_ESI, %esi
616: movl SAVE_EBX, %ebx
617:
618: addl %edx, -4(%edi) C dst most significant limb
619: movl SAVE_EDI, %edi
620:
621: movl SAVE_EBP, %ebp
622: addl $FRAME, %esp
623:
624: ret
625:
626: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>