Annotation of OpenXM_contrib/gmp/mpn/x86/k7/mul_basecase.asm, Revision 1.1.1.2
1.1 maekawa 1: dnl AMD K7 mpn_mul_basecase -- multiply two mpn numbers.
2:
1.1.1.2 ! ohara 3: dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
1.1 maekawa 4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24:
1.1.1.2 ! ohara 25: C K7: approx 4.42 cycles per cross product at around 20x20 limbs (16
! 26: C limbs/loop unrolling).
! 27:
! 28:
! 29:
1.1 maekawa 30: dnl K7 UNROLL_COUNT cycles/product (at around 20x20)
31: dnl 8 4.67
32: dnl 16 4.59
33: dnl 32 4.42
34: dnl Maximum possible with the current code is 32.
35: dnl
36: dnl At 32 the typical 13-26 limb sizes from the karatsuba code will get
37: dnl done with a straight run through a block of code, no inner loop. Using
38: dnl 32 gives 1k of code, but the k7 has a 64k L1 code cache.
39:
40: deflit(UNROLL_COUNT, 32)
41:
42:
43: C void mpn_mul_basecase (mp_ptr wp,
44: C mp_srcptr xp, mp_size_t xsize,
45: C mp_srcptr yp, mp_size_t ysize);
46: C
47: C Calculate xp,xsize multiplied by yp,ysize, storing the result in
48: C wp,xsize+ysize.
49: C
50: C This routine is essentially the same as mpn/generic/mul_basecase.c, but
51: C it's faster because it does most of the mpn_addmul_1() startup
52: C calculations only once. The saving is 15-25% on typical sizes coming from
53: C the Karatsuba multiply code.
54:
55: ifdef(`PIC',`
56: deflit(UNROLL_THRESHOLD, 5)
57: ',`
58: deflit(UNROLL_THRESHOLD, 5)
59: ')
60:
61: defframe(PARAM_YSIZE,20)
62: defframe(PARAM_YP, 16)
63: defframe(PARAM_XSIZE,12)
64: defframe(PARAM_XP, 8)
65: defframe(PARAM_WP, 4)
66:
1.1.1.2 ! ohara 67: TEXT
1.1 maekawa 68: ALIGN(32)
69: PROLOGUE(mpn_mul_basecase)
70: deflit(`FRAME',0)
71:
72: movl PARAM_XSIZE, %ecx
73: movl PARAM_YP, %eax
74:
75: movl PARAM_XP, %edx
76: movl (%eax), %eax C yp low limb
77:
78: cmpl $2, %ecx
79: ja L(xsize_more_than_two)
80: je L(two_by_something)
81:
82:
83: C one limb by one limb
84:
85: mull (%edx)
86:
87: movl PARAM_WP, %ecx
88: movl %eax, (%ecx)
89: movl %edx, 4(%ecx)
90: ret
91:
92:
93: C -----------------------------------------------------------------------------
94: L(two_by_something):
95: deflit(`FRAME',0)
96: decl PARAM_YSIZE
97: pushl %ebx defframe_pushl(`SAVE_EBX')
98: movl %eax, %ecx C yp low limb
99:
100: movl PARAM_WP, %ebx
101: pushl %esi defframe_pushl(`SAVE_ESI')
102: movl %edx, %esi C xp
103:
104: movl (%edx), %eax C xp low limb
105: jnz L(two_by_two)
106:
107:
108: C two limbs by one limb
109:
110: mull %ecx
111:
112: movl %eax, (%ebx)
113: movl 4(%esi), %eax
114: movl %edx, %esi C carry
115:
116: mull %ecx
117:
118: addl %eax, %esi
119:
120: movl %esi, 4(%ebx)
121: movl SAVE_ESI, %esi
122:
123: adcl $0, %edx
124:
125: movl %edx, 8(%ebx)
126: movl SAVE_EBX, %ebx
127: addl $FRAME, %esp
128:
129: ret
130:
131:
132:
133: C -----------------------------------------------------------------------------
134: C Could load yp earlier into another register.
135:
136: ALIGN(16)
137: L(two_by_two):
138: C eax xp low limb
139: C ebx wp
140: C ecx yp low limb
141: C edx
142: C esi xp
143: C edi
144: C ebp
145:
146: dnl FRAME carries on from previous
147:
148: mull %ecx C xp[0] * yp[0]
149:
150: push %edi defframe_pushl(`SAVE_EDI')
151: movl %edx, %edi C carry, for wp[1]
152:
153: movl %eax, (%ebx)
154: movl 4(%esi), %eax
155:
156: mull %ecx C xp[1] * yp[0]
157:
158: addl %eax, %edi
159: movl PARAM_YP, %ecx
160:
161: adcl $0, %edx
162: movl 4(%ecx), %ecx C yp[1]
163: movl %edi, 4(%ebx)
164:
165: movl 4(%esi), %eax C xp[1]
166: movl %edx, %edi C carry, for wp[2]
167:
168: mull %ecx C xp[1] * yp[1]
169:
170: addl %eax, %edi
171:
172: adcl $0, %edx
173: movl (%esi), %eax C xp[0]
174:
175: movl %edx, %esi C carry, for wp[3]
176:
177: mull %ecx C xp[0] * yp[1]
178:
179: addl %eax, 4(%ebx)
180: adcl %edx, %edi
181: movl %edi, 8(%ebx)
182:
183: adcl $0, %esi
184: movl SAVE_EDI, %edi
185: movl %esi, 12(%ebx)
186:
187: movl SAVE_ESI, %esi
188: movl SAVE_EBX, %ebx
189: addl $FRAME, %esp
190:
191: ret
192:
193:
194: C -----------------------------------------------------------------------------
195: ALIGN(16)
196: L(xsize_more_than_two):
197:
198: C The first limb of yp is processed with a simple mpn_mul_1 style loop
199: C inline. Unrolling this doesn't seem worthwhile since it's only run once
200: C (whereas the addmul below is run ysize-1 many times). A call to the
201: C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
202: C popping, and doesn't seem likely to be worthwhile on the typical 13-26
203: C limb operations the Karatsuba code calls here with.
204:
205: C eax yp[0]
206: C ebx
207: C ecx xsize
208: C edx xp
209: C esi
210: C edi
211: C ebp
212:
213: dnl FRAME doesn't carry on from previous, no pushes yet here
214: defframe(`SAVE_EBX',-4)
215: defframe(`SAVE_ESI',-8)
216: defframe(`SAVE_EDI',-12)
217: defframe(`SAVE_EBP',-16)
218: deflit(`FRAME',0)
219:
220: subl $16, %esp
221: deflit(`FRAME',16)
222:
223: movl %edi, SAVE_EDI
224: movl PARAM_WP, %edi
225:
226: movl %ebx, SAVE_EBX
227: movl %ebp, SAVE_EBP
228: movl %eax, %ebp
229:
230: movl %esi, SAVE_ESI
231: xorl %ebx, %ebx
232: leal (%edx,%ecx,4), %esi C xp end
233:
234: leal (%edi,%ecx,4), %edi C wp end of mul1
235: negl %ecx
236:
237:
238: L(mul1):
239: C eax scratch
240: C ebx carry
241: C ecx counter, negative
242: C edx scratch
243: C esi xp end
244: C edi wp end of mul1
245: C ebp multiplier
246:
247: movl (%esi,%ecx,4), %eax
248:
249: mull %ebp
250:
251: addl %ebx, %eax
252: movl %eax, (%edi,%ecx,4)
253: movl $0, %ebx
254:
255: adcl %edx, %ebx
256: incl %ecx
257: jnz L(mul1)
258:
259:
260: movl PARAM_YSIZE, %edx
261: movl PARAM_XSIZE, %ecx
262:
263: movl %ebx, (%edi) C final carry
264: decl %edx
265:
266: jnz L(ysize_more_than_one)
267:
268:
269: movl SAVE_EDI, %edi
270: movl SAVE_EBX, %ebx
271:
272: movl SAVE_EBP, %ebp
273: movl SAVE_ESI, %esi
274: addl $FRAME, %esp
275:
276: ret
277:
278:
279: L(ysize_more_than_one):
280: cmpl $UNROLL_THRESHOLD, %ecx
281: movl PARAM_YP, %eax
282:
283: jae L(unroll)
284:
285:
286: C -----------------------------------------------------------------------------
287: C simple addmul looping
288: C
289: C eax yp
290: C ebx
291: C ecx xsize
292: C edx ysize-1
293: C esi xp end
294: C edi wp end of mul1
295: C ebp
296:
297: leal 4(%eax,%edx,4), %ebp C yp end
298: negl %ecx
299: negl %edx
300:
301: movl (%esi,%ecx,4), %eax C xp low limb
302: movl %edx, PARAM_YSIZE C -(ysize-1)
303: incl %ecx
304:
305: xorl %ebx, %ebx C initial carry
306: movl %ecx, PARAM_XSIZE C -(xsize-1)
307: movl %ebp, PARAM_YP
308:
309: movl (%ebp,%edx,4), %ebp C yp second lowest limb - multiplier
310: jmp L(simple_outer_entry)
311:
312:
313: C this is offset 0x121 so close enough to aligned
314: L(simple_outer_top):
315: C ebp ysize counter, negative
316:
317: movl PARAM_YP, %edx
318: movl PARAM_XSIZE, %ecx C -(xsize-1)
319: xorl %ebx, %ebx C carry
320:
321: movl %ebp, PARAM_YSIZE
322: addl $4, %edi C next position in wp
323:
324: movl (%edx,%ebp,4), %ebp C yp limb - multiplier
325: movl -4(%esi,%ecx,4), %eax C xp low limb
326:
327:
328: L(simple_outer_entry):
329:
330: L(simple_inner):
331: C eax xp limb
332: C ebx carry limb
333: C ecx loop counter (negative)
334: C edx scratch
335: C esi xp end
336: C edi wp end
337: C ebp multiplier
338:
339: mull %ebp
340:
341: addl %eax, %ebx
342: adcl $0, %edx
343:
344: addl %ebx, (%edi,%ecx,4)
345: movl (%esi,%ecx,4), %eax
346: adcl $0, %edx
347:
348: incl %ecx
349: movl %edx, %ebx
350: jnz L(simple_inner)
351:
352:
353: mull %ebp
354:
355: movl PARAM_YSIZE, %ebp
356: addl %eax, %ebx
357:
358: adcl $0, %edx
359: addl %ebx, (%edi)
360:
361: adcl $0, %edx
362: incl %ebp
363:
364: movl %edx, 4(%edi)
365: jnz L(simple_outer_top)
366:
367:
368: movl SAVE_EBX, %ebx
369: movl SAVE_ESI, %esi
370:
371: movl SAVE_EDI, %edi
372: movl SAVE_EBP, %ebp
373: addl $FRAME, %esp
374:
375: ret
376:
377:
378:
379: C -----------------------------------------------------------------------------
380: C
381: C The unrolled loop is the same as in mpn_addmul_1(), see that code for some
382: C comments.
383: C
384: C VAR_ADJUST is the negative of how many limbs the leals in the inner loop
385: C increment xp and wp. This is used to adjust back xp and wp, and rshifted
386: C to given an initial VAR_COUNTER at the top of the outer loop.
387: C
388: C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT
389: C up to -1, inclusive.
390: C
391: C VAR_JMP is the computed jump into the unrolled loop.
392: C
393: C VAR_XP_LOW is the least significant limb of xp, which is needed at the
394: C start of the unrolled loop.
395: C
396: C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
397: C inclusive.
398: C
399: C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
400: C added to give the location of the next limb of yp, which is the multiplier
401: C in the unrolled loop.
402: C
403: C The trick with VAR_ADJUST means it's only necessary to do one fetch in the
404: C outer loop to take care of xp, wp and the inner loop counter.
405:
406: defframe(VAR_COUNTER, -20)
407: defframe(VAR_ADJUST, -24)
408: defframe(VAR_JMP, -28)
409: defframe(VAR_XP_LOW, -32)
410: deflit(VAR_EXTRA_SPACE, 16)
411:
412:
413: L(unroll):
414: C eax yp
415: C ebx
416: C ecx xsize
417: C edx ysize-1
418: C esi xp end
419: C edi wp end of mul1
420: C ebp
421:
422: movl PARAM_XP, %esi
423: movl 4(%eax), %ebp C multiplier (yp second limb)
424: leal 4(%eax,%edx,4), %eax C yp adjust for ysize indexing
425:
426: movl PARAM_WP, %edi
427: movl %eax, PARAM_YP
428: negl %edx
429:
430: movl %edx, PARAM_YSIZE
431: leal UNROLL_COUNT-2(%ecx), %ebx C (xsize-1)+UNROLL_COUNT-1
432: decl %ecx C xsize-1
433:
434: movl (%esi), %eax C xp low limb
435: andl $-UNROLL_MASK-1, %ebx
436: negl %ecx
437:
438: subl $VAR_EXTRA_SPACE, %esp
439: deflit(`FRAME',16+VAR_EXTRA_SPACE)
440: negl %ebx
441: andl $UNROLL_MASK, %ecx
442:
443: movl %ebx, VAR_ADJUST
444: movl %ecx, %edx
445: shll $4, %ecx
446:
447: sarl $UNROLL_LOG2, %ebx
448:
449: C 17 code bytes per limb
450: ifdef(`PIC',`
451: call L(pic_calc)
452: L(unroll_here):
453: ',`
454: leal L(unroll_entry) (%ecx,%edx,1), %ecx
455: ')
456: negl %edx
457:
458: movl %eax, VAR_XP_LOW
459: movl %ecx, VAR_JMP
460: leal 4(%edi,%edx,4), %edi C wp and xp, adjust for unrolling,
461: leal 4(%esi,%edx,4), %esi C and start at second limb
462: jmp L(unroll_outer_entry)
463:
464:
465: ifdef(`PIC',`
466: L(pic_calc):
1.1.1.2 ! ohara 467: C See mpn/x86/README about old gas bugs
1.1 maekawa 468: leal (%ecx,%edx,1), %ecx
469: addl $L(unroll_entry)-L(unroll_here), %ecx
470: addl (%esp), %ecx
471: ret
472: ')
473:
474:
475: C --------------------------------------------------------------------------
476: ALIGN(32)
477: L(unroll_outer_top):
478: C ebp ysize counter, negative
479:
480: movl VAR_ADJUST, %ebx
481: movl PARAM_YP, %edx
482:
483: movl VAR_XP_LOW, %eax
484: movl %ebp, PARAM_YSIZE C store incremented ysize counter
485:
486: leal 4(%edi,%ebx,4), %edi
487: leal (%esi,%ebx,4), %esi
488: sarl $UNROLL_LOG2, %ebx
489:
490: movl (%edx,%ebp,4), %ebp C yp next multiplier
491: movl VAR_JMP, %ecx
492:
493: L(unroll_outer_entry):
494: mull %ebp
495:
496: testb $1, %cl C and clear carry bit
497: movl %ebx, VAR_COUNTER
498: movl $0, %ebx
499:
500: movl $0, %ecx
501: cmovz( %eax, %ecx) C eax into low carry, zero into high carry limb
502: cmovnz( %eax, %ebx)
503:
504: C Extra fetch of VAR_JMP is bad, but registers are tight
505: jmp *VAR_JMP
506:
507:
508: C -----------------------------------------------------------------------------
509: ALIGN(32)
510: L(unroll_top):
511: C eax xp limb
512: C ebx carry high
513: C ecx carry low
514: C edx scratch
515: C esi xp+8
516: C edi wp
517: C ebp yp multiplier limb
518: C
519: C VAR_COUNTER loop counter, negative
520: C
521: C 17 bytes each limb
522:
523: L(unroll_entry):
524:
525: deflit(CHUNK_COUNT,2)
526: forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
527: deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
528: deflit(`disp1', eval(disp0 + 4))
529:
530: Zdisp( movl, disp0,(%esi), %eax)
531: adcl %edx, %ebx
532:
533: mull %ebp
534:
535: Zdisp( addl, %ecx, disp0,(%edi))
536: movl $0, %ecx
537:
538: adcl %eax, %ebx
539:
540:
541: movl disp1(%esi), %eax
542: adcl %edx, %ecx
543:
544: mull %ebp
545:
546: addl %ebx, disp1(%edi)
547: movl $0, %ebx
548:
549: adcl %eax, %ecx
550: ')
551:
552:
553: incl VAR_COUNTER
554: leal UNROLL_BYTES(%esi), %esi
555: leal UNROLL_BYTES(%edi), %edi
556:
557: jnz L(unroll_top)
558:
559:
560: C eax
561: C ebx zero
562: C ecx low
563: C edx high
564: C esi
565: C edi wp, pointing at second last limb)
566: C ebp
567: C
568: C carry flag to be added to high
569:
570: deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
571: deflit(`disp1', eval(disp0-0 + 4))
572:
573: movl PARAM_YSIZE, %ebp
574: adcl $0, %edx
575: addl %ecx, disp0(%edi)
576:
577: adcl $0, %edx
578: incl %ebp
579:
580: movl %edx, disp1(%edi)
581: jnz L(unroll_outer_top)
582:
583:
584: movl SAVE_ESI, %esi
585: movl SAVE_EBP, %ebp
586:
587: movl SAVE_EDI, %edi
588: movl SAVE_EBX, %ebx
589: addl $FRAME, %esp
590:
591: ret
592:
593: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>