Annotation of OpenXM_contrib/gmp/mpn/x86/k6/mul_basecase.asm, Revision 1.1.1.1
1.1 maekawa 1: dnl AMD K6 mpn_mul_basecase -- multiply two mpn numbers.
2: dnl
3: dnl K6: approx 9.0 cycles per cross product on 30x30 limbs (with 16 limbs/loop
4: dnl unrolling).
5:
6:
7: dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
8: dnl
9: dnl This file is part of the GNU MP Library.
10: dnl
11: dnl The GNU MP Library is free software; you can redistribute it and/or
12: dnl modify it under the terms of the GNU Lesser General Public License as
13: dnl published by the Free Software Foundation; either version 2.1 of the
14: dnl License, or (at your option) any later version.
15: dnl
16: dnl The GNU MP Library is distributed in the hope that it will be useful,
17: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
18: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19: dnl Lesser General Public License for more details.
20: dnl
21: dnl You should have received a copy of the GNU Lesser General Public
22: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
23: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
24: dnl Suite 330, Boston, MA 02111-1307, USA.
25:
26:
27: include(`../config.m4')
28:
29:
30: dnl K6: UNROLL_COUNT cycles/product (approx)
31: dnl 8 9.75
32: dnl 16 9.3
33: dnl 32 9.3
34: dnl Maximum possible with the current code is 32.
35: dnl
36: dnl With 16 the inner unrolled loop fits exactly in a 256 byte block, which
37: dnl might explain it's good performance.
38:
39: deflit(UNROLL_COUNT, 16)
40:
41:
42: C void mpn_mul_basecase (mp_ptr wp,
43: C mp_srcptr xp, mp_size_t xsize,
44: C mp_srcptr yp, mp_size_t ysize);
45: C
46: C Calculate xp,xsize multiplied by yp,ysize, storing the result in
47: C wp,xsize+ysize.
48: C
49: C This routine is essentially the same as mpn/generic/mul_basecase.c, but
50: C it's faster because it does most of the mpn_addmul_1() entry code only
51: C once. The saving is about 10-20% on typical sizes coming from the
52: C Karatsuba multiply code.
53: C
54: C Future:
55: C
56: C The unrolled loop could be shared by mpn_addmul_1, with some extra stack
57: C setups and maybe 2 or 3 wasted cycles at the end. Code saving would be
58: C 256 bytes.
59:
60: ifdef(`PIC',`
61: deflit(UNROLL_THRESHOLD, 8)
62: ',`
63: deflit(UNROLL_THRESHOLD, 8)
64: ')
65:
66: defframe(PARAM_YSIZE,20)
67: defframe(PARAM_YP, 16)
68: defframe(PARAM_XSIZE,12)
69: defframe(PARAM_XP, 8)
70: defframe(PARAM_WP, 4)
71:
72: .text
73: ALIGN(32)
74: PROLOGUE(mpn_mul_basecase)
75: deflit(`FRAME',0)
76:
77: movl PARAM_XSIZE, %ecx
78: movl PARAM_YP, %eax
79:
80: movl PARAM_XP, %edx
81: movl (%eax), %eax C yp low limb
82:
83: cmpl $2, %ecx
84: ja L(xsize_more_than_two_limbs)
85: je L(two_by_something)
86:
87:
88: C one limb by one limb
89:
90: movl (%edx), %edx C xp low limb
91: movl PARAM_WP, %ecx
92:
93: mull %edx
94:
95: movl %eax, (%ecx)
96: movl %edx, 4(%ecx)
97: ret
98:
99:
100: C -----------------------------------------------------------------------------
101: L(two_by_something):
102: decl PARAM_YSIZE
103: pushl %ebx
104: deflit(`FRAME',4)
105:
106: movl PARAM_WP, %ebx
107: pushl %esi
108: deflit(`FRAME',8)
109:
110: movl %eax, %ecx C yp low limb
111: movl (%edx), %eax C xp low limb
112:
113: movl %edx, %esi C xp
114: jnz L(two_by_two)
115:
116:
117: C two limbs by one limb
118:
119: mull %ecx
120:
121: movl %eax, (%ebx)
122: movl 4(%esi), %eax
123:
124: movl %edx, %esi C carry
125:
126: mull %ecx
127:
128: addl %eax, %esi
129: movl %esi, 4(%ebx)
130:
131: adcl $0, %edx
132:
133: movl %edx, 8(%ebx)
134: popl %esi
135:
136: popl %ebx
137: ret
138:
139:
140:
141: C -----------------------------------------------------------------------------
142: ALIGN(16)
143: L(two_by_two):
144: C eax xp low limb
145: C ebx wp
146: C ecx yp low limb
147: C edx
148: C esi xp
149: C edi
150: C ebp
151: deflit(`FRAME',8)
152:
153: mull %ecx C xp[0] * yp[0]
154:
155: push %edi
156: deflit(`FRAME',12)
157: movl %eax, (%ebx)
158:
159: movl 4(%esi), %eax
160: movl %edx, %edi C carry, for wp[1]
161:
162: mull %ecx C xp[1] * yp[0]
163:
164: addl %eax, %edi
165: movl PARAM_YP, %ecx
166:
167: adcl $0, %edx
168:
169: movl %edi, 4(%ebx)
170: movl 4(%ecx), %ecx C yp[1]
171:
172: movl 4(%esi), %eax C xp[1]
173: movl %edx, %edi C carry, for wp[2]
174:
175: mull %ecx C xp[1] * yp[1]
176:
177: addl %eax, %edi
178:
179: adcl $0, %edx
180:
181: movl (%esi), %eax C xp[0]
182: movl %edx, %esi C carry, for wp[3]
183:
184: mull %ecx C xp[0] * yp[1]
185:
186: addl %eax, 4(%ebx)
187: adcl %edx, %edi
188: adcl $0, %esi
189:
190: movl %edi, 8(%ebx)
191: popl %edi
192:
193: movl %esi, 12(%ebx)
194: popl %esi
195:
196: popl %ebx
197: ret
198:
199:
200: C -----------------------------------------------------------------------------
201: ALIGN(16)
202: L(xsize_more_than_two_limbs):
203:
204: C The first limb of yp is processed with a simple mpn_mul_1 style loop
205: C inline. Unrolling this doesn't seem worthwhile since it's only run once
206: C (whereas the addmul below is run ysize-1 many times). A call to the
207: C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
208: C popping, and doesn't seem likely to be worthwhile on the typical 10-20
209: C limb operations the Karatsuba code calls here with.
210:
211: C eax yp[0]
212: C ebx
213: C ecx xsize
214: C edx xp
215: C esi
216: C edi
217: C ebp
218: deflit(`FRAME',0)
219:
220: pushl %edi defframe_pushl(SAVE_EDI)
221: pushl %ebp defframe_pushl(SAVE_EBP)
222:
223: movl PARAM_WP, %edi
224: pushl %esi defframe_pushl(SAVE_ESI)
225:
226: movl %eax, %ebp
227: pushl %ebx defframe_pushl(SAVE_EBX)
228:
229: leal (%edx,%ecx,4), %ebx C xp end
230: xorl %esi, %esi
231:
232: leal (%edi,%ecx,4), %edi C wp end of mul1
233: negl %ecx
234:
235:
236: L(mul1):
237: C eax scratch
238: C ebx xp end
239: C ecx counter, negative
240: C edx scratch
241: C esi carry
242: C edi wp end of mul1
243: C ebp multiplier
244:
245: movl (%ebx,%ecx,4), %eax
246:
247: mull %ebp
248:
249: addl %esi, %eax
250: movl $0, %esi
251:
252: adcl %edx, %esi
253:
254: movl %eax, (%edi,%ecx,4)
255: incl %ecx
256:
257: jnz L(mul1)
258:
259:
260: movl PARAM_YSIZE, %edx
261: movl %esi, (%edi) C final carry
262:
263: movl PARAM_XSIZE, %ecx
264: decl %edx
265:
266: jnz L(ysize_more_than_one_limb)
267:
268: popl %ebx
269: popl %esi
270: popl %ebp
271: popl %edi
272: ret
273:
274:
275: L(ysize_more_than_one_limb):
276: cmpl $UNROLL_THRESHOLD, %ecx
277: movl PARAM_YP, %eax
278:
279: jae L(unroll)
280:
281:
282: C -----------------------------------------------------------------------------
283: C Simple addmul loop.
284: C
285: C Using ebx and edi pointing at the ends of their respective locations saves
286: C a couple of instructions in the outer loop. The inner loop is still 11
287: C cycles, the same as the simple loop in aorsmul_1.asm.
288:
289: C eax yp
290: C ebx xp end
291: C ecx xsize
292: C edx ysize-1
293: C esi
294: C edi wp end of mul1
295: C ebp
296:
297: movl 4(%eax), %ebp C multiplier
298: negl %ecx
299:
300: movl %ecx, PARAM_XSIZE C -xsize
301: xorl %esi, %esi C initial carry
302:
303: leal 4(%eax,%edx,4), %eax C yp end
304: negl %edx
305:
306: movl %eax, PARAM_YP
307: movl %edx, PARAM_YSIZE
308:
309: jmp L(simple_outer_entry)
310:
311:
312: C aligning here saves a couple of cycles
313: ALIGN(16)
314: L(simple_outer_top):
315: C edx ysize counter, negative
316:
317: movl PARAM_YP, %eax C yp end
318: xorl %esi, %esi C carry
319:
320: movl PARAM_XSIZE, %ecx C -xsize
321: movl %edx, PARAM_YSIZE
322:
323: movl (%eax,%edx,4), %ebp C yp limb multiplier
324: L(simple_outer_entry):
325: addl $4, %edi
326:
327:
328: L(simple_inner):
329: C eax scratch
330: C ebx xp end
331: C ecx counter, negative
332: C edx scratch
333: C esi carry
334: C edi wp end of this addmul
335: C ebp multiplier
336:
337: movl (%ebx,%ecx,4), %eax
338:
339: mull %ebp
340:
341: addl %esi, %eax
342: movl $0, %esi
343:
344: adcl $0, %edx
345: addl %eax, (%edi,%ecx,4)
346: adcl %edx, %esi
347:
348: incl %ecx
349: jnz L(simple_inner)
350:
351:
352: movl PARAM_YSIZE, %edx
353: movl %esi, (%edi)
354:
355: incl %edx
356: jnz L(simple_outer_top)
357:
358:
359: popl %ebx
360: popl %esi
361: popl %ebp
362: popl %edi
363: ret
364:
365:
366: C -----------------------------------------------------------------------------
367: C Unrolled loop.
368: C
369: C The unrolled inner loop is the same as in aorsmul_1.asm, see that code for
370: C some comments.
371: C
372: C VAR_COUNTER is for the inner loop, running from VAR_COUNTER_INIT down to
373: C 0, inclusive.
374: C
375: C VAR_JMP is the computed jump into the unrolled loop.
376: C
377: C PARAM_XP and PARAM_WP get offset appropriately for where the unrolled loop
378: C is entered.
379: C
380: C VAR_XP_LOW is the least significant limb of xp, which is needed at the
381: C start of the unrolled loop. This can't just be fetched through the xp
382: C pointer because of the offset applied to it.
383: C
384: C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
385: C inclusive.
386: C
387: C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
388: C added to give the location of the next limb of yp, which is the multiplier
389: C in the unrolled loop.
390: C
391: C PARAM_WP is similarly offset so that the PARAM_YSIZE counter can be added
392: C to give the starting point in the destination for each unrolled loop (this
393: C point is one limb upwards for each limb of yp processed).
394: C
395: C Having PARAM_YSIZE count negative to zero means it's not necessary to
396: C store new values of PARAM_YP and PARAM_WP on each loop. Those values on
397: C the stack remain constant and on each loop an leal adjusts them with the
398: C PARAM_YSIZE counter value.
399:
400:
401: defframe(VAR_COUNTER, -20)
402: defframe(VAR_COUNTER_INIT, -24)
403: defframe(VAR_JMP, -28)
404: defframe(VAR_XP_LOW, -32)
405: deflit(VAR_STACK_SPACE, 16)
406:
407: dnl For some strange reason using (%esp) instead of 0(%esp) is a touch
408: dnl slower in this code, hence the defframe empty-if-zero feature is
409: dnl disabled.
410: dnl
411: dnl If VAR_COUNTER is at (%esp), the effect is worse. In this case the
412: dnl unrolled loop is 255 instead of 256 bytes, but quite how this affects
413: dnl anything isn't clear.
414: dnl
415: define(`defframe_empty_if_zero_disabled',1)
416:
417: L(unroll):
418: C eax yp (not used)
419: C ebx xp end (not used)
420: C ecx xsize
421: C edx ysize-1
422: C esi
423: C edi wp end of mul1 (not used)
424: C ebp
425: deflit(`FRAME', 16)
426:
427: leal -2(%ecx), %ebp C one limb processed at start,
428: decl %ecx C and ebp is one less
429:
430: shrl $UNROLL_LOG2, %ebp
431: negl %ecx
432:
433: subl $VAR_STACK_SPACE, %esp
434: deflit(`FRAME', 16+VAR_STACK_SPACE)
435: andl $UNROLL_MASK, %ecx
436:
437: movl %ecx, %esi
438: shll $4, %ecx
439:
440: movl %ebp, VAR_COUNTER_INIT
441: negl %esi
442:
443: C 15 code bytes per limb
444: ifdef(`PIC',`
445: call L(pic_calc)
446: L(unroll_here):
447: ',`
448: leal L(unroll_entry) (%ecx,%esi,1), %ecx
449: ')
450:
451: movl PARAM_XP, %ebx
452: movl %ebp, VAR_COUNTER
453:
454: movl PARAM_WP, %edi
455: movl %ecx, VAR_JMP
456:
457: movl (%ebx), %eax
458: leal 4(%edi,%esi,4), %edi C wp adjust for unrolling and mul1
459:
460: leal (%ebx,%esi,4), %ebx C xp adjust for unrolling
461:
462: movl %eax, VAR_XP_LOW
463:
464: movl %ebx, PARAM_XP
465: movl PARAM_YP, %ebx
466:
467: leal (%edi,%edx,4), %ecx C wp adjust for ysize indexing
468: movl 4(%ebx), %ebp C multiplier (yp second limb)
469:
470: leal 4(%ebx,%edx,4), %ebx C yp adjust for ysize indexing
471:
472: movl %ecx, PARAM_WP
473:
474: leal 1(%esi), %ecx C adjust parity for decl %ecx above
475:
476: movl %ebx, PARAM_YP
477: negl %edx
478:
479: movl %edx, PARAM_YSIZE
480: jmp L(unroll_outer_entry)
481:
482:
483: ifdef(`PIC',`
484: L(pic_calc):
485: C See README.family about old gas bugs
486: leal (%ecx,%esi,1), %ecx
487: addl $L(unroll_entry)-L(unroll_here), %ecx
488: addl (%esp), %ecx
489: ret
490: ')
491:
492:
493: C -----------------------------------------------------------------------------
494: C Aligning here saves a couple of cycles per loop. Using 32 doesn't
495: C cost any extra space, since the inner unrolled loop below is
496: C aligned to 32.
497: ALIGN(32)
498: L(unroll_outer_top):
499: C edx ysize
500:
501: movl PARAM_YP, %eax
502: movl %edx, PARAM_YSIZE C incremented ysize counter
503:
504: movl PARAM_WP, %edi
505:
506: movl VAR_COUNTER_INIT, %ebx
507: movl (%eax,%edx,4), %ebp C next multiplier
508:
509: movl PARAM_XSIZE, %ecx
510: leal (%edi,%edx,4), %edi C adjust wp for where we are in yp
511:
512: movl VAR_XP_LOW, %eax
513: movl %ebx, VAR_COUNTER
514:
515: L(unroll_outer_entry):
516: mull %ebp
517:
518: C using testb is a tiny bit faster than testl
519: testb $1, %cl
520:
521: movl %eax, %ecx C low carry
522: movl VAR_JMP, %eax
523:
524: movl %edx, %esi C high carry
525: movl PARAM_XP, %ebx
526:
527: jnz L(unroll_noswap)
528: movl %ecx, %esi C high,low carry other way around
529:
530: movl %edx, %ecx
531: L(unroll_noswap):
532:
533: jmp *%eax
534:
535:
536:
537: C -----------------------------------------------------------------------------
538: ALIGN(32)
539: L(unroll_top):
540: C eax scratch
541: C ebx xp
542: C ecx carry low
543: C edx scratch
544: C esi carry high
545: C edi wp
546: C ebp multiplier
547: C VAR_COUNTER loop counter
548: C
549: C 15 code bytes each limb
550:
551: leal UNROLL_BYTES(%edi), %edi
552:
553: L(unroll_entry):
554: deflit(CHUNK_COUNT,2)
555: forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
556: deflit(`disp0', eval(i*CHUNK_COUNT*4))
557: deflit(`disp1', eval(disp0 + 4))
558: deflit(`disp2', eval(disp1 + 4))
559:
560: movl disp1(%ebx), %eax
561: mull %ebp
562: Zdisp( addl, %ecx, disp0,(%edi))
563: adcl %eax, %esi
564: movl %edx, %ecx
565: jadcl0( %ecx)
566:
567: movl disp2(%ebx), %eax
568: mull %ebp
569: addl %esi, disp1(%edi)
570: adcl %eax, %ecx
571: movl %edx, %esi
572: jadcl0( %esi)
573: ')
574:
575: decl VAR_COUNTER
576: leal UNROLL_BYTES(%ebx), %ebx
577:
578: jns L(unroll_top)
579:
580:
581: movl PARAM_YSIZE, %edx
582: addl %ecx, UNROLL_BYTES(%edi)
583:
584: adcl $0, %esi
585:
586: incl %edx
587: movl %esi, UNROLL_BYTES+4(%edi)
588:
589: jnz L(unroll_outer_top)
590:
591:
592: movl SAVE_ESI, %esi
593: movl SAVE_EBP, %ebp
594: movl SAVE_EDI, %edi
595: movl SAVE_EBX, %ebx
596:
597: addl $FRAME, %esp
598: ret
599:
600: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>