Annotation of OpenXM_contrib/gmp/mpn/x86/k6/aorsmul_1.asm, Revision 1.1.1.1
1.1 maekawa 1: dnl AMD K6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
2: dnl
3: dnl K6: 7.65 to 8.5 cycles/limb (at 16 limbs/loop and depending on the data),
4: dnl PIC adds about 6 cycles at the start.
5:
6:
7: dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
8: dnl
9: dnl This file is part of the GNU MP Library.
10: dnl
11: dnl The GNU MP Library is free software; you can redistribute it and/or
12: dnl modify it under the terms of the GNU Lesser General Public License as
13: dnl published by the Free Software Foundation; either version 2.1 of the
14: dnl License, or (at your option) any later version.
15: dnl
16: dnl The GNU MP Library is distributed in the hope that it will be useful,
17: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
18: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19: dnl Lesser General Public License for more details.
20: dnl
21: dnl You should have received a copy of the GNU Lesser General Public
22: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
23: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
24: dnl Suite 330, Boston, MA 02111-1307, USA.
25:
26:
27: include(`../config.m4')
28:
29:
30: dnl K6: large multpliers small multpliers
31: dnl UNROLL_COUNT cycles/limb cycles/limb
32: dnl 4 9.5 7.78
33: dnl 8 9.0 7.78
34: dnl 16 8.4 7.65
35: dnl 32 8.4 8.2
36: dnl
37: dnl Maximum possible unrolling with the current code is 32.
38: dnl
39: dnl Unrolling to 16 limbs/loop makes the unrolled loop fit exactly in a 256
40: dnl byte block, which might explain the good speed at that unrolling.
41:
42: deflit(UNROLL_COUNT, 16)
43:
44:
45: ifdef(`OPERATION_addmul_1', `
46: define(M4_inst, addl)
47: define(M4_function_1, mpn_addmul_1)
48: define(M4_function_1c, mpn_addmul_1c)
49: define(M4_description, add it to)
50: define(M4_desc_retval, carry)
51: ',`ifdef(`OPERATION_submul_1', `
52: define(M4_inst, subl)
53: define(M4_function_1, mpn_submul_1)
54: define(M4_function_1c, mpn_submul_1c)
55: define(M4_description, subtract it from)
56: define(M4_desc_retval, borrow)
57: ',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
58: ')')')
59:
60: MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
61:
62:
63: C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
64: C mp_limb_t mult);
65: C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
66: C mp_limb_t mult, mp_limb_t carry);
67: C
68: C Calculate src,size multiplied by mult and M4_description dst,size.
69: C Return the M4_desc_retval limb from the top of the result.
70: C
71: C The jadcl0()s in the unrolled loop makes the speed data dependent. Small
72: C multipliers (most significant few bits clear) result in few carry bits and
73: C speeds up to 7.65 cycles/limb are attained. Large multipliers (most
74: C significant few bits set) make the carry bits 50/50 and lead to something
75: C more like 8.4 c/l. (With adcl's both of these would be 9.3 c/l.)
76: C
77: C It's important that the gains for jadcl0 on small multipliers don't come
78: C at the cost of slowing down other data. Tests on uniformly distributed
79: C random data, designed to confound branch prediction, show about a 7%
80: C speed-up using jadcl0 over adcl (8.93 versus 9.57 cycles/limb, with all
81: C overheads included).
82: C
83: C In the simple loop, jadcl0() measures slower than adcl (11.9-14.7 versus
84: C 11.0 cycles/limb), and hence isn't used.
85: C
86: C In the simple loop, note that running ecx from negative to zero and using
87: C it as an index in the two movs wouldn't help. It would save one
88: C instruction (2*addl+loop becoming incl+jnz), but there's nothing unpaired
89: C that would be collapsed by this.
90: C
91: C
92: C jadcl0
93: C ------
94: C
95: C jadcl0() being faster than adcl $0 seems to be an artifact of two things,
96: C firstly the instruction decoding and secondly the fact that there's a
97: C carry bit for the jadcl0 only on average about 1/4 of the time.
98: C
99: C The code in the unrolled loop decodes something like the following.
100: C
101: C decode cycles
102: C mull %ebp 2
103: C M4_inst %esi, disp(%edi) 1
104: C adcl %eax, %ecx 2
105: C movl %edx, %esi \ 1
106: C jnc 1f /
107: C incl %esi \ 1
108: C 1: movl disp(%ebx), %eax /
109: C ---
110: C 7
111: C
112: C In a back-to-back style test this measures 7 with the jnc not taken, or 8
113: C with it taken (both when correctly predicted). This is opposite to the
114: C measurements showing small multipliers running faster than large ones.
115: C Watch this space for more info ...
116: C
117: C It's not clear how much branch misprediction might be costing. The K6
118: C doco says it will be 1 to 4 cycles, but presumably it's near the low end
119: C of that range to get the measured results.
120: C
121: C
122: C In the code the two carries are more or less the preceding mul product and
123: C the calculation is roughly
124: C
125: C x*y + u*b+v
126: C
127: C where b=2^32 is the size of a limb, x*y is the two carry limbs, and u and
128: C v are the two limbs it's added to (being the low of the next mul, and a
129: C limb from the destination).
130: C
131: C To get a carry requires x*y+u*b+v >= b^2, which is u*b+v >= b^2-x*y, and
132: C there are b^2-(b^2-x*y) = x*y many such values, giving a probability of
133: C x*y/b^2. If x, y, u and v are random and uniformly distributed between 0
134: C and b-1, then the total probability can be summed over x and y,
135: C
136: C 1 b-1 b-1 x*y 1 b*(b-1) b*(b-1)
137: C --- * sum sum --- = --- * ------- * ------- = 1/4
138: C b^2 x=0 y=1 b^2 b^4 2 2
139: C
140: C Actually it's a very tiny bit less than 1/4 of course. If y is fixed,
141: C then the probability is 1/2*y/b thus varying linearly between 0 and 1/2.
142:
143:
144: ifdef(`PIC',`
145: deflit(UNROLL_THRESHOLD, 9)
146: ',`
147: deflit(UNROLL_THRESHOLD, 6)
148: ')
149:
150: defframe(PARAM_CARRY, 20)
151: defframe(PARAM_MULTIPLIER,16)
152: defframe(PARAM_SIZE, 12)
153: defframe(PARAM_SRC, 8)
154: defframe(PARAM_DST, 4)
155:
156: .text
157: ALIGN(32)
158:
159: PROLOGUE(M4_function_1c)
160: pushl %esi
161: deflit(`FRAME',4)
162: movl PARAM_CARRY, %esi
163: jmp LF(M4_function_1,start_nc)
164: EPILOGUE()
165:
166: PROLOGUE(M4_function_1)
167: push %esi
168: deflit(`FRAME',4)
169: xorl %esi, %esi C initial carry
170:
171: L(start_nc):
172: movl PARAM_SIZE, %ecx
173: pushl %ebx
174: deflit(`FRAME',8)
175:
176: movl PARAM_SRC, %ebx
177: pushl %edi
178: deflit(`FRAME',12)
179:
180: cmpl $UNROLL_THRESHOLD, %ecx
181: movl PARAM_DST, %edi
182:
183: pushl %ebp
184: deflit(`FRAME',16)
185: jae L(unroll)
186:
187:
188: C simple loop
189:
190: movl PARAM_MULTIPLIER, %ebp
191:
192: L(simple):
193: C eax scratch
194: C ebx src
195: C ecx counter
196: C edx scratch
197: C esi carry
198: C edi dst
199: C ebp multiplier
200:
201: movl (%ebx), %eax
202: addl $4, %ebx
203:
204: mull %ebp
205:
206: addl $4, %edi
207: addl %esi, %eax
208:
209: adcl $0, %edx
210:
211: M4_inst %eax, -4(%edi)
212:
213: adcl $0, %edx
214:
215: movl %edx, %esi
216: loop L(simple)
217:
218:
219: popl %ebp
220: popl %edi
221:
222: popl %ebx
223: movl %esi, %eax
224:
225: popl %esi
226: ret
227:
228:
229:
230: C -----------------------------------------------------------------------------
231: C The unrolled loop uses a "two carry limbs" scheme. At the top of the loop
232: C the carries are ecx=lo, esi=hi, then they swap for each limb processed.
233: C For the computed jump an odd size means they start one way around, an even
234: C size the other.
235: C
236: C VAR_JUMP holds the computed jump temporarily because there's not enough
237: C registers at the point of doing the mul for the initial two carry limbs.
238: C
239: C The add/adc for the initial carry in %esi is necessary only for the
240: C mpn_addmul/submul_1c entry points. Duplicating the startup code to
241: C eliminiate this for the plain mpn_add/submul_1 doesn't seem like a good
242: C idea.
243:
244: dnl overlapping with parameters already fetched
245: define(VAR_COUNTER, `PARAM_SIZE')
246: define(VAR_JUMP, `PARAM_DST')
247:
248: L(unroll):
249: C eax
250: C ebx src
251: C ecx size
252: C edx
253: C esi initial carry
254: C edi dst
255: C ebp
256:
257: movl %ecx, %edx
258: decl %ecx
259:
260: subl $2, %edx
261: negl %ecx
262:
263: shrl $UNROLL_LOG2, %edx
264: andl $UNROLL_MASK, %ecx
265:
266: movl %edx, VAR_COUNTER
267: movl %ecx, %edx
268:
269: shll $4, %edx
270: negl %ecx
271:
272: C 15 code bytes per limb
273: ifdef(`PIC',`
274: call L(pic_calc)
275: L(here):
276: ',`
277: leal L(entry) (%edx,%ecx,1), %edx
278: ')
279: movl (%ebx), %eax C src low limb
280:
281: movl PARAM_MULTIPLIER, %ebp
282: movl %edx, VAR_JUMP
283:
284: mull %ebp
285:
286: addl %esi, %eax C initial carry (from _1c)
287: jadcl0( %edx)
288:
289:
290: leal 4(%ebx,%ecx,4), %ebx
291: movl %edx, %esi C high carry
292:
293: movl VAR_JUMP, %edx
294: leal (%edi,%ecx,4), %edi
295:
296: testl $1, %ecx
297: movl %eax, %ecx C low carry
298:
299: jz L(noswap)
300: movl %esi, %ecx C high,low carry other way around
301:
302: movl %eax, %esi
303: L(noswap):
304:
305: jmp *%edx
306:
307:
308: ifdef(`PIC',`
309: L(pic_calc):
310: C See README.family about old gas bugs
311: leal (%edx,%ecx,1), %edx
312: addl $L(entry)-L(here), %edx
313: addl (%esp), %edx
314: ret
315: ')
316:
317:
318: C -----------------------------------------------------------
319: ALIGN(32)
320: L(top):
321: deflit(`FRAME',16)
322: C eax scratch
323: C ebx src
324: C ecx carry lo
325: C edx scratch
326: C esi carry hi
327: C edi dst
328: C ebp multiplier
329: C
330: C 15 code bytes per limb
331:
332: leal UNROLL_BYTES(%edi), %edi
333:
334: L(entry):
335: forloop(`i', 0, UNROLL_COUNT/2-1, `
336: deflit(`disp0', eval(2*i*4))
337: deflit(`disp1', eval(disp0 + 4))
338:
339: Zdisp( movl, disp0,(%ebx), %eax)
340: mull %ebp
341: Zdisp( M4_inst,%ecx, disp0,(%edi))
342: adcl %eax, %esi
343: movl %edx, %ecx
344: jadcl0( %ecx)
345:
346: movl disp1(%ebx), %eax
347: mull %ebp
348: M4_inst %esi, disp1(%edi)
349: adcl %eax, %ecx
350: movl %edx, %esi
351: jadcl0( %esi)
352: ')
353:
354: decl VAR_COUNTER
355: leal UNROLL_BYTES(%ebx), %ebx
356:
357: jns L(top)
358:
359:
360: popl %ebp
361: M4_inst %ecx, UNROLL_BYTES(%edi)
362:
363: popl %edi
364: movl %esi, %eax
365:
366: popl %ebx
367: jadcl0( %eax)
368:
369: popl %esi
370: ret
371:
372: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>