Annotation of OpenXM_contrib/gmp/mpn/x86/pentium/mmx/rshift.asm, Revision 1.1.1.2
1.1 maekawa 1: dnl Intel P5 mpn_rshift -- mpn right shift.
2:
1.1.1.2 ! ohara 3: dnl Copyright 2000, 2002 Free Software Foundation, Inc.
1.1 maekawa 4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24:
1.1.1.2 ! ohara 25: C P5: 1.75 cycles/limb.
! 26:
! 27:
1.1 maekawa 28: C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
29: C unsigned shift);
30: C
31: C Shift src,size right by shift many bits and store the result in dst,size.
32: C Zeros are shifted in at the left. Return the bits shifted out at the
33: C right.
34: C
35: C It takes 6 mmx instructions to process 2 limbs, making 1.5 cycles/limb,
36: C and with a 4 limb loop and 1 cycle of loop overhead the total is 1.75 c/l.
37: C
38: C Full speed depends on source and destination being aligned. Unaligned mmx
39: C loads and stores on P5 don't pair and have a 2 cycle penalty. Some hairy
40: C setups and finish-ups are done to ensure alignment for the loop.
41: C
42: C MMX shifts work out a bit faster even for the simple loop.
43:
44: defframe(PARAM_SHIFT,16)
45: defframe(PARAM_SIZE, 12)
46: defframe(PARAM_SRC, 8)
47: defframe(PARAM_DST, 4)
48: deflit(`FRAME',0)
49:
50: dnl Minimum 5, because the unrolled loop can't handle less.
51: deflit(UNROLL_THRESHOLD, 5)
52:
1.1.1.2 ! ohara 53: TEXT
1.1 maekawa 54: ALIGN(8)
55:
56: PROLOGUE(mpn_rshift)
57:
58: pushl %ebx
59: pushl %edi
60: deflit(`FRAME',8)
61:
62: movl PARAM_SIZE, %eax
63: movl PARAM_DST, %edx
64:
65: movl PARAM_SRC, %ebx
66: movl PARAM_SHIFT, %ecx
67:
68: cmp $UNROLL_THRESHOLD, %eax
69: jae L(unroll)
70:
71: decl %eax
72: movl (%ebx), %edi C src low limb
73:
74: jnz L(simple)
75:
76: shrdl( %cl, %edi, %eax) C eax was decremented to zero
77:
78: shrl %cl, %edi
79:
80: movl %edi, (%edx) C dst low limb
81: popl %edi C risk of data cache bank clash
82:
83: popl %ebx
84:
85: ret
86:
87:
88: C -----------------------------------------------------------------------------
89: ALIGN(8)
90: L(simple):
91: C eax size-1
92: C ebx src
93: C ecx shift
94: C edx dst
95: C esi
96: C edi
97: C ebp
98: deflit(`FRAME',8)
99:
100: movd (%ebx), %mm5 C src[0]
101: leal (%ebx,%eax,4), %ebx C &src[size-1]
102:
103: movd %ecx, %mm6 C rshift
104: leal -4(%edx,%eax,4), %edx C &dst[size-2]
105:
106: psllq $32, %mm5
107: negl %eax
108:
109:
110: C This loop is 5 or 8 cycles, with every second load unaligned and a wasted
111: C cycle waiting for the mm0 result to be ready. For comparison a shrdl is 4
112: C cycles and would be 8 in a simple loop. Using mmx helps the return value
113: C and last limb calculations too.
114:
115: L(simple_top):
116: C eax counter, limbs, negative
117: C ebx &src[size-1]
118: C ecx return value
119: C edx &dst[size-2]
120: C
121: C mm0 scratch
122: C mm5 return value
123: C mm6 shift
124:
125: movq (%ebx,%eax,4), %mm0
126: incl %eax
127:
128: psrlq %mm6, %mm0
129:
130: movd %mm0, (%edx,%eax,4)
131: jnz L(simple_top)
132:
133:
134: movd (%ebx), %mm0
135: psrlq %mm6, %mm5 C return value
136:
137: psrlq %mm6, %mm0
138: popl %edi
139:
140: movd %mm5, %eax
141: popl %ebx
142:
143: movd %mm0, 4(%edx)
144:
145: emms
146:
147: ret
148:
149:
150: C -----------------------------------------------------------------------------
151: ALIGN(8)
152: L(unroll):
153: C eax size
154: C ebx src
155: C ecx shift
156: C edx dst
157: C esi
158: C edi
159: C ebp
160: deflit(`FRAME',8)
161:
162: movd (%ebx), %mm5 C src[0]
163: movl $4, %edi
164:
165: movd %ecx, %mm6 C rshift
166: testl %edi, %ebx
167:
168: psllq $32, %mm5
169: jz L(start_src_aligned)
170:
171:
172: C src isn't aligned, process low limb separately (marked xxx) and
173: C step src and dst by one limb, making src aligned.
174: C
175: C source ebx
176: C --+-------+-------+-------+
177: C | xxx |
178: C --+-------+-------+-------+
179: C 4mod8 0mod8 4mod8
180: C
181: C dest edx
182: C --+-------+-------+
183: C | | xxx |
184: C --+-------+-------+
185:
186: movq (%ebx), %mm0 C unaligned load
187:
188: psrlq %mm6, %mm0
189: addl $4, %ebx
190:
191: decl %eax
192:
193: movd %mm0, (%edx)
194: addl $4, %edx
195: L(start_src_aligned):
196:
197:
198: movq (%ebx), %mm1
199: testl %edi, %edx
200:
201: psrlq %mm6, %mm5 C retval
202: jz L(start_dst_aligned)
203:
204: C dst isn't aligned, add 4 to make it so, and pretend the shift is
205: C 32 bits extra. Low limb of dst (marked xxx) handled here
206: C separately.
207: C
208: C source ebx
209: C --+-------+-------+
210: C | mm1 |
211: C --+-------+-------+
212: C 4mod8 0mod8
213: C
214: C dest edx
215: C --+-------+-------+-------+
216: C | xxx |
217: C --+-------+-------+-------+
218: C 4mod8 0mod8 4mod8
219:
220: movq %mm1, %mm0
221: addl $32, %ecx C new shift
222:
223: psrlq %mm6, %mm0
224:
225: movd %ecx, %mm6
226:
227: movd %mm0, (%edx)
228: addl $4, %edx
229: L(start_dst_aligned):
230:
231:
232: movq 8(%ebx), %mm3
233: negl %ecx
234:
235: movq %mm3, %mm2 C mm2 src qword
236: addl $64, %ecx
237:
238: movd %ecx, %mm7
239: psrlq %mm6, %mm1
240:
241: leal -12(%ebx,%eax,4), %ebx
242: leal -20(%edx,%eax,4), %edx
243:
244: psllq %mm7, %mm3
245: subl $7, %eax C size-7
246:
247: por %mm1, %mm3 C mm3 ready to store
248: negl %eax C -(size-7)
249:
250: jns L(finish)
251:
252:
253: C This loop is the important bit, the rest is just support. Careful
254: C instruction scheduling achieves the claimed 1.75 c/l. The
255: C relevant parts of the pairing rules are:
256: C
257: C - mmx loads and stores execute only in the U pipe
258: C - only one mmx shift in a pair
259: C - wait one cycle before storing an mmx register result
260: C - the usual address generation interlock
261: C
262: C Two qword calculations are slightly interleaved. The instructions
263: C marked "C" belong to the second qword, and the "C prev" one is for
264: C the second qword from the previous iteration.
265:
266: ALIGN(8)
267: L(unroll_loop):
268: C eax counter, limbs, negative
269: C ebx &src[size-12]
270: C ecx
271: C edx &dst[size-12]
272: C esi
273: C edi
274: C
275: C mm0
276: C mm1
277: C mm2 src qword from -8(%ebx,%eax,4)
278: C mm3 dst qword ready to store to -8(%edx,%eax,4)
279: C
280: C mm5 return value
281: C mm6 rshift
282: C mm7 lshift
283:
284: movq (%ebx,%eax,4), %mm0
285: psrlq %mm6, %mm2
286:
287: movq %mm0, %mm1
288: psllq %mm7, %mm0
289:
290: movq %mm3, -8(%edx,%eax,4) C prev
291: por %mm2, %mm0
292:
293: movq 8(%ebx,%eax,4), %mm3 C
294: psrlq %mm6, %mm1 C
295:
296: movq %mm0, (%edx,%eax,4)
297: movq %mm3, %mm2 C
298:
299: psllq %mm7, %mm3 C
300: addl $4, %eax
301:
302: por %mm1, %mm3 C
303: js L(unroll_loop)
304:
305:
306: L(finish):
307: C eax 0 to 3 representing respectively 3 to 0 limbs remaining
308:
309: testb $2, %al
310:
311: jnz L(finish_no_two)
312:
313: movq (%ebx,%eax,4), %mm0
314: psrlq %mm6, %mm2
315:
316: movq %mm0, %mm1
317: psllq %mm7, %mm0
318:
319: movq %mm3, -8(%edx,%eax,4) C prev
320: por %mm2, %mm0
321:
322: movq %mm1, %mm2
323: movq %mm0, %mm3
324:
325: addl $2, %eax
326: L(finish_no_two):
327:
328:
329: C eax 2 or 3 representing respectively 1 or 0 limbs remaining
330: C
331: C mm2 src prev qword, from -8(%ebx,%eax,4)
332: C mm3 dst qword, for -8(%edx,%eax,4)
333:
334: testb $1, %al
335: popl %edi
336:
337: movd %mm5, %eax C retval
338: jnz L(finish_zero)
339:
340:
341: C One extra limb, destination was aligned.
342: C
343: C source ebx
344: C +-------+---------------+--
345: C | | mm2 |
346: C +-------+---------------+--
347: C
348: C dest edx
349: C +-------+---------------+---------------+--
350: C | | | mm3 |
351: C +-------+---------------+---------------+--
352: C
353: C mm6 = shift
354: C mm7 = ecx = 64-shift
355:
356:
357: C One extra limb, destination was unaligned.
358: C
359: C source ebx
360: C +-------+---------------+--
361: C | | mm2 |
362: C +-------+---------------+--
363: C
364: C dest edx
365: C +---------------+---------------+--
366: C | | mm3 |
367: C +---------------+---------------+--
368: C
369: C mm6 = shift+32
370: C mm7 = ecx = 64-(shift+32)
371:
372:
373: C In both cases there's one extra limb of src to fetch and combine
374: C with mm2 to make a qword at 8(%edx), and in the aligned case
375: C there's a further extra limb of dst to be formed.
376:
377:
378: movd 8(%ebx), %mm0
379: psrlq %mm6, %mm2
380:
381: movq %mm0, %mm1
382: psllq %mm7, %mm0
383:
384: movq %mm3, (%edx)
385: por %mm2, %mm0
386:
387: psrlq %mm6, %mm1
388: andl $32, %ecx
389:
390: popl %ebx
391: jz L(finish_one_unaligned)
392:
393: C dst was aligned, must store one extra limb
394: movd %mm1, 16(%edx)
395: L(finish_one_unaligned):
396:
397: movq %mm0, 8(%edx)
398:
399: emms
400:
401: ret
402:
403:
404: L(finish_zero):
405:
406: C No extra limbs, destination was aligned.
407: C
408: C source ebx
409: C +---------------+--
410: C | mm2 |
411: C +---------------+--
412: C
413: C dest edx+4
414: C +---------------+---------------+--
415: C | | mm3 |
416: C +---------------+---------------+--
417: C
418: C mm6 = shift
419: C mm7 = ecx = 64-shift
420:
421:
422: C No extra limbs, destination was unaligned.
423: C
424: C source ebx
425: C +---------------+--
426: C | mm2 |
427: C +---------------+--
428: C
429: C dest edx+4
430: C +-------+---------------+--
431: C | | mm3 |
432: C +-------+---------------+--
433: C
434: C mm6 = shift+32
435: C mm7 = 64-(shift+32)
436:
437:
438: C The movd for the unaligned case is clearly the same data as the
439: C movq for the aligned case, it's just a choice between whether one
440: C or two limbs should be written.
441:
442:
443: movq %mm3, 4(%edx)
444: psrlq %mm6, %mm2
445:
446: movd %mm2, 12(%edx)
447: andl $32, %ecx
448:
449: popl %ebx
450: jz L(finish_zero_unaligned)
451:
452: movq %mm2, 12(%edx)
453: L(finish_zero_unaligned):
454:
455: emms
456:
457: ret
458:
459: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>