Annotation of OpenXM_contrib/gmp/mpn/x86/pentium/mmx/lshift.asm, Revision 1.1.1.2
1.1 maekawa 1: dnl Intel P5 mpn_lshift -- mpn left shift.
2:
1.1.1.2 ! ohara 3: dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
1.1 maekawa 4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24:
1.1.1.2 ! ohara 25: C P5: 1.75 cycles/limb.
! 26:
! 27:
1.1 maekawa 28: C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
29: C unsigned shift);
30: C
31: C Shift src,size left by shift many bits and store the result in dst,size.
32: C Zeros are shifted in at the right. Return the bits shifted out at the
33: C left.
34: C
35: C The comments in mpn_rshift apply here too.
36:
37: defframe(PARAM_SHIFT,16)
38: defframe(PARAM_SIZE, 12)
39: defframe(PARAM_SRC, 8)
40: defframe(PARAM_DST, 4)
41: deflit(`FRAME',0)
42:
43: dnl minimum 5, because the unrolled loop can't handle less
44: deflit(UNROLL_THRESHOLD, 5)
45:
1.1.1.2 ! ohara 46: TEXT
1.1 maekawa 47: ALIGN(8)
48:
49: PROLOGUE(mpn_lshift)
50:
51: pushl %ebx
52: pushl %edi
53: deflit(`FRAME',8)
54:
55: movl PARAM_SIZE, %eax
56: movl PARAM_DST, %edx
57:
58: movl PARAM_SRC, %ebx
59: movl PARAM_SHIFT, %ecx
60:
61: cmp $UNROLL_THRESHOLD, %eax
62: jae L(unroll)
63:
64: movl -4(%ebx,%eax,4), %edi C src high limb
65: decl %eax
66:
67: jnz L(simple)
68:
69: shldl( %cl, %edi, %eax) C eax was decremented to zero
70:
71: shll %cl, %edi
72:
73: movl %edi, (%edx) C dst low limb
74: popl %edi C risk of data cache bank clash
75:
76: popl %ebx
77:
78: ret
79:
80:
81: C -----------------------------------------------------------------------------
82: L(simple):
83: C eax size-1
84: C ebx src
85: C ecx shift
86: C edx dst
87: C esi
88: C edi
89: C ebp
90: deflit(`FRAME',8)
91:
92: movd (%ebx,%eax,4), %mm5 C src high limb
93:
94: movd %ecx, %mm6 C lshift
95: negl %ecx
96:
97: psllq %mm6, %mm5
98: addl $32, %ecx
99:
100: movd %ecx, %mm7
101: psrlq $32, %mm5 C retval
102:
103:
104: L(simple_top):
105: C eax counter, limbs, negative
106: C ebx src
107: C ecx
108: C edx dst
109: C esi
110: C edi
111: C
112: C mm0 scratch
113: C mm5 return value
114: C mm6 shift
115: C mm7 32-shift
116:
117: movq -4(%ebx,%eax,4), %mm0
118: decl %eax
119:
120: psrlq %mm7, %mm0
121:
122: C
123:
124: movd %mm0, 4(%edx,%eax,4)
125: jnz L(simple_top)
126:
127:
128: movd (%ebx), %mm0
129:
130: movd %mm5, %eax
131: psllq %mm6, %mm0
132:
133: popl %edi
134: popl %ebx
135:
136: movd %mm0, (%edx)
137:
138: emms
139:
140: ret
141:
142:
143: C -----------------------------------------------------------------------------
144: ALIGN(8)
145: L(unroll):
146: C eax size
147: C ebx src
148: C ecx shift
149: C edx dst
150: C esi
151: C edi
152: C ebp
153: deflit(`FRAME',8)
154:
155: movd -4(%ebx,%eax,4), %mm5 C src high limb
156: leal (%ebx,%eax,4), %edi
157:
158: movd %ecx, %mm6 C lshift
159: andl $4, %edi
160:
161: psllq %mm6, %mm5
162: jz L(start_src_aligned)
163:
164:
165: C src isn't aligned, process high limb separately (marked xxx) to
166: C make it so.
167: C
168: C source -8(ebx,%eax,4)
169: C |
170: C +-------+-------+-------+--
171: C | |
172: C +-------+-------+-------+--
173: C 0mod8 4mod8 0mod8
174: C
175: C dest
176: C -4(edx,%eax,4)
177: C |
178: C +-------+-------+--
179: C | xxx | |
180: C +-------+-------+--
181:
182: movq -8(%ebx,%eax,4), %mm0 C unaligned load
183:
184: psllq %mm6, %mm0
185: decl %eax
186:
187: psrlq $32, %mm0
188:
189: C
190:
191: movd %mm0, (%edx,%eax,4)
192: L(start_src_aligned):
193:
194: movq -8(%ebx,%eax,4), %mm1 C src high qword
195: leal (%edx,%eax,4), %edi
196:
197: andl $4, %edi
198: psrlq $32, %mm5 C return value
199:
200: movq -16(%ebx,%eax,4), %mm3 C src second highest qword
201: jz L(start_dst_aligned)
202:
203: C dst isn't aligned, subtract 4 to make it so, and pretend the shift
204: C is 32 bits extra. High limb of dst (marked xxx) handled here
205: C separately.
206: C
207: C source -8(ebx,%eax,4)
208: C |
209: C +-------+-------+--
210: C | mm1 |
211: C +-------+-------+--
212: C 0mod8 4mod8
213: C
214: C dest
215: C -4(edx,%eax,4)
216: C |
217: C +-------+-------+-------+--
218: C | xxx | |
219: C +-------+-------+-------+--
220: C 0mod8 4mod8 0mod8
221:
222: movq %mm1, %mm0
223: addl $32, %ecx C new shift
224:
225: psllq %mm6, %mm0
226:
227: movd %ecx, %mm6
228: psrlq $32, %mm0
229:
230: C wasted cycle here waiting for %mm0
231:
232: movd %mm0, -4(%edx,%eax,4)
233: subl $4, %edx
234: L(start_dst_aligned):
235:
236:
237: psllq %mm6, %mm1
238: negl %ecx C -shift
239:
240: addl $64, %ecx C 64-shift
241: movq %mm3, %mm2
242:
243: movd %ecx, %mm7
244: subl $8, %eax C size-8
245:
246: psrlq %mm7, %mm3
247:
248: por %mm1, %mm3 C mm3 ready to store
249: jc L(finish)
250:
251:
252: C The comments in mpn_rshift apply here too.
253:
254: ALIGN(8)
255: L(unroll_loop):
256: C eax counter, limbs
257: C ebx src
258: C ecx
259: C edx dst
260: C esi
261: C edi
262: C
263: C mm0
264: C mm1
1.1.1.2 ! ohara 265: C mm2 src qword from 16(%ebx,%eax,4)
! 266: C mm3 dst qword ready to store to 24(%edx,%eax,4)
1.1 maekawa 267: C
268: C mm5 return value
269: C mm6 lshift
270: C mm7 rshift
271:
272: movq 8(%ebx,%eax,4), %mm0
273: psllq %mm6, %mm2
274:
275: movq %mm0, %mm1
276: psrlq %mm7, %mm0
277:
278: movq %mm3, 24(%edx,%eax,4) C prev
279: por %mm2, %mm0
280:
281: movq (%ebx,%eax,4), %mm3 C
282: psllq %mm6, %mm1 C
283:
284: movq %mm0, 16(%edx,%eax,4)
285: movq %mm3, %mm2 C
286:
287: psrlq %mm7, %mm3 C
288: subl $4, %eax
289:
290: por %mm1, %mm3 C
291: jnc L(unroll_loop)
292:
293:
294:
295: L(finish):
296: C eax -4 to -1 representing respectively 0 to 3 limbs remaining
297:
298: testb $2, %al
299:
300: jz L(finish_no_two)
301:
302: movq 8(%ebx,%eax,4), %mm0
303: psllq %mm6, %mm2
304:
305: movq %mm0, %mm1
306: psrlq %mm7, %mm0
307:
308: movq %mm3, 24(%edx,%eax,4) C prev
309: por %mm2, %mm0
310:
311: movq %mm1, %mm2
312: movq %mm0, %mm3
313:
314: subl $2, %eax
315: L(finish_no_two):
316:
317:
318: C eax -4 or -3 representing respectively 0 or 1 limbs remaining
319: C
1.1.1.2 ! ohara 320: C mm2 src prev qword, from 16(%ebx,%eax,4)
! 321: C mm3 dst qword, for 24(%edx,%eax,4)
1.1 maekawa 322:
323: testb $1, %al
324: movd %mm5, %eax C retval
325:
326: popl %edi
327: jz L(finish_zero)
328:
329:
330: C One extra src limb, destination was aligned.
331: C
332: C source ebx
333: C --+---------------+-------+
334: C | mm2 | |
335: C --+---------------+-------+
336: C
337: C dest edx+12 edx+4 edx
338: C --+---------------+---------------+-------+
339: C | mm3 | | |
340: C --+---------------+---------------+-------+
341: C
342: C mm6 = shift
343: C mm7 = ecx = 64-shift
344:
345:
346: C One extra src limb, destination was unaligned.
347: C
348: C source ebx
349: C --+---------------+-------+
350: C | mm2 | |
351: C --+---------------+-------+
352: C
353: C dest edx+12 edx+4
354: C --+---------------+---------------+
355: C | mm3 | |
356: C --+---------------+---------------+
357: C
358: C mm6 = shift+32
359: C mm7 = ecx = 64-(shift+32)
360:
361:
362: C In both cases there's one extra limb of src to fetch and combine
363: C with mm2 to make a qword at 4(%edx), and in the aligned case
364: C there's an extra limb of dst to be formed from that extra src limb
365: C left shifted.
366:
367:
368: movd (%ebx), %mm0
369: psllq %mm6, %mm2
370:
371: movq %mm3, 12(%edx)
372: psllq $32, %mm0
373:
374: movq %mm0, %mm1
375: psrlq %mm7, %mm0
376:
377: por %mm2, %mm0
378: psllq %mm6, %mm1
379:
380: movq %mm0, 4(%edx)
381: psrlq $32, %mm1
382:
383: andl $32, %ecx
384: popl %ebx
385:
386: jz L(finish_one_unaligned)
387:
388: movd %mm1, (%edx)
389: L(finish_one_unaligned):
390:
391: emms
392:
393: ret
394:
395:
396: L(finish_zero):
397:
398: C No extra src limbs, destination was aligned.
399: C
400: C source ebx
401: C --+---------------+
402: C | mm2 |
403: C --+---------------+
404: C
405: C dest edx+8 edx
406: C --+---------------+---------------+
407: C | mm3 | |
408: C --+---------------+---------------+
409: C
410: C mm6 = shift
411: C mm7 = ecx = 64-shift
412:
413:
414: C No extra src limbs, destination was unaligned.
415: C
416: C source ebx
417: C --+---------------+
418: C | mm2 |
419: C --+---------------+
420: C
421: C dest edx+8 edx+4
422: C --+---------------+-------+
423: C | mm3 | |
424: C --+---------------+-------+
425: C
426: C mm6 = shift+32
427: C mm7 = ecx = 64-(shift+32)
428:
429:
430: C The movd for the unaligned case writes the same data to 4(%edx)
431: C that the movq does for the aligned case.
432:
433:
434: movq %mm3, 8(%edx)
435: andl $32, %ecx
436:
437: psllq %mm6, %mm2
438: jz L(finish_zero_unaligned)
439:
440: movq %mm2, (%edx)
441: L(finish_zero_unaligned):
442:
443: psrlq $32, %mm2
444: popl %ebx
445:
446: movd %mm5, %eax C retval
447:
448: movd %mm2, 4(%edx)
449:
450: emms
451:
452: ret
453:
454: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>