Annotation of OpenXM_contrib/gmp/mpn/x86/pentium/mmx/lshift.asm, Revision 1.1.1.1
1.1 maekawa 1: dnl Intel P5 mpn_lshift -- mpn left shift.
2: dnl
3: dnl P5: 1.75 cycles/limb.
4:
5:
6: dnl Copyright (C) 2000 Free Software Foundation, Inc.
7: dnl
8: dnl This file is part of the GNU MP Library.
9: dnl
10: dnl The GNU MP Library is free software; you can redistribute it and/or
11: dnl modify it under the terms of the GNU Lesser General Public License as
12: dnl published by the Free Software Foundation; either version 2.1 of the
13: dnl License, or (at your option) any later version.
14: dnl
15: dnl The GNU MP Library is distributed in the hope that it will be useful,
16: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
17: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18: dnl Lesser General Public License for more details.
19: dnl
20: dnl You should have received a copy of the GNU Lesser General Public
21: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
22: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
23: dnl Suite 330, Boston, MA 02111-1307, USA.
24:
25:
26: include(`../config.m4')
27:
28:
29: C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
30: C unsigned shift);
31: C
32: C Shift src,size left by shift many bits and store the result in dst,size.
33: C Zeros are shifted in at the right. Return the bits shifted out at the
34: C left.
35: C
36: C The comments in mpn_rshift apply here too.
37:
38: defframe(PARAM_SHIFT,16)
39: defframe(PARAM_SIZE, 12)
40: defframe(PARAM_SRC, 8)
41: defframe(PARAM_DST, 4)
42: deflit(`FRAME',0)
43:
44: dnl minimum 5, because the unrolled loop can't handle less
45: deflit(UNROLL_THRESHOLD, 5)
46:
47: .text
48: ALIGN(8)
49:
50: PROLOGUE(mpn_lshift)
51:
52: pushl %ebx
53: pushl %edi
54: deflit(`FRAME',8)
55:
56: movl PARAM_SIZE, %eax
57: movl PARAM_DST, %edx
58:
59: movl PARAM_SRC, %ebx
60: movl PARAM_SHIFT, %ecx
61:
62: cmp $UNROLL_THRESHOLD, %eax
63: jae L(unroll)
64:
65: movl -4(%ebx,%eax,4), %edi C src high limb
66: decl %eax
67:
68: jnz L(simple)
69:
70: shldl( %cl, %edi, %eax) C eax was decremented to zero
71:
72: shll %cl, %edi
73:
74: movl %edi, (%edx) C dst low limb
75: popl %edi C risk of data cache bank clash
76:
77: popl %ebx
78:
79: ret
80:
81:
82: C -----------------------------------------------------------------------------
83: L(simple):
84: C eax size-1
85: C ebx src
86: C ecx shift
87: C edx dst
88: C esi
89: C edi
90: C ebp
91: deflit(`FRAME',8)
92:
93: movd (%ebx,%eax,4), %mm5 C src high limb
94:
95: movd %ecx, %mm6 C lshift
96: negl %ecx
97:
98: psllq %mm6, %mm5
99: addl $32, %ecx
100:
101: movd %ecx, %mm7
102: psrlq $32, %mm5 C retval
103:
104:
105: L(simple_top):
106: C eax counter, limbs, negative
107: C ebx src
108: C ecx
109: C edx dst
110: C esi
111: C edi
112: C
113: C mm0 scratch
114: C mm5 return value
115: C mm6 shift
116: C mm7 32-shift
117:
118: movq -4(%ebx,%eax,4), %mm0
119: decl %eax
120:
121: psrlq %mm7, %mm0
122:
123: C
124:
125: movd %mm0, 4(%edx,%eax,4)
126: jnz L(simple_top)
127:
128:
129: movd (%ebx), %mm0
130:
131: movd %mm5, %eax
132: psllq %mm6, %mm0
133:
134: popl %edi
135: popl %ebx
136:
137: movd %mm0, (%edx)
138:
139: emms
140:
141: ret
142:
143:
144: C -----------------------------------------------------------------------------
145: ALIGN(8)
146: L(unroll):
147: C eax size
148: C ebx src
149: C ecx shift
150: C edx dst
151: C esi
152: C edi
153: C ebp
154: deflit(`FRAME',8)
155:
156: movd -4(%ebx,%eax,4), %mm5 C src high limb
157: leal (%ebx,%eax,4), %edi
158:
159: movd %ecx, %mm6 C lshift
160: andl $4, %edi
161:
162: psllq %mm6, %mm5
163: jz L(start_src_aligned)
164:
165:
166: C src isn't aligned, process high limb separately (marked xxx) to
167: C make it so.
168: C
169: C source -8(ebx,%eax,4)
170: C |
171: C +-------+-------+-------+--
172: C | |
173: C +-------+-------+-------+--
174: C 0mod8 4mod8 0mod8
175: C
176: C dest
177: C -4(edx,%eax,4)
178: C |
179: C +-------+-------+--
180: C | xxx | |
181: C +-------+-------+--
182:
183: movq -8(%ebx,%eax,4), %mm0 C unaligned load
184:
185: psllq %mm6, %mm0
186: decl %eax
187:
188: psrlq $32, %mm0
189:
190: C
191:
192: movd %mm0, (%edx,%eax,4)
193: L(start_src_aligned):
194:
195: movq -8(%ebx,%eax,4), %mm1 C src high qword
196: leal (%edx,%eax,4), %edi
197:
198: andl $4, %edi
199: psrlq $32, %mm5 C return value
200:
201: movq -16(%ebx,%eax,4), %mm3 C src second highest qword
202: jz L(start_dst_aligned)
203:
204: C dst isn't aligned, subtract 4 to make it so, and pretend the shift
205: C is 32 bits extra. High limb of dst (marked xxx) handled here
206: C separately.
207: C
208: C source -8(ebx,%eax,4)
209: C |
210: C +-------+-------+--
211: C | mm1 |
212: C +-------+-------+--
213: C 0mod8 4mod8
214: C
215: C dest
216: C -4(edx,%eax,4)
217: C |
218: C +-------+-------+-------+--
219: C | xxx | |
220: C +-------+-------+-------+--
221: C 0mod8 4mod8 0mod8
222:
223: movq %mm1, %mm0
224: addl $32, %ecx C new shift
225:
226: psllq %mm6, %mm0
227:
228: movd %ecx, %mm6
229: psrlq $32, %mm0
230:
231: C wasted cycle here waiting for %mm0
232:
233: movd %mm0, -4(%edx,%eax,4)
234: subl $4, %edx
235: L(start_dst_aligned):
236:
237:
238: psllq %mm6, %mm1
239: negl %ecx C -shift
240:
241: addl $64, %ecx C 64-shift
242: movq %mm3, %mm2
243:
244: movd %ecx, %mm7
245: subl $8, %eax C size-8
246:
247: psrlq %mm7, %mm3
248:
249: por %mm1, %mm3 C mm3 ready to store
250: jc L(finish)
251:
252:
253: C The comments in mpn_rshift apply here too.
254:
255: ALIGN(8)
256: L(unroll_loop):
257: C eax counter, limbs
258: C ebx src
259: C ecx
260: C edx dst
261: C esi
262: C edi
263: C
264: C mm0
265: C mm1
266: C mm2 src qword from 48(%ebx,%eax,4)
267: C mm3 dst qword ready to store to 56(%edx,%eax,4)
268: C
269: C mm5 return value
270: C mm6 lshift
271: C mm7 rshift
272:
273: movq 8(%ebx,%eax,4), %mm0
274: psllq %mm6, %mm2
275:
276: movq %mm0, %mm1
277: psrlq %mm7, %mm0
278:
279: movq %mm3, 24(%edx,%eax,4) C prev
280: por %mm2, %mm0
281:
282: movq (%ebx,%eax,4), %mm3 C
283: psllq %mm6, %mm1 C
284:
285: movq %mm0, 16(%edx,%eax,4)
286: movq %mm3, %mm2 C
287:
288: psrlq %mm7, %mm3 C
289: subl $4, %eax
290:
291: por %mm1, %mm3 C
292: jnc L(unroll_loop)
293:
294:
295:
296: L(finish):
297: C eax -4 to -1 representing respectively 0 to 3 limbs remaining
298:
299: testb $2, %al
300:
301: jz L(finish_no_two)
302:
303: movq 8(%ebx,%eax,4), %mm0
304: psllq %mm6, %mm2
305:
306: movq %mm0, %mm1
307: psrlq %mm7, %mm0
308:
309: movq %mm3, 24(%edx,%eax,4) C prev
310: por %mm2, %mm0
311:
312: movq %mm1, %mm2
313: movq %mm0, %mm3
314:
315: subl $2, %eax
316: L(finish_no_two):
317:
318:
319: C eax -4 or -3 representing respectively 0 or 1 limbs remaining
320: C
321: C mm2 src prev qword, from 48(%ebx,%eax,4)
322: C mm3 dst qword, for 56(%edx,%eax,4)
323:
324: testb $1, %al
325: movd %mm5, %eax C retval
326:
327: popl %edi
328: jz L(finish_zero)
329:
330:
331: C One extra src limb, destination was aligned.
332: C
333: C source ebx
334: C --+---------------+-------+
335: C | mm2 | |
336: C --+---------------+-------+
337: C
338: C dest edx+12 edx+4 edx
339: C --+---------------+---------------+-------+
340: C | mm3 | | |
341: C --+---------------+---------------+-------+
342: C
343: C mm6 = shift
344: C mm7 = ecx = 64-shift
345:
346:
347: C One extra src limb, destination was unaligned.
348: C
349: C source ebx
350: C --+---------------+-------+
351: C | mm2 | |
352: C --+---------------+-------+
353: C
354: C dest edx+12 edx+4
355: C --+---------------+---------------+
356: C | mm3 | |
357: C --+---------------+---------------+
358: C
359: C mm6 = shift+32
360: C mm7 = ecx = 64-(shift+32)
361:
362:
363: C In both cases there's one extra limb of src to fetch and combine
364: C with mm2 to make a qword at 4(%edx), and in the aligned case
365: C there's an extra limb of dst to be formed from that extra src limb
366: C left shifted.
367:
368:
369: movd (%ebx), %mm0
370: psllq %mm6, %mm2
371:
372: movq %mm3, 12(%edx)
373: psllq $32, %mm0
374:
375: movq %mm0, %mm1
376: psrlq %mm7, %mm0
377:
378: por %mm2, %mm0
379: psllq %mm6, %mm1
380:
381: movq %mm0, 4(%edx)
382: psrlq $32, %mm1
383:
384: andl $32, %ecx
385: popl %ebx
386:
387: jz L(finish_one_unaligned)
388:
389: movd %mm1, (%edx)
390: L(finish_one_unaligned):
391:
392: emms
393:
394: ret
395:
396:
397: L(finish_zero):
398:
399: C No extra src limbs, destination was aligned.
400: C
401: C source ebx
402: C --+---------------+
403: C | mm2 |
404: C --+---------------+
405: C
406: C dest edx+8 edx
407: C --+---------------+---------------+
408: C | mm3 | |
409: C --+---------------+---------------+
410: C
411: C mm6 = shift
412: C mm7 = ecx = 64-shift
413:
414:
415: C No extra src limbs, destination was unaligned.
416: C
417: C source ebx
418: C --+---------------+
419: C | mm2 |
420: C --+---------------+
421: C
422: C dest edx+8 edx+4
423: C --+---------------+-------+
424: C | mm3 | |
425: C --+---------------+-------+
426: C
427: C mm6 = shift+32
428: C mm7 = ecx = 64-(shift+32)
429:
430:
431: C The movd for the unaligned case writes the same data to 4(%edx)
432: C that the movq does for the aligned case.
433:
434:
435: movq %mm3, 8(%edx)
436: andl $32, %ecx
437:
438: psllq %mm6, %mm2
439: jz L(finish_zero_unaligned)
440:
441: movq %mm2, (%edx)
442: L(finish_zero_unaligned):
443:
444: psrlq $32, %mm2
445: popl %ebx
446:
447: movd %mm5, %eax C retval
448:
449: movd %mm2, 4(%edx)
450:
451: emms
452:
453: ret
454:
455: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>