Annotation of OpenXM_contrib/gmp/mpn/x86/k7/mmx/rshift.asm, Revision 1.1.1.2
1.1 maekawa 1: dnl AMD K7 mpn_rshift -- mpn right shift.
2:
1.1.1.2 ! ohara 3: dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
1.1 maekawa 4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24:
1.1.1.2 ! ohara 25: C K7: 1.21 cycles/limb (at 16 limbs/loop).
! 26:
! 27:
! 28:
1.1 maekawa 29: dnl K7: UNROLL_COUNT cycles/limb
30: dnl 4 1.51
31: dnl 8 1.26
32: dnl 16 1.21
33: dnl 32 1.2
34: dnl Maximum possible with the current code is 64.
35:
36: deflit(UNROLL_COUNT, 16)
37:
38:
39: C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
40: C unsigned shift);
41: C
42: C Shift src,size right by shift many bits and store the result in dst,size.
43: C Zeros are shifted in at the left. The bits shifted out at the right are
44: C the return value.
45: C
46: C This code uses 64-bit MMX operations, which makes it possible to handle
47: C two limbs at a time, for a theoretical 1.0 cycles/limb. Plain integer
48: C code, on the other hand, suffers from shrd being a vector path decode and
49: C running at 3 cycles back-to-back.
50: C
51: C Full speed depends on source and destination being aligned, and some hairy
52: C setups and finish-ups are done to arrange this for the loop.
53:
54: ifdef(`PIC',`
55: deflit(UNROLL_THRESHOLD, 10)
56: ',`
57: deflit(UNROLL_THRESHOLD, 10)
58: ')
59:
60: defframe(PARAM_SHIFT,16)
61: defframe(PARAM_SIZE, 12)
62: defframe(PARAM_SRC, 8)
63: defframe(PARAM_DST, 4)
64:
65: defframe(SAVE_EDI, -4)
66: defframe(SAVE_ESI, -8)
67: defframe(SAVE_EBX, -12)
68: deflit(SAVE_SIZE, 12)
69:
1.1.1.2 ! ohara 70: TEXT
1.1 maekawa 71: ALIGN(32)
72:
73: PROLOGUE(mpn_rshift)
74: deflit(`FRAME',0)
75:
76: movl PARAM_SIZE, %eax
77: movl PARAM_SRC, %edx
78: subl $SAVE_SIZE, %esp
79: deflit(`FRAME',SAVE_SIZE)
80:
81: movl PARAM_SHIFT, %ecx
82: movl %edi, SAVE_EDI
83:
84: movl PARAM_DST, %edi
85: decl %eax
86: jnz L(more_than_one_limb)
87:
88: movl (%edx), %edx C src limb
89:
90: shrdl( %cl, %edx, %eax) C eax was decremented to zero
91:
92: shrl %cl, %edx
93:
94: movl %edx, (%edi) C dst limb
95: movl SAVE_EDI, %edi
96: addl $SAVE_SIZE, %esp
97:
98: ret
99:
100:
101: C -----------------------------------------------------------------------------
102: L(more_than_one_limb):
103: C eax size-1
104: C ebx
105: C ecx shift
106: C edx src
107: C esi
108: C edi dst
109: C ebp
110:
111: movd PARAM_SHIFT, %mm6 C rshift
112: movd (%edx), %mm5 C src low limb
113: cmp $UNROLL_THRESHOLD-1, %eax
114:
115: jae L(unroll)
116: leal (%edx,%eax,4), %edx C &src[size-1]
117: leal -4(%edi,%eax,4), %edi C &dst[size-2]
118:
119: movd (%edx), %mm4 C src high limb
120: negl %eax
121:
122:
123: L(simple_top):
124: C eax loop counter, limbs, negative
125: C ebx
126: C ecx shift
127: C edx carry
128: C edx &src[size-1]
129: C edi &dst[size-2]
130: C ebp
131: C
132: C mm0 scratch
133: C mm4 src high limb
134: C mm5 src low limb
135: C mm6 shift
136:
137: movq (%edx,%eax,4), %mm0
138: incl %eax
139:
140: psrlq %mm6, %mm0
141:
142: movd %mm0, (%edi,%eax,4)
143: jnz L(simple_top)
144:
145:
146: psllq $32, %mm5
147: psrlq %mm6, %mm4
148:
149: psrlq %mm6, %mm5
150: movd %mm4, 4(%edi) C dst high limb
151:
152: movd %mm5, %eax C return value
153:
154: movl SAVE_EDI, %edi
155: addl $SAVE_SIZE, %esp
156: emms
157:
158: ret
159:
160:
161: C -----------------------------------------------------------------------------
162: ALIGN(16)
163: L(unroll):
164: C eax size-1
165: C ebx
166: C ecx shift
167: C edx src
168: C esi
169: C edi dst
170: C ebp
171: C
172: C mm5 src low limb
173: C mm6 rshift
174:
175: testb $4, %dl
176: movl %esi, SAVE_ESI
177: movl %ebx, SAVE_EBX
178:
179: psllq $32, %mm5
180: jz L(start_src_aligned)
181:
182:
183: C src isn't aligned, process low limb separately (marked xxx) and
184: C step src and dst by one limb, making src aligned.
185: C
186: C source edx
187: C --+-------+-------+-------+
188: C | xxx |
189: C --+-------+-------+-------+
190: C 4mod8 0mod8 4mod8
191: C
192: C dest edi
193: C --+-------+-------+
194: C | | xxx |
195: C --+-------+-------+
196:
197: movq (%edx), %mm0 C src low two limbs
198: addl $4, %edx
199: movl %eax, PARAM_SIZE C size-1
200:
201: addl $4, %edi
202: decl %eax C size-2 is new size-1
203:
204: psrlq %mm6, %mm0
205: movl %edi, PARAM_DST C new dst
206:
207: movd %mm0, -4(%edi)
208: L(start_src_aligned):
209:
210:
211: movq (%edx), %mm1 C src low two limbs
212: decl %eax C size-2, two last limbs handled at end
213: testl $4, %edi
214:
215: psrlq %mm6, %mm5
216: jz L(start_dst_aligned)
217:
218:
219: C dst isn't aligned, add 4 to make it so, and pretend the shift is
220: C 32 bits extra. Low limb of dst (marked xxx) handled here separately.
221: C
222: C source edx
223: C --+-------+-------+
224: C | mm1 |
225: C --+-------+-------+
226: C 4mod8 0mod8
227: C
228: C dest edi
229: C --+-------+-------+-------+
230: C | xxx |
231: C --+-------+-------+-------+
232: C 4mod8 0mod8 4mod8
233:
234: movq %mm1, %mm0
235: psrlq %mm6, %mm1
236: addl $32, %ecx C shift+32
237:
238: movd %mm1, (%edi)
239: movq %mm0, %mm1
240: addl $4, %edi C new dst
241:
242: movd %ecx, %mm6
243: L(start_dst_aligned):
244:
245:
246: movq %mm1, %mm2 C copy of src low two limbs
247: negl %ecx
248: andl $-2, %eax C round size down to even
249:
250: movl %eax, %ebx
251: negl %eax
252: addl $64, %ecx
253:
254: andl $UNROLL_MASK, %eax
255: decl %ebx
256:
257: shll %eax
258:
259: movd %ecx, %mm7 C lshift = 64-rshift
260:
261: ifdef(`PIC',`
262: call L(pic_calc)
263: L(here):
264: ',`
265: leal L(entry) (%eax,%eax,4), %esi
266: negl %eax
267: ')
268: shrl $UNROLL_LOG2, %ebx C loop counter
269:
270: leal ifelse(UNROLL_BYTES,256,128+) 8(%edx,%eax,2), %edx
271: leal ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
272: movl PARAM_SIZE, %eax C for use at end
273:
274: jmp *%esi
275:
276:
277: ifdef(`PIC',`
278: L(pic_calc):
1.1.1.2 ! ohara 279: C See mpn/x86/README about old gas bugs
1.1 maekawa 280: leal (%eax,%eax,4), %esi
281: addl $L(entry)-L(here), %esi
282: addl (%esp), %esi
283: negl %eax
284:
285: ret
286: ')
287:
288:
289: C -----------------------------------------------------------------------------
290: ALIGN(64)
291: L(top):
292: C eax size, for use at end
293: C ebx loop counter
294: C ecx lshift
295: C edx src
296: C esi was computed jump
297: C edi dst
298: C ebp
299: C
300: C mm0 scratch
301: C mm1 \ carry (alternating)
302: C mm2 /
303: C mm6 rshift
304: C mm7 lshift
305: C
306: C 10 code bytes/limb
307: C
308: C The two chunks differ in whether mm1 or mm2 hold the carry.
309: C The computed jump puts the initial carry in both mm1 and mm2.
310:
311: L(entry):
312: deflit(CHUNK_COUNT, 4)
313: forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
314: deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
315: deflit(`disp1', eval(disp0 + 8))
316:
1.1.1.2 ! ohara 317: Zdisp( movq, disp0,(%edx), %mm0)
1.1 maekawa 318: psrlq %mm6, %mm2
319:
320: movq %mm0, %mm1
321: psllq %mm7, %mm0
322:
323: por %mm2, %mm0
1.1.1.2 ! ohara 324: Zdisp( movq, %mm0, disp0,(%edi))
1.1 maekawa 325:
326:
1.1.1.2 ! ohara 327: Zdisp( movq, disp1,(%edx), %mm0)
1.1 maekawa 328: psrlq %mm6, %mm1
329:
330: movq %mm0, %mm2
331: psllq %mm7, %mm0
332:
333: por %mm1, %mm0
1.1.1.2 ! ohara 334: Zdisp( movq, %mm0, disp1,(%edi))
1.1 maekawa 335: ')
336:
337: addl $UNROLL_BYTES, %edx
338: addl $UNROLL_BYTES, %edi
339: decl %ebx
340:
341: jns L(top)
342:
343:
344: deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
345: deflit(`disp1', eval(disp0-0 + 8))
346:
347: testb $1, %al
348: psrlq %mm6, %mm2 C wanted rshifted in all cases below
349: movl SAVE_ESI, %esi
350:
351: movd %mm5, %eax C return value
352:
353: movl SAVE_EBX, %ebx
354: jz L(end_even)
355:
356:
357: C Size odd, destination was aligned.
358: C
359: C source
360: C edx
361: C +-------+---------------+--
362: C | | mm2 |
363: C +-------+---------------+--
364: C
365: C dest edi
366: C +-------+---------------+---------------+--
367: C | | | written |
368: C +-------+---------------+---------------+--
369: C
370: C mm6 = shift
371: C mm7 = ecx = 64-shift
372:
373:
374: C Size odd, destination was unaligned.
375: C
376: C source
377: C edx
378: C +-------+---------------+--
379: C | | mm2 |
380: C +-------+---------------+--
381: C
382: C dest edi
383: C +---------------+---------------+--
384: C | | written |
385: C +---------------+---------------+--
386: C
387: C mm6 = shift+32
388: C mm7 = ecx = 64-(shift+32)
389:
390:
391: C In both cases there's one extra limb of src to fetch and combine
392: C with mm2 to make a qword to store, and in the aligned case there's
393: C a further extra limb of dst to be formed.
394:
395:
396: movd disp0(%edx), %mm0
397: movq %mm0, %mm1
398:
399: psllq %mm7, %mm0
400: testb $32, %cl
401:
402: por %mm2, %mm0
403: psrlq %mm6, %mm1
404:
405: movq %mm0, disp0(%edi)
406: jz L(finish_odd_unaligned)
407:
408: movd %mm1, disp1(%edi)
409: L(finish_odd_unaligned):
410:
411: movl SAVE_EDI, %edi
412: addl $SAVE_SIZE, %esp
413: emms
414:
415: ret
416:
417:
418: L(end_even):
419:
420: C Size even, destination was aligned.
421: C
422: C source
423: C +---------------+--
424: C | mm2 |
425: C +---------------+--
426: C
427: C dest edi
428: C +---------------+---------------+--
429: C | | mm3 |
430: C +---------------+---------------+--
431: C
432: C mm6 = shift
433: C mm7 = ecx = 64-shift
434:
435:
436: C Size even, destination was unaligned.
437: C
438: C source
439: C +---------------+--
440: C | mm2 |
441: C +---------------+--
442: C
443: C dest edi
444: C +-------+---------------+--
445: C | | mm3 |
446: C +-------+---------------+--
447: C
448: C mm6 = shift+32
449: C mm7 = 64-(shift+32)
450:
451:
452: C The movd for the unaligned case is the same data as the movq for
453: C the aligned case, it's just a choice between whether one or two
454: C limbs should be written.
455:
456:
457: testb $32, %cl
458: movd %mm2, disp0(%edi)
459:
460: jz L(end_even_unaligned)
461:
462: movq %mm2, disp0(%edi)
463: L(end_even_unaligned):
464:
465: movl SAVE_EDI, %edi
466: addl $SAVE_SIZE, %esp
467: emms
468:
469: ret
470:
471: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>