Annotation of OpenXM_contrib/gmp/mpn/x86/k6/mmx/rshift.asm, Revision 1.1.1.1
1.1 maekawa 1: dnl AMD K6 mpn_rshift -- mpn right shift.
2: dnl
3: dnl K6: 3.0 cycles/limb
4:
5:
6: dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
7: dnl
8: dnl This file is part of the GNU MP Library.
9: dnl
10: dnl The GNU MP Library is free software; you can redistribute it and/or
11: dnl modify it under the terms of the GNU Lesser General Public License as
12: dnl published by the Free Software Foundation; either version 2.1 of the
13: dnl License, or (at your option) any later version.
14: dnl
15: dnl The GNU MP Library is distributed in the hope that it will be useful,
16: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
17: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18: dnl Lesser General Public License for more details.
19: dnl
20: dnl You should have received a copy of the GNU Lesser General Public
21: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
22: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
23: dnl Suite 330, Boston, MA 02111-1307, USA.
24:
25:
26: include(`../config.m4')
27:
28:
29: C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
30: C unsigned shift);
31: C
32: C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx
33: C instructions. This is despite every second fetch being unaligned.
34:
35:
36: defframe(PARAM_SHIFT,16)
37: defframe(PARAM_SIZE, 12)
38: defframe(PARAM_SRC, 8)
39: defframe(PARAM_DST, 4)
40: deflit(`FRAME',0)
41:
42: .text
43: ALIGN(32)
44:
45: PROLOGUE(mpn_rshift)
46: deflit(`FRAME',0)
47:
48: C The 1 limb case can be done without the push %ebx, but it's then
49: C still the same speed. The push is left as a free helping hand for
50: C the two_or_more code.
51:
52: movl PARAM_SIZE, %eax
53: pushl %ebx FRAME_pushl()
54:
55: movl PARAM_SRC, %ebx
56: decl %eax
57:
58: movl PARAM_SHIFT, %ecx
59: jnz L(two_or_more)
60:
61: movl (%ebx), %edx C src limb
62: movl PARAM_DST, %ebx
63:
64: shrdl( %cl, %edx, %eax) C return value
65:
66: shrl %cl, %edx
67:
68: movl %edx, (%ebx) C dst limb
69: popl %ebx
70:
71: ret
72:
73:
74: ALIGN(16) C avoid offset 0x1f
75: L(two_or_more):
76: C eax size-1
77: C ebx src
78: C ecx shift
79: C edx
80:
81: movl (%ebx), %edx C src low limb
82: negl %ecx
83:
84: addl $32, %ecx C 32-shift
85: movd PARAM_SHIFT, %mm6
86:
87: shll %cl, %edx C retval
88: movl PARAM_DST, %ecx
89:
90: leal (%ebx,%eax,4), %ebx
91:
92: leal -4(%ecx,%eax,4), %ecx
93: negl %eax
94:
95:
96: L(simple):
97: C eax counter (negative)
98: C ebx &src[size-1]
99: C ecx &dst[size-1]
100: C edx retval
101: C
102: C mm0 scratch
103: C mm6 shift
104:
105: Zdisp( movq, 0,(%ebx,%eax,4), %mm0)
106: incl %eax
107:
108: psrlq %mm6, %mm0
109:
110: Zdisp( movd, %mm0, 0,(%ecx,%eax,4))
111: jnz L(simple)
112:
113:
114: movq %mm0, (%ecx)
115: movl %edx, %eax
116:
117: popl %ebx
118:
119: emms
120: ret
121:
122: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>