Annotation of OpenXM_contrib/gmp/mpn/x86/k6/mmx/rshift.asm, Revision 1.1.1.2
1.1 maekawa 1: dnl AMD K6 mpn_rshift -- mpn right shift.
2:
1.1.1.2 ! ohara 3: dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
1.1 maekawa 4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24:
1.1.1.2 ! ohara 25: C K6: 3.0 cycles/limb
! 26:
! 27:
1.1 maekawa 28: C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
29: C unsigned shift);
30: C
31: C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx
32: C instructions. This is despite every second fetch being unaligned.
33:
34:
35: defframe(PARAM_SHIFT,16)
36: defframe(PARAM_SIZE, 12)
37: defframe(PARAM_SRC, 8)
38: defframe(PARAM_DST, 4)
39: deflit(`FRAME',0)
40:
1.1.1.2 ! ohara 41: TEXT
1.1 maekawa 42: ALIGN(32)
43:
44: PROLOGUE(mpn_rshift)
45: deflit(`FRAME',0)
46:
47: C The 1 limb case can be done without the push %ebx, but it's then
48: C still the same speed. The push is left as a free helping hand for
49: C the two_or_more code.
50:
51: movl PARAM_SIZE, %eax
52: pushl %ebx FRAME_pushl()
53:
54: movl PARAM_SRC, %ebx
55: decl %eax
56:
57: movl PARAM_SHIFT, %ecx
58: jnz L(two_or_more)
59:
60: movl (%ebx), %edx C src limb
61: movl PARAM_DST, %ebx
62:
63: shrdl( %cl, %edx, %eax) C return value
64:
65: shrl %cl, %edx
66:
67: movl %edx, (%ebx) C dst limb
68: popl %ebx
69:
70: ret
71:
72:
73: ALIGN(16) C avoid offset 0x1f
74: L(two_or_more):
75: C eax size-1
76: C ebx src
77: C ecx shift
78: C edx
79:
80: movl (%ebx), %edx C src low limb
81: negl %ecx
82:
83: addl $32, %ecx C 32-shift
84: movd PARAM_SHIFT, %mm6
85:
86: shll %cl, %edx C retval
87: movl PARAM_DST, %ecx
88:
89: leal (%ebx,%eax,4), %ebx
90:
91: leal -4(%ecx,%eax,4), %ecx
92: negl %eax
93:
94:
95: L(simple):
96: C eax counter (negative)
97: C ebx &src[size-1]
98: C ecx &dst[size-1]
99: C edx retval
100: C
101: C mm0 scratch
102: C mm6 shift
103:
104: Zdisp( movq, 0,(%ebx,%eax,4), %mm0)
105: incl %eax
106:
107: psrlq %mm6, %mm0
108:
109: Zdisp( movd, %mm0, 0,(%ecx,%eax,4))
110: jnz L(simple)
111:
112:
113: movq %mm0, (%ecx)
114: movl %edx, %eax
115:
116: popl %ebx
117:
118: emms
119: ret
120:
121: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>