Annotation of OpenXM_contrib/gmp/mpn/x86/k6/mmx/lshift.asm, Revision 1.1.1.1
1.1 maekawa 1: dnl AMD K6 mpn_lshift -- mpn left shift.
2: dnl
3: dnl K6: 3.0 cycles/limb
4:
5:
6: dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
7: dnl
8: dnl This file is part of the GNU MP Library.
9: dnl
10: dnl The GNU MP Library is free software; you can redistribute it and/or
11: dnl modify it under the terms of the GNU Lesser General Public License as
12: dnl published by the Free Software Foundation; either version 2.1 of the
13: dnl License, or (at your option) any later version.
14: dnl
15: dnl The GNU MP Library is distributed in the hope that it will be useful,
16: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
17: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18: dnl Lesser General Public License for more details.
19: dnl
20: dnl You should have received a copy of the GNU Lesser General Public
21: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
22: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
23: dnl Suite 330, Boston, MA 02111-1307, USA.
24:
25:
26: include(`../config.m4')
27:
28:
29: C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
30: C unsigned shift);
31: C
32: C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx
33: C instructions. This is despite every second fetch being unaligned.
34:
35:
36: defframe(PARAM_SHIFT,16)
37: defframe(PARAM_SIZE, 12)
38: defframe(PARAM_SRC, 8)
39: defframe(PARAM_DST, 4)
40:
41: .text
42: ALIGN(32)
43:
44: PROLOGUE(mpn_lshift)
45: deflit(`FRAME',0)
46:
47: C The 1 limb case can be done without the push %ebx, but it's then
48: C still the same speed. The push is left as a free helping hand for
49: C the two_or_more code.
50:
51: movl PARAM_SIZE, %eax
52: pushl %ebx FRAME_pushl()
53:
54: movl PARAM_SRC, %ebx
55: decl %eax
56:
57: movl PARAM_SHIFT, %ecx
58: jnz L(two_or_more)
59:
60: movl (%ebx), %edx C src limb
61: movl PARAM_DST, %ebx
62:
63: shldl( %cl, %edx, %eax) C return value
64:
65: shll %cl, %edx
66:
67: movl %edx, (%ebx) C dst limb
68: popl %ebx
69:
70: ret
71:
72:
73: ALIGN(16) C avoid offset 0x1f
74: nop C avoid bad cache line crossing
75: L(two_or_more):
76: C eax size-1
77: C ebx src
78: C ecx shift
79: C edx
80:
81: movl (%ebx,%eax,4), %edx C src high limb
82: negl %ecx
83:
84: movd PARAM_SHIFT, %mm6
85: addl $32, %ecx C 32-shift
86:
87: shrl %cl, %edx
88:
89: movd %ecx, %mm7
90: movl PARAM_DST, %ecx
91:
92: L(top):
93: C eax counter, size-1 to 1
94: C ebx src
95: C ecx dst
96: C edx retval
97: C
98: C mm0 scratch
99: C mm6 shift
100: C mm7 32-shift
101:
102: movq -4(%ebx,%eax,4), %mm0
103: decl %eax
104:
105: psrlq %mm7, %mm0
106:
107: movd %mm0, 4(%ecx,%eax,4)
108: jnz L(top)
109:
110:
111: movd (%ebx), %mm0
112: popl %ebx
113:
114: psllq %mm6, %mm0
115: movl %edx, %eax
116:
117: movd %mm0, (%ecx)
118:
119: emms
120: ret
121:
122: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>