Annotation of OpenXM_contrib/gmp/mpn/x86/pentium4/sse2/mod_34lsub1.asm, Revision 1.1.1.1
1.1 ohara 1: dnl Intel Pentium 4 mpn_mod_32lsub1 -- remainder modulo 2^24-1.
2:
3: dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24:
25: C Pentium4: 1.0 cycles/limb
26:
27:
28: C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
29: C
30: C Enhancements:
31: C
32: C There might a couple of cycles to save by using plain integer code for
33: C more small sizes. 2 limbs measures about 20 cycles, but 3 limbs jumps to
34: C about 46 (inclusive of some function call overheads).
35:
36: defframe(PARAM_SIZE, 8)
37: defframe(PARAM_SRC, 4)
38:
39: dnl re-use parameter space
40: define(SAVE_EBX, `PARAM_SRC')
41: define(SAVE_ESI, `PARAM_SIZE')
42:
43: TEXT
44: ALIGN(16)
45: PROLOGUE(mpn_mod_34lsub1)
46: deflit(`FRAME',0)
47:
48: movl PARAM_SIZE, %ecx
49: movl PARAM_SRC, %edx
50: movl (%edx), %eax
51:
52: subl $2, %ecx
53: ja L(three_or_more)
54: jne L(one)
55:
56: movl 4(%edx), %edx
57: movl %eax, %ecx
58: shrl $24, %eax C src[0] high
59:
60: andl $0x00FFFFFF, %ecx C src[0] low
61: addl %ecx, %eax
62:
63: movl %edx, %ecx
64: shll $8, %edx
65:
66: shrl $16, %ecx C src[1] low
67: addl %ecx, %eax
68:
69: andl $0x00FFFF00, %edx C src[1] high
70: addl %edx, %eax
71:
72: L(one):
73: ret
74:
75:
76: L(three_or_more):
77: pxor %mm0, %mm0
78: pxor %mm1, %mm1
79: pxor %mm2, %mm2
80:
81: pcmpeqd %mm7, %mm7
82: psrlq $32, %mm7 C 0x00000000FFFFFFFF, low 32 bits
83:
84: pcmpeqd %mm6, %mm6
85: psrlq $40, %mm6 C 0x0000000000FFFFFF, low 24 bits
86:
87: L(top):
88: C eax
89: C ebx
90: C ecx counter, size-2 to 0, -1 or -2
91: C edx src, incrementing
92: C
93: C mm0 sum 0mod3
94: C mm1 sum 1mod3
95: C mm2 sum 2mod3
96: C mm3
97: C mm4
98: C mm5
99: C mm6 0x0000000000FFFFFF
100: C mm7 0x00000000FFFFFFFF
101:
102: movd (%edx), %mm3
103: paddq %mm3, %mm0
104:
105: movd 4(%edx), %mm3
106: paddq %mm3, %mm1
107:
108: movd 8(%edx), %mm3
109: paddq %mm3, %mm2
110:
111: addl $12, %edx
112: subl $3, %ecx
113: ja L(top)
114:
115:
116: C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively
117:
118: addl $1, %ecx
119: js L(combine) C 0 more
120:
121: movd (%edx), %mm3
122: paddq %mm3, %mm0
123:
124: jz L(combine) C 1 more
125:
126: movd 4(%edx), %mm3
127: paddq %mm3, %mm1
128:
129: L(combine):
130: movq %mm7, %mm3 C low halves
131: pand %mm0, %mm3
132:
133: movq %mm7, %mm4
134: pand %mm1, %mm4
135:
136: movq %mm7, %mm5
137: pand %mm2, %mm5
138:
139: psrlq $32, %mm0 C high halves
140: psrlq $32, %mm1
141: psrlq $32, %mm2
142:
143: paddq %mm0, %mm4 C fold high halves to give 33 bits each
144: paddq %mm1, %mm5
145: paddq %mm2, %mm3
146:
147: psllq $8, %mm4 C combine at respective offsets
148: psllq $16, %mm5
149: paddq %mm4, %mm3
150: paddq %mm5, %mm3 C 0x000cxxxxxxxxxxxx, 50 bits
151:
152: pand %mm3, %mm6 C fold at 24 bits
153: psrlq $24, %mm3
154:
155: paddq %mm6, %mm3
156: movd %mm3, %eax
157:
158: ASSERT(z, C nothing left in high dword
159: `psrlq $32, %mm3
160: movd %mm3, %ecx
161: orl %ecx, %ecx')
162:
163: emms
164: ret
165:
166: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>