Annotation of OpenXM_contrib/gmp/mpn/x86/pentium4/sse2/mode1o.asm, Revision 1.1.1.1
1.1 ohara 1: dnl Intel Pentium-4 mpn_modexact_1_odd -- mpn by limb exact remainder.
2:
3: dnl Copyright 2001, 2002 Free Software Foundation, Inc.
4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24:
25: C P4: 19.0 cycles/limb
26:
27:
28: C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
29: C mp_limb_t divisor);
30: C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
31: C mp_limb_t divisor, mp_limb_t carry);
32: C
33:
34: defframe(PARAM_CARRY, 16)
35: defframe(PARAM_DIVISOR,12)
36: defframe(PARAM_SIZE, 8)
37: defframe(PARAM_SRC, 4)
38:
39: TEXT
40:
41: ALIGN(16)
42: PROLOGUE(mpn_modexact_1c_odd)
43: deflit(`FRAME',0)
44:
45: movd PARAM_CARRY, %mm1
46: jmp L(start_1c)
47:
48: ifdef(`PIC',`
49: L(movl_eip_edx):
50: movl (%esp), %edx
51: ret
52: ')
53:
54: EPILOGUE()
55:
56:
57: ALIGN(16)
58: PROLOGUE(mpn_modexact_1_odd)
59: deflit(`FRAME',0)
60:
61: pxor %mm1, %mm1 C carry limb
62: L(start_1c):
63: movl PARAM_DIVISOR, %eax
64:
65: movd PARAM_DIVISOR, %mm7
66:
67: shrl %eax
68:
69: andl $127, %eax C d/2, 7 bits
70:
71: ifdef(`PIC',`
72: call L(movl_eip_edx)
73:
74: addl $_GLOBAL_OFFSET_TABLE_, %edx
75:
76: movl modlimb_invert_table@GOT(%edx), %edx
77: C
78: movzbl (%eax,%edx), %eax C inv 8 bits
79: ',`
80: dnl non-PIC
81: movzbl modlimb_invert_table(%eax), %eax C inv 8 bits
82: ')
83:
84: C
85:
86: movd %eax, %mm6 C inv
87:
88: movd %eax, %mm0 C inv
89:
90: pmuludq %mm6, %mm6 C inv*inv
91:
92: C
93:
94: pmuludq %mm7, %mm6 C inv*inv*d
95: paddd %mm0, %mm0 C 2*inv
96:
97: C
98:
99: psubd %mm6, %mm0 C inv = 2*inv - inv*inv*d
100: pxor %mm6, %mm6
101:
102: paddd %mm0, %mm6
103: pmuludq %mm0, %mm0 C inv*inv
104:
105: C
106:
107: pmuludq %mm7, %mm0 C inv*inv*d
108: paddd %mm6, %mm6 C 2*inv
109:
110:
111: movl PARAM_SRC, %eax
112: movl PARAM_SIZE, %ecx
113:
114: C
115:
116: psubd %mm0, %mm6 C inv = 2*inv - inv*inv*d
117:
118: ASSERT(e,` C expect d*inv == 1 mod 2^BITS_PER_MP_LIMB
119: pushl %eax FRAME_pushl()
120: movd %mm6, %eax
121: imul PARAM_DIVISOR, %eax
122: cmpl $1, %eax
123: popl %eax FRAME_popl()')
124:
125: pxor %mm0, %mm0 C carry bit
126:
127:
128: C The dependent chain here is as follows.
129: C
130: C latency
131: C psubq s = (src-cbit) - climb 2
132: C pmuludq q = s*inverse 8
133: C pmuludq prod = q*divisor 8
134: C psrlq climb = high(prod) 2
135: C --
136: C 20
137: C
138: C Yet the loop measures 19.0 c/l, so obviously there's something gained
139: C there over a straight reading of the chip documentation.
140:
141: L(top):
142: C eax src, incrementing
143: C ebx
144: C ecx counter, limbs
145: C edx
146: C
147: C mm0 carry bit
148: C mm1 carry limb
149: C mm6 inverse
150: C mm7 divisor
151:
152: movd (%eax), %mm2
153: addl $4, %eax
154:
155: psubq %mm0, %mm2 C src - cbit
156:
157: psubq %mm1, %mm2 C src - cbit - climb
158: movq %mm2, %mm0
159: psrlq $63, %mm0 C new cbit
160:
161: pmuludq %mm6, %mm2 C s*inverse
162:
163: movq %mm7, %mm1
164: pmuludq %mm2, %mm1 C q*divisor
165: psrlq $32, %mm1 C new climb
166:
167: subl $1, %ecx
168: jnz L(top)
169:
170:
171: L(done):
172: paddq %mm1, %mm0
173: movd %mm0, %eax
174: emms
175: ret
176:
177: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>