Annotation of OpenXM_contrib/gmp/mpn/x86/pentium4/sse2/dive_1.asm, Revision 1.1.1.1
1.1 ohara 1: dnl Intel Pentium-4 mpn_divexact_1 -- mpn by limb exact division.
2:
3: dnl Copyright 2001, 2002 Free Software Foundation, Inc.
4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24:
25: C P4: 19.0 cycles/limb
26:
27:
28: C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
29: C mp_limb_t divisor);
30: C
31: C Pairs of movd's are used to avoid unaligned loads. Despite the loads not
32: C being on the dependent chain and there being plenty of cycles available,
33: C using an unaligned movq on every second iteration measured about 23 c/l.
34: C
35: C Using divl for size==1 seems a touch quicker than mul-by-inverse. The mul
36: C will be about 9+2*4+2*2+10*4+19+12 = 92 cycles latency, though some of
37: C that might be hidden by out-of-order execution, whereas divl is around 60.
38: C At size==2 an extra 19 for the mul versus 60 for the divl will see the mul
39: C faster.
40:
41: defframe(PARAM_DIVISOR,16)
42: defframe(PARAM_SIZE, 12)
43: defframe(PARAM_SRC, 8)
44: defframe(PARAM_DST, 4)
45:
46: TEXT
47:
48: ALIGN(16)
49: PROLOGUE(mpn_divexact_1)
50: deflit(`FRAME',0)
51:
52: movl PARAM_SIZE, %edx
53:
54: movl PARAM_SRC, %eax
55:
56: movl PARAM_DIVISOR, %ecx
57: subl $1, %edx
58: jnz L(two_or_more)
59:
60: movl (%eax), %eax
61: xorl %edx, %edx
62:
63: divl %ecx
64: movl PARAM_DST, %ecx
65:
66: movl %eax, (%ecx)
67: ret
68:
69:
70: L(two_or_more):
71: C eax src
72: C ebx
73: C ecx divisor
74: C edx size-1
75:
76: movl %ecx, %eax
77: bsfl %ecx, %ecx C trailing twos
78:
79: shrl %cl, %eax C d = divisor without twos
80: movd %eax, %mm6
81: movd %ecx, %mm7 C shift
82:
83: shrl %eax C d/2
84:
85: andl $127, %eax C d/2, 7 bits
86:
87: ifdef(`PIC',`
88: call L(movl_eip_ecx)
89:
90: addl $_GLOBAL_OFFSET_TABLE_, %ecx
91:
92: movl modlimb_invert_table@GOT(%ecx), %ecx
93: C
94: movzbl (%eax,%ecx), %eax C inv 8 bits
95: ',`
96: dnl non-PIC
97: movzbl modlimb_invert_table(%eax), %eax C inv 8 bits
98: ')
99:
100: C
101:
102: movd %eax, %mm5 C inv
103:
104: movd %eax, %mm0 C inv
105:
106: pmuludq %mm5, %mm5 C inv*inv
107:
108: C
109:
110: pmuludq %mm6, %mm5 C inv*inv*d
111: paddd %mm0, %mm0 C 2*inv
112:
113: C
114:
115: psubd %mm5, %mm0 C inv = 2*inv - inv*inv*d
116: pxor %mm5, %mm5
117:
118: paddd %mm0, %mm5
119: pmuludq %mm0, %mm0 C inv*inv
120:
121: pcmpeqd %mm4, %mm4
122: psrlq $32, %mm4 C 0x00000000FFFFFFFF
123:
124: C
125:
126: pmuludq %mm6, %mm0 C inv*inv*d
127: paddd %mm5, %mm5 C 2*inv
128:
129: movl PARAM_SRC, %eax
130: movl PARAM_DST, %ecx
131: pxor %mm1, %mm1 C initial carry limb
132:
133: C
134:
135: psubd %mm0, %mm5 C inv = 2*inv - inv*inv*d
136:
137: ASSERT(e,` C expect d*inv == 1 mod 2^BITS_PER_MP_LIMB
138: pushl %eax FRAME_pushl()
139: movq %mm6, %mm0
140: pmuludq %mm5, %mm0
141: movd %mm0, %eax
142: cmpl $1, %eax
143: popl %eax FRAME_popl()')
144:
145: pxor %mm0, %mm0 C initial carry bit
146:
147:
148: C The dependent chain here is as follows.
149: C
150: C latency
151: C psubq s = (src-cbit) - climb 2
152: C pmuludq q = s*inverse 8
153: C pmuludq prod = q*divisor 8
154: C psrlq climb = high(prod) 2
155: C --
156: C 20
157: C
158: C Yet the loop measures 19.0 c/l, so obviously there's something gained
159: C there over a straight reading of the chip documentation.
160:
161: L(top):
162: C eax src, incrementing
163: C ebx
164: C ecx dst, incrementing
165: C edx counter, size-1 iterations
166: C
167: C mm0 carry bit
168: C mm1 carry limb
169: C mm4 0x00000000FFFFFFFF
170: C mm5 inverse
171: C mm6 divisor
172: C mm7 shift
173:
174: movd (%eax), %mm2
175: movd 4(%eax), %mm3
176: addl $4, %eax
177: punpckldq %mm3, %mm2
178:
179: psrlq %mm7, %mm2
180: pand %mm4, %mm2 C src
181: psubq %mm0, %mm2 C src - cbit
182:
183: psubq %mm1, %mm2 C src - cbit - climb
184: movq %mm2, %mm0
185: psrlq $63, %mm0 C new cbit
186:
187: pmuludq %mm5, %mm2 C s*inverse
188: movd %mm2, (%ecx) C q
189: addl $4, %ecx
190:
191: movq %mm6, %mm1
192: pmuludq %mm2, %mm1 C q*divisor
193: psrlq $32, %mm1 C new climb
194:
195: subl $1, %edx
196: jnz L(top)
197:
198:
199: L(done):
200: movd (%eax), %mm2
201: psrlq %mm7, %mm2 C src
202: psubq %mm0, %mm2 C src - cbit
203:
204: psubq %mm1, %mm2 C src - cbit - climb
205:
206: pmuludq %mm5, %mm2 C s*inverse
207: movd %mm2, (%ecx) C q
208:
209: emms
210: ret
211:
212:
213: ifdef(`PIC',`
214: L(movl_eip_ecx):
215: movl (%esp), %ecx
216: ret
217: ')
218:
219: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>