Annotation of OpenXM_contrib/gmp/mpn/x86/pentium/dive_1.asm, Revision 1.1.1.1
1.1 ohara 1: dnl Intel Pentium mpn_divexact_1 -- mpn by limb exact division.
2:
3: dnl Copyright 2001, 2002 Free Software Foundation, Inc.
4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24:
25: C divisor
26: C odd even
27: C P54: 24.5 30.5 cycles/limb
28: C P55: 23.0 28.0
29:
30:
31: C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
32: C mp_limb_t divisor);
33: C
34: C Plain divl is used for small sizes, since the inverse takes a while to
35: C setup. Multiplying works out faster for size>=3 when the divisor is odd,
36: C or size>=4 when the divisor is even. Actually on P55 size==2 for odd or
37: C size==3 for even are about the same speed for both divl or mul, but the
38: C former is used since it will use up less code cache.
39: C
40: C The P55 speeds noted above, 23 cycles odd or 28 cycles even, are as
41: C expected. On P54 in the even case the shrdl pairing nonsense (see
42: C mpn/x86/pentium/README) costs 1 cycle, but it's not clear why there's a
43: C further 1.5 slowdown for both odd and even.
44:
45: defframe(PARAM_DIVISOR,16)
46: defframe(PARAM_SIZE, 12)
47: defframe(PARAM_SRC, 8)
48: defframe(PARAM_DST, 4)
49:
50: dnl re-use parameter space
51: define(VAR_INVERSE,`PARAM_DST')
52:
53: TEXT
54:
55: ALIGN(32)
56: PROLOGUE(mpn_divexact_1)
57: deflit(`FRAME',0)
58:
59: movl PARAM_DIVISOR, %eax
60: movl PARAM_SIZE, %ecx
61:
62: pushl %esi FRAME_pushl()
63: push %edi FRAME_pushl()
64:
65: movl PARAM_SRC, %esi
66: andl $1, %eax
67:
68: movl PARAM_DST, %edi
69: addl %ecx, %eax C size if even, size+1 if odd
70:
71: cmpl $4, %eax
72: jae L(mul_by_inverse)
73:
74:
75: xorl %edx, %edx
76: L(div_top):
77: movl -4(%esi,%ecx,4), %eax
78:
79: divl PARAM_DIVISOR
80:
81: movl %eax, -4(%edi,%ecx,4)
82: decl %ecx
83:
84: jnz L(div_top)
85:
86: popl %edi
87: popl %esi
88:
89: ret
90:
91:
92:
93: L(mul_by_inverse):
94: movl PARAM_DIVISOR, %eax
95: movl $-1, %ecx
96:
97: L(strip_twos):
98: ASSERT(nz, `orl %eax, %eax')
99: shrl %eax
100: incl %ecx C shift count
101:
102: jnc L(strip_twos)
103:
104: leal 1(%eax,%eax), %edx C d
105: andl $127, %eax C d/2, 7 bits
106:
107: pushl %ebx FRAME_pushl()
108: pushl %ebp FRAME_pushl()
109:
110: ifdef(`PIC',`
111: call L(here)
112: L(here):
113: popl %ebp C eip
114:
115: addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp
116: C AGI
117: movl modlimb_invert_table@GOT(%ebp), %ebp
118: C AGI
119: movzbl (%eax,%ebp), %eax
120: ',`
121:
122: dnl non-PIC
123: movzbl modlimb_invert_table(%eax), %eax C inv 8 bits
124: ')
125:
126: movl %eax, %ebp C inv
127: addl %eax, %eax C 2*inv
128:
129: imull %ebp, %ebp C inv*inv
130:
131: imull %edx, %ebp C inv*inv*d
132:
133: subl %ebp, %eax C inv = 2*inv - inv*inv*d
134: movl PARAM_SIZE, %ebx
135:
136: movl %eax, %ebp
137: addl %eax, %eax C 2*inv
138:
139: imull %ebp, %ebp C inv*inv
140:
141: imull %edx, %ebp C inv*inv*d
142:
143: subl %ebp, %eax C inv = 2*inv - inv*inv*d
144: movl %edx, PARAM_DIVISOR C d without twos
145:
146: leal (%esi,%ebx,4), %esi C src end
147: leal (%edi,%ebx,4), %edi C dst end
148:
149: negl %ebx C -size
150:
151: ASSERT(e,` C expect d*inv == 1 mod 2^BITS_PER_MP_LIMB
152: pushl %eax FRAME_pushl()
153: imull PARAM_DIVISOR, %eax
154: cmpl $1, %eax
155: popl %eax FRAME_popl()')
156:
157: movl %eax, VAR_INVERSE
158: xorl %ebp, %ebp C initial carry bit
159:
160: movl (%esi,%ebx,4), %eax C src low limb
161: orl %ecx, %ecx C shift
162:
163: movl 4(%esi,%ebx,4), %edx C src second limb (for even)
164: jz L(odd_entry)
165:
166: shrdl( %cl, %edx, %eax)
167:
168: incl %ebx
169: jmp L(even_entry)
170:
171:
172: ALIGN(8)
173: L(odd_top):
174: C eax scratch
175: C ebx counter, limbs, negative
176: C ecx
177: C edx
178: C esi src end
179: C edi dst end
180: C ebp carry bit, 0 or -1
181:
182: mull PARAM_DIVISOR
183:
184: movl (%esi,%ebx,4), %eax
185: subl %ebp, %edx
186:
187: subl %edx, %eax
188:
189: sbbl %ebp, %ebp
190:
191: L(odd_entry):
192: imull VAR_INVERSE, %eax
193:
194: movl %eax, (%edi,%ebx,4)
195:
196: incl %ebx
197: jnz L(odd_top)
198:
199:
200: popl %ebp
201: popl %ebx
202:
203: popl %edi
204: popl %esi
205:
206: ret
207:
208:
209: L(even_top):
210: C eax scratch
211: C ebx counter, limbs, negative
212: C ecx twos
213: C edx
214: C esi src end
215: C edi dst end
216: C ebp carry bit, 0 or -1
217:
218: mull PARAM_DIVISOR
219:
220: subl %ebp, %edx C carry bit
221: movl -4(%esi,%ebx,4), %eax C src limb
222:
223: movl (%esi,%ebx,4), %ebp C and one above it
224:
225: shrdl( %cl, %ebp, %eax)
226:
227: subl %edx, %eax C carry limb
228:
229: sbbl %ebp, %ebp
230:
231: L(even_entry):
232: imull VAR_INVERSE, %eax
233:
234: movl %eax, -4(%edi,%ebx,4)
235: incl %ebx
236:
237: jnz L(even_top)
238:
239:
240:
241: mull PARAM_DIVISOR
242:
243: movl -4(%esi), %eax C src high limb
244: subl %ebp, %edx
245:
246: shrl %cl, %eax
247:
248: subl %edx, %eax C no carry if division is exact
249:
250: imull VAR_INVERSE, %eax
251:
252: movl %eax, -4(%edi) C dst high limb
253: nop C protect against cache bank clash
254:
255: popl %ebp
256: popl %ebx
257:
258: popl %edi
259: popl %esi
260:
261: ret
262:
263: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>