Annotation of OpenXM_contrib/gmp/mpn/x86/divrem_1.asm, Revision 1.1.1.2
1.1 maekawa 1: dnl x86 mpn_divrem_1 -- mpn by limb division extending to fractional quotient.
2:
1.1.1.2 ! ohara 3: dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
1.1 maekawa 4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
1.1.1.2 ! ohara 22: include(`../config.m4')
1.1 maekawa 23:
24:
1.1.1.2 ! ohara 25: C cycles/limb
! 26: C 486 approx 43 maybe
! 27: C P5 44
! 28: C P6 39
! 29: C P6MMX 39
! 30: C K6 20
! 31: C K7 42
! 32: C P4 58
1.1 maekawa 33:
34:
35: C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
36: C mp_srcptr src, mp_size_t size, mp_limb_t divisor);
37: C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
1.1.1.2 ! ohara 38: C mp_srcptr src, mp_size_t size, mp_limb_t divisor,
! 39: C mp_limb_t carry);
1.1 maekawa 40: C
41: C Divide src,size by divisor and store the quotient in dst+xsize,size.
42: C Extend the division to fractional quotient limbs in dst,xsize. Return the
43: C remainder. Either or both xsize and size can be 0.
44: C
45: C mpn_divrem_1c takes a carry parameter which is an initial high limb,
46: C effectively one extra limb at the top of src,size. Must have
47: C carry<divisor.
48: C
49: C
50: C Essentially the code is the same as the division based part of
51: C mpn/generic/divrem_1.c, but has the following advantages.
52: C
53: C - If gcc isn't being used then divrem_1.c will get the generic C
54: C udiv_qrnnd() and be rather slow.
55: C
1.1.1.2 ! ohara 56: C - On K6, using the loop instruction is a 10% speedup, but gcc prior to 3.0
! 57: C doesn't generate that instruction.
1.1 maekawa 58: C
59: C A test is done to see if the high limb is less the the divisor, and if so
60: C one less div is done. A div is between 20 and 40 cycles on the various
61: C x86s, so assuming high<divisor about half the time, then this test saves
62: C half that amount. The branch misprediction penalty on each chip is less
63: C than half a div.
64: C
65: C
66: C K6: Back-to-back div instructions run at 20 cycles, the same as the loop
67: C here, so it seems there's nothing to gain by rearranging the loop.
68: C Pairing the mov and loop instructions was found to gain nothing. (The
69: C same is true of the mpn/x86/mod_1.asm loop.)
70: C
71: C With a "decl/jnz" rather than a "loop" this code runs at 22 cycles.
72: C The loop_or_decljnz macro is an easy way to get a 10% speedup.
73: C
74: C The fast K6 multiply might be thought to suit a multiply-by-inverse,
75: C but that algorithm has been found to suffer from the releatively poor
76: C carry handling on K6 and too many auxiliary instructions. The
77: C fractional part however could be done at about 13 c/l.
78: C
1.1.1.2 ! ohara 79: C P5: Again here the auxiliary instructions hinder a multiply-by-inverse,
1.1 maekawa 80: C though there might be a 10-15% speedup available
1.1.1.2 ! ohara 81: C
! 82: C It might be thought that moving the load down to pair with the store
! 83: C would save 1 cycle, but that doesn't seem to happen in practice, and
! 84: C in any case would be a mere 2.2% saving, so it hardly worth bothering
! 85: C about.
1.1 maekawa 86:
87: defframe(PARAM_CARRY, 24)
88: defframe(PARAM_DIVISOR,20)
89: defframe(PARAM_SIZE, 16)
90: defframe(PARAM_SRC, 12)
91: defframe(PARAM_XSIZE, 8)
92: defframe(PARAM_DST, 4)
93:
1.1.1.2 ! ohara 94: TEXT
1.1 maekawa 95: ALIGN(16)
96:
97: PROLOGUE(mpn_divrem_1c)
98: deflit(`FRAME',0)
99:
100: movl PARAM_SIZE, %ecx
101: pushl %edi FRAME_pushl()
102:
103: movl PARAM_SRC, %edi
104: pushl %esi FRAME_pushl()
105:
106: movl PARAM_DIVISOR, %esi
107: pushl %ebx FRAME_pushl()
108:
109: movl PARAM_DST, %ebx
110: pushl %ebp FRAME_pushl()
111:
112: movl PARAM_XSIZE, %ebp
113: orl %ecx, %ecx
114:
115: movl PARAM_CARRY, %edx
1.1.1.2 ! ohara 116: jz L(fraction)
1.1 maekawa 117:
118: leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part
1.1.1.2 ! ohara 119: jmp L(integer_top)
1.1 maekawa 120:
121: EPILOGUE()
122:
123:
124: PROLOGUE(mpn_divrem_1)
125: deflit(`FRAME',0)
126:
127: movl PARAM_SIZE, %ecx
128: pushl %edi FRAME_pushl()
129:
130: movl PARAM_SRC, %edi
131: pushl %esi FRAME_pushl()
132:
133: movl PARAM_DIVISOR, %esi
134: orl %ecx,%ecx
135:
136: jz L(size_zero)
137: pushl %ebx FRAME_pushl()
138:
139: movl -4(%edi,%ecx,4), %eax C src high limb
140: xorl %edx, %edx
141:
142: movl PARAM_DST, %ebx
143: pushl %ebp FRAME_pushl()
144:
145: movl PARAM_XSIZE, %ebp
146: cmpl %esi, %eax
147:
148: leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part
149: jae L(integer_entry)
150:
151:
152: C high<divisor, so high of dst is zero, and avoid one div
153:
154: movl %edx, (%ebx,%ecx,4)
155: decl %ecx
156:
157: movl %eax, %edx
158: jz L(fraction)
159:
160:
161: L(integer_top):
162: C eax scratch (quotient)
163: C ebx dst+4*xsize-4
164: C ecx counter
165: C edx scratch (remainder)
166: C esi divisor
167: C edi src
168: C ebp xsize
169:
170: movl -4(%edi,%ecx,4), %eax
171: L(integer_entry):
172:
173: divl %esi
174:
175: movl %eax, (%ebx,%ecx,4)
176: loop_or_decljnz L(integer_top)
177:
178:
179: L(fraction):
180: orl %ebp, %ecx
181: jz L(done)
182:
183: movl PARAM_DST, %ebx
184:
185:
186: L(fraction_top):
187: C eax scratch (quotient)
188: C ebx dst
189: C ecx counter
190: C edx scratch (remainder)
191: C esi divisor
192: C edi
193: C ebp
194:
195: xorl %eax, %eax
196:
197: divl %esi
198:
199: movl %eax, -4(%ebx,%ecx,4)
200: loop_or_decljnz L(fraction_top)
201:
202:
203: L(done):
204: popl %ebp
205: movl %edx, %eax
206: popl %ebx
207: popl %esi
208: popl %edi
209: ret
210:
211:
212: L(size_zero):
213: deflit(`FRAME',8)
214: movl PARAM_XSIZE, %ecx
215: xorl %eax, %eax
216:
217: movl PARAM_DST, %edi
218:
1.1.1.2 ! ohara 219: cld C better safe than sorry, see mpn/x86/README
1.1 maekawa 220:
221: rep
222: stosl
223:
224: popl %esi
225: popl %edi
226: ret
227: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>